// https://github.com/moskewcz/boda/issues/13

//#define USE_FP16

#ifdef USE_FP16
  #define xtype half4
  #define read_imagep read_imageh
  #define write_imagep write_imageh
#else
  #define xtype float4
  #define read_imagep read_imagef
  #define write_imagep write_imagef
#endif

#pragma OPENCL EXTENSION cl_khr_fp16 : enable
__kernel void gemm(const int M, const int N, const int K,
  read_only image2d_t A,
  read_only image2d_t B,
  write_only image2d_t C)
{
  const sampler_t smp = CLK_NORMALIZED_COORDS_FALSE |
                        CLK_ADDRESS_CLAMP           |
                        CLK_FILTER_NEAREST;

  xtype c_r[4] = {0,0,0,0};
  xtype a_r[4], b_r[4];

  int const a_off_thr = get_global_id(0);
  int const b_off_thr = get_global_id(1);

  int2 a_samp = {0, a_off_thr};
  int2 b_samp = {0, b_off_thr};

  for (short k = 0; k < K/4; k++) {
    for (short i = 0; i < 4; ++i) {
      a_r[i] = read_imagep(A, smp, a_samp);
      b_r[i] = read_imagep(B, smp, b_samp);
      ++a_samp.x;
      ++b_samp.x;
    }

    for (short i = 0; i < 4; ++i) {
      float4 ov = c_r[i];

      ov.x += a_r[i].x * b_r[0].x;
      ov.x += a_r[i].y * b_r[0].y;
      ov.x += a_r[i].z * b_r[0].z;
      ov.x += a_r[i].w * b_r[0].w;

      ov.y += a_r[i].x * b_r[1].x;
      ov.y += a_r[i].y * b_r[1].y;
      ov.y += a_r[i].z * b_r[1].z;
      ov.y += a_r[i].w * b_r[1].w;

      ov.z += a_r[i].x * b_r[2].x;
      ov.z += a_r[i].y * b_r[2].y;
      ov.z += a_r[i].z * b_r[2].z;
      ov.z += a_r[i].w * b_r[2].w;

      ov.w += a_r[i].x * b_r[3].x;
      ov.w += a_r[i].y * b_r[3].y;
      ov.w += a_r[i].z * b_r[3].z;
      ov.w += a_r[i].w * b_r[3].w;

      c_r[i] = ov;
    }
  }

  int2 c_samp = {a_off_thr, b_off_thr*4};
  for (short i = 0; i < 4; i++) {
    write_imagep(C, c_samp, c_r[i]);
    ++c_samp.y;
  }
}