openpilot/selfdrive/modeld/thneed/debug/microbenchmark/gemm_image.cl

76 lines
1.8 KiB
Common Lisp

// https://github.com/moskewcz/boda/issues/13
//#define USE_FP16
#ifdef USE_FP16
#define xtype half4
#define read_imagep read_imageh
#define write_imagep write_imageh
#else
#define xtype float4
#define read_imagep read_imagef
#define write_imagep write_imagef
#endif
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
__kernel void gemm(const int M, const int N, const int K,
read_only image2d_t A,
read_only image2d_t B,
write_only image2d_t C)
{
const sampler_t smp = CLK_NORMALIZED_COORDS_FALSE |
CLK_ADDRESS_CLAMP |
CLK_FILTER_NEAREST;
xtype c_r[4] = {0,0,0,0};
xtype a_r[4], b_r[4];
int const a_off_thr = get_global_id(0);
int const b_off_thr = get_global_id(1);
int2 a_samp = {0, a_off_thr};
int2 b_samp = {0, b_off_thr};
for (short k = 0; k < K/4; k++) {
for (short i = 0; i < 4; ++i) {
a_r[i] = read_imagep(A, smp, a_samp);
b_r[i] = read_imagep(B, smp, b_samp);
++a_samp.x;
++b_samp.x;
}
for (short i = 0; i < 4; ++i) {
float4 ov = c_r[i];
ov.x += a_r[i].x * b_r[0].x;
ov.x += a_r[i].y * b_r[0].y;
ov.x += a_r[i].z * b_r[0].z;
ov.x += a_r[i].w * b_r[0].w;
ov.y += a_r[i].x * b_r[1].x;
ov.y += a_r[i].y * b_r[1].y;
ov.y += a_r[i].z * b_r[1].z;
ov.y += a_r[i].w * b_r[1].w;
ov.z += a_r[i].x * b_r[2].x;
ov.z += a_r[i].y * b_r[2].y;
ov.z += a_r[i].z * b_r[2].z;
ov.z += a_r[i].w * b_r[2].w;
ov.w += a_r[i].x * b_r[3].x;
ov.w += a_r[i].y * b_r[3].y;
ov.w += a_r[i].z * b_r[3].z;
ov.w += a_r[i].w * b_r[3].w;
c_r[i] = ov;
}
}
int2 c_samp = {a_off_thr, b_off_thr*4};
for (short i = 0; i < 4; i++) {
write_imagep(C, c_samp, c_r[i]);
++c_samp.y;
}
}