Use thneed directly on the loaded YUV data (#22236)

* completely untested

* it builds now

* bug fixes, save 1ms

* using a kernel to copy works

* more sane API to loadyuv

Co-authored-by: Comma Device <device@comma.ai>
pull/22255/head
George Hotz 2021-09-16 14:13:03 -07:00 committed by GitHub
parent f10ac7d060
commit 83ff9ca331
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
11 changed files with 61 additions and 21 deletions

View File

@ -22,16 +22,24 @@ ModelFrame::ModelFrame(cl_device_id device_id, cl_context context) {
loadyuv_init(&loadyuv, context, device_id, MODEL_WIDTH, MODEL_HEIGHT);
}
float* ModelFrame::prepare(cl_mem yuv_cl, int frame_width, int frame_height, const mat3 &transform) {
float* ModelFrame::prepare(cl_mem yuv_cl, int frame_width, int frame_height, const mat3 &transform, cl_mem *output) {
transform_queue(&this->transform, q,
yuv_cl, frame_width, frame_height,
y_cl, u_cl, v_cl, MODEL_WIDTH, MODEL_HEIGHT, transform);
loadyuv_queue(&loadyuv, q, y_cl, u_cl, v_cl, net_input_cl);
std::memmove(&input_frames[0], &input_frames[MODEL_FRAME_SIZE], sizeof(float) * MODEL_FRAME_SIZE);
clEnqueueReadBuffer(q, net_input_cl, CL_TRUE, 0, MODEL_FRAME_SIZE * sizeof(float), &input_frames[MODEL_FRAME_SIZE], 0, nullptr, nullptr);
clFinish(q);
return &input_frames[0];
if (output == NULL) {
loadyuv_queue(&loadyuv, q, y_cl, u_cl, v_cl, net_input_cl);
std::memmove(&input_frames[0], &input_frames[MODEL_FRAME_SIZE], sizeof(float) * MODEL_FRAME_SIZE);
CL_CHECK(clEnqueueReadBuffer(q, net_input_cl, CL_TRUE, 0, MODEL_FRAME_SIZE * sizeof(float), &input_frames[MODEL_FRAME_SIZE], 0, nullptr, nullptr));
clFinish(q);
return &input_frames[0];
} else {
loadyuv_queue(&loadyuv, q, y_cl, u_cl, v_cl, *output, true);
// NOTE: Since thneed is using a different command queue, this clFinish is needed to ensure the image is ready.
clFinish(q);
return NULL;
}
}
ModelFrame::~ModelFrame() {

View File

@ -30,7 +30,7 @@ class ModelFrame {
public:
ModelFrame(cl_device_id device_id, cl_context context);
~ModelFrame();
float* prepare(cl_mem yuv_cl, int width, int height, const mat3& transform);
float* prepare(cl_mem yuv_cl, int width, int height, const mat3& transform, cl_mem *output);
const int buf_size = MODEL_FRAME_SIZE * 2;

View File

@ -105,7 +105,8 @@ ModelDataRaw model_eval_frame(ModelState* s, cl_mem yuv_cl, int width, int heigh
//for (int i = 0; i < OUTPUT_SIZE + TEMPORAL_SIZE; i++) { printf("%f ", s->output[i]); } printf("\n");
auto net_input_buf = s->frame->prepare(yuv_cl, width, height, transform);
// if getInputBuf is not NULL, net_input_buf will be
auto net_input_buf = s->frame->prepare(yuv_cl, width, height, transform, static_cast<cl_mem*>(s->m->getInputBuf()));
s->m->execute(net_input_buf, s->frame->buf_size);
// net outputs

View File

@ -5,5 +5,6 @@ public:
virtual void addDesire(float *state, int state_size) {}
virtual void addTrafficConvention(float *state, int state_size) {}
virtual void execute(float *net_input_buf, int buf_size) {}
virtual void* getInputBuf() { return nullptr; }
};

View File

@ -25,6 +25,11 @@ void ThneedModel::addDesire(float *state, int state_size) {
desire = state;
}
void* ThneedModel::getInputBuf() {
if (thneed->input_clmem.size() > 3) return &(thneed->input_clmem[3]);
else return nullptr;
}
void ThneedModel::execute(float *net_input_buf, int buf_size) {
float *inputs[4] = {recurrent, trafficConvention, desire, net_input_buf};
if (!recorded) {

View File

@ -10,6 +10,7 @@ public:
void addTrafficConvention(float *state, int state_size);
void addDesire(float *state, int state_size);
void execute(float *net_input_buf, int buf_size);
void* getInputBuf();
private:
Thneed *thneed = NULL;
bool recorded;

View File

@ -241,6 +241,7 @@ void Thneed::find_inputs_outputs() {
for (int i = 0; i < k->num_args; i++) {
if (k->name == "zero_pad_image_float" && k->arg_names[i] == "input") {
cl_mem aa = *(cl_mem*)(k->args[i].data());
input_clmem.push_back(aa);
size_t sz;
clGetMemObjectInfo(aa, CL_MEM_SIZE, sizeof(sz), &sz, NULL);
@ -262,7 +263,7 @@ void Thneed::copy_inputs(float **finputs) {
//cl_int ret;
for (int idx = 0; idx < inputs.size(); ++idx) {
if (record & THNEED_DEBUG) printf("copying %lu -- %p -> %p\n", input_sizes[idx], finputs[idx], inputs[idx]);
memcpy(inputs[idx], finputs[idx], input_sizes[idx]);
if (finputs[idx] != NULL) memcpy(inputs[idx], finputs[idx], input_sizes[idx]);
}
}

View File

@ -98,6 +98,7 @@ class Thneed {
void wait();
int optimize();
vector<cl_mem> input_clmem;
vector<void *> inputs;
vector<size_t> input_sizes;
cl_mem output = NULL;

View File

@ -19,6 +19,7 @@ void loadyuv_init(LoadYUVState* s, cl_context ctx, cl_device_id device_id, int w
s->loadys_krnl = CL_CHECK_ERR(clCreateKernel(prg, "loadys", &err));
s->loaduv_krnl = CL_CHECK_ERR(clCreateKernel(prg, "loaduv", &err));
s->copy_krnl = CL_CHECK_ERR(clCreateKernel(prg, "copy", &err));
// done with this
CL_CHECK(clReleaseProgram(prg));
@ -27,33 +28,46 @@ void loadyuv_init(LoadYUVState* s, cl_context ctx, cl_device_id device_id, int w
void loadyuv_destroy(LoadYUVState* s) {
CL_CHECK(clReleaseKernel(s->loadys_krnl));
CL_CHECK(clReleaseKernel(s->loaduv_krnl));
CL_CHECK(clReleaseKernel(s->copy_krnl));
}
void loadyuv_queue(LoadYUVState* s, cl_command_queue q,
cl_mem y_cl, cl_mem u_cl, cl_mem v_cl,
cl_mem out_cl) {
cl_mem out_cl, bool do_shift) {
cl_int global_out_off = 0;
if (do_shift) {
// shift the image in slot 1 to slot 0, then place the new image in slot 1
global_out_off += (s->width*s->height) + (s->width/2)*(s->height/2)*2;
CL_CHECK(clSetKernelArg(s->copy_krnl, 0, sizeof(cl_mem), &out_cl));
CL_CHECK(clSetKernelArg(s->copy_krnl, 1, sizeof(cl_int), &global_out_off));
const size_t copy_work_size = global_out_off/8;
CL_CHECK(clEnqueueNDRangeKernel(q, s->copy_krnl, 1, NULL,
&copy_work_size, NULL, 0, 0, NULL));
}
CL_CHECK(clSetKernelArg(s->loadys_krnl, 0, sizeof(cl_mem), &y_cl));
CL_CHECK(clSetKernelArg(s->loadys_krnl, 1, sizeof(cl_mem), &out_cl));
CL_CHECK(clSetKernelArg(s->loadys_krnl, 2, sizeof(cl_int), &global_out_off));
const size_t loadys_work_size = (s->width*s->height)/8;
CL_CHECK(clEnqueueNDRangeKernel(q, s->loadys_krnl, 1, NULL,
&loadys_work_size, NULL, 0, 0, NULL));
const size_t loaduv_work_size = ((s->width/2)*(s->height/2))/8;
cl_int loaduv_out_off = (s->width*s->height);
global_out_off += (s->width*s->height);
CL_CHECK(clSetKernelArg(s->loaduv_krnl, 0, sizeof(cl_mem), &u_cl));
CL_CHECK(clSetKernelArg(s->loaduv_krnl, 1, sizeof(cl_mem), &out_cl));
CL_CHECK(clSetKernelArg(s->loaduv_krnl, 2, sizeof(cl_int), &loaduv_out_off));
CL_CHECK(clSetKernelArg(s->loaduv_krnl, 2, sizeof(cl_int), &global_out_off));
CL_CHECK(clEnqueueNDRangeKernel(q, s->loaduv_krnl, 1, NULL,
&loaduv_work_size, NULL, 0, 0, NULL));
loaduv_out_off += (s->width/2)*(s->height/2);
global_out_off += (s->width/2)*(s->height/2);
CL_CHECK(clSetKernelArg(s->loaduv_krnl, 0, sizeof(cl_mem), &v_cl));
CL_CHECK(clSetKernelArg(s->loaduv_krnl, 1, sizeof(cl_mem), &out_cl));
CL_CHECK(clSetKernelArg(s->loaduv_krnl, 2, sizeof(cl_int), &loaduv_out_off));
CL_CHECK(clSetKernelArg(s->loaduv_krnl, 2, sizeof(cl_int), &global_out_off));
CL_CHECK(clEnqueueNDRangeKernel(q, s->loaduv_krnl, 1, NULL,
&loaduv_work_size, NULL, 0, 0, NULL));

View File

@ -1,7 +1,8 @@
#define UV_SIZE ((TRANSFORMED_WIDTH/2)*(TRANSFORMED_HEIGHT/2))
__kernel void loadys(__global uchar8 const * const Y,
__global float * out)
__global float * out,
int out_offset)
{
const int gid = get_global_id(0);
const int ois = gid * 8;
@ -17,11 +18,11 @@ __kernel void loadys(__global uchar8 const * const Y,
__global float* outy0;
__global float* outy1;
if ((oy & 1) == 0) {
outy0 = out; //y0
outy1 = out + UV_SIZE*2; //y2
outy0 = out + out_offset; //y0
outy1 = out + out_offset + UV_SIZE*2; //y2
} else {
outy0 = out + UV_SIZE; //y1
outy1 = out + UV_SIZE*3; //y3
outy0 = out + out_offset + UV_SIZE; //y1
outy1 = out + out_offset + UV_SIZE*3; //y3
}
vstore4(ysf.s0246, 0, outy0 + (oy/2) * (TRANSFORMED_WIDTH/2) + ox/2);
@ -37,3 +38,10 @@ __kernel void loaduv(__global uchar8 const * const in,
const float8 outv = convert_float8(inv);
out[gid + out_offset / 8] = outv;
}
__kernel void copy(__global float8 * inout,
int in_offset)
{
const int gid = get_global_id(0);
inout[gid] = inout[gid + in_offset / 8];
}

View File

@ -4,7 +4,7 @@
typedef struct {
int width, height;
cl_kernel loadys_krnl, loaduv_krnl;
cl_kernel loadys_krnl, loaduv_krnl, copy_krnl;
} LoadYUVState;
void loadyuv_init(LoadYUVState* s, cl_context ctx, cl_device_id device_id, int width, int height);
@ -13,4 +13,4 @@ void loadyuv_destroy(LoadYUVState* s);
void loadyuv_queue(LoadYUVState* s, cl_command_queue q,
cl_mem y_cl, cl_mem u_cl, cl_mem v_cl,
cl_mem out_cl);
cl_mem out_cl, bool do_shift = false);