Use thneed directly on the loaded YUV data (#22236)
* completely untested * it builds now * bug fixes, save 1ms * using a kernel to copy works * more sane API to loadyuv Co-authored-by: Comma Device <device@comma.ai>pull/22255/head
parent
f10ac7d060
commit
83ff9ca331
|
@ -22,16 +22,24 @@ ModelFrame::ModelFrame(cl_device_id device_id, cl_context context) {
|
|||
loadyuv_init(&loadyuv, context, device_id, MODEL_WIDTH, MODEL_HEIGHT);
|
||||
}
|
||||
|
||||
float* ModelFrame::prepare(cl_mem yuv_cl, int frame_width, int frame_height, const mat3 &transform) {
|
||||
float* ModelFrame::prepare(cl_mem yuv_cl, int frame_width, int frame_height, const mat3 &transform, cl_mem *output) {
|
||||
transform_queue(&this->transform, q,
|
||||
yuv_cl, frame_width, frame_height,
|
||||
y_cl, u_cl, v_cl, MODEL_WIDTH, MODEL_HEIGHT, transform);
|
||||
loadyuv_queue(&loadyuv, q, y_cl, u_cl, v_cl, net_input_cl);
|
||||
|
||||
std::memmove(&input_frames[0], &input_frames[MODEL_FRAME_SIZE], sizeof(float) * MODEL_FRAME_SIZE);
|
||||
clEnqueueReadBuffer(q, net_input_cl, CL_TRUE, 0, MODEL_FRAME_SIZE * sizeof(float), &input_frames[MODEL_FRAME_SIZE], 0, nullptr, nullptr);
|
||||
clFinish(q);
|
||||
return &input_frames[0];
|
||||
if (output == NULL) {
|
||||
loadyuv_queue(&loadyuv, q, y_cl, u_cl, v_cl, net_input_cl);
|
||||
|
||||
std::memmove(&input_frames[0], &input_frames[MODEL_FRAME_SIZE], sizeof(float) * MODEL_FRAME_SIZE);
|
||||
CL_CHECK(clEnqueueReadBuffer(q, net_input_cl, CL_TRUE, 0, MODEL_FRAME_SIZE * sizeof(float), &input_frames[MODEL_FRAME_SIZE], 0, nullptr, nullptr));
|
||||
clFinish(q);
|
||||
return &input_frames[0];
|
||||
} else {
|
||||
loadyuv_queue(&loadyuv, q, y_cl, u_cl, v_cl, *output, true);
|
||||
// NOTE: Since thneed is using a different command queue, this clFinish is needed to ensure the image is ready.
|
||||
clFinish(q);
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
|
||||
ModelFrame::~ModelFrame() {
|
||||
|
|
|
@ -30,7 +30,7 @@ class ModelFrame {
|
|||
public:
|
||||
ModelFrame(cl_device_id device_id, cl_context context);
|
||||
~ModelFrame();
|
||||
float* prepare(cl_mem yuv_cl, int width, int height, const mat3& transform);
|
||||
float* prepare(cl_mem yuv_cl, int width, int height, const mat3& transform, cl_mem *output);
|
||||
|
||||
const int buf_size = MODEL_FRAME_SIZE * 2;
|
||||
|
||||
|
|
|
@ -105,7 +105,8 @@ ModelDataRaw model_eval_frame(ModelState* s, cl_mem yuv_cl, int width, int heigh
|
|||
|
||||
//for (int i = 0; i < OUTPUT_SIZE + TEMPORAL_SIZE; i++) { printf("%f ", s->output[i]); } printf("\n");
|
||||
|
||||
auto net_input_buf = s->frame->prepare(yuv_cl, width, height, transform);
|
||||
// if getInputBuf is not NULL, net_input_buf will be
|
||||
auto net_input_buf = s->frame->prepare(yuv_cl, width, height, transform, static_cast<cl_mem*>(s->m->getInputBuf()));
|
||||
s->m->execute(net_input_buf, s->frame->buf_size);
|
||||
|
||||
// net outputs
|
||||
|
|
|
@ -5,5 +5,6 @@ public:
|
|||
virtual void addDesire(float *state, int state_size) {}
|
||||
virtual void addTrafficConvention(float *state, int state_size) {}
|
||||
virtual void execute(float *net_input_buf, int buf_size) {}
|
||||
virtual void* getInputBuf() { return nullptr; }
|
||||
};
|
||||
|
||||
|
|
|
@ -25,6 +25,11 @@ void ThneedModel::addDesire(float *state, int state_size) {
|
|||
desire = state;
|
||||
}
|
||||
|
||||
void* ThneedModel::getInputBuf() {
|
||||
if (thneed->input_clmem.size() > 3) return &(thneed->input_clmem[3]);
|
||||
else return nullptr;
|
||||
}
|
||||
|
||||
void ThneedModel::execute(float *net_input_buf, int buf_size) {
|
||||
float *inputs[4] = {recurrent, trafficConvention, desire, net_input_buf};
|
||||
if (!recorded) {
|
||||
|
|
|
@ -10,6 +10,7 @@ public:
|
|||
void addTrafficConvention(float *state, int state_size);
|
||||
void addDesire(float *state, int state_size);
|
||||
void execute(float *net_input_buf, int buf_size);
|
||||
void* getInputBuf();
|
||||
private:
|
||||
Thneed *thneed = NULL;
|
||||
bool recorded;
|
||||
|
|
|
@ -241,6 +241,7 @@ void Thneed::find_inputs_outputs() {
|
|||
for (int i = 0; i < k->num_args; i++) {
|
||||
if (k->name == "zero_pad_image_float" && k->arg_names[i] == "input") {
|
||||
cl_mem aa = *(cl_mem*)(k->args[i].data());
|
||||
input_clmem.push_back(aa);
|
||||
|
||||
size_t sz;
|
||||
clGetMemObjectInfo(aa, CL_MEM_SIZE, sizeof(sz), &sz, NULL);
|
||||
|
@ -262,7 +263,7 @@ void Thneed::copy_inputs(float **finputs) {
|
|||
//cl_int ret;
|
||||
for (int idx = 0; idx < inputs.size(); ++idx) {
|
||||
if (record & THNEED_DEBUG) printf("copying %lu -- %p -> %p\n", input_sizes[idx], finputs[idx], inputs[idx]);
|
||||
memcpy(inputs[idx], finputs[idx], input_sizes[idx]);
|
||||
if (finputs[idx] != NULL) memcpy(inputs[idx], finputs[idx], input_sizes[idx]);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -98,6 +98,7 @@ class Thneed {
|
|||
void wait();
|
||||
int optimize();
|
||||
|
||||
vector<cl_mem> input_clmem;
|
||||
vector<void *> inputs;
|
||||
vector<size_t> input_sizes;
|
||||
cl_mem output = NULL;
|
||||
|
|
|
@ -19,6 +19,7 @@ void loadyuv_init(LoadYUVState* s, cl_context ctx, cl_device_id device_id, int w
|
|||
|
||||
s->loadys_krnl = CL_CHECK_ERR(clCreateKernel(prg, "loadys", &err));
|
||||
s->loaduv_krnl = CL_CHECK_ERR(clCreateKernel(prg, "loaduv", &err));
|
||||
s->copy_krnl = CL_CHECK_ERR(clCreateKernel(prg, "copy", &err));
|
||||
|
||||
// done with this
|
||||
CL_CHECK(clReleaseProgram(prg));
|
||||
|
@ -27,33 +28,46 @@ void loadyuv_init(LoadYUVState* s, cl_context ctx, cl_device_id device_id, int w
|
|||
void loadyuv_destroy(LoadYUVState* s) {
|
||||
CL_CHECK(clReleaseKernel(s->loadys_krnl));
|
||||
CL_CHECK(clReleaseKernel(s->loaduv_krnl));
|
||||
CL_CHECK(clReleaseKernel(s->copy_krnl));
|
||||
}
|
||||
|
||||
void loadyuv_queue(LoadYUVState* s, cl_command_queue q,
|
||||
cl_mem y_cl, cl_mem u_cl, cl_mem v_cl,
|
||||
cl_mem out_cl) {
|
||||
cl_mem out_cl, bool do_shift) {
|
||||
cl_int global_out_off = 0;
|
||||
if (do_shift) {
|
||||
// shift the image in slot 1 to slot 0, then place the new image in slot 1
|
||||
global_out_off += (s->width*s->height) + (s->width/2)*(s->height/2)*2;
|
||||
CL_CHECK(clSetKernelArg(s->copy_krnl, 0, sizeof(cl_mem), &out_cl));
|
||||
CL_CHECK(clSetKernelArg(s->copy_krnl, 1, sizeof(cl_int), &global_out_off));
|
||||
const size_t copy_work_size = global_out_off/8;
|
||||
CL_CHECK(clEnqueueNDRangeKernel(q, s->copy_krnl, 1, NULL,
|
||||
©_work_size, NULL, 0, 0, NULL));
|
||||
}
|
||||
|
||||
CL_CHECK(clSetKernelArg(s->loadys_krnl, 0, sizeof(cl_mem), &y_cl));
|
||||
CL_CHECK(clSetKernelArg(s->loadys_krnl, 1, sizeof(cl_mem), &out_cl));
|
||||
CL_CHECK(clSetKernelArg(s->loadys_krnl, 2, sizeof(cl_int), &global_out_off));
|
||||
|
||||
const size_t loadys_work_size = (s->width*s->height)/8;
|
||||
CL_CHECK(clEnqueueNDRangeKernel(q, s->loadys_krnl, 1, NULL,
|
||||
&loadys_work_size, NULL, 0, 0, NULL));
|
||||
|
||||
const size_t loaduv_work_size = ((s->width/2)*(s->height/2))/8;
|
||||
cl_int loaduv_out_off = (s->width*s->height);
|
||||
global_out_off += (s->width*s->height);
|
||||
|
||||
CL_CHECK(clSetKernelArg(s->loaduv_krnl, 0, sizeof(cl_mem), &u_cl));
|
||||
CL_CHECK(clSetKernelArg(s->loaduv_krnl, 1, sizeof(cl_mem), &out_cl));
|
||||
CL_CHECK(clSetKernelArg(s->loaduv_krnl, 2, sizeof(cl_int), &loaduv_out_off));
|
||||
CL_CHECK(clSetKernelArg(s->loaduv_krnl, 2, sizeof(cl_int), &global_out_off));
|
||||
|
||||
CL_CHECK(clEnqueueNDRangeKernel(q, s->loaduv_krnl, 1, NULL,
|
||||
&loaduv_work_size, NULL, 0, 0, NULL));
|
||||
|
||||
loaduv_out_off += (s->width/2)*(s->height/2);
|
||||
global_out_off += (s->width/2)*(s->height/2);
|
||||
|
||||
CL_CHECK(clSetKernelArg(s->loaduv_krnl, 0, sizeof(cl_mem), &v_cl));
|
||||
CL_CHECK(clSetKernelArg(s->loaduv_krnl, 1, sizeof(cl_mem), &out_cl));
|
||||
CL_CHECK(clSetKernelArg(s->loaduv_krnl, 2, sizeof(cl_int), &loaduv_out_off));
|
||||
CL_CHECK(clSetKernelArg(s->loaduv_krnl, 2, sizeof(cl_int), &global_out_off));
|
||||
|
||||
CL_CHECK(clEnqueueNDRangeKernel(q, s->loaduv_krnl, 1, NULL,
|
||||
&loaduv_work_size, NULL, 0, 0, NULL));
|
||||
|
|
|
@ -1,7 +1,8 @@
|
|||
#define UV_SIZE ((TRANSFORMED_WIDTH/2)*(TRANSFORMED_HEIGHT/2))
|
||||
|
||||
__kernel void loadys(__global uchar8 const * const Y,
|
||||
__global float * out)
|
||||
__global float * out,
|
||||
int out_offset)
|
||||
{
|
||||
const int gid = get_global_id(0);
|
||||
const int ois = gid * 8;
|
||||
|
@ -17,11 +18,11 @@ __kernel void loadys(__global uchar8 const * const Y,
|
|||
__global float* outy0;
|
||||
__global float* outy1;
|
||||
if ((oy & 1) == 0) {
|
||||
outy0 = out; //y0
|
||||
outy1 = out + UV_SIZE*2; //y2
|
||||
outy0 = out + out_offset; //y0
|
||||
outy1 = out + out_offset + UV_SIZE*2; //y2
|
||||
} else {
|
||||
outy0 = out + UV_SIZE; //y1
|
||||
outy1 = out + UV_SIZE*3; //y3
|
||||
outy0 = out + out_offset + UV_SIZE; //y1
|
||||
outy1 = out + out_offset + UV_SIZE*3; //y3
|
||||
}
|
||||
|
||||
vstore4(ysf.s0246, 0, outy0 + (oy/2) * (TRANSFORMED_WIDTH/2) + ox/2);
|
||||
|
@ -37,3 +38,10 @@ __kernel void loaduv(__global uchar8 const * const in,
|
|||
const float8 outv = convert_float8(inv);
|
||||
out[gid + out_offset / 8] = outv;
|
||||
}
|
||||
|
||||
__kernel void copy(__global float8 * inout,
|
||||
int in_offset)
|
||||
{
|
||||
const int gid = get_global_id(0);
|
||||
inout[gid] = inout[gid + in_offset / 8];
|
||||
}
|
||||
|
|
|
@ -4,7 +4,7 @@
|
|||
|
||||
typedef struct {
|
||||
int width, height;
|
||||
cl_kernel loadys_krnl, loaduv_krnl;
|
||||
cl_kernel loadys_krnl, loaduv_krnl, copy_krnl;
|
||||
} LoadYUVState;
|
||||
|
||||
void loadyuv_init(LoadYUVState* s, cl_context ctx, cl_device_id device_id, int width, int height);
|
||||
|
@ -13,4 +13,4 @@ void loadyuv_destroy(LoadYUVState* s);
|
|||
|
||||
void loadyuv_queue(LoadYUVState* s, cl_command_queue q,
|
||||
cl_mem y_cl, cl_mem u_cl, cl_mem v_cl,
|
||||
cl_mem out_cl);
|
||||
cl_mem out_cl, bool do_shift = false);
|
||||
|
|
Loading…
Reference in New Issue