diff --git a/.gitignore b/.gitignore index ec33cd9f7..1f729d9db 100644 --- a/.gitignore +++ b/.gitignore @@ -71,3 +71,6 @@ flycheck_* cppcheck_report.txt comma.sh + +selfdrive/modeld/thneed/compile +models/*.thneed diff --git a/release/files_common b/release/files_common index 76058366d..21f96f861 100644 --- a/release/files_common +++ b/release/files_common @@ -407,10 +407,14 @@ selfdrive/modeld/transforms/transform.h selfdrive/modeld/transforms/transform.cl selfdrive/modeld/thneed/thneed.* +selfdrive/modeld/thneed/serialize.cc +selfdrive/modeld/thneed/compile.cc selfdrive/modeld/thneed/include/* selfdrive/modeld/runners/snpemodel.cc selfdrive/modeld/runners/snpemodel.h +selfdrive/modeld/runners/thneedmodel.cc +selfdrive/modeld/runners/thneedmodel.h selfdrive/modeld/runners/runmodel.h selfdrive/modeld/runners/run.h diff --git a/selfdrive/modeld/SConscript b/selfdrive/modeld/SConscript index 4fcded106..68b067f35 100644 --- a/selfdrive/modeld/SConscript +++ b/selfdrive/modeld/SConscript @@ -10,14 +10,17 @@ common_src = [ "transforms/transform.cc" ] -if arch == "aarch64": - libs += ['gsl', 'CB', 'gnustl_shared'] - common_src += ["thneed/thneed.cc"] - lenv['CFLAGS'].append("-DUSE_THNEED") - lenv['CXXFLAGS'].append("-DUSE_THNEED") -elif arch == "larch64": - libs += ['gsl', 'CB', 'pthread', 'dl'] - common_src += ["thneed/thneed.cc"] +thneed_src = [ + "thneed/thneed.cc", + "thneed/serialize.cc", + "runners/thneedmodel.cc", +] + +if arch == "aarch64" or arch == "larch64": + libs += ['gsl', 'CB'] + libs += ['gnustl_shared'] if arch == "aarch64" else ['pthread', 'dl'] + + common_src += thneed_src lenv['CFLAGS'].append("-DUSE_THNEED") lenv['CXXFLAGS'].append("-DUSE_THNEED") else: @@ -40,15 +43,23 @@ else: del libs[libs.index('symphony-cpu')] del common_src[common_src.index('runners/snpemodel.cc')] -common = lenv.Object(common_src) +common_model = lenv.Object(common_src) + +# build thneed model +if arch == "aarch64" or arch == "larch64": + compiler = lenv.Program('thneed/compile', ["thneed/compile.cc" ]+common_model, LIBS=libs) + cmd = f"cd {Dir('.').get_abspath()} && {compiler[0].get_abspath()} ../../models/supercombo.dlc ../../models/supercombo.thneed --binary" + snpe_path = "/data/pythonpath/phonelibs/snpe/"+arch + cenv = Environment(ENV = {'LD_LIBRARY_PATH' : snpe_path+":"+lenv["ENV"]["LD_LIBRARY_PATH"]}) + cenv.Command("../../models/supercombo.thneed", ["../../models/supercombo.dlc", compiler], cmd) lenv.Program('_dmonitoringmodeld', [ "dmonitoringmodeld.cc", "models/dmonitoring.cc", - ]+common, LIBS=libs) + ]+common_model, LIBS=libs) lenv.Program('_modeld', [ "modeld.cc", "models/driving.cc", - ]+common, LIBS=libs) + ]+common_model, LIBS=libs) diff --git a/selfdrive/modeld/models/driving.cc b/selfdrive/modeld/models/driving.cc index bad5b3cce..bce849101 100644 --- a/selfdrive/modeld/models/driving.cc +++ b/selfdrive/modeld/models/driving.cc @@ -54,7 +54,13 @@ void model_init(ModelState* s, cl_device_id device_id, cl_context context) { constexpr int output_size = OUTPUT_SIZE + TEMPORAL_SIZE; s->output = std::make_unique(output_size); + memset(&s->output[0], 0, output_size*sizeof(float)); + +#if defined(QCOM) || defined(QCOM2) + s->m = std::make_unique("../../models/supercombo.thneed", &s->output[0], output_size, USE_GPU_RUNTIME); +#else s->m = std::make_unique("../../models/supercombo.dlc", &s->output[0], output_size, USE_GPU_RUNTIME); +#endif #ifdef TEMPORAL s->m->addRecurrent(&s->output[OUTPUT_SIZE], TEMPORAL_SIZE); diff --git a/selfdrive/modeld/runners/run.h b/selfdrive/modeld/runners/run.h index dea340a0a..98a3fb8b6 100644 --- a/selfdrive/modeld/runners/run.h +++ b/selfdrive/modeld/runners/run.h @@ -1,10 +1,10 @@ -#ifndef RUN_H -#define RUN_H +#pragma once #include "runmodel.h" #include "snpemodel.h" -#ifdef QCOM +#if defined(QCOM) || defined(QCOM2) + #include "thneedmodel.h" #define DefaultRunModel SNPEModel #else #ifdef USE_ONNX_MODEL @@ -14,5 +14,3 @@ #define DefaultRunModel SNPEModel #endif #endif - -#endif diff --git a/selfdrive/modeld/runners/snpemodel.h b/selfdrive/modeld/runners/snpemodel.h index 90c26664f..76339642f 100644 --- a/selfdrive/modeld/runners/snpemodel.h +++ b/selfdrive/modeld/runners/snpemodel.h @@ -31,13 +31,14 @@ public: void addTrafficConvention(float *state, int state_size); void addDesire(float *state, int state_size); void execute(float *net_input_buf, int buf_size); -private: - uint8_t *model_data = NULL; #ifdef USE_THNEED Thneed *thneed = NULL; #endif +private: + uint8_t *model_data = NULL; + #if defined(QCOM) || defined(QCOM2) zdl::DlSystem::Runtime_t Runtime; #endif diff --git a/selfdrive/modeld/runners/thneedmodel.cc b/selfdrive/modeld/runners/thneedmodel.cc new file mode 100644 index 000000000..0ebe7226e --- /dev/null +++ b/selfdrive/modeld/runners/thneedmodel.cc @@ -0,0 +1,41 @@ +#include "thneedmodel.h" +#include + +ThneedModel::ThneedModel(const char *path, float *loutput, size_t loutput_size, int runtime) { + thneed = new Thneed(true); + thneed->record = 0; + thneed->load(path); + thneed->clexec(); + thneed->find_inputs_outputs(); + + recorded = false; + output = loutput; +} + +void ThneedModel::addRecurrent(float *state, int state_size) { + recurrent = state; +} + +void ThneedModel::addTrafficConvention(float *state, int state_size) { + trafficConvention = state; +} + +void ThneedModel::addDesire(float *state, int state_size) { + desire = state; +} + +void ThneedModel::execute(float *net_input_buf, int buf_size) { + float *inputs[4] = {recurrent, trafficConvention, desire, net_input_buf}; + if (!recorded) { + thneed->record = THNEED_RECORD; + thneed->copy_inputs(inputs); + thneed->clexec(); + thneed->copy_output(output); + thneed->stop(); + + recorded = true; + } else { + thneed->execute(inputs, output); + } +} + diff --git a/selfdrive/modeld/runners/thneedmodel.h b/selfdrive/modeld/runners/thneedmodel.h new file mode 100644 index 000000000..05cb2438e --- /dev/null +++ b/selfdrive/modeld/runners/thneedmodel.h @@ -0,0 +1,24 @@ +#pragma once + +#include "runmodel.h" +#include "thneed/thneed.h" + +class ThneedModel : public RunModel { +public: + ThneedModel(const char *path, float *loutput, size_t loutput_size, int runtime); + void addRecurrent(float *state, int state_size); + void addTrafficConvention(float *state, int state_size); + void addDesire(float *state, int state_size); + void execute(float *net_input_buf, int buf_size); +private: + Thneed *thneed = NULL; + bool recorded; + + float *output; + + // recurrent and desire + float *recurrent; + float *trafficConvention; + float *desire; +}; + diff --git a/selfdrive/modeld/thneed/compile.cc b/selfdrive/modeld/thneed/compile.cc new file mode 100644 index 000000000..63955a6f9 --- /dev/null +++ b/selfdrive/modeld/thneed/compile.cc @@ -0,0 +1,34 @@ +#include +#include "thneed.h" +#include "../runners/snpemodel.h" + +#define TEMPORAL_SIZE 512 +#define DESIRE_LEN 8 +#define TRAFFIC_CONVENTION_LEN 2 + +// TODO: This should probably use SNPE directly. +int main(int argc, char* argv[]) { + #define OUTPUT_SIZE 0x10000 + float *output = (float*)calloc(OUTPUT_SIZE, sizeof(float)); + SNPEModel mdl(argv[1], output, 0, USE_GPU_RUNTIME); + + float state[TEMPORAL_SIZE] = {0}; + float desire[DESIRE_LEN] = {0}; + float traffic_convention[TRAFFIC_CONVENTION_LEN] = {0}; + float *input = (float*)calloc(0x1000000, sizeof(float));; + + mdl.addRecurrent(state, TEMPORAL_SIZE); + mdl.addDesire(desire, DESIRE_LEN); + mdl.addTrafficConvention(traffic_convention, TRAFFIC_CONVENTION_LEN); + + // first run + printf("************** execute 1 **************\n"); + memset(output, 0, OUTPUT_SIZE * sizeof(float)); + mdl.execute(input, 0); + + // save model + bool save_binaries = (argc > 3) && (strcmp(argv[3], "--binary") == 0); + mdl.thneed->save(argv[2], save_binaries); + return 0; +} + diff --git a/selfdrive/modeld/thneed/serialize.cc b/selfdrive/modeld/thneed/serialize.cc new file mode 100644 index 000000000..7f22d631f --- /dev/null +++ b/selfdrive/modeld/thneed/serialize.cc @@ -0,0 +1,290 @@ +#include +#include +#include "thneed.h" +#include "json11.hpp" +using namespace json11; + +extern map g_program_source; + +void Thneed::load(const char *filename) { + printf("Thneed::load: loading from %s\n", filename); + + FILE *f = fopen(filename, "rb"); + fseek(f, 0L, SEEK_END); + int sz = ftell(f); + fseek(f, 0L, SEEK_SET); + char *buf = (char*)malloc(sz); + fread(buf, 1, sz, f); + fclose(f); + + int jsz = *(int *)buf; + string jj(buf+4, jsz); + string err; + Json jdat = Json::parse(jj, err); + + map real_mem; + real_mem[NULL] = NULL; + + int ptr = 4+jsz; + for (auto &obj : jdat["objects"].array_items()) { + auto mobj = obj.object_items(); + int sz = mobj["size"].int_value(); + cl_mem clbuf = NULL; + + if (mobj["buffer_id"].string_value().size() > 0) { + // image buffer must already be allocated + clbuf = real_mem[*(cl_mem*)(mobj["buffer_id"].string_value().data())]; + assert(mobj["needs_load"].bool_value() == false); + } else { + if (mobj["needs_load"].bool_value()) { + //printf("loading %p %d @ 0x%X\n", clbuf, sz, ptr); + clbuf = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR | CL_MEM_READ_WRITE, sz, &buf[ptr], NULL); + ptr += sz; + } else { + clbuf = clCreateBuffer(context, CL_MEM_READ_WRITE, sz, NULL, NULL); + } + } + assert(clbuf != NULL); + + if (mobj["arg_type"] == "image2d_t" || mobj["arg_type"] == "image1d_t") { + cl_image_desc desc = {0}; + desc.image_type = (mobj["arg_type"] == "image2d_t") ? CL_MEM_OBJECT_IMAGE2D : CL_MEM_OBJECT_IMAGE1D_BUFFER; + desc.image_width = mobj["width"].int_value(); + desc.image_height = mobj["height"].int_value(); + desc.image_row_pitch = mobj["row_pitch"].int_value(); + desc.buffer = clbuf; + + cl_image_format format; + format.image_channel_order = CL_RGBA; + format.image_channel_data_type = CL_HALF_FLOAT; + + clbuf = clCreateImage(context, CL_MEM_READ_WRITE, &format, &desc, NULL, NULL); + assert(clbuf != NULL); + } + + real_mem[*(cl_mem*)(mobj["id"].string_value().data())] = clbuf; + } + + map g_programs; + for (auto &obj : jdat["programs"].object_items()) { + const char *srcs[1]; + srcs[0] = (const char *)obj.second.string_value().c_str(); + size_t length = obj.second.string_value().size(); + + if (record & THNEED_DEBUG) printf("building %s with size %zu\n", obj.first.c_str(), length); + + cl_program program = clCreateProgramWithSource(context, 1, srcs, &length, NULL); + int err = clBuildProgram(program, 1, &device_id, "", NULL, NULL); + if (err != 0) { + printf("got err %d\n", err); + size_t length; + char buffer[2048]; + clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &length); + buffer[length] = '\0'; + printf("%s\n", buffer); + } + assert(err == 0); + + g_programs[obj.first] = program; + } + + for (auto &obj : jdat["binaries"].array_items()) { + string name = obj["name"].string_value(); + size_t length = obj["length"].int_value(); + const unsigned char *srcs[1]; + srcs[0] = (const unsigned char *)&buf[ptr]; + ptr += length; + + if (record & THNEED_DEBUG) printf("binary %s with size %zu\n", name.c_str(), length); + + cl_int err; + cl_program program = clCreateProgramWithBinary(context, 1, &device_id, &length, srcs, NULL, &err); + assert(program != NULL && err == CL_SUCCESS); + err = clBuildProgram(program, 1, &device_id, "", NULL, NULL); + assert(err == CL_SUCCESS); + + g_programs[name] = program; + } + + for (auto &obj : jdat["kernels"].array_items()) { + auto gws = obj["global_work_size"]; + auto lws = obj["local_work_size"]; + auto kk = shared_ptr(new CLQueuedKernel(this)); + + kk->name = obj["name"].string_value(); + kk->program = g_programs[kk->name]; + kk->work_dim = obj["work_dim"].int_value(); + for (int i = 0; i < kk->work_dim; i++) { + kk->global_work_size[i] = gws[i].int_value(); + kk->local_work_size[i] = lws[i].int_value(); + } + kk->num_args = obj["num_args"].int_value(); + for (int i = 0; i < kk->num_args; i++) { + string arg = obj["args"].array_items()[i].string_value(); + int arg_size = obj["args_size"].array_items()[i].int_value(); + kk->args_size.push_back(arg_size); + if (arg_size == 8) { + cl_mem val = *(cl_mem*)(arg.data()); + val = real_mem[val]; + kk->args.push_back(string((char*)&val, sizeof(val))); + } else { + kk->args.push_back(arg); + } + } + kq.push_back(kk); + } + + free(buf); + clFinish(command_queue); +} + +void Thneed::save(const char *filename, bool save_binaries) { + printf("Thneed::save: saving to %s\n", filename); + + // get kernels + std::vector kernels; + std::set saved_objects; + std::vector objects; + std::map programs; + std::map binaries; + + for (auto &k : kq) { + kernels.push_back(k->to_json()); + + // check args for objects + int i = 0; + for (auto &a : k->args) { + if (a.size() == 8) { + if (saved_objects.find(a) == saved_objects.end()) { + saved_objects.insert(a); + cl_mem val = *(cl_mem*)(a.data()); + if (val != NULL) { + bool needs_load = k->arg_names[i] == "weights" || k->arg_names[i] == "biases"; + + auto jj = Json::object({ + {"id", a}, + {"arg_type", k->arg_types[i]}, + }); + + if (k->arg_types[i] == "image2d_t" || k->arg_types[i] == "image1d_t") { + cl_mem buf; + clGetImageInfo(val, CL_IMAGE_BUFFER, sizeof(buf), &buf, NULL); + string aa = string((char *)&buf, sizeof(buf)); + jj["buffer_id"] = aa; + + size_t width, height, row_pitch; + clGetImageInfo(val, CL_IMAGE_WIDTH, sizeof(width), &width, NULL); + clGetImageInfo(val, CL_IMAGE_HEIGHT, sizeof(height), &height, NULL); + clGetImageInfo(val, CL_IMAGE_ROW_PITCH, sizeof(row_pitch), &row_pitch, NULL); + jj["width"] = (int)width; + jj["height"] = (int)height; + jj["row_pitch"] = (int)row_pitch; + jj["size"] = (int)(height * row_pitch); + jj["needs_load"] = false; + + if (saved_objects.find(aa) == saved_objects.end()) { + saved_objects.insert(aa); + size_t sz; + clGetMemObjectInfo(buf, CL_MEM_SIZE, sizeof(sz), &sz, NULL); + // save the buffer + objects.push_back(Json::object({ + {"id", aa}, + {"arg_type", ""}, + {"needs_load", needs_load}, + {"size", (int)sz} + })); + if (needs_load) assert(sz == height * row_pitch); + } + } else { + size_t sz = 0; + clGetMemObjectInfo(val, CL_MEM_SIZE, sizeof(sz), &sz, NULL); + jj["size"] = (int)sz; + jj["needs_load"] = needs_load; + } + + objects.push_back(jj); + } + } + } + i++; + } + + if (save_binaries) { + int err; + size_t binary_size = 0; + err = clGetProgramInfo(k->program, CL_PROGRAM_BINARY_SIZES, sizeof(binary_size), &binary_size, NULL); + assert(err == 0); + assert(binary_size > 0); + string sv(binary_size, '\x00'); + + uint8_t* bufs[1] = { (uint8_t*)sv.data(), }; + err = clGetProgramInfo(k->program, CL_PROGRAM_BINARIES, sizeof(bufs), &bufs, NULL); + assert(err == 0); + + binaries[k->name] = sv; + } else { + programs[k->name] = g_program_source[k->program]; + } + } + + vector saved_buffers; + for (auto &obj : objects) { + auto mobj = obj.object_items(); + cl_mem val = *(cl_mem*)(mobj["id"].string_value().data()); + int sz = mobj["size"].int_value(); + if (mobj["needs_load"].bool_value()) { + char *buf = (char *)malloc(sz); + if (mobj["arg_type"] == "image2d_t" || mobj["arg_type"] == "image1d_t") { + assert(false); + } else { + // buffers alloced with CL_MEM_HOST_WRITE_ONLY, hence this hack + //hexdump((uint32_t*)val, 0x100); + + // the worst hack in thneed, the flags are at 0x14 + ((uint32_t*)val)[0x14] &= ~CL_MEM_HOST_WRITE_ONLY; + cl_int ret = clEnqueueReadBuffer(command_queue, val, CL_TRUE, 0, sz, buf, 0, NULL, NULL); + assert(ret == CL_SUCCESS); + } + //printf("saving buffer: %d %p %s\n", sz, buf, mobj["arg_type"].string_value().c_str()); + saved_buffers.push_back(string(buf, sz)); + free(buf); + } + } + + std::vector jbinaries; + for (auto &obj : binaries) { + jbinaries.push_back(Json::object({{"name", obj.first}, {"length", (int)obj.second.size()}})); + saved_buffers.push_back(obj.second); + } + + Json jdat = Json::object({ + {"kernels", kernels}, + {"objects", objects}, + {"programs", programs}, + {"binaries", jbinaries}, + }); + + string str = jdat.dump(); + int jsz = str.length(); + + FILE *f = fopen(filename, "wb"); + fwrite(&jsz, 1, sizeof(jsz), f); + fwrite(str.data(), 1, jsz, f); + for (auto &s : saved_buffers) { + fwrite(s.data(), 1, s.length(), f); + } + fclose(f); +} + +Json CLQueuedKernel::to_json() const { + return Json::object { + { "name", name }, + { "work_dim", (int)work_dim }, + { "global_work_size", Json::array { (int)global_work_size[0], (int)global_work_size[1], (int)global_work_size[2] } }, + { "local_work_size", Json::array { (int)local_work_size[0], (int)local_work_size[1], (int)local_work_size[2] } }, + { "num_args", (int)num_args }, + { "args", args }, + { "args_size", args_size }, + }; +} + diff --git a/selfdrive/modeld/thneed/thneed.cc b/selfdrive/modeld/thneed/thneed.cc index 443285653..4442cf682 100644 --- a/selfdrive/modeld/thneed/thneed.cc +++ b/selfdrive/modeld/thneed/thneed.cc @@ -7,8 +7,6 @@ #include #include "thneed.h" -//#define SAVE_KERNELS - //#define RUN_DISASSEMBLER //#define RUN_OPTIMIZER @@ -83,7 +81,8 @@ int ioctl(int filedes, unsigned long request, void *argp) { } if (thneed->record & THNEED_RECORD) { - thneed->syncobjs.push_back(string((char *)objs, sizeof(struct kgsl_gpuobj_sync_obj)*cmd->count)); + thneed->cmds.push_back(unique_ptr(new + CachedSync(thneed, string((char *)objs, sizeof(struct kgsl_gpuobj_sync_obj)*cmd->count)))); } } else if (request == IOCTL_KGSL_DEVICE_WAITTIMESTAMP_CTXTID) { struct kgsl_device_waittimestamp_ctxtid *cmd = (struct kgsl_device_waittimestamp_ctxtid *)argp; @@ -103,6 +102,14 @@ int ioctl(int filedes, unsigned long request, void *argp) { } } } + } else if (request == IOCTL_KGSL_DRAWCTXT_CREATE || request == IOCTL_KGSL_DRAWCTXT_DESTROY) { + // this happens + } else if (request == IOCTL_KGSL_GPUOBJ_ALLOC || request == IOCTL_KGSL_GPUOBJ_FREE) { + // this happens + } else { + if (thneed->record & THNEED_DEBUG) { + printf("other ioctl %lx\n", request); + } } } @@ -133,13 +140,27 @@ GPUMalloc::~GPUMalloc() { } void *GPUMalloc::alloc(int size) { - if (size > remaining) return NULL; - remaining -= size; void *ret = (void*)base; - base += (size+0xff) & (~0xFF); + size = (size+0xff) & (~0xFF); + assert(size <= remaining); + remaining -= size; + base += size; return ret; } +// *********** CachedSync, at the ioctl layer *********** + +void CachedSync::exec() { + struct kgsl_gpuobj_sync cmd; + + cmd.objs = (uint64_t)data.data(); + cmd.obj_len = data.length(); + cmd.count = data.length() / sizeof(struct kgsl_gpuobj_sync_obj); + + int ret = ioctl(thneed->fd, IOCTL_KGSL_GPUOBJ_SYNC, &cmd); + assert(ret == 0); +} + // *********** CachedCommand, at the ioctl layer *********** CachedCommand::CachedCommand(Thneed *lthneed, struct kgsl_gpu_command *cmd) { @@ -174,24 +195,11 @@ CachedCommand::CachedCommand(Thneed *lthneed, struct kgsl_gpu_command *cmd) { thneed->ckq.clear(); } -void CachedCommand::exec(bool wait) { +void CachedCommand::exec() { cache.timestamp = ++thneed->timestamp; int ret = ioctl(thneed->fd, IOCTL_KGSL_GPU_COMMAND, &cache); - if (wait) { - struct kgsl_device_waittimestamp_ctxtid wait; - wait.context_id = cache.context_id; - wait.timestamp = cache.timestamp; - wait.timeout = -1; - - uint64_t tb = nanos_since_boot(); - int wret = ioctl(thneed->fd, IOCTL_KGSL_DEVICE_WAITTIMESTAMP_CTXTID, &wait); - uint64_t te = nanos_since_boot(); - - if (thneed->record & THNEED_DEBUG) printf("exec %d wait %d after %lu us\n", ret, wret, (te-tb)/1000); - } else { - if (thneed->record & THNEED_DEBUG) printf("CachedCommand::exec got %d\n", ret); - } + if (thneed->record & THNEED_DEBUG) printf("CachedCommand::exec got %d\n", ret); if (thneed->record & THNEED_VERBOSE_DEBUG) { for (auto &it : kq) { @@ -213,32 +221,85 @@ Thneed::Thneed(bool do_clinit) { if (do_clinit) clinit(); assert(g_fd != -1); fd = g_fd; - ram = make_unique(0x40000, fd); + ram = make_unique(0x80000, fd); record = THNEED_RECORD; timestamp = -1; g_thneed = this; } void Thneed::stop() { + find_inputs_outputs(); + printf("Thneed::stop: recorded %lu commands\n", cmds.size()); record = 0; } +void Thneed::find_inputs_outputs() { + cl_int err; + if (inputs.size() > 0) return; + + // save the global inputs/outputs + for (auto &k : kq) { + for (int i = 0; i < k->num_args; i++) { + if (k->name == "zero_pad_image_float" && k->arg_names[i] == "input") { + cl_mem aa = *(cl_mem*)(k->args[i].data()); + + size_t sz; + clGetMemObjectInfo(aa, CL_MEM_SIZE, sizeof(sz), &sz, NULL); + input_sizes.push_back(sz); + + void *ret = clEnqueueMapBuffer(command_queue, aa, CL_TRUE, CL_MAP_WRITE, 0, sz, 0, NULL, NULL, &err); + assert(err == CL_SUCCESS); + inputs.push_back(ret); + } + + if (k->name == "image2d_to_buffer_float" && k->arg_names[i] == "output") { + output = *(cl_mem*)(k->args[i].data()); + } + } + } +} + +void Thneed::copy_inputs(float **finputs) { + //cl_int ret; + for (int idx = 0; idx < inputs.size(); ++idx) { + if (record & THNEED_DEBUG) printf("copying %lu -- %p -> %p\n", input_sizes[idx], finputs[idx], inputs[idx]); + memcpy(inputs[idx], finputs[idx], input_sizes[idx]); + } +} + +void Thneed::copy_output(float *foutput) { + if (output != NULL) { + size_t sz; + clGetMemObjectInfo(output, CL_MEM_SIZE, sizeof(sz), &sz, NULL); + if (record & THNEED_DEBUG) printf("copying %lu for output %p -> %p\n", sz, output, foutput); + clEnqueueReadBuffer(command_queue, output, CL_TRUE, 0, sz, foutput, 0, NULL, NULL); + } else { + printf("CAUTION: model output is NULL, does it have no outputs?\n"); + } +} + +void Thneed::wait() { + struct kgsl_device_waittimestamp_ctxtid wait; + wait.context_id = context_id; + wait.timestamp = timestamp; + wait.timeout = -1; + + uint64_t tb = nanos_since_boot(); + int wret = ioctl(fd, IOCTL_KGSL_DEVICE_WAITTIMESTAMP_CTXTID, &wait); + uint64_t te = nanos_since_boot(); + + if (record & THNEED_DEBUG) printf("wait %d after %lu us\n", wret, (te-tb)/1000); +} + void Thneed::execute(float **finputs, float *foutput, bool slow) { - int ret; uint64_t tb, te; if (record & THNEED_DEBUG) tb = nanos_since_boot(); // ****** copy inputs - for (int idx = 0; idx < inputs.size(); ++idx) { - size_t sz; - clGetMemObjectInfo(inputs[idx], CL_MEM_SIZE, sizeof(sz), &sz, NULL); - - if (record & THNEED_DEBUG) printf("copying %lu -- %p -> %p\n", sz, finputs[idx], inputs[idx]); - // TODO: This shouldn't have to block - clEnqueueWriteBuffer(command_queue, inputs[idx], CL_TRUE, 0, sz, finputs[idx], 0, NULL, NULL); - } + copy_inputs(finputs); // ****** set power constraint + int ret; struct kgsl_device_constraint_pwrlevel pwrlevel; pwrlevel.level = KGSL_CONSTRAINT_PWR_MAX; @@ -260,30 +321,12 @@ void Thneed::execute(float **finputs, float *foutput, bool slow) { for (auto &it : cmds) { ++i; if (record & THNEED_DEBUG) printf("run %2d @ %7lu us: ", i, (nanos_since_boot()-tb)/1000); - it->exec((i == cmds.size()) || slow); - } - - // ****** sync objects - for (auto &it : syncobjs) { - struct kgsl_gpuobj_sync cmd; - - cmd.objs = (uint64_t)it.data(); - cmd.obj_len = it.length(); - cmd.count = it.length() / sizeof(struct kgsl_gpuobj_sync_obj); - - ret = ioctl(fd, IOCTL_KGSL_GPUOBJ_SYNC, &cmd); - assert(ret == 0); + it->exec(); + if ((i == cmds.size()) || slow) wait(); } // ****** copy outputs - if (output != NULL) { - size_t sz; - clGetMemObjectInfo(output, CL_MEM_SIZE, sizeof(sz), &sz, NULL); - if (record & THNEED_DEBUG) printf("copying %lu for output %p -> %p\n", sz, output, foutput); - clEnqueueReadBuffer(command_queue, output, CL_TRUE, 0, sz, foutput, 0, NULL, NULL); - } else { - printf("CAUTION: model output is NULL, does it have no outputs?\n"); - } + copy_output(foutput); // ****** unset power constraint constraint.type = KGSL_CONSTRAINT_NONE; @@ -316,7 +359,7 @@ void Thneed::clinit() { assert(err == 0); //cl_command_queue_properties props[3] = {CL_QUEUE_PROPERTIES, CL_QUEUE_PROFILING_ENABLE, 0}; - cl_command_queue_properties props[3] = {CL_QUEUE_PROPERTIES, 0, 0}; + cl_command_queue_properties props[3] = {CL_QUEUE_PROPERTIES, 0, 0}; command_queue = clCreateCommandQueueWithProperties(context, device_id, props, &err); assert(err == 0); @@ -453,6 +496,9 @@ CLQueuedKernel::CLQueuedKernel(Thneed *lthneed, char arg_name[0x100]; clGetKernelArgInfo(kernel, i, CL_KERNEL_ARG_NAME, sizeof(arg_name), arg_name, NULL); arg_names.push_back(string(arg_name)); + clGetKernelArgInfo(kernel, i, CL_KERNEL_ARG_TYPE_NAME, sizeof(arg_name), arg_name, NULL); + arg_types.push_back(string(arg_name)); + args.push_back(g_args[make_pair(kernel, i)]); args_size.push_back(g_args_size[make_pair(kernel, i)]); } @@ -473,11 +519,14 @@ cl_int CLQueuedKernel::exec() { if (kernel == NULL) { kernel = clCreateKernel(program, name.c_str(), NULL); arg_names.clear(); + arg_types.clear(); for (int j = 0; j < num_args; j++) { char arg_name[0x100]; clGetKernelArgInfo(kernel, j, CL_KERNEL_ARG_NAME, sizeof(arg_name), arg_name, NULL); arg_names.push_back(string(arg_name)); + clGetKernelArgInfo(kernel, j, CL_KERNEL_ARG_TYPE_NAME, sizeof(arg_name), arg_name, NULL); + arg_types.push_back(string(arg_name)); cl_int ret; if (args[j].size() != 0) { @@ -490,19 +539,6 @@ cl_int CLQueuedKernel::exec() { } } - // save the global inputs/outputs - if (thneed->record & THNEED_RECORD) { - for (int i = 0; i < num_args; i++) { - if (name == "zero_pad_image_float" && arg_names[i] == "input") { - thneed->inputs.push_back(*(cl_mem*)(args[i].data())); - } - - if (name == "image2d_to_buffer_float" && arg_names[i] == "output") { - thneed->output = *(cl_mem*)(args[i].data()); - } - } - } - if (thneed->record & THNEED_DEBUG) { debug_print(thneed->record & THNEED_VERBOSE_DEBUG); } @@ -524,10 +560,8 @@ void CLQueuedKernel::debug_print(bool verbose) { if (verbose) { for (int i = 0; i < num_args; i++) { - char arg_type[0x100]; - clGetKernelArgInfo(kernel, i, CL_KERNEL_ARG_TYPE_NAME, sizeof(arg_type), arg_type, NULL); string arg = args[i]; - printf(" %s %s", arg_type, arg_names[i].c_str()); + printf(" %s %s", arg_types[i].c_str(), arg_names[i].c_str()); void *arg_value = (void*)arg.data(); int arg_size = arg.size(); if (arg_size == 0) { @@ -537,7 +571,7 @@ void CLQueuedKernel::debug_print(bool verbose) { } else if (arg_size == 2) { printf(" = %d", *((short*)arg_value)); } else if (arg_size == 4) { - if (strcmp(arg_type, "float") == 0) { + if (arg_types[i] == "float") { printf(" = %f", *((float*)arg_value)); } else { printf(" = %d", *((int*)arg_value)); @@ -546,7 +580,7 @@ void CLQueuedKernel::debug_print(bool verbose) { cl_mem val = (cl_mem)(*((uintptr_t*)arg_value)); printf(" = %p", val); if (val != NULL) { - if (strcmp("image2d_t", arg_type) == 0 || strcmp("image1d_t", arg_type) == 0) { + if (arg_types[i] == "image2d_t" || arg_types[i] == "image1d_t") { cl_image_format format; size_t width, height, depth, array_size, row_pitch, slice_pitch; cl_mem buf; diff --git a/selfdrive/modeld/thneed/thneed.h b/selfdrive/modeld/thneed/thneed.h index e1039efdf..c36aaff70 100644 --- a/selfdrive/modeld/thneed/thneed.h +++ b/selfdrive/modeld/thneed/thneed.h @@ -48,6 +48,7 @@ class CLQueuedKernel { string name; cl_uint num_args; vector arg_names; + vector arg_types; vector args; vector args_size; cl_kernel kernel = NULL; @@ -60,12 +61,26 @@ class CLQueuedKernel { Thneed *thneed; }; -class CachedCommand { +class CachedIoctl { + public: + virtual void exec() {} +}; + +class CachedSync: public CachedIoctl { + public: + CachedSync(Thneed *lthneed, string ldata) { thneed = lthneed; data = ldata; } + void exec(); + private: + Thneed *thneed; + string data; +}; + +class CachedCommand: public CachedIoctl { public: CachedCommand(Thneed *lthneed, struct kgsl_gpu_command *cmd); - void exec(bool wait); - void disassemble(int cmd_index); + void exec(); private: + void disassemble(int cmd_index); struct kgsl_gpu_command cache; unique_ptr cmds; unique_ptr objs; @@ -78,9 +93,11 @@ class Thneed { Thneed(bool do_clinit=false); void stop(); void execute(float **finputs, float *foutput, bool slow=false); + void wait(); int optimize(); - vector inputs; + vector inputs; + vector input_sizes; cl_mem output = NULL; cl_context context = NULL; @@ -92,11 +109,13 @@ class Thneed { int record; int timestamp; unique_ptr ram; - vector > cmds; - vector syncobjs; + vector > cmds; int fd; // all CL kernels + void find_inputs_outputs(); + void copy_inputs(float **finputs); + void copy_output(float *foutput); cl_int clexec(); vector > kq; @@ -105,9 +124,8 @@ class Thneed { // loading and saving void load(const char *filename); - void save(const char *filename); + void save(const char *filename, bool save_binaries=false); private: void clinit(); - json11::Json to_json(); };