Thneed load/save (#19700)
* start thneed load/save * compiling * fix loading * build thneed model in scons * don't hardcode /data/openpilot * release files * those too * support for loading/saving binary kernels * save binaries out of json band * make binary a command line flag to the compiler * need include assert * fix shadowed common in SConscript * cleanup run.h * hmm, the recurrent buffer wasn't 0ed * ugh, unique ptr * remove power constraint, refactor record * Revert "remove power constraint, refactor record" This reverts commit bb6fa52db6df59cd9d6420a6f630430e35af8a5e. * print on thneed stop * fingers crossed for this one * recorded * just curious * okay okay, pass tests? * cleanups * refactor wait Co-authored-by: Comma Device <device@comma.ai> Co-authored-by: Adeeb Shihadeh <adeebshihadeh@gmail.com>albatross
parent
124100d0fa
commit
59fac9fdc6
|
@ -71,3 +71,6 @@ flycheck_*
|
|||
|
||||
cppcheck_report.txt
|
||||
comma.sh
|
||||
|
||||
selfdrive/modeld/thneed/compile
|
||||
models/*.thneed
|
||||
|
|
|
@ -407,10 +407,14 @@ selfdrive/modeld/transforms/transform.h
|
|||
selfdrive/modeld/transforms/transform.cl
|
||||
|
||||
selfdrive/modeld/thneed/thneed.*
|
||||
selfdrive/modeld/thneed/serialize.cc
|
||||
selfdrive/modeld/thneed/compile.cc
|
||||
selfdrive/modeld/thneed/include/*
|
||||
|
||||
selfdrive/modeld/runners/snpemodel.cc
|
||||
selfdrive/modeld/runners/snpemodel.h
|
||||
selfdrive/modeld/runners/thneedmodel.cc
|
||||
selfdrive/modeld/runners/thneedmodel.h
|
||||
selfdrive/modeld/runners/runmodel.h
|
||||
selfdrive/modeld/runners/run.h
|
||||
|
||||
|
|
|
@ -10,14 +10,17 @@ common_src = [
|
|||
"transforms/transform.cc"
|
||||
]
|
||||
|
||||
if arch == "aarch64":
|
||||
libs += ['gsl', 'CB', 'gnustl_shared']
|
||||
common_src += ["thneed/thneed.cc"]
|
||||
lenv['CFLAGS'].append("-DUSE_THNEED")
|
||||
lenv['CXXFLAGS'].append("-DUSE_THNEED")
|
||||
elif arch == "larch64":
|
||||
libs += ['gsl', 'CB', 'pthread', 'dl']
|
||||
common_src += ["thneed/thneed.cc"]
|
||||
thneed_src = [
|
||||
"thneed/thneed.cc",
|
||||
"thneed/serialize.cc",
|
||||
"runners/thneedmodel.cc",
|
||||
]
|
||||
|
||||
if arch == "aarch64" or arch == "larch64":
|
||||
libs += ['gsl', 'CB']
|
||||
libs += ['gnustl_shared'] if arch == "aarch64" else ['pthread', 'dl']
|
||||
|
||||
common_src += thneed_src
|
||||
lenv['CFLAGS'].append("-DUSE_THNEED")
|
||||
lenv['CXXFLAGS'].append("-DUSE_THNEED")
|
||||
else:
|
||||
|
@ -40,15 +43,23 @@ else:
|
|||
del libs[libs.index('symphony-cpu')]
|
||||
del common_src[common_src.index('runners/snpemodel.cc')]
|
||||
|
||||
common = lenv.Object(common_src)
|
||||
common_model = lenv.Object(common_src)
|
||||
|
||||
# build thneed model
|
||||
if arch == "aarch64" or arch == "larch64":
|
||||
compiler = lenv.Program('thneed/compile', ["thneed/compile.cc" ]+common_model, LIBS=libs)
|
||||
cmd = f"cd {Dir('.').get_abspath()} && {compiler[0].get_abspath()} ../../models/supercombo.dlc ../../models/supercombo.thneed --binary"
|
||||
snpe_path = "/data/pythonpath/phonelibs/snpe/"+arch
|
||||
cenv = Environment(ENV = {'LD_LIBRARY_PATH' : snpe_path+":"+lenv["ENV"]["LD_LIBRARY_PATH"]})
|
||||
cenv.Command("../../models/supercombo.thneed", ["../../models/supercombo.dlc", compiler], cmd)
|
||||
|
||||
lenv.Program('_dmonitoringmodeld', [
|
||||
"dmonitoringmodeld.cc",
|
||||
"models/dmonitoring.cc",
|
||||
]+common, LIBS=libs)
|
||||
]+common_model, LIBS=libs)
|
||||
|
||||
lenv.Program('_modeld', [
|
||||
"modeld.cc",
|
||||
"models/driving.cc",
|
||||
]+common, LIBS=libs)
|
||||
]+common_model, LIBS=libs)
|
||||
|
||||
|
|
|
@ -54,7 +54,13 @@ void model_init(ModelState* s, cl_device_id device_id, cl_context context) {
|
|||
|
||||
constexpr int output_size = OUTPUT_SIZE + TEMPORAL_SIZE;
|
||||
s->output = std::make_unique<float[]>(output_size);
|
||||
memset(&s->output[0], 0, output_size*sizeof(float));
|
||||
|
||||
#if defined(QCOM) || defined(QCOM2)
|
||||
s->m = std::make_unique<ThneedModel>("../../models/supercombo.thneed", &s->output[0], output_size, USE_GPU_RUNTIME);
|
||||
#else
|
||||
s->m = std::make_unique<DefaultRunModel>("../../models/supercombo.dlc", &s->output[0], output_size, USE_GPU_RUNTIME);
|
||||
#endif
|
||||
|
||||
#ifdef TEMPORAL
|
||||
s->m->addRecurrent(&s->output[OUTPUT_SIZE], TEMPORAL_SIZE);
|
||||
|
|
|
@ -1,10 +1,10 @@
|
|||
#ifndef RUN_H
|
||||
#define RUN_H
|
||||
#pragma once
|
||||
|
||||
#include "runmodel.h"
|
||||
#include "snpemodel.h"
|
||||
|
||||
#ifdef QCOM
|
||||
#if defined(QCOM) || defined(QCOM2)
|
||||
#include "thneedmodel.h"
|
||||
#define DefaultRunModel SNPEModel
|
||||
#else
|
||||
#ifdef USE_ONNX_MODEL
|
||||
|
@ -14,5 +14,3 @@
|
|||
#define DefaultRunModel SNPEModel
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
|
|
@ -31,13 +31,14 @@ public:
|
|||
void addTrafficConvention(float *state, int state_size);
|
||||
void addDesire(float *state, int state_size);
|
||||
void execute(float *net_input_buf, int buf_size);
|
||||
private:
|
||||
uint8_t *model_data = NULL;
|
||||
|
||||
#ifdef USE_THNEED
|
||||
Thneed *thneed = NULL;
|
||||
#endif
|
||||
|
||||
private:
|
||||
uint8_t *model_data = NULL;
|
||||
|
||||
#if defined(QCOM) || defined(QCOM2)
|
||||
zdl::DlSystem::Runtime_t Runtime;
|
||||
#endif
|
||||
|
|
|
@ -0,0 +1,41 @@
|
|||
#include "thneedmodel.h"
|
||||
#include <assert.h>
|
||||
|
||||
ThneedModel::ThneedModel(const char *path, float *loutput, size_t loutput_size, int runtime) {
|
||||
thneed = new Thneed(true);
|
||||
thneed->record = 0;
|
||||
thneed->load(path);
|
||||
thneed->clexec();
|
||||
thneed->find_inputs_outputs();
|
||||
|
||||
recorded = false;
|
||||
output = loutput;
|
||||
}
|
||||
|
||||
void ThneedModel::addRecurrent(float *state, int state_size) {
|
||||
recurrent = state;
|
||||
}
|
||||
|
||||
void ThneedModel::addTrafficConvention(float *state, int state_size) {
|
||||
trafficConvention = state;
|
||||
}
|
||||
|
||||
void ThneedModel::addDesire(float *state, int state_size) {
|
||||
desire = state;
|
||||
}
|
||||
|
||||
void ThneedModel::execute(float *net_input_buf, int buf_size) {
|
||||
float *inputs[4] = {recurrent, trafficConvention, desire, net_input_buf};
|
||||
if (!recorded) {
|
||||
thneed->record = THNEED_RECORD;
|
||||
thneed->copy_inputs(inputs);
|
||||
thneed->clexec();
|
||||
thneed->copy_output(output);
|
||||
thneed->stop();
|
||||
|
||||
recorded = true;
|
||||
} else {
|
||||
thneed->execute(inputs, output);
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,24 @@
|
|||
#pragma once
|
||||
|
||||
#include "runmodel.h"
|
||||
#include "thneed/thneed.h"
|
||||
|
||||
class ThneedModel : public RunModel {
|
||||
public:
|
||||
ThneedModel(const char *path, float *loutput, size_t loutput_size, int runtime);
|
||||
void addRecurrent(float *state, int state_size);
|
||||
void addTrafficConvention(float *state, int state_size);
|
||||
void addDesire(float *state, int state_size);
|
||||
void execute(float *net_input_buf, int buf_size);
|
||||
private:
|
||||
Thneed *thneed = NULL;
|
||||
bool recorded;
|
||||
|
||||
float *output;
|
||||
|
||||
// recurrent and desire
|
||||
float *recurrent;
|
||||
float *trafficConvention;
|
||||
float *desire;
|
||||
};
|
||||
|
|
@ -0,0 +1,34 @@
|
|||
#include <string.h>
|
||||
#include "thneed.h"
|
||||
#include "../runners/snpemodel.h"
|
||||
|
||||
#define TEMPORAL_SIZE 512
|
||||
#define DESIRE_LEN 8
|
||||
#define TRAFFIC_CONVENTION_LEN 2
|
||||
|
||||
// TODO: This should probably use SNPE directly.
|
||||
int main(int argc, char* argv[]) {
|
||||
#define OUTPUT_SIZE 0x10000
|
||||
float *output = (float*)calloc(OUTPUT_SIZE, sizeof(float));
|
||||
SNPEModel mdl(argv[1], output, 0, USE_GPU_RUNTIME);
|
||||
|
||||
float state[TEMPORAL_SIZE] = {0};
|
||||
float desire[DESIRE_LEN] = {0};
|
||||
float traffic_convention[TRAFFIC_CONVENTION_LEN] = {0};
|
||||
float *input = (float*)calloc(0x1000000, sizeof(float));;
|
||||
|
||||
mdl.addRecurrent(state, TEMPORAL_SIZE);
|
||||
mdl.addDesire(desire, DESIRE_LEN);
|
||||
mdl.addTrafficConvention(traffic_convention, TRAFFIC_CONVENTION_LEN);
|
||||
|
||||
// first run
|
||||
printf("************** execute 1 **************\n");
|
||||
memset(output, 0, OUTPUT_SIZE * sizeof(float));
|
||||
mdl.execute(input, 0);
|
||||
|
||||
// save model
|
||||
bool save_binaries = (argc > 3) && (strcmp(argv[3], "--binary") == 0);
|
||||
mdl.thneed->save(argv[2], save_binaries);
|
||||
return 0;
|
||||
}
|
||||
|
|
@ -0,0 +1,290 @@
|
|||
#include <set>
|
||||
#include <assert.h>
|
||||
#include "thneed.h"
|
||||
#include "json11.hpp"
|
||||
using namespace json11;
|
||||
|
||||
extern map<cl_program, string> g_program_source;
|
||||
|
||||
void Thneed::load(const char *filename) {
|
||||
printf("Thneed::load: loading from %s\n", filename);
|
||||
|
||||
FILE *f = fopen(filename, "rb");
|
||||
fseek(f, 0L, SEEK_END);
|
||||
int sz = ftell(f);
|
||||
fseek(f, 0L, SEEK_SET);
|
||||
char *buf = (char*)malloc(sz);
|
||||
fread(buf, 1, sz, f);
|
||||
fclose(f);
|
||||
|
||||
int jsz = *(int *)buf;
|
||||
string jj(buf+4, jsz);
|
||||
string err;
|
||||
Json jdat = Json::parse(jj, err);
|
||||
|
||||
map<cl_mem, cl_mem> real_mem;
|
||||
real_mem[NULL] = NULL;
|
||||
|
||||
int ptr = 4+jsz;
|
||||
for (auto &obj : jdat["objects"].array_items()) {
|
||||
auto mobj = obj.object_items();
|
||||
int sz = mobj["size"].int_value();
|
||||
cl_mem clbuf = NULL;
|
||||
|
||||
if (mobj["buffer_id"].string_value().size() > 0) {
|
||||
// image buffer must already be allocated
|
||||
clbuf = real_mem[*(cl_mem*)(mobj["buffer_id"].string_value().data())];
|
||||
assert(mobj["needs_load"].bool_value() == false);
|
||||
} else {
|
||||
if (mobj["needs_load"].bool_value()) {
|
||||
//printf("loading %p %d @ 0x%X\n", clbuf, sz, ptr);
|
||||
clbuf = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR | CL_MEM_READ_WRITE, sz, &buf[ptr], NULL);
|
||||
ptr += sz;
|
||||
} else {
|
||||
clbuf = clCreateBuffer(context, CL_MEM_READ_WRITE, sz, NULL, NULL);
|
||||
}
|
||||
}
|
||||
assert(clbuf != NULL);
|
||||
|
||||
if (mobj["arg_type"] == "image2d_t" || mobj["arg_type"] == "image1d_t") {
|
||||
cl_image_desc desc = {0};
|
||||
desc.image_type = (mobj["arg_type"] == "image2d_t") ? CL_MEM_OBJECT_IMAGE2D : CL_MEM_OBJECT_IMAGE1D_BUFFER;
|
||||
desc.image_width = mobj["width"].int_value();
|
||||
desc.image_height = mobj["height"].int_value();
|
||||
desc.image_row_pitch = mobj["row_pitch"].int_value();
|
||||
desc.buffer = clbuf;
|
||||
|
||||
cl_image_format format;
|
||||
format.image_channel_order = CL_RGBA;
|
||||
format.image_channel_data_type = CL_HALF_FLOAT;
|
||||
|
||||
clbuf = clCreateImage(context, CL_MEM_READ_WRITE, &format, &desc, NULL, NULL);
|
||||
assert(clbuf != NULL);
|
||||
}
|
||||
|
||||
real_mem[*(cl_mem*)(mobj["id"].string_value().data())] = clbuf;
|
||||
}
|
||||
|
||||
map<string, cl_program> g_programs;
|
||||
for (auto &obj : jdat["programs"].object_items()) {
|
||||
const char *srcs[1];
|
||||
srcs[0] = (const char *)obj.second.string_value().c_str();
|
||||
size_t length = obj.second.string_value().size();
|
||||
|
||||
if (record & THNEED_DEBUG) printf("building %s with size %zu\n", obj.first.c_str(), length);
|
||||
|
||||
cl_program program = clCreateProgramWithSource(context, 1, srcs, &length, NULL);
|
||||
int err = clBuildProgram(program, 1, &device_id, "", NULL, NULL);
|
||||
if (err != 0) {
|
||||
printf("got err %d\n", err);
|
||||
size_t length;
|
||||
char buffer[2048];
|
||||
clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &length);
|
||||
buffer[length] = '\0';
|
||||
printf("%s\n", buffer);
|
||||
}
|
||||
assert(err == 0);
|
||||
|
||||
g_programs[obj.first] = program;
|
||||
}
|
||||
|
||||
for (auto &obj : jdat["binaries"].array_items()) {
|
||||
string name = obj["name"].string_value();
|
||||
size_t length = obj["length"].int_value();
|
||||
const unsigned char *srcs[1];
|
||||
srcs[0] = (const unsigned char *)&buf[ptr];
|
||||
ptr += length;
|
||||
|
||||
if (record & THNEED_DEBUG) printf("binary %s with size %zu\n", name.c_str(), length);
|
||||
|
||||
cl_int err;
|
||||
cl_program program = clCreateProgramWithBinary(context, 1, &device_id, &length, srcs, NULL, &err);
|
||||
assert(program != NULL && err == CL_SUCCESS);
|
||||
err = clBuildProgram(program, 1, &device_id, "", NULL, NULL);
|
||||
assert(err == CL_SUCCESS);
|
||||
|
||||
g_programs[name] = program;
|
||||
}
|
||||
|
||||
for (auto &obj : jdat["kernels"].array_items()) {
|
||||
auto gws = obj["global_work_size"];
|
||||
auto lws = obj["local_work_size"];
|
||||
auto kk = shared_ptr<CLQueuedKernel>(new CLQueuedKernel(this));
|
||||
|
||||
kk->name = obj["name"].string_value();
|
||||
kk->program = g_programs[kk->name];
|
||||
kk->work_dim = obj["work_dim"].int_value();
|
||||
for (int i = 0; i < kk->work_dim; i++) {
|
||||
kk->global_work_size[i] = gws[i].int_value();
|
||||
kk->local_work_size[i] = lws[i].int_value();
|
||||
}
|
||||
kk->num_args = obj["num_args"].int_value();
|
||||
for (int i = 0; i < kk->num_args; i++) {
|
||||
string arg = obj["args"].array_items()[i].string_value();
|
||||
int arg_size = obj["args_size"].array_items()[i].int_value();
|
||||
kk->args_size.push_back(arg_size);
|
||||
if (arg_size == 8) {
|
||||
cl_mem val = *(cl_mem*)(arg.data());
|
||||
val = real_mem[val];
|
||||
kk->args.push_back(string((char*)&val, sizeof(val)));
|
||||
} else {
|
||||
kk->args.push_back(arg);
|
||||
}
|
||||
}
|
||||
kq.push_back(kk);
|
||||
}
|
||||
|
||||
free(buf);
|
||||
clFinish(command_queue);
|
||||
}
|
||||
|
||||
void Thneed::save(const char *filename, bool save_binaries) {
|
||||
printf("Thneed::save: saving to %s\n", filename);
|
||||
|
||||
// get kernels
|
||||
std::vector<Json> kernels;
|
||||
std::set<string> saved_objects;
|
||||
std::vector<Json> objects;
|
||||
std::map<string, string> programs;
|
||||
std::map<string, string> binaries;
|
||||
|
||||
for (auto &k : kq) {
|
||||
kernels.push_back(k->to_json());
|
||||
|
||||
// check args for objects
|
||||
int i = 0;
|
||||
for (auto &a : k->args) {
|
||||
if (a.size() == 8) {
|
||||
if (saved_objects.find(a) == saved_objects.end()) {
|
||||
saved_objects.insert(a);
|
||||
cl_mem val = *(cl_mem*)(a.data());
|
||||
if (val != NULL) {
|
||||
bool needs_load = k->arg_names[i] == "weights" || k->arg_names[i] == "biases";
|
||||
|
||||
auto jj = Json::object({
|
||||
{"id", a},
|
||||
{"arg_type", k->arg_types[i]},
|
||||
});
|
||||
|
||||
if (k->arg_types[i] == "image2d_t" || k->arg_types[i] == "image1d_t") {
|
||||
cl_mem buf;
|
||||
clGetImageInfo(val, CL_IMAGE_BUFFER, sizeof(buf), &buf, NULL);
|
||||
string aa = string((char *)&buf, sizeof(buf));
|
||||
jj["buffer_id"] = aa;
|
||||
|
||||
size_t width, height, row_pitch;
|
||||
clGetImageInfo(val, CL_IMAGE_WIDTH, sizeof(width), &width, NULL);
|
||||
clGetImageInfo(val, CL_IMAGE_HEIGHT, sizeof(height), &height, NULL);
|
||||
clGetImageInfo(val, CL_IMAGE_ROW_PITCH, sizeof(row_pitch), &row_pitch, NULL);
|
||||
jj["width"] = (int)width;
|
||||
jj["height"] = (int)height;
|
||||
jj["row_pitch"] = (int)row_pitch;
|
||||
jj["size"] = (int)(height * row_pitch);
|
||||
jj["needs_load"] = false;
|
||||
|
||||
if (saved_objects.find(aa) == saved_objects.end()) {
|
||||
saved_objects.insert(aa);
|
||||
size_t sz;
|
||||
clGetMemObjectInfo(buf, CL_MEM_SIZE, sizeof(sz), &sz, NULL);
|
||||
// save the buffer
|
||||
objects.push_back(Json::object({
|
||||
{"id", aa},
|
||||
{"arg_type", "<image buffer>"},
|
||||
{"needs_load", needs_load},
|
||||
{"size", (int)sz}
|
||||
}));
|
||||
if (needs_load) assert(sz == height * row_pitch);
|
||||
}
|
||||
} else {
|
||||
size_t sz = 0;
|
||||
clGetMemObjectInfo(val, CL_MEM_SIZE, sizeof(sz), &sz, NULL);
|
||||
jj["size"] = (int)sz;
|
||||
jj["needs_load"] = needs_load;
|
||||
}
|
||||
|
||||
objects.push_back(jj);
|
||||
}
|
||||
}
|
||||
}
|
||||
i++;
|
||||
}
|
||||
|
||||
if (save_binaries) {
|
||||
int err;
|
||||
size_t binary_size = 0;
|
||||
err = clGetProgramInfo(k->program, CL_PROGRAM_BINARY_SIZES, sizeof(binary_size), &binary_size, NULL);
|
||||
assert(err == 0);
|
||||
assert(binary_size > 0);
|
||||
string sv(binary_size, '\x00');
|
||||
|
||||
uint8_t* bufs[1] = { (uint8_t*)sv.data(), };
|
||||
err = clGetProgramInfo(k->program, CL_PROGRAM_BINARIES, sizeof(bufs), &bufs, NULL);
|
||||
assert(err == 0);
|
||||
|
||||
binaries[k->name] = sv;
|
||||
} else {
|
||||
programs[k->name] = g_program_source[k->program];
|
||||
}
|
||||
}
|
||||
|
||||
vector<string> saved_buffers;
|
||||
for (auto &obj : objects) {
|
||||
auto mobj = obj.object_items();
|
||||
cl_mem val = *(cl_mem*)(mobj["id"].string_value().data());
|
||||
int sz = mobj["size"].int_value();
|
||||
if (mobj["needs_load"].bool_value()) {
|
||||
char *buf = (char *)malloc(sz);
|
||||
if (mobj["arg_type"] == "image2d_t" || mobj["arg_type"] == "image1d_t") {
|
||||
assert(false);
|
||||
} else {
|
||||
// buffers alloced with CL_MEM_HOST_WRITE_ONLY, hence this hack
|
||||
//hexdump((uint32_t*)val, 0x100);
|
||||
|
||||
// the worst hack in thneed, the flags are at 0x14
|
||||
((uint32_t*)val)[0x14] &= ~CL_MEM_HOST_WRITE_ONLY;
|
||||
cl_int ret = clEnqueueReadBuffer(command_queue, val, CL_TRUE, 0, sz, buf, 0, NULL, NULL);
|
||||
assert(ret == CL_SUCCESS);
|
||||
}
|
||||
//printf("saving buffer: %d %p %s\n", sz, buf, mobj["arg_type"].string_value().c_str());
|
||||
saved_buffers.push_back(string(buf, sz));
|
||||
free(buf);
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<Json> jbinaries;
|
||||
for (auto &obj : binaries) {
|
||||
jbinaries.push_back(Json::object({{"name", obj.first}, {"length", (int)obj.second.size()}}));
|
||||
saved_buffers.push_back(obj.second);
|
||||
}
|
||||
|
||||
Json jdat = Json::object({
|
||||
{"kernels", kernels},
|
||||
{"objects", objects},
|
||||
{"programs", programs},
|
||||
{"binaries", jbinaries},
|
||||
});
|
||||
|
||||
string str = jdat.dump();
|
||||
int jsz = str.length();
|
||||
|
||||
FILE *f = fopen(filename, "wb");
|
||||
fwrite(&jsz, 1, sizeof(jsz), f);
|
||||
fwrite(str.data(), 1, jsz, f);
|
||||
for (auto &s : saved_buffers) {
|
||||
fwrite(s.data(), 1, s.length(), f);
|
||||
}
|
||||
fclose(f);
|
||||
}
|
||||
|
||||
Json CLQueuedKernel::to_json() const {
|
||||
return Json::object {
|
||||
{ "name", name },
|
||||
{ "work_dim", (int)work_dim },
|
||||
{ "global_work_size", Json::array { (int)global_work_size[0], (int)global_work_size[1], (int)global_work_size[2] } },
|
||||
{ "local_work_size", Json::array { (int)local_work_size[0], (int)local_work_size[1], (int)local_work_size[2] } },
|
||||
{ "num_args", (int)num_args },
|
||||
{ "args", args },
|
||||
{ "args_size", args_size },
|
||||
};
|
||||
}
|
||||
|
|
@ -7,8 +7,6 @@
|
|||
#include <errno.h>
|
||||
#include "thneed.h"
|
||||
|
||||
//#define SAVE_KERNELS
|
||||
|
||||
//#define RUN_DISASSEMBLER
|
||||
//#define RUN_OPTIMIZER
|
||||
|
||||
|
@ -83,7 +81,8 @@ int ioctl(int filedes, unsigned long request, void *argp) {
|
|||
}
|
||||
|
||||
if (thneed->record & THNEED_RECORD) {
|
||||
thneed->syncobjs.push_back(string((char *)objs, sizeof(struct kgsl_gpuobj_sync_obj)*cmd->count));
|
||||
thneed->cmds.push_back(unique_ptr<CachedSync>(new
|
||||
CachedSync(thneed, string((char *)objs, sizeof(struct kgsl_gpuobj_sync_obj)*cmd->count))));
|
||||
}
|
||||
} else if (request == IOCTL_KGSL_DEVICE_WAITTIMESTAMP_CTXTID) {
|
||||
struct kgsl_device_waittimestamp_ctxtid *cmd = (struct kgsl_device_waittimestamp_ctxtid *)argp;
|
||||
|
@ -103,6 +102,14 @@ int ioctl(int filedes, unsigned long request, void *argp) {
|
|||
}
|
||||
}
|
||||
}
|
||||
} else if (request == IOCTL_KGSL_DRAWCTXT_CREATE || request == IOCTL_KGSL_DRAWCTXT_DESTROY) {
|
||||
// this happens
|
||||
} else if (request == IOCTL_KGSL_GPUOBJ_ALLOC || request == IOCTL_KGSL_GPUOBJ_FREE) {
|
||||
// this happens
|
||||
} else {
|
||||
if (thneed->record & THNEED_DEBUG) {
|
||||
printf("other ioctl %lx\n", request);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -133,13 +140,27 @@ GPUMalloc::~GPUMalloc() {
|
|||
}
|
||||
|
||||
void *GPUMalloc::alloc(int size) {
|
||||
if (size > remaining) return NULL;
|
||||
remaining -= size;
|
||||
void *ret = (void*)base;
|
||||
base += (size+0xff) & (~0xFF);
|
||||
size = (size+0xff) & (~0xFF);
|
||||
assert(size <= remaining);
|
||||
remaining -= size;
|
||||
base += size;
|
||||
return ret;
|
||||
}
|
||||
|
||||
// *********** CachedSync, at the ioctl layer ***********
|
||||
|
||||
void CachedSync::exec() {
|
||||
struct kgsl_gpuobj_sync cmd;
|
||||
|
||||
cmd.objs = (uint64_t)data.data();
|
||||
cmd.obj_len = data.length();
|
||||
cmd.count = data.length() / sizeof(struct kgsl_gpuobj_sync_obj);
|
||||
|
||||
int ret = ioctl(thneed->fd, IOCTL_KGSL_GPUOBJ_SYNC, &cmd);
|
||||
assert(ret == 0);
|
||||
}
|
||||
|
||||
// *********** CachedCommand, at the ioctl layer ***********
|
||||
|
||||
CachedCommand::CachedCommand(Thneed *lthneed, struct kgsl_gpu_command *cmd) {
|
||||
|
@ -174,24 +195,11 @@ CachedCommand::CachedCommand(Thneed *lthneed, struct kgsl_gpu_command *cmd) {
|
|||
thneed->ckq.clear();
|
||||
}
|
||||
|
||||
void CachedCommand::exec(bool wait) {
|
||||
void CachedCommand::exec() {
|
||||
cache.timestamp = ++thneed->timestamp;
|
||||
int ret = ioctl(thneed->fd, IOCTL_KGSL_GPU_COMMAND, &cache);
|
||||
|
||||
if (wait) {
|
||||
struct kgsl_device_waittimestamp_ctxtid wait;
|
||||
wait.context_id = cache.context_id;
|
||||
wait.timestamp = cache.timestamp;
|
||||
wait.timeout = -1;
|
||||
|
||||
uint64_t tb = nanos_since_boot();
|
||||
int wret = ioctl(thneed->fd, IOCTL_KGSL_DEVICE_WAITTIMESTAMP_CTXTID, &wait);
|
||||
uint64_t te = nanos_since_boot();
|
||||
|
||||
if (thneed->record & THNEED_DEBUG) printf("exec %d wait %d after %lu us\n", ret, wret, (te-tb)/1000);
|
||||
} else {
|
||||
if (thneed->record & THNEED_DEBUG) printf("CachedCommand::exec got %d\n", ret);
|
||||
}
|
||||
if (thneed->record & THNEED_DEBUG) printf("CachedCommand::exec got %d\n", ret);
|
||||
|
||||
if (thneed->record & THNEED_VERBOSE_DEBUG) {
|
||||
for (auto &it : kq) {
|
||||
|
@ -213,32 +221,85 @@ Thneed::Thneed(bool do_clinit) {
|
|||
if (do_clinit) clinit();
|
||||
assert(g_fd != -1);
|
||||
fd = g_fd;
|
||||
ram = make_unique<GPUMalloc>(0x40000, fd);
|
||||
ram = make_unique<GPUMalloc>(0x80000, fd);
|
||||
record = THNEED_RECORD;
|
||||
timestamp = -1;
|
||||
g_thneed = this;
|
||||
}
|
||||
|
||||
void Thneed::stop() {
|
||||
find_inputs_outputs();
|
||||
printf("Thneed::stop: recorded %lu commands\n", cmds.size());
|
||||
record = 0;
|
||||
}
|
||||
|
||||
void Thneed::find_inputs_outputs() {
|
||||
cl_int err;
|
||||
if (inputs.size() > 0) return;
|
||||
|
||||
// save the global inputs/outputs
|
||||
for (auto &k : kq) {
|
||||
for (int i = 0; i < k->num_args; i++) {
|
||||
if (k->name == "zero_pad_image_float" && k->arg_names[i] == "input") {
|
||||
cl_mem aa = *(cl_mem*)(k->args[i].data());
|
||||
|
||||
size_t sz;
|
||||
clGetMemObjectInfo(aa, CL_MEM_SIZE, sizeof(sz), &sz, NULL);
|
||||
input_sizes.push_back(sz);
|
||||
|
||||
void *ret = clEnqueueMapBuffer(command_queue, aa, CL_TRUE, CL_MAP_WRITE, 0, sz, 0, NULL, NULL, &err);
|
||||
assert(err == CL_SUCCESS);
|
||||
inputs.push_back(ret);
|
||||
}
|
||||
|
||||
if (k->name == "image2d_to_buffer_float" && k->arg_names[i] == "output") {
|
||||
output = *(cl_mem*)(k->args[i].data());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void Thneed::copy_inputs(float **finputs) {
|
||||
//cl_int ret;
|
||||
for (int idx = 0; idx < inputs.size(); ++idx) {
|
||||
if (record & THNEED_DEBUG) printf("copying %lu -- %p -> %p\n", input_sizes[idx], finputs[idx], inputs[idx]);
|
||||
memcpy(inputs[idx], finputs[idx], input_sizes[idx]);
|
||||
}
|
||||
}
|
||||
|
||||
void Thneed::copy_output(float *foutput) {
|
||||
if (output != NULL) {
|
||||
size_t sz;
|
||||
clGetMemObjectInfo(output, CL_MEM_SIZE, sizeof(sz), &sz, NULL);
|
||||
if (record & THNEED_DEBUG) printf("copying %lu for output %p -> %p\n", sz, output, foutput);
|
||||
clEnqueueReadBuffer(command_queue, output, CL_TRUE, 0, sz, foutput, 0, NULL, NULL);
|
||||
} else {
|
||||
printf("CAUTION: model output is NULL, does it have no outputs?\n");
|
||||
}
|
||||
}
|
||||
|
||||
void Thneed::wait() {
|
||||
struct kgsl_device_waittimestamp_ctxtid wait;
|
||||
wait.context_id = context_id;
|
||||
wait.timestamp = timestamp;
|
||||
wait.timeout = -1;
|
||||
|
||||
uint64_t tb = nanos_since_boot();
|
||||
int wret = ioctl(fd, IOCTL_KGSL_DEVICE_WAITTIMESTAMP_CTXTID, &wait);
|
||||
uint64_t te = nanos_since_boot();
|
||||
|
||||
if (record & THNEED_DEBUG) printf("wait %d after %lu us\n", wret, (te-tb)/1000);
|
||||
}
|
||||
|
||||
void Thneed::execute(float **finputs, float *foutput, bool slow) {
|
||||
int ret;
|
||||
uint64_t tb, te;
|
||||
if (record & THNEED_DEBUG) tb = nanos_since_boot();
|
||||
|
||||
// ****** copy inputs
|
||||
for (int idx = 0; idx < inputs.size(); ++idx) {
|
||||
size_t sz;
|
||||
clGetMemObjectInfo(inputs[idx], CL_MEM_SIZE, sizeof(sz), &sz, NULL);
|
||||
|
||||
if (record & THNEED_DEBUG) printf("copying %lu -- %p -> %p\n", sz, finputs[idx], inputs[idx]);
|
||||
// TODO: This shouldn't have to block
|
||||
clEnqueueWriteBuffer(command_queue, inputs[idx], CL_TRUE, 0, sz, finputs[idx], 0, NULL, NULL);
|
||||
}
|
||||
copy_inputs(finputs);
|
||||
|
||||
// ****** set power constraint
|
||||
int ret;
|
||||
struct kgsl_device_constraint_pwrlevel pwrlevel;
|
||||
pwrlevel.level = KGSL_CONSTRAINT_PWR_MAX;
|
||||
|
||||
|
@ -260,30 +321,12 @@ void Thneed::execute(float **finputs, float *foutput, bool slow) {
|
|||
for (auto &it : cmds) {
|
||||
++i;
|
||||
if (record & THNEED_DEBUG) printf("run %2d @ %7lu us: ", i, (nanos_since_boot()-tb)/1000);
|
||||
it->exec((i == cmds.size()) || slow);
|
||||
}
|
||||
|
||||
// ****** sync objects
|
||||
for (auto &it : syncobjs) {
|
||||
struct kgsl_gpuobj_sync cmd;
|
||||
|
||||
cmd.objs = (uint64_t)it.data();
|
||||
cmd.obj_len = it.length();
|
||||
cmd.count = it.length() / sizeof(struct kgsl_gpuobj_sync_obj);
|
||||
|
||||
ret = ioctl(fd, IOCTL_KGSL_GPUOBJ_SYNC, &cmd);
|
||||
assert(ret == 0);
|
||||
it->exec();
|
||||
if ((i == cmds.size()) || slow) wait();
|
||||
}
|
||||
|
||||
// ****** copy outputs
|
||||
if (output != NULL) {
|
||||
size_t sz;
|
||||
clGetMemObjectInfo(output, CL_MEM_SIZE, sizeof(sz), &sz, NULL);
|
||||
if (record & THNEED_DEBUG) printf("copying %lu for output %p -> %p\n", sz, output, foutput);
|
||||
clEnqueueReadBuffer(command_queue, output, CL_TRUE, 0, sz, foutput, 0, NULL, NULL);
|
||||
} else {
|
||||
printf("CAUTION: model output is NULL, does it have no outputs?\n");
|
||||
}
|
||||
copy_output(foutput);
|
||||
|
||||
// ****** unset power constraint
|
||||
constraint.type = KGSL_CONSTRAINT_NONE;
|
||||
|
@ -316,7 +359,7 @@ void Thneed::clinit() {
|
|||
assert(err == 0);
|
||||
|
||||
//cl_command_queue_properties props[3] = {CL_QUEUE_PROPERTIES, CL_QUEUE_PROFILING_ENABLE, 0};
|
||||
cl_command_queue_properties props[3] = {CL_QUEUE_PROPERTIES, 0, 0};
|
||||
cl_command_queue_properties props[3] = {CL_QUEUE_PROPERTIES, 0, 0};
|
||||
command_queue = clCreateCommandQueueWithProperties(context, device_id, props, &err);
|
||||
assert(err == 0);
|
||||
|
||||
|
@ -453,6 +496,9 @@ CLQueuedKernel::CLQueuedKernel(Thneed *lthneed,
|
|||
char arg_name[0x100];
|
||||
clGetKernelArgInfo(kernel, i, CL_KERNEL_ARG_NAME, sizeof(arg_name), arg_name, NULL);
|
||||
arg_names.push_back(string(arg_name));
|
||||
clGetKernelArgInfo(kernel, i, CL_KERNEL_ARG_TYPE_NAME, sizeof(arg_name), arg_name, NULL);
|
||||
arg_types.push_back(string(arg_name));
|
||||
|
||||
args.push_back(g_args[make_pair(kernel, i)]);
|
||||
args_size.push_back(g_args_size[make_pair(kernel, i)]);
|
||||
}
|
||||
|
@ -473,11 +519,14 @@ cl_int CLQueuedKernel::exec() {
|
|||
if (kernel == NULL) {
|
||||
kernel = clCreateKernel(program, name.c_str(), NULL);
|
||||
arg_names.clear();
|
||||
arg_types.clear();
|
||||
|
||||
for (int j = 0; j < num_args; j++) {
|
||||
char arg_name[0x100];
|
||||
clGetKernelArgInfo(kernel, j, CL_KERNEL_ARG_NAME, sizeof(arg_name), arg_name, NULL);
|
||||
arg_names.push_back(string(arg_name));
|
||||
clGetKernelArgInfo(kernel, j, CL_KERNEL_ARG_TYPE_NAME, sizeof(arg_name), arg_name, NULL);
|
||||
arg_types.push_back(string(arg_name));
|
||||
|
||||
cl_int ret;
|
||||
if (args[j].size() != 0) {
|
||||
|
@ -490,19 +539,6 @@ cl_int CLQueuedKernel::exec() {
|
|||
}
|
||||
}
|
||||
|
||||
// save the global inputs/outputs
|
||||
if (thneed->record & THNEED_RECORD) {
|
||||
for (int i = 0; i < num_args; i++) {
|
||||
if (name == "zero_pad_image_float" && arg_names[i] == "input") {
|
||||
thneed->inputs.push_back(*(cl_mem*)(args[i].data()));
|
||||
}
|
||||
|
||||
if (name == "image2d_to_buffer_float" && arg_names[i] == "output") {
|
||||
thneed->output = *(cl_mem*)(args[i].data());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (thneed->record & THNEED_DEBUG) {
|
||||
debug_print(thneed->record & THNEED_VERBOSE_DEBUG);
|
||||
}
|
||||
|
@ -524,10 +560,8 @@ void CLQueuedKernel::debug_print(bool verbose) {
|
|||
|
||||
if (verbose) {
|
||||
for (int i = 0; i < num_args; i++) {
|
||||
char arg_type[0x100];
|
||||
clGetKernelArgInfo(kernel, i, CL_KERNEL_ARG_TYPE_NAME, sizeof(arg_type), arg_type, NULL);
|
||||
string arg = args[i];
|
||||
printf(" %s %s", arg_type, arg_names[i].c_str());
|
||||
printf(" %s %s", arg_types[i].c_str(), arg_names[i].c_str());
|
||||
void *arg_value = (void*)arg.data();
|
||||
int arg_size = arg.size();
|
||||
if (arg_size == 0) {
|
||||
|
@ -537,7 +571,7 @@ void CLQueuedKernel::debug_print(bool verbose) {
|
|||
} else if (arg_size == 2) {
|
||||
printf(" = %d", *((short*)arg_value));
|
||||
} else if (arg_size == 4) {
|
||||
if (strcmp(arg_type, "float") == 0) {
|
||||
if (arg_types[i] == "float") {
|
||||
printf(" = %f", *((float*)arg_value));
|
||||
} else {
|
||||
printf(" = %d", *((int*)arg_value));
|
||||
|
@ -546,7 +580,7 @@ void CLQueuedKernel::debug_print(bool verbose) {
|
|||
cl_mem val = (cl_mem)(*((uintptr_t*)arg_value));
|
||||
printf(" = %p", val);
|
||||
if (val != NULL) {
|
||||
if (strcmp("image2d_t", arg_type) == 0 || strcmp("image1d_t", arg_type) == 0) {
|
||||
if (arg_types[i] == "image2d_t" || arg_types[i] == "image1d_t") {
|
||||
cl_image_format format;
|
||||
size_t width, height, depth, array_size, row_pitch, slice_pitch;
|
||||
cl_mem buf;
|
||||
|
|
|
@ -48,6 +48,7 @@ class CLQueuedKernel {
|
|||
string name;
|
||||
cl_uint num_args;
|
||||
vector<string> arg_names;
|
||||
vector<string> arg_types;
|
||||
vector<string> args;
|
||||
vector<int> args_size;
|
||||
cl_kernel kernel = NULL;
|
||||
|
@ -60,12 +61,26 @@ class CLQueuedKernel {
|
|||
Thneed *thneed;
|
||||
};
|
||||
|
||||
class CachedCommand {
|
||||
class CachedIoctl {
|
||||
public:
|
||||
virtual void exec() {}
|
||||
};
|
||||
|
||||
class CachedSync: public CachedIoctl {
|
||||
public:
|
||||
CachedSync(Thneed *lthneed, string ldata) { thneed = lthneed; data = ldata; }
|
||||
void exec();
|
||||
private:
|
||||
Thneed *thneed;
|
||||
string data;
|
||||
};
|
||||
|
||||
class CachedCommand: public CachedIoctl {
|
||||
public:
|
||||
CachedCommand(Thneed *lthneed, struct kgsl_gpu_command *cmd);
|
||||
void exec(bool wait);
|
||||
void disassemble(int cmd_index);
|
||||
void exec();
|
||||
private:
|
||||
void disassemble(int cmd_index);
|
||||
struct kgsl_gpu_command cache;
|
||||
unique_ptr<kgsl_command_object[]> cmds;
|
||||
unique_ptr<kgsl_command_object[]> objs;
|
||||
|
@ -78,9 +93,11 @@ class Thneed {
|
|||
Thneed(bool do_clinit=false);
|
||||
void stop();
|
||||
void execute(float **finputs, float *foutput, bool slow=false);
|
||||
void wait();
|
||||
int optimize();
|
||||
|
||||
vector<cl_mem> inputs;
|
||||
vector<void *> inputs;
|
||||
vector<size_t> input_sizes;
|
||||
cl_mem output = NULL;
|
||||
|
||||
cl_context context = NULL;
|
||||
|
@ -92,11 +109,13 @@ class Thneed {
|
|||
int record;
|
||||
int timestamp;
|
||||
unique_ptr<GPUMalloc> ram;
|
||||
vector<unique_ptr<CachedCommand> > cmds;
|
||||
vector<string> syncobjs;
|
||||
vector<unique_ptr<CachedIoctl> > cmds;
|
||||
int fd;
|
||||
|
||||
// all CL kernels
|
||||
void find_inputs_outputs();
|
||||
void copy_inputs(float **finputs);
|
||||
void copy_output(float *foutput);
|
||||
cl_int clexec();
|
||||
vector<shared_ptr<CLQueuedKernel> > kq;
|
||||
|
||||
|
@ -105,9 +124,8 @@ class Thneed {
|
|||
|
||||
// loading and saving
|
||||
void load(const char *filename);
|
||||
void save(const char *filename);
|
||||
void save(const char *filename, bool save_binaries=false);
|
||||
private:
|
||||
void clinit();
|
||||
json11::Json to_json();
|
||||
};
|
||||
|
||||
|
|
Loading…
Reference in New Issue