Thneed load/save (#19700)

* start thneed load/save

* compiling

* fix loading

* build thneed model in scons

* don't hardcode /data/openpilot

* release files

* those too

* support for loading/saving binary kernels

* save binaries out of json band

* make binary a command line flag to the compiler

* need include assert

* fix shadowed common in SConscript

* cleanup run.h

* hmm, the recurrent buffer wasn't 0ed

* ugh, unique ptr

* remove power constraint, refactor record

* Revert "remove power constraint, refactor record"

This reverts commit bb6fa52db6df59cd9d6420a6f630430e35af8a5e.

* print on thneed stop

* fingers crossed for this one

* recorded

* just curious

* okay okay, pass tests?

* cleanups

* refactor wait

Co-authored-by: Comma Device <device@comma.ai>
Co-authored-by: Adeeb Shihadeh <adeebshihadeh@gmail.com>
albatross
George Hotz 2021-01-19 18:08:31 -08:00 committed by GitHub
parent 124100d0fa
commit 59fac9fdc6
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
12 changed files with 561 additions and 97 deletions

3
.gitignore vendored
View File

@ -71,3 +71,6 @@ flycheck_*
cppcheck_report.txt
comma.sh
selfdrive/modeld/thneed/compile
models/*.thneed

View File

@ -407,10 +407,14 @@ selfdrive/modeld/transforms/transform.h
selfdrive/modeld/transforms/transform.cl
selfdrive/modeld/thneed/thneed.*
selfdrive/modeld/thneed/serialize.cc
selfdrive/modeld/thneed/compile.cc
selfdrive/modeld/thneed/include/*
selfdrive/modeld/runners/snpemodel.cc
selfdrive/modeld/runners/snpemodel.h
selfdrive/modeld/runners/thneedmodel.cc
selfdrive/modeld/runners/thneedmodel.h
selfdrive/modeld/runners/runmodel.h
selfdrive/modeld/runners/run.h

View File

@ -10,14 +10,17 @@ common_src = [
"transforms/transform.cc"
]
if arch == "aarch64":
libs += ['gsl', 'CB', 'gnustl_shared']
common_src += ["thneed/thneed.cc"]
lenv['CFLAGS'].append("-DUSE_THNEED")
lenv['CXXFLAGS'].append("-DUSE_THNEED")
elif arch == "larch64":
libs += ['gsl', 'CB', 'pthread', 'dl']
common_src += ["thneed/thneed.cc"]
thneed_src = [
"thneed/thneed.cc",
"thneed/serialize.cc",
"runners/thneedmodel.cc",
]
if arch == "aarch64" or arch == "larch64":
libs += ['gsl', 'CB']
libs += ['gnustl_shared'] if arch == "aarch64" else ['pthread', 'dl']
common_src += thneed_src
lenv['CFLAGS'].append("-DUSE_THNEED")
lenv['CXXFLAGS'].append("-DUSE_THNEED")
else:
@ -40,15 +43,23 @@ else:
del libs[libs.index('symphony-cpu')]
del common_src[common_src.index('runners/snpemodel.cc')]
common = lenv.Object(common_src)
common_model = lenv.Object(common_src)
# build thneed model
if arch == "aarch64" or arch == "larch64":
compiler = lenv.Program('thneed/compile', ["thneed/compile.cc" ]+common_model, LIBS=libs)
cmd = f"cd {Dir('.').get_abspath()} && {compiler[0].get_abspath()} ../../models/supercombo.dlc ../../models/supercombo.thneed --binary"
snpe_path = "/data/pythonpath/phonelibs/snpe/"+arch
cenv = Environment(ENV = {'LD_LIBRARY_PATH' : snpe_path+":"+lenv["ENV"]["LD_LIBRARY_PATH"]})
cenv.Command("../../models/supercombo.thneed", ["../../models/supercombo.dlc", compiler], cmd)
lenv.Program('_dmonitoringmodeld', [
"dmonitoringmodeld.cc",
"models/dmonitoring.cc",
]+common, LIBS=libs)
]+common_model, LIBS=libs)
lenv.Program('_modeld', [
"modeld.cc",
"models/driving.cc",
]+common, LIBS=libs)
]+common_model, LIBS=libs)

View File

@ -54,7 +54,13 @@ void model_init(ModelState* s, cl_device_id device_id, cl_context context) {
constexpr int output_size = OUTPUT_SIZE + TEMPORAL_SIZE;
s->output = std::make_unique<float[]>(output_size);
memset(&s->output[0], 0, output_size*sizeof(float));
#if defined(QCOM) || defined(QCOM2)
s->m = std::make_unique<ThneedModel>("../../models/supercombo.thneed", &s->output[0], output_size, USE_GPU_RUNTIME);
#else
s->m = std::make_unique<DefaultRunModel>("../../models/supercombo.dlc", &s->output[0], output_size, USE_GPU_RUNTIME);
#endif
#ifdef TEMPORAL
s->m->addRecurrent(&s->output[OUTPUT_SIZE], TEMPORAL_SIZE);

View File

@ -1,10 +1,10 @@
#ifndef RUN_H
#define RUN_H
#pragma once
#include "runmodel.h"
#include "snpemodel.h"
#ifdef QCOM
#if defined(QCOM) || defined(QCOM2)
#include "thneedmodel.h"
#define DefaultRunModel SNPEModel
#else
#ifdef USE_ONNX_MODEL
@ -14,5 +14,3 @@
#define DefaultRunModel SNPEModel
#endif
#endif
#endif

View File

@ -31,13 +31,14 @@ public:
void addTrafficConvention(float *state, int state_size);
void addDesire(float *state, int state_size);
void execute(float *net_input_buf, int buf_size);
private:
uint8_t *model_data = NULL;
#ifdef USE_THNEED
Thneed *thneed = NULL;
#endif
private:
uint8_t *model_data = NULL;
#if defined(QCOM) || defined(QCOM2)
zdl::DlSystem::Runtime_t Runtime;
#endif

View File

@ -0,0 +1,41 @@
#include "thneedmodel.h"
#include <assert.h>
ThneedModel::ThneedModel(const char *path, float *loutput, size_t loutput_size, int runtime) {
thneed = new Thneed(true);
thneed->record = 0;
thneed->load(path);
thneed->clexec();
thneed->find_inputs_outputs();
recorded = false;
output = loutput;
}
void ThneedModel::addRecurrent(float *state, int state_size) {
recurrent = state;
}
void ThneedModel::addTrafficConvention(float *state, int state_size) {
trafficConvention = state;
}
void ThneedModel::addDesire(float *state, int state_size) {
desire = state;
}
void ThneedModel::execute(float *net_input_buf, int buf_size) {
float *inputs[4] = {recurrent, trafficConvention, desire, net_input_buf};
if (!recorded) {
thneed->record = THNEED_RECORD;
thneed->copy_inputs(inputs);
thneed->clexec();
thneed->copy_output(output);
thneed->stop();
recorded = true;
} else {
thneed->execute(inputs, output);
}
}

View File

@ -0,0 +1,24 @@
#pragma once
#include "runmodel.h"
#include "thneed/thneed.h"
class ThneedModel : public RunModel {
public:
ThneedModel(const char *path, float *loutput, size_t loutput_size, int runtime);
void addRecurrent(float *state, int state_size);
void addTrafficConvention(float *state, int state_size);
void addDesire(float *state, int state_size);
void execute(float *net_input_buf, int buf_size);
private:
Thneed *thneed = NULL;
bool recorded;
float *output;
// recurrent and desire
float *recurrent;
float *trafficConvention;
float *desire;
};

View File

@ -0,0 +1,34 @@
#include <string.h>
#include "thneed.h"
#include "../runners/snpemodel.h"
#define TEMPORAL_SIZE 512
#define DESIRE_LEN 8
#define TRAFFIC_CONVENTION_LEN 2
// TODO: This should probably use SNPE directly.
int main(int argc, char* argv[]) {
#define OUTPUT_SIZE 0x10000
float *output = (float*)calloc(OUTPUT_SIZE, sizeof(float));
SNPEModel mdl(argv[1], output, 0, USE_GPU_RUNTIME);
float state[TEMPORAL_SIZE] = {0};
float desire[DESIRE_LEN] = {0};
float traffic_convention[TRAFFIC_CONVENTION_LEN] = {0};
float *input = (float*)calloc(0x1000000, sizeof(float));;
mdl.addRecurrent(state, TEMPORAL_SIZE);
mdl.addDesire(desire, DESIRE_LEN);
mdl.addTrafficConvention(traffic_convention, TRAFFIC_CONVENTION_LEN);
// first run
printf("************** execute 1 **************\n");
memset(output, 0, OUTPUT_SIZE * sizeof(float));
mdl.execute(input, 0);
// save model
bool save_binaries = (argc > 3) && (strcmp(argv[3], "--binary") == 0);
mdl.thneed->save(argv[2], save_binaries);
return 0;
}

View File

@ -0,0 +1,290 @@
#include <set>
#include <assert.h>
#include "thneed.h"
#include "json11.hpp"
using namespace json11;
extern map<cl_program, string> g_program_source;
void Thneed::load(const char *filename) {
printf("Thneed::load: loading from %s\n", filename);
FILE *f = fopen(filename, "rb");
fseek(f, 0L, SEEK_END);
int sz = ftell(f);
fseek(f, 0L, SEEK_SET);
char *buf = (char*)malloc(sz);
fread(buf, 1, sz, f);
fclose(f);
int jsz = *(int *)buf;
string jj(buf+4, jsz);
string err;
Json jdat = Json::parse(jj, err);
map<cl_mem, cl_mem> real_mem;
real_mem[NULL] = NULL;
int ptr = 4+jsz;
for (auto &obj : jdat["objects"].array_items()) {
auto mobj = obj.object_items();
int sz = mobj["size"].int_value();
cl_mem clbuf = NULL;
if (mobj["buffer_id"].string_value().size() > 0) {
// image buffer must already be allocated
clbuf = real_mem[*(cl_mem*)(mobj["buffer_id"].string_value().data())];
assert(mobj["needs_load"].bool_value() == false);
} else {
if (mobj["needs_load"].bool_value()) {
//printf("loading %p %d @ 0x%X\n", clbuf, sz, ptr);
clbuf = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR | CL_MEM_READ_WRITE, sz, &buf[ptr], NULL);
ptr += sz;
} else {
clbuf = clCreateBuffer(context, CL_MEM_READ_WRITE, sz, NULL, NULL);
}
}
assert(clbuf != NULL);
if (mobj["arg_type"] == "image2d_t" || mobj["arg_type"] == "image1d_t") {
cl_image_desc desc = {0};
desc.image_type = (mobj["arg_type"] == "image2d_t") ? CL_MEM_OBJECT_IMAGE2D : CL_MEM_OBJECT_IMAGE1D_BUFFER;
desc.image_width = mobj["width"].int_value();
desc.image_height = mobj["height"].int_value();
desc.image_row_pitch = mobj["row_pitch"].int_value();
desc.buffer = clbuf;
cl_image_format format;
format.image_channel_order = CL_RGBA;
format.image_channel_data_type = CL_HALF_FLOAT;
clbuf = clCreateImage(context, CL_MEM_READ_WRITE, &format, &desc, NULL, NULL);
assert(clbuf != NULL);
}
real_mem[*(cl_mem*)(mobj["id"].string_value().data())] = clbuf;
}
map<string, cl_program> g_programs;
for (auto &obj : jdat["programs"].object_items()) {
const char *srcs[1];
srcs[0] = (const char *)obj.second.string_value().c_str();
size_t length = obj.second.string_value().size();
if (record & THNEED_DEBUG) printf("building %s with size %zu\n", obj.first.c_str(), length);
cl_program program = clCreateProgramWithSource(context, 1, srcs, &length, NULL);
int err = clBuildProgram(program, 1, &device_id, "", NULL, NULL);
if (err != 0) {
printf("got err %d\n", err);
size_t length;
char buffer[2048];
clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &length);
buffer[length] = '\0';
printf("%s\n", buffer);
}
assert(err == 0);
g_programs[obj.first] = program;
}
for (auto &obj : jdat["binaries"].array_items()) {
string name = obj["name"].string_value();
size_t length = obj["length"].int_value();
const unsigned char *srcs[1];
srcs[0] = (const unsigned char *)&buf[ptr];
ptr += length;
if (record & THNEED_DEBUG) printf("binary %s with size %zu\n", name.c_str(), length);
cl_int err;
cl_program program = clCreateProgramWithBinary(context, 1, &device_id, &length, srcs, NULL, &err);
assert(program != NULL && err == CL_SUCCESS);
err = clBuildProgram(program, 1, &device_id, "", NULL, NULL);
assert(err == CL_SUCCESS);
g_programs[name] = program;
}
for (auto &obj : jdat["kernels"].array_items()) {
auto gws = obj["global_work_size"];
auto lws = obj["local_work_size"];
auto kk = shared_ptr<CLQueuedKernel>(new CLQueuedKernel(this));
kk->name = obj["name"].string_value();
kk->program = g_programs[kk->name];
kk->work_dim = obj["work_dim"].int_value();
for (int i = 0; i < kk->work_dim; i++) {
kk->global_work_size[i] = gws[i].int_value();
kk->local_work_size[i] = lws[i].int_value();
}
kk->num_args = obj["num_args"].int_value();
for (int i = 0; i < kk->num_args; i++) {
string arg = obj["args"].array_items()[i].string_value();
int arg_size = obj["args_size"].array_items()[i].int_value();
kk->args_size.push_back(arg_size);
if (arg_size == 8) {
cl_mem val = *(cl_mem*)(arg.data());
val = real_mem[val];
kk->args.push_back(string((char*)&val, sizeof(val)));
} else {
kk->args.push_back(arg);
}
}
kq.push_back(kk);
}
free(buf);
clFinish(command_queue);
}
void Thneed::save(const char *filename, bool save_binaries) {
printf("Thneed::save: saving to %s\n", filename);
// get kernels
std::vector<Json> kernels;
std::set<string> saved_objects;
std::vector<Json> objects;
std::map<string, string> programs;
std::map<string, string> binaries;
for (auto &k : kq) {
kernels.push_back(k->to_json());
// check args for objects
int i = 0;
for (auto &a : k->args) {
if (a.size() == 8) {
if (saved_objects.find(a) == saved_objects.end()) {
saved_objects.insert(a);
cl_mem val = *(cl_mem*)(a.data());
if (val != NULL) {
bool needs_load = k->arg_names[i] == "weights" || k->arg_names[i] == "biases";
auto jj = Json::object({
{"id", a},
{"arg_type", k->arg_types[i]},
});
if (k->arg_types[i] == "image2d_t" || k->arg_types[i] == "image1d_t") {
cl_mem buf;
clGetImageInfo(val, CL_IMAGE_BUFFER, sizeof(buf), &buf, NULL);
string aa = string((char *)&buf, sizeof(buf));
jj["buffer_id"] = aa;
size_t width, height, row_pitch;
clGetImageInfo(val, CL_IMAGE_WIDTH, sizeof(width), &width, NULL);
clGetImageInfo(val, CL_IMAGE_HEIGHT, sizeof(height), &height, NULL);
clGetImageInfo(val, CL_IMAGE_ROW_PITCH, sizeof(row_pitch), &row_pitch, NULL);
jj["width"] = (int)width;
jj["height"] = (int)height;
jj["row_pitch"] = (int)row_pitch;
jj["size"] = (int)(height * row_pitch);
jj["needs_load"] = false;
if (saved_objects.find(aa) == saved_objects.end()) {
saved_objects.insert(aa);
size_t sz;
clGetMemObjectInfo(buf, CL_MEM_SIZE, sizeof(sz), &sz, NULL);
// save the buffer
objects.push_back(Json::object({
{"id", aa},
{"arg_type", "<image buffer>"},
{"needs_load", needs_load},
{"size", (int)sz}
}));
if (needs_load) assert(sz == height * row_pitch);
}
} else {
size_t sz = 0;
clGetMemObjectInfo(val, CL_MEM_SIZE, sizeof(sz), &sz, NULL);
jj["size"] = (int)sz;
jj["needs_load"] = needs_load;
}
objects.push_back(jj);
}
}
}
i++;
}
if (save_binaries) {
int err;
size_t binary_size = 0;
err = clGetProgramInfo(k->program, CL_PROGRAM_BINARY_SIZES, sizeof(binary_size), &binary_size, NULL);
assert(err == 0);
assert(binary_size > 0);
string sv(binary_size, '\x00');
uint8_t* bufs[1] = { (uint8_t*)sv.data(), };
err = clGetProgramInfo(k->program, CL_PROGRAM_BINARIES, sizeof(bufs), &bufs, NULL);
assert(err == 0);
binaries[k->name] = sv;
} else {
programs[k->name] = g_program_source[k->program];
}
}
vector<string> saved_buffers;
for (auto &obj : objects) {
auto mobj = obj.object_items();
cl_mem val = *(cl_mem*)(mobj["id"].string_value().data());
int sz = mobj["size"].int_value();
if (mobj["needs_load"].bool_value()) {
char *buf = (char *)malloc(sz);
if (mobj["arg_type"] == "image2d_t" || mobj["arg_type"] == "image1d_t") {
assert(false);
} else {
// buffers alloced with CL_MEM_HOST_WRITE_ONLY, hence this hack
//hexdump((uint32_t*)val, 0x100);
// the worst hack in thneed, the flags are at 0x14
((uint32_t*)val)[0x14] &= ~CL_MEM_HOST_WRITE_ONLY;
cl_int ret = clEnqueueReadBuffer(command_queue, val, CL_TRUE, 0, sz, buf, 0, NULL, NULL);
assert(ret == CL_SUCCESS);
}
//printf("saving buffer: %d %p %s\n", sz, buf, mobj["arg_type"].string_value().c_str());
saved_buffers.push_back(string(buf, sz));
free(buf);
}
}
std::vector<Json> jbinaries;
for (auto &obj : binaries) {
jbinaries.push_back(Json::object({{"name", obj.first}, {"length", (int)obj.second.size()}}));
saved_buffers.push_back(obj.second);
}
Json jdat = Json::object({
{"kernels", kernels},
{"objects", objects},
{"programs", programs},
{"binaries", jbinaries},
});
string str = jdat.dump();
int jsz = str.length();
FILE *f = fopen(filename, "wb");
fwrite(&jsz, 1, sizeof(jsz), f);
fwrite(str.data(), 1, jsz, f);
for (auto &s : saved_buffers) {
fwrite(s.data(), 1, s.length(), f);
}
fclose(f);
}
Json CLQueuedKernel::to_json() const {
return Json::object {
{ "name", name },
{ "work_dim", (int)work_dim },
{ "global_work_size", Json::array { (int)global_work_size[0], (int)global_work_size[1], (int)global_work_size[2] } },
{ "local_work_size", Json::array { (int)local_work_size[0], (int)local_work_size[1], (int)local_work_size[2] } },
{ "num_args", (int)num_args },
{ "args", args },
{ "args_size", args_size },
};
}

View File

@ -7,8 +7,6 @@
#include <errno.h>
#include "thneed.h"
//#define SAVE_KERNELS
//#define RUN_DISASSEMBLER
//#define RUN_OPTIMIZER
@ -83,7 +81,8 @@ int ioctl(int filedes, unsigned long request, void *argp) {
}
if (thneed->record & THNEED_RECORD) {
thneed->syncobjs.push_back(string((char *)objs, sizeof(struct kgsl_gpuobj_sync_obj)*cmd->count));
thneed->cmds.push_back(unique_ptr<CachedSync>(new
CachedSync(thneed, string((char *)objs, sizeof(struct kgsl_gpuobj_sync_obj)*cmd->count))));
}
} else if (request == IOCTL_KGSL_DEVICE_WAITTIMESTAMP_CTXTID) {
struct kgsl_device_waittimestamp_ctxtid *cmd = (struct kgsl_device_waittimestamp_ctxtid *)argp;
@ -103,6 +102,14 @@ int ioctl(int filedes, unsigned long request, void *argp) {
}
}
}
} else if (request == IOCTL_KGSL_DRAWCTXT_CREATE || request == IOCTL_KGSL_DRAWCTXT_DESTROY) {
// this happens
} else if (request == IOCTL_KGSL_GPUOBJ_ALLOC || request == IOCTL_KGSL_GPUOBJ_FREE) {
// this happens
} else {
if (thneed->record & THNEED_DEBUG) {
printf("other ioctl %lx\n", request);
}
}
}
@ -133,13 +140,27 @@ GPUMalloc::~GPUMalloc() {
}
void *GPUMalloc::alloc(int size) {
if (size > remaining) return NULL;
remaining -= size;
void *ret = (void*)base;
base += (size+0xff) & (~0xFF);
size = (size+0xff) & (~0xFF);
assert(size <= remaining);
remaining -= size;
base += size;
return ret;
}
// *********** CachedSync, at the ioctl layer ***********
void CachedSync::exec() {
struct kgsl_gpuobj_sync cmd;
cmd.objs = (uint64_t)data.data();
cmd.obj_len = data.length();
cmd.count = data.length() / sizeof(struct kgsl_gpuobj_sync_obj);
int ret = ioctl(thneed->fd, IOCTL_KGSL_GPUOBJ_SYNC, &cmd);
assert(ret == 0);
}
// *********** CachedCommand, at the ioctl layer ***********
CachedCommand::CachedCommand(Thneed *lthneed, struct kgsl_gpu_command *cmd) {
@ -174,24 +195,11 @@ CachedCommand::CachedCommand(Thneed *lthneed, struct kgsl_gpu_command *cmd) {
thneed->ckq.clear();
}
void CachedCommand::exec(bool wait) {
void CachedCommand::exec() {
cache.timestamp = ++thneed->timestamp;
int ret = ioctl(thneed->fd, IOCTL_KGSL_GPU_COMMAND, &cache);
if (wait) {
struct kgsl_device_waittimestamp_ctxtid wait;
wait.context_id = cache.context_id;
wait.timestamp = cache.timestamp;
wait.timeout = -1;
uint64_t tb = nanos_since_boot();
int wret = ioctl(thneed->fd, IOCTL_KGSL_DEVICE_WAITTIMESTAMP_CTXTID, &wait);
uint64_t te = nanos_since_boot();
if (thneed->record & THNEED_DEBUG) printf("exec %d wait %d after %lu us\n", ret, wret, (te-tb)/1000);
} else {
if (thneed->record & THNEED_DEBUG) printf("CachedCommand::exec got %d\n", ret);
}
if (thneed->record & THNEED_DEBUG) printf("CachedCommand::exec got %d\n", ret);
if (thneed->record & THNEED_VERBOSE_DEBUG) {
for (auto &it : kq) {
@ -213,32 +221,85 @@ Thneed::Thneed(bool do_clinit) {
if (do_clinit) clinit();
assert(g_fd != -1);
fd = g_fd;
ram = make_unique<GPUMalloc>(0x40000, fd);
ram = make_unique<GPUMalloc>(0x80000, fd);
record = THNEED_RECORD;
timestamp = -1;
g_thneed = this;
}
void Thneed::stop() {
find_inputs_outputs();
printf("Thneed::stop: recorded %lu commands\n", cmds.size());
record = 0;
}
void Thneed::find_inputs_outputs() {
cl_int err;
if (inputs.size() > 0) return;
// save the global inputs/outputs
for (auto &k : kq) {
for (int i = 0; i < k->num_args; i++) {
if (k->name == "zero_pad_image_float" && k->arg_names[i] == "input") {
cl_mem aa = *(cl_mem*)(k->args[i].data());
size_t sz;
clGetMemObjectInfo(aa, CL_MEM_SIZE, sizeof(sz), &sz, NULL);
input_sizes.push_back(sz);
void *ret = clEnqueueMapBuffer(command_queue, aa, CL_TRUE, CL_MAP_WRITE, 0, sz, 0, NULL, NULL, &err);
assert(err == CL_SUCCESS);
inputs.push_back(ret);
}
if (k->name == "image2d_to_buffer_float" && k->arg_names[i] == "output") {
output = *(cl_mem*)(k->args[i].data());
}
}
}
}
void Thneed::copy_inputs(float **finputs) {
//cl_int ret;
for (int idx = 0; idx < inputs.size(); ++idx) {
if (record & THNEED_DEBUG) printf("copying %lu -- %p -> %p\n", input_sizes[idx], finputs[idx], inputs[idx]);
memcpy(inputs[idx], finputs[idx], input_sizes[idx]);
}
}
void Thneed::copy_output(float *foutput) {
if (output != NULL) {
size_t sz;
clGetMemObjectInfo(output, CL_MEM_SIZE, sizeof(sz), &sz, NULL);
if (record & THNEED_DEBUG) printf("copying %lu for output %p -> %p\n", sz, output, foutput);
clEnqueueReadBuffer(command_queue, output, CL_TRUE, 0, sz, foutput, 0, NULL, NULL);
} else {
printf("CAUTION: model output is NULL, does it have no outputs?\n");
}
}
void Thneed::wait() {
struct kgsl_device_waittimestamp_ctxtid wait;
wait.context_id = context_id;
wait.timestamp = timestamp;
wait.timeout = -1;
uint64_t tb = nanos_since_boot();
int wret = ioctl(fd, IOCTL_KGSL_DEVICE_WAITTIMESTAMP_CTXTID, &wait);
uint64_t te = nanos_since_boot();
if (record & THNEED_DEBUG) printf("wait %d after %lu us\n", wret, (te-tb)/1000);
}
void Thneed::execute(float **finputs, float *foutput, bool slow) {
int ret;
uint64_t tb, te;
if (record & THNEED_DEBUG) tb = nanos_since_boot();
// ****** copy inputs
for (int idx = 0; idx < inputs.size(); ++idx) {
size_t sz;
clGetMemObjectInfo(inputs[idx], CL_MEM_SIZE, sizeof(sz), &sz, NULL);
if (record & THNEED_DEBUG) printf("copying %lu -- %p -> %p\n", sz, finputs[idx], inputs[idx]);
// TODO: This shouldn't have to block
clEnqueueWriteBuffer(command_queue, inputs[idx], CL_TRUE, 0, sz, finputs[idx], 0, NULL, NULL);
}
copy_inputs(finputs);
// ****** set power constraint
int ret;
struct kgsl_device_constraint_pwrlevel pwrlevel;
pwrlevel.level = KGSL_CONSTRAINT_PWR_MAX;
@ -260,30 +321,12 @@ void Thneed::execute(float **finputs, float *foutput, bool slow) {
for (auto &it : cmds) {
++i;
if (record & THNEED_DEBUG) printf("run %2d @ %7lu us: ", i, (nanos_since_boot()-tb)/1000);
it->exec((i == cmds.size()) || slow);
}
// ****** sync objects
for (auto &it : syncobjs) {
struct kgsl_gpuobj_sync cmd;
cmd.objs = (uint64_t)it.data();
cmd.obj_len = it.length();
cmd.count = it.length() / sizeof(struct kgsl_gpuobj_sync_obj);
ret = ioctl(fd, IOCTL_KGSL_GPUOBJ_SYNC, &cmd);
assert(ret == 0);
it->exec();
if ((i == cmds.size()) || slow) wait();
}
// ****** copy outputs
if (output != NULL) {
size_t sz;
clGetMemObjectInfo(output, CL_MEM_SIZE, sizeof(sz), &sz, NULL);
if (record & THNEED_DEBUG) printf("copying %lu for output %p -> %p\n", sz, output, foutput);
clEnqueueReadBuffer(command_queue, output, CL_TRUE, 0, sz, foutput, 0, NULL, NULL);
} else {
printf("CAUTION: model output is NULL, does it have no outputs?\n");
}
copy_output(foutput);
// ****** unset power constraint
constraint.type = KGSL_CONSTRAINT_NONE;
@ -316,7 +359,7 @@ void Thneed::clinit() {
assert(err == 0);
//cl_command_queue_properties props[3] = {CL_QUEUE_PROPERTIES, CL_QUEUE_PROFILING_ENABLE, 0};
cl_command_queue_properties props[3] = {CL_QUEUE_PROPERTIES, 0, 0};
cl_command_queue_properties props[3] = {CL_QUEUE_PROPERTIES, 0, 0};
command_queue = clCreateCommandQueueWithProperties(context, device_id, props, &err);
assert(err == 0);
@ -453,6 +496,9 @@ CLQueuedKernel::CLQueuedKernel(Thneed *lthneed,
char arg_name[0x100];
clGetKernelArgInfo(kernel, i, CL_KERNEL_ARG_NAME, sizeof(arg_name), arg_name, NULL);
arg_names.push_back(string(arg_name));
clGetKernelArgInfo(kernel, i, CL_KERNEL_ARG_TYPE_NAME, sizeof(arg_name), arg_name, NULL);
arg_types.push_back(string(arg_name));
args.push_back(g_args[make_pair(kernel, i)]);
args_size.push_back(g_args_size[make_pair(kernel, i)]);
}
@ -473,11 +519,14 @@ cl_int CLQueuedKernel::exec() {
if (kernel == NULL) {
kernel = clCreateKernel(program, name.c_str(), NULL);
arg_names.clear();
arg_types.clear();
for (int j = 0; j < num_args; j++) {
char arg_name[0x100];
clGetKernelArgInfo(kernel, j, CL_KERNEL_ARG_NAME, sizeof(arg_name), arg_name, NULL);
arg_names.push_back(string(arg_name));
clGetKernelArgInfo(kernel, j, CL_KERNEL_ARG_TYPE_NAME, sizeof(arg_name), arg_name, NULL);
arg_types.push_back(string(arg_name));
cl_int ret;
if (args[j].size() != 0) {
@ -490,19 +539,6 @@ cl_int CLQueuedKernel::exec() {
}
}
// save the global inputs/outputs
if (thneed->record & THNEED_RECORD) {
for (int i = 0; i < num_args; i++) {
if (name == "zero_pad_image_float" && arg_names[i] == "input") {
thneed->inputs.push_back(*(cl_mem*)(args[i].data()));
}
if (name == "image2d_to_buffer_float" && arg_names[i] == "output") {
thneed->output = *(cl_mem*)(args[i].data());
}
}
}
if (thneed->record & THNEED_DEBUG) {
debug_print(thneed->record & THNEED_VERBOSE_DEBUG);
}
@ -524,10 +560,8 @@ void CLQueuedKernel::debug_print(bool verbose) {
if (verbose) {
for (int i = 0; i < num_args; i++) {
char arg_type[0x100];
clGetKernelArgInfo(kernel, i, CL_KERNEL_ARG_TYPE_NAME, sizeof(arg_type), arg_type, NULL);
string arg = args[i];
printf(" %s %s", arg_type, arg_names[i].c_str());
printf(" %s %s", arg_types[i].c_str(), arg_names[i].c_str());
void *arg_value = (void*)arg.data();
int arg_size = arg.size();
if (arg_size == 0) {
@ -537,7 +571,7 @@ void CLQueuedKernel::debug_print(bool verbose) {
} else if (arg_size == 2) {
printf(" = %d", *((short*)arg_value));
} else if (arg_size == 4) {
if (strcmp(arg_type, "float") == 0) {
if (arg_types[i] == "float") {
printf(" = %f", *((float*)arg_value));
} else {
printf(" = %d", *((int*)arg_value));
@ -546,7 +580,7 @@ void CLQueuedKernel::debug_print(bool verbose) {
cl_mem val = (cl_mem)(*((uintptr_t*)arg_value));
printf(" = %p", val);
if (val != NULL) {
if (strcmp("image2d_t", arg_type) == 0 || strcmp("image1d_t", arg_type) == 0) {
if (arg_types[i] == "image2d_t" || arg_types[i] == "image1d_t") {
cl_image_format format;
size_t width, height, depth, array_size, row_pitch, slice_pitch;
cl_mem buf;

View File

@ -48,6 +48,7 @@ class CLQueuedKernel {
string name;
cl_uint num_args;
vector<string> arg_names;
vector<string> arg_types;
vector<string> args;
vector<int> args_size;
cl_kernel kernel = NULL;
@ -60,12 +61,26 @@ class CLQueuedKernel {
Thneed *thneed;
};
class CachedCommand {
class CachedIoctl {
public:
virtual void exec() {}
};
class CachedSync: public CachedIoctl {
public:
CachedSync(Thneed *lthneed, string ldata) { thneed = lthneed; data = ldata; }
void exec();
private:
Thneed *thneed;
string data;
};
class CachedCommand: public CachedIoctl {
public:
CachedCommand(Thneed *lthneed, struct kgsl_gpu_command *cmd);
void exec(bool wait);
void disassemble(int cmd_index);
void exec();
private:
void disassemble(int cmd_index);
struct kgsl_gpu_command cache;
unique_ptr<kgsl_command_object[]> cmds;
unique_ptr<kgsl_command_object[]> objs;
@ -78,9 +93,11 @@ class Thneed {
Thneed(bool do_clinit=false);
void stop();
void execute(float **finputs, float *foutput, bool slow=false);
void wait();
int optimize();
vector<cl_mem> inputs;
vector<void *> inputs;
vector<size_t> input_sizes;
cl_mem output = NULL;
cl_context context = NULL;
@ -92,11 +109,13 @@ class Thneed {
int record;
int timestamp;
unique_ptr<GPUMalloc> ram;
vector<unique_ptr<CachedCommand> > cmds;
vector<string> syncobjs;
vector<unique_ptr<CachedIoctl> > cmds;
int fd;
// all CL kernels
void find_inputs_outputs();
void copy_inputs(float **finputs);
void copy_output(float *foutput);
cl_int clexec();
vector<shared_ptr<CLQueuedKernel> > kq;
@ -105,9 +124,8 @@ class Thneed {
// loading and saving
void load(const char *filename);
void save(const char *filename);
void save(const char *filename, bool save_binaries=false);
private:
void clinit();
json11::Json to_json();
};