Refactor convolutions (#23807)
* one conv with defines * add conv * building works on C3 * this is num_outputs too, process replay is so useful Co-authored-by: Comma Device <device@comma.ai>testing-closet
parent
719801845b
commit
2c7542d34e
|
@ -0,0 +1,272 @@
|
|||
read_only image2d_t input,
|
||||
#ifndef DEPTHWISE
|
||||
short startPackedInputChannel,
|
||||
short numPackedInputChannelsForGroup, short totalNumPackedInputChannels,
|
||||
// typo required for API compatibility
|
||||
short packedOuputChannelOffset, short totalNumPackedOutputChannels,
|
||||
#else
|
||||
short totalNumPackedChannels,
|
||||
#endif
|
||||
read_only image2d_t weights, __constant float *biases,
|
||||
short filterSizeX, short filterSizeY,
|
||||
write_only image2d_t output,
|
||||
short paddingX, short paddingY, short strideX, short strideY,
|
||||
#ifdef SUPPORT_DILATION
|
||||
short dilationX, short dilationY,
|
||||
#endif
|
||||
short neuron, float a, float b, float min_clamp, float max_clamp,
|
||||
#ifndef DEPTHWISE
|
||||
// note: these are not supported
|
||||
__constant float *parameters, __constant float *batchNormBiases,
|
||||
#endif
|
||||
short numOutputColumns
|
||||
#ifdef SUPPORT_ACCUMULATION
|
||||
, short doAccumulate, read_only image2d_t accumulator
|
||||
#endif
|
||||
) {
|
||||
|
||||
#ifndef NUM_OUTPUTS
|
||||
#define NUM_OUTPUTS 4
|
||||
#endif
|
||||
|
||||
// init
|
||||
const sampler_t smp = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
|
||||
short packedOutputChannel = get_global_id(0);
|
||||
short startOutputColumn = mul24((short)get_global_id(1), NUM_OUTPUTS);
|
||||
short outputRow = get_global_id(2);
|
||||
|
||||
#ifdef DEPTHWISE
|
||||
short totalNumPackedInputChannels = totalNumPackedChannels;
|
||||
short totalNumPackedOutputChannels = totalNumPackedChannels;
|
||||
short startPackedInputChannel = packedOutputChannel;
|
||||
#endif
|
||||
|
||||
short startX = mad24(mad24(startOutputColumn, strideX, -paddingX), totalNumPackedInputChannels, startPackedInputChannel);
|
||||
short strideWithChannels = mul24(strideX, totalNumPackedInputChannels);
|
||||
|
||||
float4 outputValues[NUM_OUTPUTS];
|
||||
for (short i = 0; i < NUM_OUTPUTS; ++i) {
|
||||
outputValues[i] = (float4)(0, 0, 0, 0);
|
||||
}
|
||||
|
||||
int2 inputLocation;
|
||||
inputLocation.y = mad24(outputRow, strideY, -paddingY);
|
||||
|
||||
int2 weightLocation;
|
||||
weightLocation.x = 0;
|
||||
weightLocation.y = packedOutputChannel;
|
||||
|
||||
#ifdef DEPTHWISE
|
||||
|
||||
#ifdef SUPPORT_DILATION
|
||||
|
||||
// depthwise convolution
|
||||
for (short rfRow = 0; rfRow < filterSizeY; ++rfRow) {
|
||||
for (short rfColumn = 0; rfColumn < filterSizeX; ++rfColumn) {
|
||||
short dilatedStepX = mul24(totalNumPackedChannels, dilationX);
|
||||
inputLocation.x = mad24(rfColumn, dilatedStepX, startX);
|
||||
float4 inputValues[4];
|
||||
for (short i = 0; i < 4; ++i) {
|
||||
inputValues[i] = read_imagef(input, smp, inputLocation);
|
||||
inputLocation.x += strideWithChannels;
|
||||
}
|
||||
float4 weightValues = read_imagef(weights, smp, weightLocation);
|
||||
++weightLocation.x;
|
||||
outputValues[0] += inputValues[0] * weightValues;
|
||||
outputValues[1] += inputValues[1] * weightValues;
|
||||
outputValues[2] += inputValues[2] * weightValues;
|
||||
outputValues[3] += inputValues[3] * weightValues;
|
||||
}
|
||||
inputLocation.y += dilationY;
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
// depthwise unstrided convolution
|
||||
for (short rfRow = 0; rfRow < filterSizeY; ++rfRow) {
|
||||
float4 inputValues[4];
|
||||
inputLocation.x = startX;
|
||||
for (short i = 1; i < 4; ++i) {
|
||||
inputValues[i] = read_imagef(input, smp, inputLocation);
|
||||
inputLocation.x += totalNumPackedOutputChannels;
|
||||
}
|
||||
for (short rfColumn = 0; rfColumn < filterSizeX; ++rfColumn) {
|
||||
inputValues[0] = inputValues[1];
|
||||
inputValues[1] = inputValues[2];
|
||||
inputValues[2] = inputValues[3];
|
||||
inputValues[3] = read_imagef(input, smp, inputLocation);
|
||||
inputLocation.x += totalNumPackedChannels;
|
||||
float4 weightValues = read_imagef(weights, smp, weightLocation);
|
||||
++weightLocation.x;
|
||||
outputValues[0] += inputValues[0] * weightValues;
|
||||
outputValues[1] += inputValues[1] * weightValues;
|
||||
outputValues[2] += inputValues[2] * weightValues;
|
||||
outputValues[3] += inputValues[3] * weightValues;
|
||||
}
|
||||
++inputLocation.y;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#elif defined(ONLY_1X1_CONV)
|
||||
|
||||
// 1x1 convolution
|
||||
short endPackedInputChannel = startPackedInputChannel + numPackedInputChannelsForGroup;
|
||||
for (short packedInputChannel = startPackedInputChannel; packedInputChannel < endPackedInputChannel; ++packedInputChannel) {
|
||||
float4 weightValues[4];
|
||||
for (short outChIdx = 0; outChIdx < 4; ++outChIdx) {
|
||||
weightValues[outChIdx] = read_imagef(weights, smp, weightLocation);
|
||||
++weightLocation.x;
|
||||
}
|
||||
|
||||
inputLocation.x = startX + packedInputChannel;
|
||||
float4 inputValues[NUM_OUTPUTS];
|
||||
for (short i = 0; i < NUM_OUTPUTS; ++i) {
|
||||
inputValues[i] = read_imagef(input, smp, inputLocation);
|
||||
inputLocation.x += strideWithChannels;
|
||||
}
|
||||
|
||||
for (short i = 0; i < NUM_OUTPUTS; ++i) {
|
||||
float4 curOutputValues = outputValues[i];
|
||||
curOutputValues.x += inputValues[i].x * weightValues[0].x;
|
||||
curOutputValues.x += inputValues[i].y * weightValues[0].y;
|
||||
curOutputValues.x += inputValues[i].z * weightValues[0].z;
|
||||
curOutputValues.x += inputValues[i].w * weightValues[0].w;
|
||||
curOutputValues.y += inputValues[i].x * weightValues[1].x;
|
||||
curOutputValues.y += inputValues[i].y * weightValues[1].y;
|
||||
curOutputValues.y += inputValues[i].z * weightValues[1].z;
|
||||
curOutputValues.y += inputValues[i].w * weightValues[1].w;
|
||||
curOutputValues.z += inputValues[i].x * weightValues[2].x;
|
||||
curOutputValues.z += inputValues[i].y * weightValues[2].y;
|
||||
curOutputValues.z += inputValues[i].z * weightValues[2].z;
|
||||
curOutputValues.z += inputValues[i].w * weightValues[2].w;
|
||||
curOutputValues.w += inputValues[i].x * weightValues[3].x;
|
||||
curOutputValues.w += inputValues[i].y * weightValues[3].y;
|
||||
curOutputValues.w += inputValues[i].z * weightValues[3].z;
|
||||
curOutputValues.w += inputValues[i].w * weightValues[3].w;
|
||||
outputValues[i] = curOutputValues;
|
||||
}
|
||||
}
|
||||
packedOutputChannel += packedOuputChannelOffset;
|
||||
|
||||
#else
|
||||
|
||||
// normal convolution
|
||||
for (short rfRow = 0; rfRow < filterSizeY; ++rfRow) {
|
||||
for (short packedInputChannel = 0; packedInputChannel < numPackedInputChannelsForGroup; ++packedInputChannel) {
|
||||
short startXForChannel = startX + packedInputChannel;
|
||||
for (short rfColumn = 0; rfColumn < filterSizeX; ++rfColumn) {
|
||||
|
||||
float4 weightValues[4];
|
||||
for (short outChIdx = 0; outChIdx < 4; ++outChIdx) {
|
||||
weightValues[outChIdx] = read_imagef(weights, smp, weightLocation);
|
||||
++weightLocation.x;
|
||||
}
|
||||
|
||||
#ifdef SUPPORT_DILATION
|
||||
short dilatedStepX = mul24(totalNumPackedInputChannels, dilationX);
|
||||
inputLocation.x = mad24(rfColumn, dilatedStepX, startXForChannel);
|
||||
#else
|
||||
inputLocation.x = mad24(rfColumn, totalNumPackedInputChannels, startXForChannel);
|
||||
#endif
|
||||
float4 inputValues[NUM_OUTPUTS];
|
||||
for (short i = 0; i < NUM_OUTPUTS; ++i) {
|
||||
inputValues[i] = read_imagef(input, smp, inputLocation);
|
||||
inputLocation.x += strideWithChannels;
|
||||
}
|
||||
for (short i = 0; i < NUM_OUTPUTS; ++i) {
|
||||
float4 curOutputValues = outputValues[i];
|
||||
curOutputValues.x += inputValues[i].x * weightValues[0].x;
|
||||
curOutputValues.x += inputValues[i].y * weightValues[0].y;
|
||||
curOutputValues.x += inputValues[i].z * weightValues[0].z;
|
||||
curOutputValues.x += inputValues[i].w * weightValues[0].w;
|
||||
curOutputValues.y += inputValues[i].x * weightValues[1].x;
|
||||
curOutputValues.y += inputValues[i].y * weightValues[1].y;
|
||||
curOutputValues.y += inputValues[i].z * weightValues[1].z;
|
||||
curOutputValues.y += inputValues[i].w * weightValues[1].w;
|
||||
curOutputValues.z += inputValues[i].x * weightValues[2].x;
|
||||
curOutputValues.z += inputValues[i].y * weightValues[2].y;
|
||||
curOutputValues.z += inputValues[i].z * weightValues[2].z;
|
||||
curOutputValues.z += inputValues[i].w * weightValues[2].w;
|
||||
curOutputValues.w += inputValues[i].x * weightValues[3].x;
|
||||
curOutputValues.w += inputValues[i].y * weightValues[3].y;
|
||||
curOutputValues.w += inputValues[i].z * weightValues[3].z;
|
||||
curOutputValues.w += inputValues[i].w * weightValues[3].w;
|
||||
outputValues[i] = curOutputValues;
|
||||
}
|
||||
}
|
||||
}
|
||||
#ifdef SUPPORT_DILATION
|
||||
inputLocation.y += dilationY;
|
||||
#else
|
||||
++inputLocation.y;
|
||||
#endif
|
||||
}
|
||||
packedOutputChannel += packedOuputChannelOffset;
|
||||
#endif
|
||||
|
||||
// bias
|
||||
short outputChannel = mul24(packedOutputChannel, 4);
|
||||
float4 biasValues = vload4(0, biases + outputChannel);
|
||||
for (short i = 0; i < NUM_OUTPUTS; ++i) {
|
||||
outputValues[i] += biasValues;
|
||||
}
|
||||
|
||||
#ifdef SUPPORT_ACCUMULATION
|
||||
// accumulate
|
||||
if (doAccumulate) {
|
||||
int2 outputLocation;
|
||||
short outputColumn = startOutputColumn;
|
||||
outputLocation.y = outputRow;
|
||||
for (short i = 0; i < NUM_OUTPUTS; ++i) {
|
||||
outputLocation.x = mad24(outputColumn, totalNumPackedOutputChannels, packedOutputChannel);
|
||||
if (outputColumn < numOutputColumns) {
|
||||
outputValues[i] += read_imagef(accumulator, smp, outputLocation);
|
||||
}
|
||||
++outputColumn;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
// activation
|
||||
switch (neuron) {
|
||||
case 1:
|
||||
for (short i = 0; i < NUM_OUTPUTS; ++i) {
|
||||
outputValues[i] = max(outputValues[i], 0.0f);
|
||||
}
|
||||
break;
|
||||
case 2:
|
||||
for (short i = 0; i < NUM_OUTPUTS; ++i) {
|
||||
outputValues[i] = a * tanh(b * outputValues[i]);
|
||||
}
|
||||
break;
|
||||
case 3:
|
||||
for (short i = 0; i < NUM_OUTPUTS; ++i) {
|
||||
outputValues[i] = native_recip(1.0f + native_exp(-a * outputValues[i] + b));
|
||||
}
|
||||
break;
|
||||
case 4:
|
||||
for (short i = 0; i < NUM_OUTPUTS; ++i) {
|
||||
outputValues[i] = max(outputValues[i], min_clamp);
|
||||
outputValues[i] = min(outputValues[i], max_clamp);
|
||||
}
|
||||
break;
|
||||
case 5:
|
||||
for (short i = 0; i < NUM_OUTPUTS; ++i) {
|
||||
outputValues[i] = max(outputValues[i], 0.0f) + a * (native_exp(min(outputValues[i], 0.0f)) - 1.0f);
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
// output
|
||||
int2 outputLocation;
|
||||
short outputColumn = startOutputColumn;
|
||||
outputLocation.y = outputRow;
|
||||
for (short i = 0; i < NUM_OUTPUTS; ++i) {
|
||||
outputLocation.x = mad24(outputColumn, totalNumPackedOutputChannels, packedOutputChannel);
|
||||
if (outputColumn < numOutputColumns) {
|
||||
write_imagef(output, outputLocation, outputValues[i]);
|
||||
}
|
||||
++outputColumn;
|
||||
}
|
||||
}
|
|
@ -1,129 +1,4 @@
|
|||
#define SUPPORT_DILATION
|
||||
|
||||
__kernel void convolution_horizontal_reduced_reads(
|
||||
read_only image2d_t input,
|
||||
short startPackedInputChannel,
|
||||
short numPackedInputChannelsForGroup, short totalNumPackedInputChannels,
|
||||
short packedOuputChannelOffset, short totalNumPackedOutputChannels,
|
||||
read_only image2d_t weights, __constant float *biases,
|
||||
short filterSizeX, short filterSizeY,
|
||||
write_only image2d_t output,
|
||||
short paddingX, short paddingY, short strideX, short strideY,
|
||||
short dilationX, short dilationY,
|
||||
short neuron, float a, float b, float min_clamp, float max_clamp,
|
||||
__constant float *parameters, __constant float *batchNormBiases,
|
||||
short numOutputColumns) {
|
||||
|
||||
// init
|
||||
const sampler_t smp = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
|
||||
short packedOutputChannel = get_global_id(0);
|
||||
short startOutputColumn = mul24((short)get_global_id(1), 4);
|
||||
short outputRow = get_global_id(2);
|
||||
short startX = mad24(mad24(startOutputColumn, strideX, -paddingX),
|
||||
totalNumPackedInputChannels, startPackedInputChannel);
|
||||
short strideWithChannels = mul24(strideX, totalNumPackedInputChannels);
|
||||
|
||||
float4 outputValues[4];
|
||||
for (short i = 0; i < 4; ++i) {
|
||||
outputValues[i] = (float4)(0, 0, 0, 0);
|
||||
}
|
||||
|
||||
int2 inputLocation;
|
||||
inputLocation.y = mad24(outputRow, strideY, -paddingY);
|
||||
|
||||
int2 weightLocation;
|
||||
weightLocation.x = 0;
|
||||
weightLocation.y = packedOutputChannel;
|
||||
|
||||
// convolution
|
||||
for (short rfRow = 0; rfRow < filterSizeY; ++rfRow) {
|
||||
for (short packedInputChannel = 0;
|
||||
packedInputChannel < numPackedInputChannelsForGroup;
|
||||
++packedInputChannel) {
|
||||
short startXForChannel = startX + packedInputChannel;
|
||||
for (short rfColumn = 0; rfColumn < filterSizeX; ++rfColumn) {
|
||||
float4 weightValues[4];
|
||||
for (short outChIdx = 0; outChIdx < 4; ++outChIdx) {
|
||||
weightValues[outChIdx] = read_imagef(weights, smp, weightLocation);
|
||||
++weightLocation.x;
|
||||
}
|
||||
short dilatedStepX = mul24(totalNumPackedInputChannels, dilationX);
|
||||
inputLocation.x = mad24(rfColumn, dilatedStepX, startXForChannel);
|
||||
float4 inputValues[4];
|
||||
for (short i = 0; i < 4; ++i) {
|
||||
inputValues[i] = read_imagef(input, smp, inputLocation);
|
||||
inputLocation.x += strideWithChannels;
|
||||
}
|
||||
for (short i = 0; i < 4; ++i) {
|
||||
float4 curOutputValues = outputValues[i];
|
||||
curOutputValues.x += inputValues[i].x * weightValues[0].x;
|
||||
curOutputValues.x += inputValues[i].y * weightValues[0].y;
|
||||
curOutputValues.x += inputValues[i].z * weightValues[0].z;
|
||||
curOutputValues.x += inputValues[i].w * weightValues[0].w;
|
||||
curOutputValues.y += inputValues[i].x * weightValues[1].x;
|
||||
curOutputValues.y += inputValues[i].y * weightValues[1].y;
|
||||
curOutputValues.y += inputValues[i].z * weightValues[1].z;
|
||||
curOutputValues.y += inputValues[i].w * weightValues[1].w;
|
||||
curOutputValues.z += inputValues[i].x * weightValues[2].x;
|
||||
curOutputValues.z += inputValues[i].y * weightValues[2].y;
|
||||
curOutputValues.z += inputValues[i].z * weightValues[2].z;
|
||||
curOutputValues.z += inputValues[i].w * weightValues[2].w;
|
||||
curOutputValues.w += inputValues[i].x * weightValues[3].x;
|
||||
curOutputValues.w += inputValues[i].y * weightValues[3].y;
|
||||
curOutputValues.w += inputValues[i].z * weightValues[3].z;
|
||||
curOutputValues.w += inputValues[i].w * weightValues[3].w;
|
||||
outputValues[i] = curOutputValues;
|
||||
}
|
||||
}
|
||||
}
|
||||
inputLocation.y += dilationY;
|
||||
}
|
||||
|
||||
// bias
|
||||
packedOutputChannel += packedOuputChannelOffset;
|
||||
short outputChannel = mul24(packedOutputChannel, 4);
|
||||
float4 biasValues = vload4(0, biases + outputChannel);
|
||||
for (short i = 0; i < 4; ++i) {
|
||||
outputValues[i] += biasValues;
|
||||
}
|
||||
|
||||
// activation
|
||||
switch (neuron) {
|
||||
case 1:
|
||||
for (short i = 0; i < 4; ++i) {
|
||||
outputValues[i] = max(outputValues[i], 0.0f);
|
||||
}
|
||||
break;
|
||||
case 2:
|
||||
for (short i = 0; i < 4; ++i) {
|
||||
outputValues[i] = a * tanh(b * outputValues[i]);
|
||||
}
|
||||
break;
|
||||
case 3:
|
||||
for (short i = 0; i < 4; ++i) {
|
||||
outputValues[i] = native_recip(1.0f + native_exp(-a * outputValues[i] + b));
|
||||
}
|
||||
break;
|
||||
case 4:
|
||||
for (short i = 0; i < 4; ++i) {
|
||||
outputValues[i] = max(outputValues[i], min_clamp);
|
||||
outputValues[i] = min(outputValues[i], max_clamp);
|
||||
}
|
||||
break;
|
||||
case 5:
|
||||
for (short i = 0; i < 4; ++i) {
|
||||
outputValues[i] = max(outputValues[i], 0.0f) + a * (native_exp(min(outputValues[i], 0.0f)) - 1.0f);
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
// output
|
||||
int2 outputLocation;
|
||||
short outputColumn = startOutputColumn;
|
||||
outputLocation.y = outputRow;
|
||||
for (short i = 0; i < 4; ++i) {
|
||||
outputLocation.x = mad24(outputColumn, totalNumPackedOutputChannels, packedOutputChannel);
|
||||
if (outputColumn < numOutputColumns) {
|
||||
write_imagef(output, outputLocation, outputValues[i]);
|
||||
}
|
||||
++outputColumn;
|
||||
}
|
||||
}
|
||||
#include "convolution_.cl"
|
||||
|
|
|
@ -1,140 +1,5 @@
|
|||
#define ONLY_1X1_CONV
|
||||
#define SUPPORT_ACCUMULATION
|
||||
|
||||
__kernel void convolution_horizontal_reduced_reads_1x1(
|
||||
read_only image2d_t input,
|
||||
short startPackedInputChannel,
|
||||
short numPackedInputChannelsForGroup, short totalNumPackedInputChannels,
|
||||
short packedOuputChannelOffset, short totalNumPackedOutputChannels,
|
||||
read_only image2d_t weights, __constant float *biases,
|
||||
short filterSizeX, short filterSizeY,
|
||||
write_only image2d_t output,
|
||||
short paddingX, short paddingY, short strideX, short strideY,
|
||||
short neuron, float a, float b, float min_clamp, float max_clamp,
|
||||
__constant float *parameters, __constant float *batchNormBiases,
|
||||
short numOutputColumns,
|
||||
short doAccumulate, read_only image2d_t accumulator) {
|
||||
|
||||
// init
|
||||
const sampler_t smp = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
|
||||
short packedOutputChannel = get_global_id(0);
|
||||
short startOutputColumn = mul24((short)get_global_id(1), 4);
|
||||
short outputRow = get_global_id(2);
|
||||
short endPackedInputChannel = startPackedInputChannel + numPackedInputChannelsForGroup;
|
||||
short startX = mad24(mad24(startOutputColumn, strideX, -paddingX),
|
||||
totalNumPackedInputChannels, startPackedInputChannel);
|
||||
short strideWithChannels = mul24(strideX, totalNumPackedInputChannels);
|
||||
|
||||
float4 outputValues[4];
|
||||
for (short i = 0; i < 4; ++i) {
|
||||
outputValues[i] = (float4)(0, 0, 0, 0);
|
||||
}
|
||||
|
||||
int2 inputLocation;
|
||||
inputLocation.y = mad24(outputRow, strideY, -paddingY);
|
||||
|
||||
int2 weightLocation;
|
||||
weightLocation.x = 0;
|
||||
weightLocation.y = packedOutputChannel;
|
||||
|
||||
// convolution
|
||||
for (short packedInputChannel = startPackedInputChannel;
|
||||
packedInputChannel < endPackedInputChannel; ++packedInputChannel) {
|
||||
|
||||
float4 weightValues[4];
|
||||
for (short outChIdx = 0; outChIdx < 4; ++outChIdx) {
|
||||
weightValues[outChIdx] = read_imagef(weights, smp, weightLocation);
|
||||
++weightLocation.x;
|
||||
}
|
||||
|
||||
inputLocation.x = startX + packedInputChannel;
|
||||
float4 inputValues[4];
|
||||
for (short i = 0; i < 4; ++i) {
|
||||
inputValues[i] = read_imagef(input, smp, inputLocation);
|
||||
inputLocation.x += strideWithChannels;
|
||||
}
|
||||
|
||||
for (short i = 0; i < 4; ++i) {
|
||||
float4 curOutputValues = outputValues[i];
|
||||
curOutputValues.x += inputValues[i].x * weightValues[0].x;
|
||||
curOutputValues.x += inputValues[i].y * weightValues[0].y;
|
||||
curOutputValues.x += inputValues[i].z * weightValues[0].z;
|
||||
curOutputValues.x += inputValues[i].w * weightValues[0].w;
|
||||
curOutputValues.y += inputValues[i].x * weightValues[1].x;
|
||||
curOutputValues.y += inputValues[i].y * weightValues[1].y;
|
||||
curOutputValues.y += inputValues[i].z * weightValues[1].z;
|
||||
curOutputValues.y += inputValues[i].w * weightValues[1].w;
|
||||
curOutputValues.z += inputValues[i].x * weightValues[2].x;
|
||||
curOutputValues.z += inputValues[i].y * weightValues[2].y;
|
||||
curOutputValues.z += inputValues[i].z * weightValues[2].z;
|
||||
curOutputValues.z += inputValues[i].w * weightValues[2].w;
|
||||
curOutputValues.w += inputValues[i].x * weightValues[3].x;
|
||||
curOutputValues.w += inputValues[i].y * weightValues[3].y;
|
||||
curOutputValues.w += inputValues[i].z * weightValues[3].z;
|
||||
curOutputValues.w += inputValues[i].w * weightValues[3].w;
|
||||
outputValues[i] = curOutputValues;
|
||||
}
|
||||
}
|
||||
|
||||
// bias
|
||||
packedOutputChannel += packedOuputChannelOffset;
|
||||
short outputChannel = mul24(packedOutputChannel, 4);
|
||||
float4 biasValues = vload4(0, biases + outputChannel);
|
||||
for (short i = 0; i < 4; ++i) {
|
||||
outputValues[i] += biasValues;
|
||||
}
|
||||
|
||||
// accumulate
|
||||
if (doAccumulate) {
|
||||
int2 outputLocation;
|
||||
short outputColumn = startOutputColumn;
|
||||
outputLocation.y = outputRow;
|
||||
for (short i = 0; i < 4; ++i) {
|
||||
outputLocation.x = mad24(outputColumn, totalNumPackedOutputChannels, packedOutputChannel);
|
||||
if (outputColumn < numOutputColumns) {
|
||||
outputValues[i] += read_imagef(accumulator, smp, outputLocation);
|
||||
}
|
||||
++outputColumn;
|
||||
}
|
||||
}
|
||||
|
||||
// activation
|
||||
switch (neuron) {
|
||||
case 1:
|
||||
for (short i = 0; i < 4; ++i) {
|
||||
outputValues[i] = max(outputValues[i], 0.0f);
|
||||
}
|
||||
break;
|
||||
case 2:
|
||||
for (short i = 0; i < 4; ++i) {
|
||||
outputValues[i] = a * tanh(b * outputValues[i]);
|
||||
}
|
||||
break;
|
||||
case 3:
|
||||
for (short i = 0; i < 4; ++i) {
|
||||
outputValues[i] = native_recip(1.0f + native_exp(-a * outputValues[i] + b));
|
||||
}
|
||||
break;
|
||||
case 4:
|
||||
for (short i = 0; i < 4; ++i) {
|
||||
outputValues[i] = max(outputValues[i], min_clamp);
|
||||
outputValues[i] = min(outputValues[i], max_clamp);
|
||||
}
|
||||
break;
|
||||
case 5:
|
||||
for (short i = 0; i < 4; ++i) {
|
||||
outputValues[i] = max(outputValues[i], 0.0f) + a * (native_exp(min(outputValues[i], 0.0f)) - 1.0f);
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
// output
|
||||
int2 outputLocation;
|
||||
short outputColumn = startOutputColumn;
|
||||
outputLocation.y = outputRow;
|
||||
for (short i = 0; i < 4; ++i) {
|
||||
outputLocation.x = mad24(outputColumn, totalNumPackedOutputChannels, packedOutputChannel);
|
||||
if (outputColumn < numOutputColumns) {
|
||||
write_imagef(output, outputLocation, outputValues[i]);
|
||||
}
|
||||
++outputColumn;
|
||||
}
|
||||
}
|
||||
|
||||
#include "convolution_.cl"
|
||||
|
|
|
@ -1,130 +1,4 @@
|
|||
#define NUM_OUTPUTS 5
|
||||
|
||||
__kernel void convolution_horizontal_reduced_reads_5_outputs(
|
||||
read_only image2d_t input,
|
||||
short startPackedInputChannel,
|
||||
short numPackedInputChannelsForGroup, short totalNumPackedInputChannels,
|
||||
short packedOuputChannelOffset, short totalNumPackedOutputChannels,
|
||||
read_only image2d_t weights, __constant float *biases,
|
||||
short filterSizeX, short filterSizeY,
|
||||
write_only image2d_t output,
|
||||
short paddingX, short paddingY, short strideX, short strideY,
|
||||
short neuron, float a, float b, float min_clamp, float max_clamp,
|
||||
__constant float *parameters, __constant float *batchNormBiases,
|
||||
short numOutputColumns) {
|
||||
|
||||
// init
|
||||
const sampler_t smp = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
|
||||
short packedOutputChannel = get_global_id(0);
|
||||
short startOutputColumn = mul24((short)get_global_id(1), 5);
|
||||
short outputRow = get_global_id(2);
|
||||
short startX = mad24(mad24(startOutputColumn, strideX, -paddingX),
|
||||
totalNumPackedInputChannels, startPackedInputChannel);
|
||||
short strideWithChannels = mul24(strideX, totalNumPackedInputChannels);
|
||||
|
||||
float4 outputValues[5];
|
||||
for (short i = 0; i < 5; ++i) {
|
||||
outputValues[i] = (float4)(0, 0, 0, 0);
|
||||
}
|
||||
|
||||
int2 inputLocation;
|
||||
inputLocation.y = mad24(outputRow, strideY, -paddingY);
|
||||
|
||||
int2 weightLocation;
|
||||
weightLocation.x = 0;
|
||||
weightLocation.y = packedOutputChannel;
|
||||
|
||||
// convolution
|
||||
for (short rfRow = 0; rfRow < filterSizeY; ++rfRow) {
|
||||
for (short packedInputChannel = 0;
|
||||
packedInputChannel < numPackedInputChannelsForGroup;
|
||||
++packedInputChannel) {
|
||||
short startXForChannel = startX + packedInputChannel;
|
||||
for (short rfColumn = 0; rfColumn < filterSizeX; ++rfColumn) {
|
||||
float4 weightValues[4];
|
||||
for (short outChIdx = 0; outChIdx < 4; ++outChIdx) {
|
||||
weightValues[outChIdx] = read_imagef(weights, smp, weightLocation);
|
||||
++weightLocation.x;
|
||||
}
|
||||
inputLocation.x =
|
||||
mad24(rfColumn, totalNumPackedInputChannels, startXForChannel);
|
||||
|
||||
float4 inputValues[5];
|
||||
for (short i = 0; i < 5; ++i) {
|
||||
inputValues[i] = read_imagef(input, smp, inputLocation);
|
||||
inputLocation.x += strideWithChannels;
|
||||
}
|
||||
|
||||
for (short i = 0; i < 5; ++i) {
|
||||
float4 curOutputValues = outputValues[i];
|
||||
curOutputValues.x += inputValues[i].x * weightValues[0].x;
|
||||
curOutputValues.x += inputValues[i].y * weightValues[0].y;
|
||||
curOutputValues.x += inputValues[i].z * weightValues[0].z;
|
||||
curOutputValues.x += inputValues[i].w * weightValues[0].w;
|
||||
curOutputValues.y += inputValues[i].x * weightValues[1].x;
|
||||
curOutputValues.y += inputValues[i].y * weightValues[1].y;
|
||||
curOutputValues.y += inputValues[i].z * weightValues[1].z;
|
||||
curOutputValues.y += inputValues[i].w * weightValues[1].w;
|
||||
curOutputValues.z += inputValues[i].x * weightValues[2].x;
|
||||
curOutputValues.z += inputValues[i].y * weightValues[2].y;
|
||||
curOutputValues.z += inputValues[i].z * weightValues[2].z;
|
||||
curOutputValues.z += inputValues[i].w * weightValues[2].w;
|
||||
curOutputValues.w += inputValues[i].x * weightValues[3].x;
|
||||
curOutputValues.w += inputValues[i].y * weightValues[3].y;
|
||||
curOutputValues.w += inputValues[i].z * weightValues[3].z;
|
||||
curOutputValues.w += inputValues[i].w * weightValues[3].w;
|
||||
outputValues[i] = curOutputValues;
|
||||
}
|
||||
}
|
||||
}
|
||||
++inputLocation.y;
|
||||
}
|
||||
|
||||
// bias
|
||||
packedOutputChannel += packedOuputChannelOffset;
|
||||
short outputChannel = mul24(packedOutputChannel, 4);
|
||||
float4 biasValues = vload4(0, biases + outputChannel);
|
||||
for (short i = 0; i < 5; ++i) {
|
||||
outputValues[i] += biasValues;
|
||||
}
|
||||
|
||||
// activation
|
||||
switch (neuron) {
|
||||
case 1:
|
||||
for (short i = 0; i < 5; ++i) {
|
||||
outputValues[i] = max(outputValues[i], 0.0f);
|
||||
}
|
||||
break;
|
||||
case 2:
|
||||
for (short i = 0; i < 5; ++i) {
|
||||
outputValues[i] = a * tanh(b * outputValues[i]);
|
||||
}
|
||||
break;
|
||||
case 3:
|
||||
for (short i = 0; i < 5; ++i) {
|
||||
outputValues[i] = native_recip(1.0f + native_exp(-a * outputValues[i] + b));
|
||||
}
|
||||
break;
|
||||
case 4:
|
||||
for (short i = 0; i < 5; ++i) {
|
||||
outputValues[i] = max(outputValues[i], min_clamp);
|
||||
outputValues[i] = min(outputValues[i], max_clamp);
|
||||
}
|
||||
break;
|
||||
case 5:
|
||||
for (short i = 0; i < 5; ++i) {
|
||||
outputValues[i] = max(outputValues[i], 0.0f) + a * (native_exp(min(outputValues[i], 0.0f)) - 1.0f);
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
// output
|
||||
int2 outputLocation;
|
||||
short outputColumn = startOutputColumn;
|
||||
outputLocation.y = outputRow;
|
||||
for (short i = 0; i < 5; ++i) {
|
||||
outputLocation.x = mad24(outputColumn, totalNumPackedOutputChannels, packedOutputChannel);
|
||||
if (outputColumn < numOutputColumns) {
|
||||
write_imagef(output, outputLocation, outputValues[i]);
|
||||
}
|
||||
++outputColumn;
|
||||
}
|
||||
}
|
||||
#include "convolution_.cl"
|
||||
|
|
|
@ -1,101 +1,5 @@
|
|||
#define DEPTHWISE
|
||||
#define SUPPORT_DILATION
|
||||
|
||||
__kernel void convolution_horizontal_reduced_reads_depthwise(
|
||||
read_only image2d_t input,
|
||||
short totalNumPackedChannels,
|
||||
read_only image2d_t weights, __constant float *biases,
|
||||
short filterSizeX, short filterSizeY,
|
||||
write_only image2d_t output,
|
||||
short paddingX, short paddingY, short strideX, short strideY,
|
||||
short dilationX, short dilationY,
|
||||
short neuron, float a, float b, float min_clamp, float max_clamp,
|
||||
short numOutputColumns) {
|
||||
|
||||
// init
|
||||
const sampler_t smp = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
|
||||
short packedChannel = get_global_id(0);
|
||||
short startOutputColumn = mul24((short)get_global_id(1), 4);
|
||||
short outputRow = get_global_id(2);
|
||||
short startXForChannel = mad24(mad24(startOutputColumn, strideX, -paddingX),
|
||||
totalNumPackedChannels, packedChannel);
|
||||
short strideWithChannels = mul24(strideX, totalNumPackedChannels);
|
||||
|
||||
float4 outputValues[4];
|
||||
for (short i = 0; i < 4; ++i) {
|
||||
outputValues[i] = (float4)(0, 0, 0, 0);
|
||||
}
|
||||
|
||||
int2 inputLocation;
|
||||
inputLocation.y = mad24(outputRow, strideY, -paddingY);
|
||||
|
||||
int2 weightLocation;
|
||||
weightLocation.x = 0;
|
||||
weightLocation.y = packedChannel;
|
||||
|
||||
// convolution
|
||||
for (short rfRow = 0; rfRow < filterSizeY; ++rfRow) {
|
||||
for (short rfColumn = 0; rfColumn < filterSizeX; ++rfColumn) {
|
||||
short dilatedStepX = mul24(totalNumPackedChannels, dilationX);
|
||||
inputLocation.x = mad24(rfColumn, dilatedStepX, startXForChannel);
|
||||
float4 inputValues[4];
|
||||
for (short i = 0; i < 4; ++i) {
|
||||
inputValues[i] = read_imagef(input, smp, inputLocation);
|
||||
inputLocation.x += strideWithChannels;
|
||||
}
|
||||
float4 weightValues = read_imagef(weights, smp, weightLocation);
|
||||
++weightLocation.x;
|
||||
outputValues[0] += inputValues[0] * weightValues;
|
||||
outputValues[1] += inputValues[1] * weightValues;
|
||||
outputValues[2] += inputValues[2] * weightValues;
|
||||
outputValues[3] += inputValues[3] * weightValues;
|
||||
}
|
||||
inputLocation.y += dilationY;
|
||||
}
|
||||
|
||||
// bias
|
||||
short outputChannel = mul24(packedChannel, 4);
|
||||
float4 biasValues = vload4(0, biases + outputChannel);
|
||||
for (short i = 0; i < 4; ++i) {
|
||||
outputValues[i] += biasValues;
|
||||
}
|
||||
|
||||
// activation
|
||||
switch (neuron) {
|
||||
case 1:
|
||||
for (short i = 0; i < 4; ++i) {
|
||||
outputValues[i] = max(outputValues[i], 0.0f);
|
||||
}
|
||||
break;
|
||||
case 2:
|
||||
for (short i = 0; i < 4; ++i) {
|
||||
outputValues[i] = a * tanh(b * outputValues[i]);
|
||||
}
|
||||
break;
|
||||
case 3:
|
||||
for (short i = 0; i < 4; ++i) {
|
||||
outputValues[i] = native_recip(1.0f + native_exp(-a * outputValues[i] + b));
|
||||
}
|
||||
break;
|
||||
case 4:
|
||||
for (short i = 0; i < 4; ++i) {
|
||||
outputValues[i] = max(outputValues[i], min_clamp);
|
||||
outputValues[i] = min(outputValues[i], max_clamp);
|
||||
}
|
||||
break;
|
||||
case 5:
|
||||
for (short i = 0; i < 4; ++i) {
|
||||
outputValues[i] = max(outputValues[i], 0.0f) + a * (native_exp(min(outputValues[i], 0.0f)) - 1.0f);
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
// output
|
||||
int2 outputLocation;
|
||||
short outputColumn = startOutputColumn;
|
||||
outputLocation.y = outputRow;
|
||||
for (short i = 0; i < 4; ++i) {
|
||||
outputLocation.x = mad24(outputColumn, totalNumPackedChannels, packedChannel);
|
||||
if (outputColumn < numOutputColumns) {
|
||||
write_imagef(output, outputLocation, outputValues[i]);
|
||||
}
|
||||
++outputColumn;
|
||||
}
|
||||
}
|
||||
#include "convolution_.cl"
|
||||
|
|
|
@ -1,103 +1,4 @@
|
|||
#define DEPTHWISE
|
||||
|
||||
__kernel void convolution_horizontal_reduced_reads_depthwise_stride_1(
|
||||
read_only image2d_t input,
|
||||
short totalNumPackedChannels,
|
||||
read_only image2d_t weights, __constant float *biases,
|
||||
short filterSizeX, short filterSizeY,
|
||||
write_only image2d_t output,
|
||||
short paddingX, short paddingY, short strideX, short strideY,
|
||||
short neuron, float a, float b, float min_clamp, float max_clamp,
|
||||
short numOutputColumns) {
|
||||
|
||||
// init
|
||||
const sampler_t smp = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
|
||||
short packedChannel = get_global_id(0);
|
||||
short startOutputColumn = mul24((short)get_global_id(1), 4);
|
||||
short outputRow = get_global_id(2);
|
||||
short startXForChannel = mad24(mad24(startOutputColumn, strideX, -paddingX),
|
||||
totalNumPackedChannels, packedChannel);
|
||||
|
||||
float4 outputValues[4];
|
||||
for (short i = 0; i < 4; ++i) {
|
||||
outputValues[i] = (float4)(0, 0, 0, 0);
|
||||
}
|
||||
|
||||
int2 inputLocation;
|
||||
inputLocation.y = mad24(outputRow, strideY, -paddingY);
|
||||
|
||||
int2 weightLocation;
|
||||
weightLocation.x = 0;
|
||||
weightLocation.y = packedChannel;
|
||||
|
||||
// convolution
|
||||
for (short rfRow = 0; rfRow < filterSizeY; ++rfRow) {
|
||||
float4 inputValues[4];
|
||||
inputLocation.x = startXForChannel;
|
||||
for (short i = 1; i < 4; ++i) {
|
||||
inputValues[i] = read_imagef(input, smp, inputLocation);
|
||||
inputLocation.x += totalNumPackedChannels;
|
||||
}
|
||||
for (short rfColumn = 0; rfColumn < filterSizeX; ++rfColumn) {
|
||||
inputValues[0] = inputValues[1];
|
||||
inputValues[1] = inputValues[2];
|
||||
inputValues[2] = inputValues[3];
|
||||
inputValues[3] = read_imagef(input, smp, inputLocation);
|
||||
inputLocation.x += totalNumPackedChannels;
|
||||
float4 weightValues = read_imagef(weights, smp, weightLocation);
|
||||
++weightLocation.x;
|
||||
outputValues[0] += inputValues[0] * weightValues;
|
||||
outputValues[1] += inputValues[1] * weightValues;
|
||||
outputValues[2] += inputValues[2] * weightValues;
|
||||
outputValues[3] += inputValues[3] * weightValues;
|
||||
}
|
||||
++inputLocation.y;
|
||||
}
|
||||
|
||||
// bias
|
||||
short outputChannel = mul24(packedChannel, 4);
|
||||
float4 biasValues = vload4(0, biases + outputChannel);
|
||||
for (short i = 0; i < 4; ++i) {
|
||||
outputValues[i] += biasValues;
|
||||
}
|
||||
|
||||
// activation
|
||||
switch (neuron) {
|
||||
case 1:
|
||||
for (short i = 0; i < 4; ++i) {
|
||||
outputValues[i] = max(outputValues[i], 0.0f);
|
||||
}
|
||||
break;
|
||||
case 2:
|
||||
for (short i = 0; i < 4; ++i) {
|
||||
outputValues[i] = a * tanh(b * outputValues[i]);
|
||||
}
|
||||
break;
|
||||
case 3:
|
||||
for (short i = 0; i < 4; ++i) {
|
||||
outputValues[i] = native_recip(1.0f + native_exp(-a * outputValues[i] + b));
|
||||
}
|
||||
break;
|
||||
case 4:
|
||||
for (short i = 0; i < 4; ++i) {
|
||||
outputValues[i] = max(outputValues[i], min_clamp);
|
||||
outputValues[i] = min(outputValues[i], max_clamp);
|
||||
}
|
||||
break;
|
||||
case 5:
|
||||
for (short i = 0; i < 4; ++i) {
|
||||
outputValues[i] = max(outputValues[i], 0.0f) + a * (native_exp(min(outputValues[i], 0.0f)) - 1.0f);
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
// output
|
||||
int2 outputLocation;
|
||||
short outputColumn = startOutputColumn;
|
||||
outputLocation.y = outputRow;
|
||||
for (short i = 0; i < 4; ++i) {
|
||||
outputLocation.x = mad24(outputColumn, totalNumPackedChannels, packedChannel);
|
||||
if (outputColumn < numOutputColumns) {
|
||||
write_imagef(output, outputLocation, outputValues[i]);
|
||||
}
|
||||
++outputColumn;
|
||||
}
|
||||
}
|
||||
#include "convolution_.cl"
|
||||
|
|
|
@ -80,7 +80,9 @@ int Thneed::optimize() {
|
|||
|
||||
printf("building kernel %s\n", k->name.c_str());
|
||||
k->program = clCreateProgramWithSource(context, 1, srcs, &length, NULL);
|
||||
int err = clBuildProgram(k->program, 1, &device_id, "", NULL, NULL);
|
||||
char options[0x100];
|
||||
snprintf(options, sizeof(options)-1, "-I %s", kernel_path);
|
||||
int err = clBuildProgram(k->program, 1, &device_id, options, NULL, NULL);
|
||||
|
||||
if (err != 0) {
|
||||
printf("got err %d\n", err);
|
||||
|
|
Loading…
Reference in New Issue