This isn't bigmodel, but there's a lot of good stuff here (#1532)

* bigmodel

* more debug print

* debugging bigmodel

* remove the tanh, debugging

* print images/buffers

* disassemble the command queues

* decompiler

* dump the shaders

* full disasm

* support patching kernel and fixing convolution_horizontal_reduced_reads_1x1

* microbenchmark

* 42 GFLOPS, 1 GB/s

* gemm benchmark

* 75 GFLOPS vs 42 GFLOPS

* 115 GFLOPS

* oops, never mind

* gemm image is slow

* this is pretty hopeless

* gemm image gets 62 GFLOPS

* this is addictive and still a waste of time

* cleanup cleanup

* that hook was dumb

* tabbing

* more tabbing

Co-authored-by: Comma Device <device@comma.ai>
albatross
George Hotz 2020-05-17 23:13:17 -07:00 committed by GitHub
parent 81686547cc
commit 78a352a8ca
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
15 changed files with 6770 additions and 23 deletions

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,906 @@
/*
* Mesa 3-D graphics library
*
* Copyright (C) 1999-2008 Brian Paul All Rights Reserved.
* Copyright (C) 2009 VMware, Inc. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included
* in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
#ifndef SHADER_ENUMS_H
#define SHADER_ENUMS_H
#include <stdbool.h>
/* Project-wide (GL and Vulkan) maximum. */
#define MAX_DRAW_BUFFERS 8
#ifdef __cplusplus
extern "C" {
#endif
/**
* Shader stages.
*
* The order must match how shaders are ordered in the pipeline.
* The GLSL linker assumes that if i<j, then the j-th shader is
* executed later than the i-th shader.
*/
typedef enum
{
MESA_SHADER_NONE = -1,
MESA_SHADER_VERTEX = 0,
MESA_SHADER_TESS_CTRL = 1,
MESA_SHADER_TESS_EVAL = 2,
MESA_SHADER_GEOMETRY = 3,
MESA_SHADER_FRAGMENT = 4,
MESA_SHADER_COMPUTE = 5,
/* must be last so it doesn't affect the GL pipeline */
MESA_SHADER_KERNEL = 6,
} gl_shader_stage;
static inline bool
gl_shader_stage_is_compute(gl_shader_stage stage)
{
return stage == MESA_SHADER_COMPUTE || stage == MESA_SHADER_KERNEL;
}
/**
* Number of STATE_* values we need to address any GL state.
* Used to dimension arrays.
*/
#define STATE_LENGTH 5
typedef short gl_state_index16; /* see enum gl_state_index */
const char *gl_shader_stage_name(gl_shader_stage stage);
/**
* Translate a gl_shader_stage to a short shader stage name for debug
* printouts and error messages.
*/
const char *_mesa_shader_stage_to_string(unsigned stage);
/**
* Translate a gl_shader_stage to a shader stage abbreviation (VS, GS, FS)
* for debug printouts and error messages.
*/
const char *_mesa_shader_stage_to_abbrev(unsigned stage);
/**
* GL related stages (not including CL)
*/
#define MESA_SHADER_STAGES (MESA_SHADER_COMPUTE + 1)
/**
* All stages
*/
#define MESA_ALL_SHADER_STAGES (MESA_SHADER_KERNEL + 1)
/**
* Indexes for vertex program attributes.
* GL_NV_vertex_program aliases generic attributes over the conventional
* attributes. In GL_ARB_vertex_program shader the aliasing is optional.
* In GL_ARB_vertex_shader / OpenGL 2.0 the aliasing is disallowed (the
* generic attributes are distinct/separate).
*/
typedef enum
{
VERT_ATTRIB_POS,
VERT_ATTRIB_NORMAL,
VERT_ATTRIB_COLOR0,
VERT_ATTRIB_COLOR1,
VERT_ATTRIB_FOG,
VERT_ATTRIB_COLOR_INDEX,
VERT_ATTRIB_EDGEFLAG,
VERT_ATTRIB_TEX0,
VERT_ATTRIB_TEX1,
VERT_ATTRIB_TEX2,
VERT_ATTRIB_TEX3,
VERT_ATTRIB_TEX4,
VERT_ATTRIB_TEX5,
VERT_ATTRIB_TEX6,
VERT_ATTRIB_TEX7,
VERT_ATTRIB_POINT_SIZE,
VERT_ATTRIB_GENERIC0,
VERT_ATTRIB_GENERIC1,
VERT_ATTRIB_GENERIC2,
VERT_ATTRIB_GENERIC3,
VERT_ATTRIB_GENERIC4,
VERT_ATTRIB_GENERIC5,
VERT_ATTRIB_GENERIC6,
VERT_ATTRIB_GENERIC7,
VERT_ATTRIB_GENERIC8,
VERT_ATTRIB_GENERIC9,
VERT_ATTRIB_GENERIC10,
VERT_ATTRIB_GENERIC11,
VERT_ATTRIB_GENERIC12,
VERT_ATTRIB_GENERIC13,
VERT_ATTRIB_GENERIC14,
VERT_ATTRIB_GENERIC15,
VERT_ATTRIB_MAX
} gl_vert_attrib;
const char *gl_vert_attrib_name(gl_vert_attrib attrib);
/**
* Symbolic constats to help iterating over
* specific blocks of vertex attributes.
*
* VERT_ATTRIB_FF
* includes all fixed function attributes as well as
* the aliased GL_NV_vertex_program shader attributes.
* VERT_ATTRIB_TEX
* include the classic texture coordinate attributes.
* Is a subset of VERT_ATTRIB_FF.
* VERT_ATTRIB_GENERIC
* include the OpenGL 2.0+ GLSL generic shader attributes.
* These alias the generic GL_ARB_vertex_shader attributes.
* VERT_ATTRIB_MAT
* include the generic shader attributes used to alias
* varying material values for the TNL shader programs.
* They are located at the end of the generic attribute
* block not to overlap with the generic 0 attribute.
*/
#define VERT_ATTRIB_FF(i) (VERT_ATTRIB_POS + (i))
#define VERT_ATTRIB_FF_MAX VERT_ATTRIB_GENERIC0
#define VERT_ATTRIB_TEX(i) (VERT_ATTRIB_TEX0 + (i))
#define VERT_ATTRIB_TEX_MAX MAX_TEXTURE_COORD_UNITS
#define VERT_ATTRIB_GENERIC(i) (VERT_ATTRIB_GENERIC0 + (i))
#define VERT_ATTRIB_GENERIC_MAX MAX_VERTEX_GENERIC_ATTRIBS
#define VERT_ATTRIB_MAT0 \
(VERT_ATTRIB_GENERIC_MAX - VERT_ATTRIB_MAT_MAX)
#define VERT_ATTRIB_MAT(i) \
VERT_ATTRIB_GENERIC((i) + VERT_ATTRIB_MAT0)
#define VERT_ATTRIB_MAT_MAX MAT_ATTRIB_MAX
/**
* Bitflags for vertex attributes.
* These are used in bitfields in many places.
*/
/*@{*/
#define VERT_BIT_POS BITFIELD_BIT(VERT_ATTRIB_POS)
#define VERT_BIT_NORMAL BITFIELD_BIT(VERT_ATTRIB_NORMAL)
#define VERT_BIT_COLOR0 BITFIELD_BIT(VERT_ATTRIB_COLOR0)
#define VERT_BIT_COLOR1 BITFIELD_BIT(VERT_ATTRIB_COLOR1)
#define VERT_BIT_FOG BITFIELD_BIT(VERT_ATTRIB_FOG)
#define VERT_BIT_COLOR_INDEX BITFIELD_BIT(VERT_ATTRIB_COLOR_INDEX)
#define VERT_BIT_EDGEFLAG BITFIELD_BIT(VERT_ATTRIB_EDGEFLAG)
#define VERT_BIT_TEX0 BITFIELD_BIT(VERT_ATTRIB_TEX0)
#define VERT_BIT_TEX1 BITFIELD_BIT(VERT_ATTRIB_TEX1)
#define VERT_BIT_TEX2 BITFIELD_BIT(VERT_ATTRIB_TEX2)
#define VERT_BIT_TEX3 BITFIELD_BIT(VERT_ATTRIB_TEX3)
#define VERT_BIT_TEX4 BITFIELD_BIT(VERT_ATTRIB_TEX4)
#define VERT_BIT_TEX5 BITFIELD_BIT(VERT_ATTRIB_TEX5)
#define VERT_BIT_TEX6 BITFIELD_BIT(VERT_ATTRIB_TEX6)
#define VERT_BIT_TEX7 BITFIELD_BIT(VERT_ATTRIB_TEX7)
#define VERT_BIT_POINT_SIZE BITFIELD_BIT(VERT_ATTRIB_POINT_SIZE)
#define VERT_BIT_GENERIC0 BITFIELD_BIT(VERT_ATTRIB_GENERIC0)
#define VERT_BIT(i) BITFIELD_BIT(i)
#define VERT_BIT_ALL BITFIELD_RANGE(0, VERT_ATTRIB_MAX)
#define VERT_BIT_FF(i) VERT_BIT(i)
#define VERT_BIT_FF_ALL BITFIELD_RANGE(0, VERT_ATTRIB_FF_MAX)
#define VERT_BIT_TEX(i) VERT_BIT(VERT_ATTRIB_TEX(i))
#define VERT_BIT_TEX_ALL \
BITFIELD_RANGE(VERT_ATTRIB_TEX(0), VERT_ATTRIB_TEX_MAX)
#define VERT_BIT_GENERIC(i) VERT_BIT(VERT_ATTRIB_GENERIC(i))
#define VERT_BIT_GENERIC_ALL \
BITFIELD_RANGE(VERT_ATTRIB_GENERIC(0), VERT_ATTRIB_GENERIC_MAX)
#define VERT_BIT_MAT(i) VERT_BIT(VERT_ATTRIB_MAT(i))
#define VERT_BIT_MAT_ALL \
BITFIELD_RANGE(VERT_ATTRIB_MAT(0), VERT_ATTRIB_MAT_MAX)
/*@}*/
#define MAX_VARYING 32 /**< number of float[4] vectors */
/**
* Indexes for vertex shader outputs, geometry shader inputs/outputs, and
* fragment shader inputs.
*
* Note that some of these values are not available to all pipeline stages.
*
* When this enum is updated, the following code must be updated too:
* - vertResults (in prog_print.c's arb_output_attrib_string())
* - fragAttribs (in prog_print.c's arb_input_attrib_string())
* - _mesa_varying_slot_in_fs()
*/
typedef enum
{
VARYING_SLOT_POS,
VARYING_SLOT_COL0, /* COL0 and COL1 must be contiguous */
VARYING_SLOT_COL1,
VARYING_SLOT_FOGC,
VARYING_SLOT_TEX0, /* TEX0-TEX7 must be contiguous */
VARYING_SLOT_TEX1,
VARYING_SLOT_TEX2,
VARYING_SLOT_TEX3,
VARYING_SLOT_TEX4,
VARYING_SLOT_TEX5,
VARYING_SLOT_TEX6,
VARYING_SLOT_TEX7,
VARYING_SLOT_PSIZ, /* Does not appear in FS */
VARYING_SLOT_BFC0, /* Does not appear in FS */
VARYING_SLOT_BFC1, /* Does not appear in FS */
VARYING_SLOT_EDGE, /* Does not appear in FS */
VARYING_SLOT_CLIP_VERTEX, /* Does not appear in FS */
VARYING_SLOT_CLIP_DIST0,
VARYING_SLOT_CLIP_DIST1,
VARYING_SLOT_CULL_DIST0,
VARYING_SLOT_CULL_DIST1,
VARYING_SLOT_PRIMITIVE_ID, /* Does not appear in VS */
VARYING_SLOT_LAYER, /* Appears as VS or GS output */
VARYING_SLOT_VIEWPORT, /* Appears as VS or GS output */
VARYING_SLOT_FACE, /* FS only */
VARYING_SLOT_PNTC, /* FS only */
VARYING_SLOT_TESS_LEVEL_OUTER, /* Only appears as TCS output. */
VARYING_SLOT_TESS_LEVEL_INNER, /* Only appears as TCS output. */
VARYING_SLOT_BOUNDING_BOX0, /* Only appears as TCS output. */
VARYING_SLOT_BOUNDING_BOX1, /* Only appears as TCS output. */
VARYING_SLOT_VIEW_INDEX,
VARYING_SLOT_VIEWPORT_MASK, /* Does not appear in FS */
VARYING_SLOT_VAR0, /* First generic varying slot */
/* the remaining are simply for the benefit of gl_varying_slot_name()
* and not to be construed as an upper bound:
*/
VARYING_SLOT_VAR1,
VARYING_SLOT_VAR2,
VARYING_SLOT_VAR3,
VARYING_SLOT_VAR4,
VARYING_SLOT_VAR5,
VARYING_SLOT_VAR6,
VARYING_SLOT_VAR7,
VARYING_SLOT_VAR8,
VARYING_SLOT_VAR9,
VARYING_SLOT_VAR10,
VARYING_SLOT_VAR11,
VARYING_SLOT_VAR12,
VARYING_SLOT_VAR13,
VARYING_SLOT_VAR14,
VARYING_SLOT_VAR15,
VARYING_SLOT_VAR16,
VARYING_SLOT_VAR17,
VARYING_SLOT_VAR18,
VARYING_SLOT_VAR19,
VARYING_SLOT_VAR20,
VARYING_SLOT_VAR21,
VARYING_SLOT_VAR22,
VARYING_SLOT_VAR23,
VARYING_SLOT_VAR24,
VARYING_SLOT_VAR25,
VARYING_SLOT_VAR26,
VARYING_SLOT_VAR27,
VARYING_SLOT_VAR28,
VARYING_SLOT_VAR29,
VARYING_SLOT_VAR30,
VARYING_SLOT_VAR31,
} gl_varying_slot;
#define VARYING_SLOT_MAX (VARYING_SLOT_VAR0 + MAX_VARYING)
#define VARYING_SLOT_PATCH0 (VARYING_SLOT_MAX)
#define VARYING_SLOT_TESS_MAX (VARYING_SLOT_PATCH0 + MAX_VARYING)
#define MAX_VARYINGS_INCL_PATCH (VARYING_SLOT_TESS_MAX - VARYING_SLOT_VAR0)
const char *gl_varying_slot_name(gl_varying_slot slot);
/**
* Bitflags for varying slots.
*/
/*@{*/
#define VARYING_BIT_POS BITFIELD64_BIT(VARYING_SLOT_POS)
#define VARYING_BIT_COL0 BITFIELD64_BIT(VARYING_SLOT_COL0)
#define VARYING_BIT_COL1 BITFIELD64_BIT(VARYING_SLOT_COL1)
#define VARYING_BIT_FOGC BITFIELD64_BIT(VARYING_SLOT_FOGC)
#define VARYING_BIT_TEX0 BITFIELD64_BIT(VARYING_SLOT_TEX0)
#define VARYING_BIT_TEX1 BITFIELD64_BIT(VARYING_SLOT_TEX1)
#define VARYING_BIT_TEX2 BITFIELD64_BIT(VARYING_SLOT_TEX2)
#define VARYING_BIT_TEX3 BITFIELD64_BIT(VARYING_SLOT_TEX3)
#define VARYING_BIT_TEX4 BITFIELD64_BIT(VARYING_SLOT_TEX4)
#define VARYING_BIT_TEX5 BITFIELD64_BIT(VARYING_SLOT_TEX5)
#define VARYING_BIT_TEX6 BITFIELD64_BIT(VARYING_SLOT_TEX6)
#define VARYING_BIT_TEX7 BITFIELD64_BIT(VARYING_SLOT_TEX7)
#define VARYING_BIT_TEX(U) BITFIELD64_BIT(VARYING_SLOT_TEX0 + (U))
#define VARYING_BITS_TEX_ANY BITFIELD64_RANGE(VARYING_SLOT_TEX0, \
MAX_TEXTURE_COORD_UNITS)
#define VARYING_BIT_PSIZ BITFIELD64_BIT(VARYING_SLOT_PSIZ)
#define VARYING_BIT_BFC0 BITFIELD64_BIT(VARYING_SLOT_BFC0)
#define VARYING_BIT_BFC1 BITFIELD64_BIT(VARYING_SLOT_BFC1)
#define VARYING_BITS_COLOR (VARYING_BIT_COL0 | \
VARYING_BIT_COL1 | \
VARYING_BIT_BFC0 | \
VARYING_BIT_BFC1)
#define VARYING_BIT_EDGE BITFIELD64_BIT(VARYING_SLOT_EDGE)
#define VARYING_BIT_CLIP_VERTEX BITFIELD64_BIT(VARYING_SLOT_CLIP_VERTEX)
#define VARYING_BIT_CLIP_DIST0 BITFIELD64_BIT(VARYING_SLOT_CLIP_DIST0)
#define VARYING_BIT_CLIP_DIST1 BITFIELD64_BIT(VARYING_SLOT_CLIP_DIST1)
#define VARYING_BIT_CULL_DIST0 BITFIELD64_BIT(VARYING_SLOT_CULL_DIST0)
#define VARYING_BIT_CULL_DIST1 BITFIELD64_BIT(VARYING_SLOT_CULL_DIST1)
#define VARYING_BIT_PRIMITIVE_ID BITFIELD64_BIT(VARYING_SLOT_PRIMITIVE_ID)
#define VARYING_BIT_LAYER BITFIELD64_BIT(VARYING_SLOT_LAYER)
#define VARYING_BIT_VIEWPORT BITFIELD64_BIT(VARYING_SLOT_VIEWPORT)
#define VARYING_BIT_FACE BITFIELD64_BIT(VARYING_SLOT_FACE)
#define VARYING_BIT_PNTC BITFIELD64_BIT(VARYING_SLOT_PNTC)
#define VARYING_BIT_TESS_LEVEL_OUTER BITFIELD64_BIT(VARYING_SLOT_TESS_LEVEL_OUTER)
#define VARYING_BIT_TESS_LEVEL_INNER BITFIELD64_BIT(VARYING_SLOT_TESS_LEVEL_INNER)
#define VARYING_BIT_BOUNDING_BOX0 BITFIELD64_BIT(VARYING_SLOT_BOUNDING_BOX0)
#define VARYING_BIT_BOUNDING_BOX1 BITFIELD64_BIT(VARYING_SLOT_BOUNDING_BOX1)
#define VARYING_BIT_VIEWPORT_MASK BITFIELD64_BIT(VARYING_SLOT_VIEWPORT_MASK)
#define VARYING_BIT_VAR(V) BITFIELD64_BIT(VARYING_SLOT_VAR0 + (V))
/*@}*/
/**
* Bitflags for system values.
*/
#define SYSTEM_BIT_SAMPLE_ID ((uint64_t)1 << SYSTEM_VALUE_SAMPLE_ID)
#define SYSTEM_BIT_SAMPLE_POS ((uint64_t)1 << SYSTEM_VALUE_SAMPLE_POS)
#define SYSTEM_BIT_SAMPLE_MASK_IN ((uint64_t)1 << SYSTEM_VALUE_SAMPLE_MASK_IN)
#define SYSTEM_BIT_LOCAL_INVOCATION_ID ((uint64_t)1 << SYSTEM_VALUE_LOCAL_INVOCATION_ID)
/**
* If the gl_register_file is PROGRAM_SYSTEM_VALUE, the register index will be
* one of these values. If a NIR variable's mode is nir_var_system_value, it
* will be one of these values.
*/
typedef enum
{
/**
* \name System values applicable to all shaders
*/
/*@{*/
/**
* Builtin variables added by GL_ARB_shader_ballot.
*/
/*@{*/
/**
* From the GL_ARB_shader-ballot spec:
*
* "A sub-group is a collection of invocations which execute in lockstep.
* The variable <gl_SubGroupSizeARB> is the maximum number of
* invocations in a sub-group. The maximum <gl_SubGroupSizeARB>
* supported in this extension is 64."
*
* The spec defines this as a uniform. However, it's highly unlikely that
* implementations actually treat it as a uniform (which is loaded from a
* constant buffer). Most likely, this is an implementation-wide constant,
* or perhaps something that depends on the shader stage.
*/
SYSTEM_VALUE_SUBGROUP_SIZE,
/**
* From the GL_ARB_shader_ballot spec:
*
* "The variable <gl_SubGroupInvocationARB> holds the index of the
* invocation within sub-group. This variable is in the range 0 to
* <gl_SubGroupSizeARB>-1, where <gl_SubGroupSizeARB> is the total
* number of invocations in a sub-group."
*/
SYSTEM_VALUE_SUBGROUP_INVOCATION,
/**
* From the GL_ARB_shader_ballot spec:
*
* "The <gl_SubGroup??MaskARB> variables provide a bitmask for all
* invocations, with one bit per invocation starting with the least
* significant bit, according to the following table,
*
* variable equation for bit values
* -------------------- ------------------------------------
* gl_SubGroupEqMaskARB bit index == gl_SubGroupInvocationARB
* gl_SubGroupGeMaskARB bit index >= gl_SubGroupInvocationARB
* gl_SubGroupGtMaskARB bit index > gl_SubGroupInvocationARB
* gl_SubGroupLeMaskARB bit index <= gl_SubGroupInvocationARB
* gl_SubGroupLtMaskARB bit index < gl_SubGroupInvocationARB
*/
SYSTEM_VALUE_SUBGROUP_EQ_MASK,
SYSTEM_VALUE_SUBGROUP_GE_MASK,
SYSTEM_VALUE_SUBGROUP_GT_MASK,
SYSTEM_VALUE_SUBGROUP_LE_MASK,
SYSTEM_VALUE_SUBGROUP_LT_MASK,
/*@}*/
/**
* Builtin variables added by VK_KHR_subgroups
*/
/*@{*/
SYSTEM_VALUE_NUM_SUBGROUPS,
SYSTEM_VALUE_SUBGROUP_ID,
/*@}*/
/*@}*/
/**
* \name Vertex shader system values
*/
/*@{*/
/**
* OpenGL-style vertex ID.
*
* Section 2.11.7 (Shader Execution), subsection Shader Inputs, of the
* OpenGL 3.3 core profile spec says:
*
* "gl_VertexID holds the integer index i implicitly passed by
* DrawArrays or one of the other drawing commands defined in section
* 2.8.3."
*
* Section 2.8.3 (Drawing Commands) of the same spec says:
*
* "The commands....are equivalent to the commands with the same base
* name (without the BaseVertex suffix), except that the ith element
* transferred by the corresponding draw call will be taken from
* element indices[i] + basevertex of each enabled array."
*
* Additionally, the overview in the GL_ARB_shader_draw_parameters spec
* says:
*
* "In unextended GL, vertex shaders have inputs named gl_VertexID and
* gl_InstanceID, which contain, respectively the index of the vertex
* and instance. The value of gl_VertexID is the implicitly passed
* index of the vertex being processed, which includes the value of
* baseVertex, for those commands that accept it."
*
* gl_VertexID gets basevertex added in. This differs from DirectX where
* SV_VertexID does \b not get basevertex added in.
*
* \note
* If all system values are available, \c SYSTEM_VALUE_VERTEX_ID will be
* equal to \c SYSTEM_VALUE_VERTEX_ID_ZERO_BASE plus
* \c SYSTEM_VALUE_BASE_VERTEX.
*
* \sa SYSTEM_VALUE_VERTEX_ID_ZERO_BASE, SYSTEM_VALUE_BASE_VERTEX
*/
SYSTEM_VALUE_VERTEX_ID,
/**
* Instanced ID as supplied to gl_InstanceID
*
* Values assigned to gl_InstanceID always begin with zero, regardless of
* the value of baseinstance.
*
* Section 11.1.3.9 (Shader Inputs) of the OpenGL 4.4 core profile spec
* says:
*
* "gl_InstanceID holds the integer instance number of the current
* primitive in an instanced draw call (see section 10.5)."
*
* Through a big chain of pseudocode, section 10.5 describes that
* baseinstance is not counted by gl_InstanceID. In that section, notice
*
* "If an enabled vertex attribute array is instanced (it has a
* non-zero divisor as specified by VertexAttribDivisor), the element
* index that is transferred to the GL, for all vertices, is given by
*
* floor(instance/divisor) + baseinstance
*
* If an array corresponding to an attribute required by a vertex
* shader is not enabled, then the corresponding element is taken from
* the current attribute state (see section 10.2)."
*
* Note that baseinstance is \b not included in the value of instance.
*/
SYSTEM_VALUE_INSTANCE_ID,
/**
* Vulkan InstanceIndex.
*
* InstanceIndex = gl_InstanceID + gl_BaseInstance
*/
SYSTEM_VALUE_INSTANCE_INDEX,
/**
* DirectX-style vertex ID.
*
* Unlike \c SYSTEM_VALUE_VERTEX_ID, this system value does \b not include
* the value of basevertex.
*
* \sa SYSTEM_VALUE_VERTEX_ID, SYSTEM_VALUE_BASE_VERTEX
*/
SYSTEM_VALUE_VERTEX_ID_ZERO_BASE,
/**
* Value of \c basevertex passed to \c glDrawElementsBaseVertex and similar
* functions.
*
* \sa SYSTEM_VALUE_VERTEX_ID, SYSTEM_VALUE_VERTEX_ID_ZERO_BASE
*/
SYSTEM_VALUE_BASE_VERTEX,
/**
* Depending on the type of the draw call (indexed or non-indexed),
* is the value of \c basevertex passed to \c glDrawElementsBaseVertex and
* similar, or is the value of \c first passed to \c glDrawArrays and
* similar.
*
* \note
* It can be used to calculate the \c SYSTEM_VALUE_VERTEX_ID as
* \c SYSTEM_VALUE_VERTEX_ID_ZERO_BASE plus \c SYSTEM_VALUE_FIRST_VERTEX.
*
* \sa SYSTEM_VALUE_VERTEX_ID_ZERO_BASE, SYSTEM_VALUE_VERTEX_ID
*/
SYSTEM_VALUE_FIRST_VERTEX,
/**
* If the Draw command used to start the rendering was an indexed draw
* or not (~0/0). Useful to calculate \c SYSTEM_VALUE_BASE_VERTEX as
* \c SYSTEM_VALUE_IS_INDEXED_DRAW & \c SYSTEM_VALUE_FIRST_VERTEX.
*/
SYSTEM_VALUE_IS_INDEXED_DRAW,
/**
* Value of \c baseinstance passed to instanced draw entry points
*
* \sa SYSTEM_VALUE_INSTANCE_ID
*/
SYSTEM_VALUE_BASE_INSTANCE,
/**
* From _ARB_shader_draw_parameters:
*
* "Additionally, this extension adds a further built-in variable,
* gl_DrawID to the shading language. This variable contains the index
* of the draw currently being processed by a Multi* variant of a
* drawing command (such as MultiDrawElements or
* MultiDrawArraysIndirect)."
*
* If GL_ARB_multi_draw_indirect is not supported, this is always 0.
*/
SYSTEM_VALUE_DRAW_ID,
/*@}*/
/**
* \name Geometry shader system values
*/
/*@{*/
SYSTEM_VALUE_INVOCATION_ID, /**< (Also in Tessellation Control shader) */
/*@}*/
/**
* \name Fragment shader system values
*/
/*@{*/
SYSTEM_VALUE_FRAG_COORD,
SYSTEM_VALUE_POINT_COORD,
SYSTEM_VALUE_FRONT_FACE,
SYSTEM_VALUE_SAMPLE_ID,
SYSTEM_VALUE_SAMPLE_POS,
SYSTEM_VALUE_SAMPLE_MASK_IN,
SYSTEM_VALUE_HELPER_INVOCATION,
SYSTEM_VALUE_COLOR0,
SYSTEM_VALUE_COLOR1,
/*@}*/
/**
* \name Tessellation Evaluation shader system values
*/
/*@{*/
SYSTEM_VALUE_TESS_COORD,
SYSTEM_VALUE_VERTICES_IN, /**< Tessellation vertices in input patch */
SYSTEM_VALUE_PRIMITIVE_ID,
SYSTEM_VALUE_TESS_LEVEL_OUTER, /**< TES input */
SYSTEM_VALUE_TESS_LEVEL_INNER, /**< TES input */
SYSTEM_VALUE_TESS_LEVEL_OUTER_DEFAULT, /**< TCS input for passthru TCS */
SYSTEM_VALUE_TESS_LEVEL_INNER_DEFAULT, /**< TCS input for passthru TCS */
/*@}*/
/**
* \name Compute shader system values
*/
/*@{*/
SYSTEM_VALUE_LOCAL_INVOCATION_ID,
SYSTEM_VALUE_LOCAL_INVOCATION_INDEX,
SYSTEM_VALUE_GLOBAL_INVOCATION_ID,
SYSTEM_VALUE_GLOBAL_INVOCATION_INDEX,
SYSTEM_VALUE_WORK_GROUP_ID,
SYSTEM_VALUE_NUM_WORK_GROUPS,
SYSTEM_VALUE_LOCAL_GROUP_SIZE,
SYSTEM_VALUE_GLOBAL_GROUP_SIZE,
SYSTEM_VALUE_WORK_DIM,
SYSTEM_VALUE_USER_DATA_AMD,
/*@}*/
/** Required for VK_KHR_device_group */
SYSTEM_VALUE_DEVICE_INDEX,
/** Required for VK_KHX_multiview */
SYSTEM_VALUE_VIEW_INDEX,
/**
* Driver internal vertex-count, used (for example) for drivers to
* calculate stride for stream-out outputs. Not externally visible.
*/
SYSTEM_VALUE_VERTEX_CNT,
/**
* Required for AMD_shader_explicit_vertex_parameter and also used for
* varying-fetch instructions.
*
* The _SIZE value is "primitive size", used to scale i/j in primitive
* space to pixel space.
*/
SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL,
SYSTEM_VALUE_BARYCENTRIC_PERSP_SAMPLE,
SYSTEM_VALUE_BARYCENTRIC_PERSP_CENTROID,
SYSTEM_VALUE_BARYCENTRIC_PERSP_SIZE,
SYSTEM_VALUE_BARYCENTRIC_LINEAR_PIXEL,
SYSTEM_VALUE_BARYCENTRIC_LINEAR_CENTROID,
SYSTEM_VALUE_BARYCENTRIC_LINEAR_SAMPLE,
SYSTEM_VALUE_BARYCENTRIC_PULL_MODEL,
/**
* IR3 specific geometry shader and tesselation control shader system
* values that packs invocation id, thread id and vertex id. Having this
* as a nir level system value lets us do the unpacking in nir.
*/
SYSTEM_VALUE_GS_HEADER_IR3,
SYSTEM_VALUE_TCS_HEADER_IR3,
SYSTEM_VALUE_MAX /**< Number of values */
} gl_system_value;
const char *gl_system_value_name(gl_system_value sysval);
/**
* The possible interpolation qualifiers that can be applied to a fragment
* shader input in GLSL.
*
* Note: INTERP_MODE_NONE must be 0 so that memsetting the
* ir_variable data structure to 0 causes the default behavior.
*/
enum glsl_interp_mode
{
INTERP_MODE_NONE = 0,
INTERP_MODE_SMOOTH,
INTERP_MODE_FLAT,
INTERP_MODE_NOPERSPECTIVE,
INTERP_MODE_EXPLICIT,
INTERP_MODE_COUNT /**< Number of interpolation qualifiers */
};
enum glsl_interface_packing {
GLSL_INTERFACE_PACKING_STD140,
GLSL_INTERFACE_PACKING_SHARED,
GLSL_INTERFACE_PACKING_PACKED,
GLSL_INTERFACE_PACKING_STD430
};
const char *glsl_interp_mode_name(enum glsl_interp_mode qual);
/**
* Fragment program results
*/
typedef enum
{
FRAG_RESULT_DEPTH = 0,
FRAG_RESULT_STENCIL = 1,
/* If a single color should be written to all render targets, this
* register is written. No FRAG_RESULT_DATAn will be written.
*/
FRAG_RESULT_COLOR = 2,
FRAG_RESULT_SAMPLE_MASK = 3,
/* FRAG_RESULT_DATAn are the per-render-target (GLSL gl_FragData[n]
* or ARB_fragment_program fragment.color[n]) color results. If
* any are written, FRAG_RESULT_COLOR will not be written.
* FRAG_RESULT_DATA1 and up are simply for the benefit of
* gl_frag_result_name() and not to be construed as an upper bound
*/
FRAG_RESULT_DATA0 = 4,
FRAG_RESULT_DATA1,
FRAG_RESULT_DATA2,
FRAG_RESULT_DATA3,
FRAG_RESULT_DATA4,
FRAG_RESULT_DATA5,
FRAG_RESULT_DATA6,
FRAG_RESULT_DATA7,
} gl_frag_result;
const char *gl_frag_result_name(gl_frag_result result);
#define FRAG_RESULT_MAX (FRAG_RESULT_DATA0 + MAX_DRAW_BUFFERS)
/**
* \brief Layout qualifiers for gl_FragDepth.
*
* Extension AMD_conservative_depth allows gl_FragDepth to be redeclared with
* a layout qualifier.
*
* \see enum ir_depth_layout
*/
enum gl_frag_depth_layout
{
FRAG_DEPTH_LAYOUT_NONE, /**< No layout is specified. */
FRAG_DEPTH_LAYOUT_ANY,
FRAG_DEPTH_LAYOUT_GREATER,
FRAG_DEPTH_LAYOUT_LESS,
FRAG_DEPTH_LAYOUT_UNCHANGED
};
/**
* \brief Buffer access qualifiers
*/
enum gl_access_qualifier
{
ACCESS_COHERENT = (1 << 0),
ACCESS_RESTRICT = (1 << 1),
ACCESS_VOLATILE = (1 << 2),
ACCESS_NON_READABLE = (1 << 3),
ACCESS_NON_WRITEABLE = (1 << 4),
/** The access may use a non-uniform buffer or image index */
ACCESS_NON_UNIFORM = (1 << 5),
/* This has the same semantics as NIR_INTRINSIC_CAN_REORDER, only to be
* used with loads. In other words, it means that the load can be
* arbitrarily reordered, or combined with other loads to the same address.
* It is implied by ACCESS_NON_WRITEABLE together with ACCESS_RESTRICT, and
* a lack of ACCESS_COHERENT and ACCESS_VOLATILE.
*/
ACCESS_CAN_REORDER = (1 << 6),
/** Use as little cache space as possible. */
ACCESS_STREAM_CACHE_POLICY = (1 << 7),
};
/**
* \brief Blend support qualifiers
*/
enum gl_advanced_blend_mode
{
BLEND_NONE = 0x0000,
BLEND_MULTIPLY = 0x0001,
BLEND_SCREEN = 0x0002,
BLEND_OVERLAY = 0x0004,
BLEND_DARKEN = 0x0008,
BLEND_LIGHTEN = 0x0010,
BLEND_COLORDODGE = 0x0020,
BLEND_COLORBURN = 0x0040,
BLEND_HARDLIGHT = 0x0080,
BLEND_SOFTLIGHT = 0x0100,
BLEND_DIFFERENCE = 0x0200,
BLEND_EXCLUSION = 0x0400,
BLEND_HSL_HUE = 0x0800,
BLEND_HSL_SATURATION = 0x1000,
BLEND_HSL_COLOR = 0x2000,
BLEND_HSL_LUMINOSITY = 0x4000,
BLEND_ALL = 0x7fff,
};
enum blend_func
{
BLEND_FUNC_ADD,
BLEND_FUNC_SUBTRACT,
BLEND_FUNC_REVERSE_SUBTRACT,
BLEND_FUNC_MIN,
BLEND_FUNC_MAX,
};
enum blend_factor
{
BLEND_FACTOR_ZERO,
BLEND_FACTOR_SRC_COLOR,
BLEND_FACTOR_DST_COLOR,
BLEND_FACTOR_SRC_ALPHA,
BLEND_FACTOR_DST_ALPHA,
BLEND_FACTOR_CONSTANT_COLOR,
BLEND_FACTOR_CONSTANT_ALPHA,
BLEND_FACTOR_SRC_ALPHA_SATURATE,
};
enum gl_tess_spacing
{
TESS_SPACING_UNSPECIFIED,
TESS_SPACING_EQUAL,
TESS_SPACING_FRACTIONAL_ODD,
TESS_SPACING_FRACTIONAL_EVEN,
};
/**
* A compare function enum for use in compiler lowering passes. This is in
* the same order as GL's compare functions (shifted down by GL_NEVER), and is
* exactly the same as gallium's PIPE_FUNC_*.
*/
enum compare_func
{
COMPARE_FUNC_NEVER,
COMPARE_FUNC_LESS,
COMPARE_FUNC_EQUAL,
COMPARE_FUNC_LEQUAL,
COMPARE_FUNC_GREATER,
COMPARE_FUNC_NOTEQUAL,
COMPARE_FUNC_GEQUAL,
COMPARE_FUNC_ALWAYS,
};
/**
* Arrangements for grouping invocations from NV_compute_shader_derivatives.
*
* The extension provides new layout qualifiers that support two different
* arrangements of compute shader invocations for the purpose of derivative
* computation. When specifying
*
* layout(derivative_group_quadsNV) in;
*
* compute shader invocations are grouped into 2x2x1 arrays whose four local
* invocation ID values follow the pattern:
*
* +-----------------+------------------+
* | (2x+0, 2y+0, z) | (2x+1, 2y+0, z) |
* +-----------------+------------------+
* | (2x+0, 2y+1, z) | (2x+1, 2y+1, z) |
* +-----------------+------------------+
*
* where Y increases from bottom to top. When specifying
*
* layout(derivative_group_linearNV) in;
*
* compute shader invocations are grouped into 2x2x1 arrays whose four local
* invocation index values follow the pattern:
*
* +------+------+
* | 4n+0 | 4n+1 |
* +------+------+
* | 4n+2 | 4n+3 |
* +------+------+
*
* If neither layout qualifier is specified, derivatives in compute shaders
* return zero, which is consistent with the handling of built-in texture
* functions like texture() in GLSL 4.50 compute shaders.
*/
enum gl_derivative_group {
DERIVATIVE_GROUP_NONE = 0,
DERIVATIVE_GROUP_QUADS,
DERIVATIVE_GROUP_LINEAR,
};
enum float_controls
{
FLOAT_CONTROLS_DEFAULT_FLOAT_CONTROL_MODE = 0x0000,
FLOAT_CONTROLS_DENORM_PRESERVE_FP16 = 0x0001,
FLOAT_CONTROLS_DENORM_PRESERVE_FP32 = 0x0002,
FLOAT_CONTROLS_DENORM_PRESERVE_FP64 = 0x0004,
FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP16 = 0x0008,
FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP32 = 0x0010,
FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP64 = 0x0020,
FLOAT_CONTROLS_SIGNED_ZERO_INF_NAN_PRESERVE_FP16 = 0x0040,
FLOAT_CONTROLS_SIGNED_ZERO_INF_NAN_PRESERVE_FP32 = 0x0080,
FLOAT_CONTROLS_SIGNED_ZERO_INF_NAN_PRESERVE_FP64 = 0x0100,
FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP16 = 0x0200,
FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP32 = 0x0400,
FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP64 = 0x0800,
FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP16 = 0x1000,
FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP32 = 0x2000,
FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP64 = 0x4000,
};
#ifdef __cplusplus
} /* extern "C" */
#endif
#endif /* SHADER_ENUMS_H */

View File

@ -0,0 +1,261 @@
/*
* Mesa 3-D graphics library
*
* Copyright (C) 2006 Brian Paul All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included
* in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
/**
* \file bitset.h
* \brief Bitset of arbitrary size definitions.
* \author Michal Krol
*/
#ifndef BITSET_H
#define BITSET_H
//#include "util/bitscan.h"
//#include "util/macros.h"
/****************************************************************************
* generic bitset implementation
*/
#define BITSET_WORD unsigned int
#define BITSET_WORDBITS (sizeof (BITSET_WORD) * 8)
/* bitset declarations
*/
#define BITSET_WORDS(bits) (((bits) + BITSET_WORDBITS - 1) / BITSET_WORDBITS)
#define BITSET_DECLARE(name, bits) BITSET_WORD name[BITSET_WORDS(bits)]
/* bitset operations
*/
#define BITSET_COPY(x, y) memcpy( (x), (y), sizeof (x) )
#define BITSET_EQUAL(x, y) (memcmp( (x), (y), sizeof (x) ) == 0)
#define BITSET_ZERO(x) memset( (x), 0, sizeof (x) )
#define BITSET_ONES(x) memset( (x), 0xff, sizeof (x) )
#define BITSET_BITWORD(b) ((b) / BITSET_WORDBITS)
#define BITSET_BIT(b) (1u << ((b) % BITSET_WORDBITS))
/* single bit operations
*/
#define BITSET_TEST(x, b) (((x)[BITSET_BITWORD(b)] & BITSET_BIT(b)) != 0)
#define BITSET_SET(x, b) ((x)[BITSET_BITWORD(b)] |= BITSET_BIT(b))
#define BITSET_CLEAR(x, b) ((x)[BITSET_BITWORD(b)] &= ~BITSET_BIT(b))
#define BITSET_MASK(b) (((b) % BITSET_WORDBITS == 0) ? ~0 : BITSET_BIT(b) - 1)
#define BITSET_RANGE(b, e) ((BITSET_MASK((e) + 1)) & ~(BITSET_BIT(b) - 1))
/* bit range operations
*/
#define BITSET_TEST_RANGE(x, b, e) \
(BITSET_BITWORD(b) == BITSET_BITWORD(e) ? \
(((x)[BITSET_BITWORD(b)] & BITSET_RANGE(b, e)) != 0) : \
(assert (!"BITSET_TEST_RANGE: bit range crosses word boundary"), 0))
#define BITSET_SET_RANGE(x, b, e) \
(BITSET_BITWORD(b) == BITSET_BITWORD(e) ? \
((x)[BITSET_BITWORD(b)] |= BITSET_RANGE(b, e)) : \
(assert (!"BITSET_SET_RANGE: bit range crosses word boundary"), 0))
#define BITSET_CLEAR_RANGE(x, b, e) \
(BITSET_BITWORD(b) == BITSET_BITWORD(e) ? \
((x)[BITSET_BITWORD(b)] &= ~BITSET_RANGE(b, e)) : \
(assert (!"BITSET_CLEAR_RANGE: bit range crosses word boundary"), 0))
/* Get first bit set in a bitset.
*/
static inline int
__bitset_ffs(const BITSET_WORD *x, int n)
{
int i;
for (i = 0; i < n; i++) {
if (x[i])
return ffs(x[i]) + BITSET_WORDBITS * i;
}
return 0;
}
#define BITSET_FFS(x) __bitset_ffs(x, ARRAY_SIZE(x))
static inline unsigned
__bitset_next_set(unsigned i, BITSET_WORD *tmp,
const BITSET_WORD *set, unsigned size)
{
unsigned bit, word;
/* NOTE: The initial conditions for this function are very specific. At
* the start of the loop, the tmp variable must be set to *set and the
* initial i value set to 0. This way, if there is a bit set in the first
* word, we ignore the i-value and just grab that bit (so 0 is ok, even
* though 0 may be returned). If the first word is 0, then the value of
* `word` will be 0 and we will go on to look at the second word.
*/
word = BITSET_BITWORD(i);
while (*tmp == 0) {
word++;
if (word >= BITSET_WORDS(size))
return size;
*tmp = set[word];
}
/* Find the next set bit in the non-zero word */
bit = ffs(*tmp) - 1;
/* Unset the bit */
*tmp &= ~(1ull << bit);
return word * BITSET_WORDBITS + bit;
}
/**
* Iterates over each set bit in a set
*
* @param __i iteration variable, bit number
* @param __set the bitset to iterate (will not be modified)
* @param __size number of bits in the set to consider
*/
#define BITSET_FOREACH_SET(__i, __set, __size) \
for (BITSET_WORD __tmp = *(__set), *__foo = &__tmp; __foo != NULL; __foo = NULL) \
for (__i = 0; \
(__i = __bitset_next_set(__i, &__tmp, __set, __size)) < __size;)
#ifdef __cplusplus
/**
* Simple C++ wrapper of a bitset type of static size, with value semantics
* and basic bitwise arithmetic operators. The operators defined below are
* expected to have the same semantics as the same operator applied to other
* fundamental integer types. T is the name of the struct to instantiate
* it as, and N is the number of bits in the bitset.
*/
#define DECLARE_BITSET_T(T, N) struct T { \
EXPLICIT_CONVERSION \
operator bool() const \
{ \
for (unsigned i = 0; i < BITSET_WORDS(N); i++) \
if (words[i]) \
return true; \
return false; \
} \
\
T & \
operator=(int x) \
{ \
const T c = {{ (BITSET_WORD)x }}; \
return *this = c; \
} \
\
friend bool \
operator==(const T &b, const T &c) \
{ \
return BITSET_EQUAL(b.words, c.words); \
} \
\
friend bool \
operator!=(const T &b, const T &c) \
{ \
return !(b == c); \
} \
\
friend bool \
operator==(const T &b, int x) \
{ \
const T c = {{ (BITSET_WORD)x }}; \
return b == c; \
} \
\
friend bool \
operator!=(const T &b, int x) \
{ \
return !(b == x); \
} \
\
friend T \
operator~(const T &b) \
{ \
T c; \
for (unsigned i = 0; i < BITSET_WORDS(N); i++) \
c.words[i] = ~b.words[i]; \
return c; \
} \
\
T & \
operator|=(const T &b) \
{ \
for (unsigned i = 0; i < BITSET_WORDS(N); i++) \
words[i] |= b.words[i]; \
return *this; \
} \
\
friend T \
operator|(const T &b, const T &c) \
{ \
T d = b; \
d |= c; \
return d; \
} \
\
T & \
operator&=(const T &b) \
{ \
for (unsigned i = 0; i < BITSET_WORDS(N); i++) \
words[i] &= b.words[i]; \
return *this; \
} \
\
friend T \
operator&(const T &b, const T &c) \
{ \
T d = b; \
d &= c; \
return d; \
} \
\
bool \
test(unsigned i) const \
{ \
return BITSET_TEST(words, i); \
} \
\
T & \
set(unsigned i) \
{ \
BITSET_SET(words, i); \
return *this; \
} \
\
T & \
clear(unsigned i) \
{ \
BITSET_CLEAR(words, i); \
return *this; \
} \
\
BITSET_WORD words[BITSET_WORDS(N)]; \
}
#endif
#endif

View File

@ -0,0 +1,262 @@
/**************************************************************************
*
* Copyright 2006 VMware, Inc., Bismarck, ND. USA.
* All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sub license, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
* THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
* USE OR OTHER DEALINGS IN THE SOFTWARE.
*
* The above copyright notice and this permission notice (including the
* next paragraph) shall be included in all copies or substantial portions
* of the Software.
*
**************************************************************************/
/**
* \file
* List macros heavily inspired by the Linux kernel
* list handling. No list looping yet.
*
* Is not threadsafe, so common operations need to
* be protected using an external mutex.
*/
#ifndef _UTIL_LIST_H_
#define _UTIL_LIST_H_
#include <stdbool.h>
#include <stddef.h>
#include <assert.h>
#ifdef DEBUG
# define list_assert(cond, msg) assert(cond && msg)
#else
# define list_assert(cond, msg) (void)(0 && (cond))
#endif
struct list_head
{
struct list_head *prev;
struct list_head *next;
};
static inline void list_inithead(struct list_head *item)
{
item->prev = item;
item->next = item;
}
static inline void list_add(struct list_head *item, struct list_head *list)
{
item->prev = list;
item->next = list->next;
list->next->prev = item;
list->next = item;
}
static inline void list_addtail(struct list_head *item, struct list_head *list)
{
item->next = list;
item->prev = list->prev;
list->prev->next = item;
list->prev = item;
}
static inline bool list_is_empty(const struct list_head *list);
static inline void list_replace(struct list_head *from, struct list_head *to)
{
if (list_is_empty(from)) {
list_inithead(to);
} else {
to->prev = from->prev;
to->next = from->next;
from->next->prev = to;
from->prev->next = to;
}
}
static inline void list_del(struct list_head *item)
{
item->prev->next = item->next;
item->next->prev = item->prev;
item->prev = item->next = NULL;
}
static inline void list_delinit(struct list_head *item)
{
item->prev->next = item->next;
item->next->prev = item->prev;
item->next = item;
item->prev = item;
}
static inline bool list_is_empty(const struct list_head *list)
{
return list->next == list;
}
/**
* Returns whether the list has exactly one element.
*/
static inline bool list_is_singular(const struct list_head *list)
{
return list->next != NULL && list->next != list && list->next->next == list;
}
static inline unsigned list_length(const struct list_head *list)
{
struct list_head *node;
unsigned length = 0;
for (node = list->next; node != list; node = node->next)
length++;
return length;
}
static inline void list_splice(struct list_head *src, struct list_head *dst)
{
if (list_is_empty(src))
return;
src->next->prev = dst;
src->prev->next = dst->next;
dst->next->prev = src->prev;
dst->next = src->next;
}
static inline void list_splicetail(struct list_head *src, struct list_head *dst)
{
if (list_is_empty(src))
return;
src->prev->next = dst;
src->next->prev = dst->prev;
dst->prev->next = src->next;
dst->prev = src->prev;
}
static inline void list_validate(const struct list_head *list)
{
struct list_head *node;
assert(list->next->prev == list && list->prev->next == list);
for (node = list->next; node != list; node = node->next)
assert(node->next->prev == node && node->prev->next == node);
}
#define LIST_ENTRY(__type, __item, __field) \
((__type *)(((char *)(__item)) - offsetof(__type, __field)))
/**
* Cast from a pointer to a member of a struct back to the containing struct.
*
* 'sample' MUST be initialized, or else the result is undefined!
*/
#ifndef container_of
#define container_of(ptr, sample, member) \
(void *)((char *)(ptr) \
- ((char *)&(sample)->member - (char *)(sample)))
#endif
#define list_first_entry(ptr, type, member) \
LIST_ENTRY(type, (ptr)->next, member)
#define list_last_entry(ptr, type, member) \
LIST_ENTRY(type, (ptr)->prev, member)
#define LIST_FOR_EACH_ENTRY(pos, head, member) \
for (pos = NULL, pos = container_of((head)->next, pos, member); \
&pos->member != (head); \
pos = container_of(pos->member.next, pos, member))
#define LIST_FOR_EACH_ENTRY_SAFE(pos, storage, head, member) \
for (pos = NULL, pos = container_of((head)->next, pos, member), \
storage = container_of(pos->member.next, pos, member); \
&pos->member != (head); \
pos = storage, storage = container_of(storage->member.next, storage, member))
#define LIST_FOR_EACH_ENTRY_SAFE_REV(pos, storage, head, member) \
for (pos = NULL, pos = container_of((head)->prev, pos, member), \
storage = container_of(pos->member.prev, pos, member); \
&pos->member != (head); \
pos = storage, storage = container_of(storage->member.prev, storage, member))
#define LIST_FOR_EACH_ENTRY_FROM(pos, start, head, member) \
for (pos = NULL, pos = container_of((start), pos, member); \
&pos->member != (head); \
pos = container_of(pos->member.next, pos, member))
#define LIST_FOR_EACH_ENTRY_FROM_REV(pos, start, head, member) \
for (pos = NULL, pos = container_of((start), pos, member); \
&pos->member != (head); \
pos = container_of(pos->member.prev, pos, member))
#define list_for_each_entry(type, pos, head, member) \
for (type *pos = LIST_ENTRY(type, (head)->next, member), \
*__next = LIST_ENTRY(type, pos->member.next, member); \
&pos->member != (head); \
pos = LIST_ENTRY(type, pos->member.next, member), \
list_assert(pos == __next, "use _safe iterator"), \
__next = LIST_ENTRY(type, __next->member.next, member))
#define list_for_each_entry_safe(type, pos, head, member) \
for (type *pos = LIST_ENTRY(type, (head)->next, member), \
*__next = LIST_ENTRY(type, pos->member.next, member); \
&pos->member != (head); \
pos = __next, \
__next = LIST_ENTRY(type, __next->member.next, member))
#define list_for_each_entry_rev(type, pos, head, member) \
for (type *pos = LIST_ENTRY(type, (head)->prev, member), \
*__prev = LIST_ENTRY(type, pos->member.prev, member); \
&pos->member != (head); \
pos = LIST_ENTRY(type, pos->member.prev, member), \
list_assert(pos == __prev, "use _safe iterator"), \
__prev = LIST_ENTRY(type, __prev->member.prev, member))
#define list_for_each_entry_safe_rev(type, pos, head, member) \
for (type *pos = LIST_ENTRY(type, (head)->prev, member), \
*__prev = LIST_ENTRY(type, pos->member.prev, member); \
&pos->member != (head); \
pos = __prev, \
__prev = LIST_ENTRY(type, __prev->member.prev, member))
#define list_for_each_entry_from(type, pos, start, head, member) \
for (type *pos = LIST_ENTRY(type, (start), member); \
&pos->member != (head); \
pos = LIST_ENTRY(type, pos->member.next, member))
#define list_for_each_entry_from_safe(type, pos, start, head, member) \
for (type *pos = LIST_ENTRY(type, (start), member), \
*__next = LIST_ENTRY(type, pos->member.next, member); \
&pos->member != (head); \
pos = __next, \
__next = LIST_ENTRY(type, __next->member.next, member))
#define list_for_each_entry_from_rev(type, pos, start, head, member) \
for (type *pos = LIST_ENTRY(type, (start), member); \
&pos->member != (head); \
pos = LIST_ENTRY(type, pos->member.prev, member))
#define list_pair_for_each_entry(type, pos1, pos2, head1, head2, member) \
for (type *pos1 = LIST_ENTRY(type, (head1)->next, member), \
*pos2 = LIST_ENTRY(type, (head2)->next, member); \
&pos1->member != (head1) && &pos2->member != (head2); \
pos1 = LIST_ENTRY(type, pos1->member.next, member), \
pos2 = LIST_ENTRY(type, pos2->member.next, member))
#endif /*_UTIL_LIST_H_*/

View File

@ -0,0 +1,346 @@
/*
* Copyright © 2014 Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*/
#ifndef UTIL_MACROS_H
#define UTIL_MACROS_H
#include <assert.h>
/* Compute the size of an array */
#ifndef ARRAY_SIZE
# define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
#endif
/* For compatibility with Clang's __has_builtin() */
#ifndef __has_builtin
# define __has_builtin(x) 0
#endif
/**
* __builtin_expect macros
*/
#if !defined(HAVE___BUILTIN_EXPECT)
# define __builtin_expect(x, y) (x)
#endif
#ifndef likely
# ifdef HAVE___BUILTIN_EXPECT
# define likely(x) __builtin_expect(!!(x), 1)
# define unlikely(x) __builtin_expect(!!(x), 0)
# else
# define likely(x) (x)
# define unlikely(x) (x)
# endif
#endif
/**
* Static (compile-time) assertion.
* Basically, use COND to dimension an array. If COND is false/zero the
* array size will be -1 and we'll get a compilation error.
*/
#define STATIC_ASSERT(COND) \
do { \
(void) sizeof(char [1 - 2*!(COND)]); \
} while (0)
/**
* Unreachable macro. Useful for suppressing "control reaches end of non-void
* function" warnings.
*/
#if defined(HAVE___BUILTIN_UNREACHABLE) || __has_builtin(__builtin_unreachable)
#define unreachable(str) \
do { \
assert(!str); \
__builtin_unreachable(); \
} while (0)
#elif defined (_MSC_VER)
#define unreachable(str) \
do { \
assert(!str); \
__assume(0); \
} while (0)
#else
#define unreachable(str) assert(!str)
#endif
/**
* Assume macro. Useful for expressing our assumptions to the compiler,
* typically for purposes of silencing warnings.
*/
#if __has_builtin(__builtin_assume)
#define assume(expr) \
do { \
assert(expr); \
__builtin_assume(expr); \
} while (0)
#elif defined HAVE___BUILTIN_UNREACHABLE
#define assume(expr) ((expr) ? ((void) 0) \
: (assert(!"assumption failed"), \
__builtin_unreachable()))
#elif defined (_MSC_VER)
#define assume(expr) __assume(expr)
#else
#define assume(expr) assert(expr)
#endif
/* Attribute const is used for functions that have no effects other than their
* return value, and only rely on the argument values to compute the return
* value. As a result, calls to it can be CSEed. Note that using memory
* pointed to by the arguments is not allowed for const functions.
*/
#ifdef HAVE_FUNC_ATTRIBUTE_CONST
#define ATTRIBUTE_CONST __attribute__((__const__))
#else
#define ATTRIBUTE_CONST
#endif
#ifdef HAVE_FUNC_ATTRIBUTE_FLATTEN
#define FLATTEN __attribute__((__flatten__))
#else
#define FLATTEN
#endif
#ifdef HAVE_FUNC_ATTRIBUTE_FORMAT
#define PRINTFLIKE(f, a) __attribute__ ((format(__printf__, f, a)))
#else
#define PRINTFLIKE(f, a)
#endif
#ifdef HAVE_FUNC_ATTRIBUTE_MALLOC
#define MALLOCLIKE __attribute__((__malloc__))
#else
#define MALLOCLIKE
#endif
/* Forced function inlining */
/* Note: Clang also sets __GNUC__ (see other cases below) */
#ifndef ALWAYS_INLINE
# if defined(__GNUC__)
# define ALWAYS_INLINE inline __attribute__((always_inline))
# elif defined(_MSC_VER)
# define ALWAYS_INLINE __forceinline
# else
# define ALWAYS_INLINE inline
# endif
#endif
/* Used to optionally mark structures with misaligned elements or size as
* packed, to trade off performance for space.
*/
#ifdef HAVE_FUNC_ATTRIBUTE_PACKED
#define PACKED __attribute__((__packed__))
#else
#define PACKED
#endif
/* Attribute pure is used for functions that have no effects other than their
* return value. As a result, calls to it can be dead code eliminated.
*/
#ifdef HAVE_FUNC_ATTRIBUTE_PURE
#define ATTRIBUTE_PURE __attribute__((__pure__))
#else
#define ATTRIBUTE_PURE
#endif
#ifdef HAVE_FUNC_ATTRIBUTE_RETURNS_NONNULL
#define ATTRIBUTE_RETURNS_NONNULL __attribute__((__returns_nonnull__))
#else
#define ATTRIBUTE_RETURNS_NONNULL
#endif
#ifndef NORETURN
# ifdef _MSC_VER
# define NORETURN __declspec(noreturn)
# elif defined HAVE_FUNC_ATTRIBUTE_NORETURN
# define NORETURN __attribute__((__noreturn__))
# else
# define NORETURN
# endif
#endif
#ifdef __cplusplus
/**
* Macro function that evaluates to true if T is a trivially
* destructible type -- that is, if its (non-virtual) destructor
* performs no action and all member variables and base classes are
* trivially destructible themselves.
*/
# if (defined(__clang__) && defined(__has_feature))
# if __has_feature(has_trivial_destructor)
# define HAS_TRIVIAL_DESTRUCTOR(T) __has_trivial_destructor(T)
# endif
# elif defined(__GNUC__)
# if ((__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ >= 3)))
# define HAS_TRIVIAL_DESTRUCTOR(T) __has_trivial_destructor(T)
# endif
# elif defined(_MSC_VER) && !defined(__INTEL_COMPILER)
# define HAS_TRIVIAL_DESTRUCTOR(T) __has_trivial_destructor(T)
# endif
# ifndef HAS_TRIVIAL_DESTRUCTOR
/* It's always safe (if inefficient) to assume that a
* destructor is non-trivial.
*/
# define HAS_TRIVIAL_DESTRUCTOR(T) (false)
# endif
#endif
/**
* PUBLIC/USED macros
*
* If we build the library with gcc's -fvisibility=hidden flag, we'll
* use the PUBLIC macro to mark functions that are to be exported.
*
* We also need to define a USED attribute, so the optimizer doesn't
* inline a static function that we later use in an alias. - ajax
*/
#ifndef PUBLIC
# if defined(__GNUC__)
# define PUBLIC __attribute__((visibility("default")))
# define USED __attribute__((used))
# elif defined(_MSC_VER)
# define PUBLIC __declspec(dllexport)
# define USED
# else
# define PUBLIC
# define USED
# endif
#endif
/**
* UNUSED marks variables (or sometimes functions) that have to be defined,
* but are sometimes (or always) unused beyond that. A common case is for
* a function parameter to be used in some build configurations but not others.
* Another case is fallback vfuncs that don't do anything with their params.
*
* Note that this should not be used for identifiers used in `assert()`;
* see ASSERTED below.
*/
#ifdef HAVE_FUNC_ATTRIBUTE_UNUSED
#define UNUSED __attribute__((unused))
#else
#define UNUSED
#endif
/**
* Use ASSERTED to indicate that an identifier is unused outside of an `assert()`,
* so that assert-free builds don't get "unused variable" warnings.
*/
#ifdef NDEBUG
#define ASSERTED UNUSED
#else
#define ASSERTED
#endif
#ifdef HAVE_FUNC_ATTRIBUTE_WARN_UNUSED_RESULT
#define MUST_CHECK __attribute__((warn_unused_result))
#else
#define MUST_CHECK
#endif
#if defined(__GNUC__)
#define ATTRIBUTE_NOINLINE __attribute__((noinline))
#else
#define ATTRIBUTE_NOINLINE
#endif
/**
* Check that STRUCT::FIELD can hold MAXVAL. We use a lot of bitfields
* in Mesa/gallium. We have to be sure they're of sufficient size to
* hold the largest expected value.
* Note that with MSVC, enums are signed and enum bitfields need one extra
* high bit (always zero) to ensure the max value is handled correctly.
* This macro will detect that with MSVC, but not GCC.
*/
#define ASSERT_BITFIELD_SIZE(STRUCT, FIELD, MAXVAL) \
do { \
ASSERTED STRUCT s; \
s.FIELD = (MAXVAL); \
assert((int) s.FIELD == (MAXVAL) && "Insufficient bitfield size!"); \
} while (0)
/** Compute ceiling of integer quotient of A divided by B. */
#define DIV_ROUND_UP( A, B ) ( ((A) + (B) - 1) / (B) )
/** Clamp X to [MIN,MAX]. Turn NaN into MIN, arbitrarily. */
#define CLAMP( X, MIN, MAX ) ( (X)>(MIN) ? ((X)>(MAX) ? (MAX) : (X)) : (MIN) )
/** Minimum of two values: */
#define MIN2( A, B ) ( (A)<(B) ? (A) : (B) )
/** Maximum of two values: */
#define MAX2( A, B ) ( (A)>(B) ? (A) : (B) )
/** Minimum and maximum of three values: */
#define MIN3( A, B, C ) ((A) < (B) ? MIN2(A, C) : MIN2(B, C))
#define MAX3( A, B, C ) ((A) > (B) ? MAX2(A, C) : MAX2(B, C))
/** Align a value to a power of two */
#define ALIGN_POT(x, pot_align) (((x) + (pot_align) - 1) & ~((pot_align) - 1))
/**
* Macro for declaring an explicit conversion operator. Defaults to an
* implicit conversion if C++11 is not supported.
*/
#if __cplusplus >= 201103L
#define EXPLICIT_CONVERSION explicit
#elif defined(__cplusplus)
#define EXPLICIT_CONVERSION
#endif
/** Set a single bit */
#define BITFIELD_BIT(b) (1u << (b))
/** Set all bits up to excluding bit b */
#define BITFIELD_MASK(b) \
((b) == 32 ? (~0u) : BITFIELD_BIT((b) % 32) - 1)
/** Set count bits starting from bit b */
#define BITFIELD_RANGE(b, count) \
(BITFIELD_MASK((b) + (count)) & ~BITFIELD_MASK(b))
/** Set a single bit */
#define BITFIELD64_BIT(b) (1ull << (b))
/** Set all bits up to excluding bit b */
#define BITFIELD64_MASK(b) \
((b) == 64 ? (~0ull) : BITFIELD64_BIT(b) - 1)
/** Set count bits starting from bit b */
#define BITFIELD64_RANGE(b, count) \
(BITFIELD64_MASK((b) + (count)) & ~BITFIELD64_MASK(b))
/* TODO: In future we should try to move this to u_debug.h once header
* dependencies are reorganised to allow this.
*/
enum pipe_debug_type
{
PIPE_DEBUG_TYPE_OUT_OF_MEMORY = 1,
PIPE_DEBUG_TYPE_ERROR,
PIPE_DEBUG_TYPE_SHADER_INFO,
PIPE_DEBUG_TYPE_PERF_INFO,
PIPE_DEBUG_TYPE_INFO,
PIPE_DEBUG_TYPE_FALLBACK,
PIPE_DEBUG_TYPE_CONFORMANCE,
};
#endif /* UTIL_MACROS_H */

View File

@ -0,0 +1,132 @@
#include "debug/include/adreno_pm4types.h"
#define REG_A5XX_TPL1_CS_TEX_CONST_LO 0x0000e760
#define REG_A5XX_TPL1_CS_TEX_SAMP_LO 0x0000e75c
#define REG_A5XX_SP_CS_CTRL_REG0 0x0000e5f0
std::map<int, std::string> regs = {
{0x0000e760, "REG_A5XX_TPL1_CS_TEX_CONST_LO"},
{0x0000e75c, "REG_A5XX_TPL1_CS_TEX_SAMP_LO"},
{0x00000e06, "REG_A5XX_HLSQ_MODE_CNTL"},
{0x00000e91, "REG_A5XX_UCHE_CACHE_INVALIDATE_MIN_LO"},
{0x00000ec2, "REG_A5XX_SP_MODE_CNTL"},
{0x0000e580, "REG_A5XX_SP_SP_CNTL"},
{0x0000e5f0, "REG_A5XX_SP_CS_CTRL_REG0"},
{0x0000e796, "REG_A5XX_HLSQ_CS_CNTL"},
{0x0000e784, "REG_A5XX_HLSQ_CONTROL_0_REG"},
{0x0000e7b0, "REG_A5XX_HLSQ_CS_NDRANGE_0"},
{0x0000e7b9, "REG_A5XX_HLSQ_CS_KERNEL_GROUP_X"},
{0x00000cdd, "REG_A5XX_VSC_RESOLVE_CNTL"},
};
std::map<int, std::string> ops = {
{33, "CP_REG_RMW"},
{62, "CP_REG_TO_MEM"},
{49, "CP_RUN_OPENCL"},
{16, "CP_NOP"},
{38, "CP_WAIT_FOR_IDLE"},
{110, "CP_COMPUTE_CHECKPOINT"},
{48, "CP_LOAD_STATE"},
};
void CachedCommand::disassemble() {
uint32_t *src = (uint32_t *)cmds[1].gpuaddr;
int len = cmds[1].size/4;
printf("disassemble %p %d\n", src, len);
int i = 0;
while (i < len) {
int pktsize;
int pkttype = -1;
if (pkt_is_type0(src[i])) {
pkttype = 0;
pktsize = type0_pkt_size(src[i]);
} else if (pkt_is_type3(src[i])) {
pkttype = 3;
pktsize = type3_pkt_size(src[i]);
} else if (pkt_is_type4(src[i])) {
pkttype = 4;
pktsize = type4_pkt_size(src[i]);
} else if (pkt_is_type7(src[i])) {
pkttype = 7;
pktsize = type7_pkt_size(src[i]);
}
printf("%3d: type:%d size:%d ", i, pkttype, pktsize);
if (pkttype == 7) {
int op = cp_type7_opcode(src[i]);
if (ops.find(op) != ops.end()) {
printf("%-40s ", ops[op].c_str());
} else {
printf("op: %4d ", op);
}
}
if (pkttype == 4) {
int reg = cp_type4_base_index_one_reg_wr(src[i]);
if (regs.find(reg) != regs.end()) {
printf("%-40s ", regs[reg].c_str());
} else {
printf("reg: %4x ", reg);
}
}
for (int j = 0; j < pktsize+1; j++) {
printf("%8.8X ", src[i+j]);
}
printf("\n");
uint64_t addr;
if (pkttype == 7) {
switch (cp_type7_opcode(src[i])) {
case CP_LOAD_STATE:
int dst_off = src[i+1] & 0x1FFF;
int state_src = (src[i+1] >> 16) & 3;
int state_block = (src[i+1] >> 18) & 7;
int state_type = src[i+2] & 3;
int num_unit = (src[i+1] & 0xffc00000) >> 22;
printf(" dst_off: %x state_src: %d state_block: %d state_type: %d num_unit: %d\n",
dst_off, state_src, state_block, state_type, num_unit);
addr = (uint64_t)(src[i+2] & 0xfffffffc) | ((uint64_t)(src[i+3]) << 32);
if (state_block == 5 && state_type == 0) {
if (!(addr&0xFFF)) {
int len = 0x1000;
if (num_unit >= 32) len += 0x1000;
//hexdump((uint32_t *)addr, len);
char fn[0x100];
snprintf(fn, sizeof(fn), "/tmp/0x%lx.shader", addr);
printf("dumping %s\n", fn);
FILE *f = fopen(fn, "wb");
// groups of 16 instructions
fwrite((void*)addr, 1, len, f);
fclose(f);
}
}
break;
}
}
/*if (pkttype == 4) {
switch (cp_type4_base_index_one_reg_wr(src[i])) {
case REG_A5XX_SP_CS_CTRL_REG0:
addr = (uint64_t)(src[i+4] & 0xfffffffc) | ((uint64_t)(src[i+5]) << 32);
hexdump((uint32_t *)addr, 0x1000);
break;
}
}*/
/*if (pkttype == 4 && cp_type4_base_index_one_reg_wr(src[i]) == REG_A5XX_TPL1_CS_TEX_CONST_LO) {
uint64_t addr = (uint64_t)(src[i+1] & 0xffffffff) | ((uint64_t)(src[i+2]) << 32);
hexdump((uint32_t *)addr, 0x40);
}
if (pkttype == 4 && cp_type4_base_index_one_reg_wr(src[i]) == REG_A5XX_TPL1_CS_TEX_SAMP_LO) {
uint64_t addr = (uint64_t)(src[i+1] & 0xffffffff) | ((uint64_t)(src[i+2]) << 32);
hexdump((uint32_t *)addr, 0x40);
}*/
if (pkttype == -1) break;
i += (1+pktsize);
}
assert(i == len);
}

View File

@ -0,0 +1,51 @@
// https://github.com/moskewcz/boda/issues/13
#define USE_FP16
#ifdef USE_FP16
#define up(x) x
#define down(x) x
#define xtype half8
#define skip 128
#else
#define up(x) convert_float8(x)
#define down(x) convert_half8(x)
#define xtype float8
#define skip 128
#endif
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
__kernel void gemm(const int M, const int N, const int K,
global const half8* a, global const half8* b, global half8* c )
{
xtype c_r[8] = {0,0,0,0,0,0,0,0};
int const a_off_thr = get_global_id(0);
int const b_off_thr = get_global_id(1);
int a_off = a_off_thr;
int b_off = b_off_thr;
for( int k = 0; k < 1024; k += 1 ) {
xtype a_r = up(a[a_off]);
xtype b_r = up(b[b_off]);
c_r[0] += a_r.s0*b_r;
c_r[1] += a_r.s1*b_r;
c_r[2] += a_r.s2*b_r;
c_r[3] += a_r.s3*b_r;
c_r[4] += a_r.s4*b_r;
c_r[5] += a_r.s5*b_r;
c_r[6] += a_r.s6*b_r;
c_r[7] += a_r.s7*b_r;
a_off += skip;
b_off += skip;
}
int c_off = a_off_thr*1024 + b_off_thr;
for (int i = 0; i < 8; i++) {
c[c_off] = down(c_r[i]);
c_off += skip;
}
}

View File

@ -0,0 +1,75 @@
// https://github.com/moskewcz/boda/issues/13
//#define USE_FP16
#ifdef USE_FP16
#define xtype half4
#define read_imagep read_imageh
#define write_imagep write_imageh
#else
#define xtype float4
#define read_imagep read_imagef
#define write_imagep write_imagef
#endif
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
__kernel void gemm(const int M, const int N, const int K,
read_only image2d_t A,
read_only image2d_t B,
write_only image2d_t C)
{
const sampler_t smp = CLK_NORMALIZED_COORDS_FALSE |
CLK_ADDRESS_CLAMP |
CLK_FILTER_NEAREST;
xtype c_r[4] = {0,0,0,0};
xtype a_r[4], b_r[4];
int const a_off_thr = get_global_id(0);
int const b_off_thr = get_global_id(1);
int2 a_samp = {0, a_off_thr};
int2 b_samp = {0, b_off_thr};
for (short k = 0; k < K/4; k++) {
for (short i = 0; i < 4; ++i) {
a_r[i] = read_imagep(A, smp, a_samp);
b_r[i] = read_imagep(B, smp, b_samp);
++a_samp.x;
++b_samp.x;
}
for (short i = 0; i < 4; ++i) {
float4 ov = c_r[i];
ov.x += a_r[i].x * b_r[0].x;
ov.x += a_r[i].y * b_r[0].y;
ov.x += a_r[i].z * b_r[0].z;
ov.x += a_r[i].w * b_r[0].w;
ov.y += a_r[i].x * b_r[1].x;
ov.y += a_r[i].y * b_r[1].y;
ov.y += a_r[i].z * b_r[1].z;
ov.y += a_r[i].w * b_r[1].w;
ov.z += a_r[i].x * b_r[2].x;
ov.z += a_r[i].y * b_r[2].y;
ov.z += a_r[i].z * b_r[2].z;
ov.z += a_r[i].w * b_r[2].w;
ov.w += a_r[i].x * b_r[3].x;
ov.w += a_r[i].y * b_r[3].y;
ov.w += a_r[i].z * b_r[3].z;
ov.w += a_r[i].w * b_r[3].w;
c_r[i] = ov;
}
}
int2 c_samp = {a_off_thr, b_off_thr*4};
for (short i = 0; i < 4; i++) {
write_imagep(C, c_samp, c_r[i]);
++c_samp.y;
}
}

View File

@ -0,0 +1,314 @@
#include <stdio.h>
#include <stdlib.h>
#include <CL/cl.h>
#include <assert.h>
#include <time.h>
/*
block7b_project_conv (Conv2D) (None, 8, 16, 352) 743424 block7b_activation[0][0]
8448*8*4 = 8*16*2112 = 270336 = input = 128*2112
2112*88*4 = 743424 = weights = 2112*352
1408*8*4 = 8*16*352 = 45056 = output = 128*352
FLOPS = 128*2112*352 = 95158272 = 95 MFLOPS
RAM = 128*2112 + 2112*352 + 128*352 = 1058816 = 1 M accesses
# 22 groups
128*2112 + 2112*16 + 128*16 = 306176
306176*22 = 6735872 real accesses
This is a 128x2112 by 2112x352 matrix multiply
work_size = {88, 4, 8}
Each kernel run computes 16 outputs
0x7f7e8a6380 convolution_horizontal_reduced_reads_1x1 -- 88 4 8 -- 4 4 8
image2d_t input = 0x7f7f490b00 image 8448 x 8 rp 67840
short startPackedInputChannel = 0
short numPackedInputChannelsForGroup = 528
short totalNumPackedInputChannels = 528
short packedOuputChannelOffset = 0
short totalNumPackedOutputChannels = 88
image2d_t weights = 0x7f7f52fb80 image 2112 x 88 rp 16896
float* biases = 0x7f7f564d80 buffer 1408
short filterSizeX = 1
short filterSizeY = 1
image2d_t output = 0x7f7f490e80 image 1408 x 8 rp 11264
short paddingX = 0
short paddingY = 0
short strideX = 1
short strideY = 1
short neuron = 0
float a = 1.000000
float b = 1.000000
float min_clamp = 0.000000
float max_clamp = 0.000000
float* parameters = 0x0
float* batchNormBiases = 0x0
short numOutputColumns = 16
*/
#define GEMM
#define IMAGE
void dump_maps() {
FILE *f = fopen("/proc/self/maps", "rb");
char maps[0x100000];
int len = fread(maps, 1, sizeof(maps), f);
maps[len] = '\0';
maps[0x800] = '\0';
fclose(f);
printf("%s\n", maps);
}
static inline uint64_t nanos_since_boot() {
struct timespec t;
clock_gettime(CLOCK_BOOTTIME, &t);
return t.tv_sec * 1000000000ULL + t.tv_nsec;
}
int main(int argc, char *argv[]) {
cl_int err;
// cl init
cl_device_id device_id;
cl_context context;
cl_command_queue q;
{
cl_platform_id platform_id[2];
cl_uint num_devices;
cl_uint num_platforms;
err = clGetPlatformIDs(sizeof(platform_id)/sizeof(cl_platform_id), platform_id, &num_platforms);
assert(err == 0);
err = clGetDeviceIDs(platform_id[0], CL_DEVICE_TYPE_DEFAULT, 1, &device_id, &num_devices);
assert(err == 0);
context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &err);
assert(err == 0);
q = clCreateCommandQueue(context, device_id, 0, &err);
assert(err == 0);
}
printf("cl ready\n");
char tmp[0x10000];
memset(tmp, 0, sizeof(tmp));
FILE *f = fopen(argv[1], "rb");
fread(tmp, 1, sizeof(tmp), f);
fclose(f);
const char *strings[1];
size_t lengths[1];
strings[0] = tmp;
lengths[0] = strlen(tmp);
cl_program prog = clCreateProgramWithSource(context, 1, strings, lengths, &err);
assert(err == 0);
printf("creating program\n");
err = clBuildProgram(prog, 1, &device_id, "-D AVANTE_IS_GPU_A530_64", NULL, NULL);
if (err != 0) {
printf("got err %d\n", err);
size_t length;
char buffer[2048];
clGetProgramBuildInfo(prog, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &length);
buffer[length] = '\0';
printf("%s\n", buffer);
}
assert(err == 0);
printf("built program\n");
#ifdef GEMM
// 128x2112 by 2112x352
int M,N,K;
M = N = K = 1024;
//M = 128; K = 2112; N = 352;
cl_kernel kern = clCreateKernel(prog, "gemm", &err);
assert(err == 0);
printf("creating kernel %p\n", kern);
cl_mem A,B,C;
A = clCreateBuffer(context, CL_MEM_READ_WRITE, M*K*2, NULL, &err);
assert(err == 0);
B = clCreateBuffer(context, CL_MEM_READ_WRITE, K*N*2, NULL, &err);
assert(err == 0);
C = clCreateBuffer(context, CL_MEM_READ_WRITE, M*N*2, NULL, &err);
assert(err == 0);
printf("created buffers\n");
#ifdef IMAGE
cl_image_format fmt;
fmt.image_channel_order = CL_RGBA;
fmt.image_channel_data_type = CL_HALF_FLOAT;
cl_image_desc desc;
desc.image_type = CL_MEM_OBJECT_IMAGE2D;
desc.image_depth = 0; desc.image_slice_pitch = 0; desc.num_mip_levels = 0; desc.num_samples = 0;
desc.image_width = K; desc.image_height = M/4;
desc.buffer = A;
desc.image_row_pitch = desc.image_width*8;
A = clCreateImage(context, CL_MEM_READ_WRITE, &fmt, &desc, NULL, &err);
assert(err == 0);
desc.image_width = K; desc.image_height = N/4;
desc.buffer = B; desc.image_row_pitch = desc.image_width*8;
B = clCreateImage(context, CL_MEM_READ_WRITE, &fmt, &desc, NULL, &err);
assert(err == 0);
desc.image_width = M/4; desc.image_height = N;
desc.buffer = C; desc.image_row_pitch = desc.image_width*8;
C = clCreateImage(context, CL_MEM_READ_WRITE, &fmt, &desc, NULL, &err);
assert(err == 0);
printf("created images\n");
#endif
clSetKernelArg(kern, 0, sizeof(int), &M);
clSetKernelArg(kern, 1, sizeof(int), &N);
clSetKernelArg(kern, 2, sizeof(int), &K);
clSetKernelArg(kern, 3, sizeof(cl_mem), &A);
clSetKernelArg(kern, 4, sizeof(cl_mem), &B);
clSetKernelArg(kern, 5, sizeof(cl_mem), &C);
printf("set args\n");
#ifdef IMAGE
size_t global_work_size[3] = {M/4, N/4, 1};
size_t local_work_size[3] = {4, 64, 1};
#else
size_t global_work_size[3] = {128, 128, 1};
size_t local_work_size[3] = {2, 128, 1};
#endif
#else
cl_kernel kern = clCreateKernel(prog, "convolution_horizontal_reduced_reads_1x1", &err);
assert(err == 0);
printf("creating kernel\n");
cl_mem input;
cl_mem weights;
cl_mem weights_buffer;
cl_mem biases;
cl_mem outputs;
cl_image_format fmt;
fmt.image_channel_order = CL_RGBA;
fmt.image_channel_data_type = CL_HALF_FLOAT;
cl_image_desc desc;
desc.image_type = CL_MEM_OBJECT_IMAGE2D;
desc.image_depth = 0; desc.image_slice_pitch = 0; desc.num_mip_levels = 0; desc.num_samples = 0;
desc.buffer = NULL;
biases = clCreateBuffer(context, CL_MEM_READ_WRITE, 1408, NULL, &err);
assert(err == 0);
desc.image_width = 8448; desc.image_height = 8; desc.image_row_pitch = 67840;
desc.buffer = clCreateBuffer(context, CL_MEM_READ_WRITE, desc.image_height * desc.image_row_pitch, NULL, &err);
assert(err == 0);
input = clCreateImage(context, CL_MEM_READ_WRITE, &fmt, &desc, NULL, &err);
assert(err == 0);
desc.image_width = 2112; desc.image_height = 88; desc.image_row_pitch = 16896;
weights_buffer = desc.buffer = clCreateBuffer(context, CL_MEM_READ_WRITE, desc.image_height * desc.image_row_pitch, NULL, &err);
assert(err == 0);
weights = clCreateImage(context, CL_MEM_READ_WRITE, &fmt, &desc, NULL, &err);
assert(err == 0);
desc.image_width = 1408; desc.image_height = 8; desc.image_row_pitch = 11264;
desc.buffer = clCreateBuffer(context, CL_MEM_READ_WRITE, desc.image_height * desc.image_row_pitch, NULL, &err);
assert(err == 0);
outputs = clCreateImage(context, CL_MEM_READ_WRITE, &fmt, &desc, NULL, &err);
assert(err == 0);
void *n = NULL;
uint16_t v;
float fl;
clSetKernelArg(kern, 0, sizeof(cl_mem), &input);
v = 0; clSetKernelArg(kern, 1, sizeof(v), &v);
v = 528; clSetKernelArg(kern, 2, sizeof(v), &v);
v = 528; clSetKernelArg(kern, 3, sizeof(v), &v);
v = 0; clSetKernelArg(kern, 4, sizeof(v), &v);
v = 88; clSetKernelArg(kern, 5, sizeof(v), &v);
clSetKernelArg(kern, 6, sizeof(cl_mem), &weights);
//clSetKernelArg(kern, 6, sizeof(cl_mem), &weights_buffer);
clSetKernelArg(kern, 7, sizeof(cl_mem), &biases);
v = 1; clSetKernelArg(kern, 8, sizeof(v), &v);
v = 1; clSetKernelArg(kern, 9, sizeof(v), &v);
clSetKernelArg(kern, 10, sizeof(cl_mem), &outputs);
v = 0; clSetKernelArg(kern, 11, sizeof(v), &v);
v = 0; clSetKernelArg(kern, 12, sizeof(v), &v);
v = 1; clSetKernelArg(kern, 13, sizeof(v), &v);
v = 1; clSetKernelArg(kern, 14, sizeof(v), &v);
v = 0; clSetKernelArg(kern, 15, sizeof(v), &v);
fl = 1.0; clSetKernelArg(kern, 16, sizeof(fl), &fl);
fl = 0.0; clSetKernelArg(kern, 17, sizeof(fl), &fl);
fl = 0.0; clSetKernelArg(kern, 18, sizeof(fl), &fl);
fl = 0.0; clSetKernelArg(kern, 19, sizeof(fl), &fl);
clSetKernelArg(kern, 20, sizeof(n), &n);
clSetKernelArg(kern, 21, sizeof(n), &n);
v = 16; clSetKernelArg(kern, 22, sizeof(v), &v);
size_t global_work_size[3] = {88, 4, 8};
size_t local_work_size[3] = {4, 4, 8};
#endif
printf("ready to enqueue\n");
for (int i = 0; i < 20; i++) {
cl_event event;
err = clEnqueueNDRangeKernel(q, kern, 3, NULL, global_work_size, local_work_size, 0, NULL, &event);
assert(err == 0);
uint64_t tb = nanos_since_boot();
err = clWaitForEvents(1, &event);
assert(err == 0);
uint64_t te = nanos_since_boot();
uint64_t us = (te-tb)/1000;
float s = 1000000.0/us;
#ifdef GEMM
float flops = M*N*K*s;
float rams = (M*N + N*K + M*K)*s;
#else
float flops = 95158272.0*s;
float rams = 1058816.0*s;
//float rams = 6735872.0*s;
#endif
printf("%2d: wait %lu us -- %.2f GFLOPS -- %.2f GB/s\n", i, us, flops/1e9, rams*2/1e9);
}
size_t binary_size = 0;
err = clGetProgramInfo(prog, CL_PROGRAM_BINARY_SIZES, sizeof(binary_size), &binary_size, NULL);
assert(err == 0);
assert(binary_size > 0);
uint8_t *binary_buf = (uint8_t *)malloc(binary_size);
assert(binary_buf);
uint8_t* bufs[1] = { binary_buf, };
err = clGetProgramInfo(prog, CL_PROGRAM_BINARIES, sizeof(bufs), &bufs, NULL);
assert(err == 0);
FILE *g = fopen("/tmp/bin.bin", "wb");
fwrite(binary_buf, 1, binary_size, g);
fclose(g);
/*dump_maps();
for (uint64_t i = 0x7ffbd2000; i < 0x800000000; i += 0x1000) {
uint64_t cmd = *((uint64_t*)i);
printf("%llx: %llx\n", i, cmd);
}*/
return 0;
}

View File

@ -0,0 +1,2 @@
#!/usr/bin/env bash
gcc -I/data/openpilot/phonelibs/opencl/include -L/system/vendor/lib64 -lOpenCL -lCB -lgsl go.c

View File

@ -8,8 +8,9 @@
void hexdump(uint32_t *d, int len);
int main(int argc, char* argv[]) {
float *output = (float*)calloc(0x10000, sizeof(float));
float *golden = (float*)calloc(0x10000, sizeof(float));
#define OUTPUT_SIZE 0x10000
float *output = (float*)calloc(OUTPUT_SIZE, sizeof(float));
float *golden = (float*)calloc(OUTPUT_SIZE, sizeof(float));
SNPEModel mdl(argv[1], output, 0, USE_GPU_RUNTIME);
// cmd line test
@ -59,36 +60,39 @@ int main(int argc, char* argv[]) {
// first run
printf("************** execute 1 **************\n");
memset(output, 0, sizeof(output));
memset(output, 0, OUTPUT_SIZE * sizeof(float));
mdl.execute(input, 0);
hexdump((uint32_t *)output, 0x100);
memcpy(golden, output, sizeof(output));
memcpy(golden, output, OUTPUT_SIZE * sizeof(float));
// second run
printf("************** execute 2 **************\n");
memset(output, 0, sizeof(output));
memset(output, 0, OUTPUT_SIZE * sizeof(float));
Thneed *t = new Thneed();
t->record = 3; // debug print with record
t->record = 7; // debug print with record
mdl.execute(input, 0);
t->stop();
hexdump((uint32_t *)output, 0x100);
if (memcmp(golden, output, sizeof(output)) != 0) { printf("FAILURE\n"); return -1; }
if (memcmp(golden, output, OUTPUT_SIZE * sizeof(float)) != 0) { printf("FAILURE\n"); return -1; }
// third run
printf("************** execute 3 **************\n");
memset(output, 0, sizeof(output));
memset(output, 0, OUTPUT_SIZE * sizeof(float));
t->record = 2; // debug print w/o record
float *inputs[4] = {state, traffic_convention, desire, input};
t->execute(inputs, output);
t->execute(inputs, output, true);
hexdump((uint32_t *)output, 0x100);
if (memcmp(golden, output, sizeof(output)) != 0) { printf("FAILURE\n"); return -1; }
if (memcmp(golden, output, OUTPUT_SIZE * sizeof(float)) != 0) { printf("FAILURE\n"); return -1; }
printf("************** execute 4 **************\n");
memset(output, 0, sizeof(output));
//t->record = 2; // debug print w/o record
t->execute(inputs, output);
hexdump((uint32_t *)output, 0x100);
if (memcmp(golden, output, sizeof(output)) != 0) { printf("FAILURE\n"); return -1; }
while (1) {
memset(output, 0, OUTPUT_SIZE * sizeof(float));
//t->record = 2; // debug print w/o record
t->execute(inputs, output);
hexdump((uint32_t *)output, 0x100);
if (memcmp(golden, output, OUTPUT_SIZE * sizeof(float)) != 0) { printf("FAILURE\n"); return -1; }
break;
}
printf("************** execute done **************\n");
}

View File

@ -13,7 +13,8 @@ std::map<std::pair<cl_kernel, int>, std::string> g_args;
static inline uint64_t nanos_since_boot() {
struct timespec t;
clock_gettime(CLOCK_BOOTTIME, &t);
return t.tv_sec * 1000000000ULL + t.tv_nsec; }
return t.tv_sec * 1000000000ULL + t.tv_nsec;
}
void hexdump(uint32_t *d, int len) {
assert((len%4) == 0);
@ -43,10 +44,12 @@ int ioctl(int filedes, unsigned long request, void *argp) {
thneed->timestamp = cmd->timestamp;
thneed->context_id = cmd->context_id;
CachedCommand *ccmd = new CachedCommand(thneed, cmd);
//ccmd->disassemble();
thneed->cmds.push_back(ccmd);
}
if (thneed->record & 2) {
printf("IOCTL_KGSL_GPU_COMMAND: flags: 0x%lx context_id: %u timestamp: %u\n",
printf("IOCTL_KGSL_GPU_COMMAND(%2zu): flags: 0x%lx context_id: %u timestamp: %u\n",
thneed->cmds.size(),
cmd->flags,
cmd->context_id, cmd->timestamp);
}
@ -179,7 +182,10 @@ void Thneed::stop() {
//#define SAVE_LOG
void Thneed::execute(float **finputs, float *foutput) {
void Thneed::execute(float **finputs, float *foutput, bool slow) {
uint64_t tb, te;
if (record & 2) tb = nanos_since_boot();
#ifdef SAVE_LOG
char fn[0x100];
snprintf(fn, sizeof(fn), "/tmp/thneed_log_%d", timestamp);
@ -197,7 +203,7 @@ void Thneed::execute(float **finputs, float *foutput) {
#endif
if (record & 2) printf("copying %lu -- %p -> %p\n", sz, finputs[idx], inputs[idx]);
clEnqueueWriteBuffer(command_queue, inputs[idx], CL_TRUE, 0, sz, finputs[idx], 0, NULL, NULL);
//clEnqueueWriteBuffer(command_queue, inputs[idx], CL_TRUE, 0, sz, finputs[idx], 0, NULL, NULL);
}
// ****** set power constraint
@ -220,8 +226,9 @@ void Thneed::execute(float **finputs, float *foutput) {
// ****** run commands
int i = 0;
for (auto it = cmds.begin(); it != cmds.end(); ++it) {
++i;
if (record & 2) printf("run %2d: ", i);
(*it)->exec((++i) == cmds.size());
(*it)->exec((i == cmds.size()) || slow);
}
// ****** sync objects
@ -255,6 +262,11 @@ void Thneed::execute(float **finputs, float *foutput) {
ret = ioctl(fd, IOCTL_KGSL_SETPROPERTY, &prop);
assert(ret == 0);
if (record & 2) {
te = nanos_since_boot();
printf("model exec in %lu us\n", (te-tb)/1000);
}
}
cl_int (*my_clSetKernelArg)(cl_kernel kernel, cl_uint arg_index, size_t arg_size, const void *arg_value) = NULL;
@ -311,10 +323,19 @@ cl_int clEnqueueNDRangeKernel(cl_command_queue command_queue,
}
}
}
if (thneed != NULL && thneed->record & 2) {
printf("%p %56s -- ", kernel, name);
for (int i = 0; i < work_dim; i++) {
printf("%4zu ", global_work_size[i]);
}
printf(" -- ");
for (int i = 0; i < work_dim; i++) {
printf("%4zu ", local_work_size[i]);
}
printf("\n");
}
if (thneed != NULL && thneed->record & 4) {
// extreme debug
printf("%s -- %p\n", name, kernel);
for (int i = 0; i < num_args; i++) {
char arg_type[0x100];
char arg_name[0x100];
@ -337,6 +358,29 @@ cl_int clEnqueueNDRangeKernel(cl_command_queue command_queue,
} else if (arg_size == 8) {
cl_mem val = (cl_mem)(*((uintptr_t*)arg_value));
printf(" = %p", val);
if (val != NULL) {
if (strcmp("image2d_t", arg_type) == 0 || strcmp("image1d_t", arg_type) == 0) {
cl_image_format format;
size_t width, height, depth, array_size, row_pitch, slice_pitch;
clGetImageInfo(val, CL_IMAGE_FORMAT, sizeof(format), &format, NULL);
assert(format.image_channel_data_type == CL_HALF_FLOAT);
clGetImageInfo(val, CL_IMAGE_WIDTH, sizeof(width), &width, NULL);
clGetImageInfo(val, CL_IMAGE_HEIGHT, sizeof(height), &height, NULL);
clGetImageInfo(val, CL_IMAGE_DEPTH, sizeof(depth), &depth, NULL);
clGetImageInfo(val, CL_IMAGE_ARRAY_SIZE, sizeof(array_size), &array_size, NULL);
clGetImageInfo(val, CL_IMAGE_ROW_PITCH, sizeof(row_pitch), &row_pitch, NULL);
clGetImageInfo(val, CL_IMAGE_SLICE_PITCH, sizeof(slice_pitch), &slice_pitch, NULL);
assert(depth == 0);
assert(array_size == 0);
assert(slice_pitch == 0);
printf(" image %zu x %zu rp %zu", width, height, row_pitch);
} else {
size_t sz;
clGetMemObjectInfo(val, CL_MEM_SIZE, sizeof(sz), &sz, NULL);
printf(" buffer %zu", sz);
}
}
}
printf("\n");
}
@ -345,6 +389,53 @@ cl_int clEnqueueNDRangeKernel(cl_command_queue command_queue,
cl_int ret = my_clEnqueueNDRangeKernel(command_queue, kernel, work_dim,
global_work_offset, global_work_size, local_work_size,
num_events_in_wait_list, event_wait_list, event);
/*uint64_t tb = nanos_since_boot();
clWaitForEvents(1, event);
uint64_t te = nanos_since_boot();
if (thneed != NULL && thneed->record & 2) {
printf(" wait %lu us\n", (te-tb)/1000);
}*/
return ret;
}
//#define SAVE_KERNELS
#ifdef SAVE_KERNELS
std::map<cl_program, std::string> program_source;
#endif
cl_program (*my_clCreateProgramWithSource)(cl_context context, cl_uint count, const char **strings, const size_t *lengths, cl_int *errcode_ret) = NULL;
cl_program clCreateProgramWithSource(cl_context context, cl_uint count, const char **strings, const size_t *lengths, cl_int *errcode_ret) {
if (my_clCreateProgramWithSource == NULL) my_clCreateProgramWithSource = reinterpret_cast<decltype(my_clCreateProgramWithSource)>(dlsym(RTLD_NEXT, "REAL_clCreateProgramWithSource"));
assert(count == 1);
size_t my_lengths[1];
my_lengths[0] = lengths[0];
#ifdef SAVE_KERNELS
char fn[0x100];
snprintf(fn, sizeof(fn), "/tmp/program_%zu.cl", strlen(strings[0]));
FILE *f = fopen(fn, "wb");
fprintf(f, "%s", strings[0]);
fclose(f);
char tmp[0x10000];
memset(tmp, 0, sizeof(tmp));
snprintf(fn, sizeof(fn), "/tmp/patched_%zu.cl", strlen(strings[0]));
FILE *g = fopen(fn, "rb");
if (g != NULL) {
printf("LOADING PATCHED PROGRAM %s\n", fn);
fread(tmp, 1, sizeof(tmp), g);
fclose(g);
strings[0] = tmp;
my_lengths[0] = strlen(tmp);
}
program_source[ret] = strings[0];
#endif
cl_program ret = my_clCreateProgramWithSource(context, count, strings, my_lengths, errcode_ret);
return ret;
}
@ -356,6 +447,8 @@ void *dlsym(void *handle, const char *symbol) {
return (void*)clEnqueueNDRangeKernel;
} else if (strcmp("clSetKernelArg", symbol) == 0) {
return (void*)clSetKernelArg;
} else if (strcmp("clCreateProgramWithSource", symbol) == 0) {
return (void*)clCreateProgramWithSource;
} else {
return my_dlsym(handle, symbol);
}

View File

@ -20,6 +20,7 @@ class CachedCommand {
public:
CachedCommand(Thneed *lthneed, struct kgsl_gpu_command *cmd);
void exec(bool wait);
void disassemble();
private:
struct kgsl_gpu_command cache;
struct kgsl_command_object cmds[2];
@ -31,7 +32,7 @@ class Thneed {
public:
Thneed();
void stop();
void execute(float **finputs, float *foutput);
void execute(float **finputs, float *foutput, bool slow=false);
std::vector<cl_mem> inputs;
cl_mem output;