1
0
Fork 0

refactor to remove extra kernel params (#2563)

* refactor to have compiled kernel

* bugfixes

* docs/beautiful.py

* revert that

* fix tests
pull/2565/head
George Hotz 2023-12-02 00:32:25 -08:00 committed by GitHub
parent 27481b9206
commit 5068e99d18
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
15 changed files with 143 additions and 37 deletions

View File

@ -16,6 +16,7 @@ jobs:
runs-on: ubuntu-latest
timeout-minutes: 20
# TODO: run the pre-commit hook to replace a lot of this
steps:
- name: Checkout Code
uses: actions/checkout@v3
@ -47,7 +48,9 @@ jobs:
- name: Check <5000 lines
run: sloccount tinygrad test examples extra; if [ $(sloccount tinygrad | sed -n 's/.*Total Physical Source Lines of Code (SLOC)[ ]*= \([^ ]*\).*/\1/p' | tr -d ',') -gt 5000 ]; then exit 1; fi
- name: Test Docs
run: python docs/abstractions.py
run: |
python docs/abstractions.py
python docs/beautiful.py
- name: Test Quickstart
run: awk '/```python/{flag=1;next}/```/{flag=0}flag' docs/quickstart.md > quickstart.py && PYTHONPATH=. python quickstart.py
- name: Fuzz Test symbolic

View File

@ -21,7 +21,15 @@ repos:
pass_filenames: false
- id: docs
name: docs
entry: python3 docs/abstractions.py
entry: |
python3 docs/abstractions.py
python3 docs/beautiful.py
language: system
always_run: true
pass_filenames: false
- id: devicetests
name: select GPU tests
entry: env GPU=1 PYTHONPATH="." pytest test/test_uops.py test/test_custom_function.py
language: system
always_run: true
pass_filenames: false

View File

@ -214,7 +214,7 @@ MallocAllocator.copyin(input_a, numpy_a.data.cast("B"))
MallocAllocator.copyin(input_b, numpy_b.data.cast("B"))
# compile the program, run it, and 2+3 does indeed equal 5
program = ClangProgram("add", compile_clang(f"void add(float *a, float *b, float *c) {{ *a = *b + *c; }}"), bufs=3)
program = ClangProgram("add", compile_clang(f"void add(float *a, float *b, float *c) {{ *a = *b + *c; }}"))
program(output, input_a, input_b)
numpy_out = np.empty(1, dtype=np.float32)
MallocAllocator.copyout(numpy_out.data.cast("B"), output)

94
docs/beautiful.py 100644
View File

@ -0,0 +1,94 @@
# in tinygrad, things are easy
# because if things were hard, tinygrad would not be tiny
# come on a journey where we add 2+3
# ******** first, in the raw ***********
from tinygrad import dtypes
from tinygrad.device import MallocAllocator
from tinygrad.runtime.ops_clang import ClangProgram, compile_clang
# allocate some buffers
# TODO: remove dtypes from allocators
out = MallocAllocator.alloc(1, dtypes.int32)
a = MallocAllocator.alloc(1, dtypes.int32)
b = MallocAllocator.alloc(1, dtypes.int32)
# load in some values
MallocAllocator.copyin(a, bytearray([2,0,0,0]))
MallocAllocator.copyin(b, bytearray([3,0,0,0]))
# compile a program
fxn = ClangProgram("add", compile_clang("void add(int *out, int *a, int *b) { out[0] = a[0] + b[0]; }"))
# run the program
fxn(out, a, b)
outb = bytearray(MallocAllocator.as_buffer(out))
assert outb == bytearray([5,0,0,0])
print(outb)
# ******** second, one layer higher ***********
import numpy as np
from tinygrad.device import Buffer, Device
from tinygrad.ops import LazyOp, BufferOps, MemBuffer, BinaryOps
from tinygrad.shape.shapetracker import ShapeTracker
# allocate some buffers + load in values
out = Buffer("CLANG", 1, dtypes.int32)
a = Buffer("CLANG", 1, dtypes.int32).copyin(np.array([2], np.int32).data)
b = Buffer("CLANG", 1, dtypes.int32).copyin(np.array([3], np.int32).data)
# describe the computation
ld_1 = LazyOp(BufferOps.LOAD, (), MemBuffer(1, dtypes.int32, ShapeTracker.from_shape((1,))))
ld_2 = LazyOp(BufferOps.LOAD, (), MemBuffer(2, dtypes.int32, ShapeTracker.from_shape((1,))))
alu = LazyOp(BinaryOps.ADD, (ld_1, ld_2))
st_0 = LazyOp(BufferOps.STORE, (alu,), MemBuffer(0, dtypes.int32, ShapeTracker.from_shape((1,))))
# compile a program
fxn = Device["CLANG"].get_runner(st_0)
# run the program
fxn.exec([out, a, b])
assert out.toCPU().item() == 5
print(out.toCPU())
# ******** third, one layer higher ***********
from tinygrad.lazy import LazyBuffer
from tinygrad.graph import print_tree
from tinygrad.realize import run_schedule
# allocate some values + load in values
a = LazyBuffer.fromCPU(np.array([2], np.int32))
b = LazyBuffer.fromCPU(np.array([3], np.int32))
# describe the computation
out = a.e(BinaryOps.ADD, b)
# schedule the computation (print it)
sched = out.schedule()
print_tree(sched[0].ast)
# run that schedule
run_schedule(sched)
assert out.realized.toCPU().item() == 5
print(out.realized.toCPU())
# ******** fourth, the top layer ***********
from tinygrad import Tensor
a = Tensor([2], dtype=dtypes.int32)
b = Tensor([3], dtype=dtypes.int32)
out = (a+b).item()
assert out == 5
print(out)

View File

@ -8,7 +8,7 @@ from tinygrad.device import Compiled, Allocator
from tinygrad.helpers import Profiling
class FakeProgram:
def __init__(self, name:str, prg:bytes, bufs:int, vars:int=0): pass
def __init__(self, name:str, prg:bytes): pass
def __call__(self, *bufs, global_size, local_size, wait=False): pass
class FakeAllocator(Allocator):

View File

@ -22,7 +22,7 @@ def atan2_gpu(ret:Buffer, a:Buffer, b:Buffer):
__kernel void atan2_gpu(global float *c, global float *a, global float *b) {
int idx = get_global_id(0);
c[idx] = atan2(a[idx], b[idx]);
}""", global_size=[ret.size], bufcount=3).build(Device[ret.device].compiler, Device[ret.device].runtime).exec([ret, a, b])
}""", global_size=[ret.size]).build(Device[ret.device].compiler, Device[ret.device].runtime).exec([ret, a, b])
def atan2_cpu(ret:Buffer, a:Buffer, b:Buffer): ret.copyin(np.require(np.arctan2(a._buf, b._buf), requirements='C').data)

View File

@ -7,11 +7,11 @@ from tinygrad.ops import UnaryOps, BinaryOps, TernaryOps
from tinygrad.device import CompiledASTRunner, Compiled
from tinygrad.codegen.linearizer import UOps, UOp
def _uops_to_prg(uops, bufcount):
def _uops_to_prg(uops):
src, runtime_args = Device[Device.DEFAULT].renderer("test", uops)
return CompiledASTRunner(None, "test", src,
[1] if Device[Device.DEFAULT].linearizer_opts.has_local else None, [1] if Device[Device.DEFAULT].linearizer_opts.has_local else None,
runtime_args=runtime_args, bufcount=bufcount).build(Device[Device.DEFAULT].compiler, Device[Device.DEFAULT].runtime)
runtime_args=runtime_args).build(Device[Device.DEFAULT].compiler, Device[Device.DEFAULT].runtime)
def uop(uops:List[UOp], uop:UOps, dtype:Optional[DType], vin:Tuple[UOp, ...], arg:Any=None) -> UOp:
uops.append(UOp(uop, dtype, tuple(vin), arg))
@ -26,7 +26,7 @@ def _test_single_value(vals, op, dtype):
uop(uops, UOps.STORE, None, (buf_store, uop(uops, UOps.CONST, dtypes.int32, (), 0), alu))
buf = Buffer(Device.DEFAULT, 1, dtype)
buf2 = [Buffer.fromCPU(Device.DEFAULT, np.array([a], dtype=dtype.np)) for a in vals]
prg = _uops_to_prg(uops, 1+len(buf2))
prg = _uops_to_prg(uops)
prg.exec([buf]+buf2)
return buf.toCPU()[0]
@ -37,7 +37,7 @@ def _test_single_value_const(vals, op, dtype):
alu = uop(uops, UOps.ALU, dtype, loads, op)
uop(uops, UOps.STORE, None, (buf_store, uop(uops, UOps.CONST, dtypes.int32, (), 0), alu))
buf = Buffer(Device.DEFAULT, 1, dtype)
prg = _uops_to_prg(uops, 1)
prg = _uops_to_prg(uops)
prg.exec([buf])
return buf.toCPU()[0]

View File

@ -213,18 +213,16 @@ def _get_interpreted_fxn(fxn_for_op:Dict[Op, Callable], ast:LazyOp) -> Interpret
# **************** for Compiled Devices ****************
class CompiledASTRunner(JITRunner):
def __init__(self, ast:Optional[LazyOp], name:str, prg:str, global_size:Optional[List[int]]=None, local_size:Optional[List[int]]=None, runtime_args:Optional[dict]=None, bufcount:int=0):
def __init__(self, ast:Optional[LazyOp], name:str, prg:str, global_size:Optional[List[int]]=None, local_size:Optional[List[int]]=None, runtime_args:Optional[dict]=None):
super().__init__()
if DEBUG >= 4: print(prg)
if global_size is not None: global_size = global_size + [1]*(3-len(global_size))
if local_size is not None: local_size = local_size + [1]*(3-len(local_size))
self.name, self.display_name, self.prg, self.global_size, self.local_size, self.runtime_args = \
to_function_name(name), name, prg, global_size, local_size, runtime_args if runtime_args is not None else {}
self.bufcount = bufcount
self.vars: List[Variable] = []
if ast:
info = get_lazyop_info(ast)
self.bufcount = len(info.mem)
self.op_estimate, self.mem_estimate = info.flops, info.mem_estimate
from tinygrad.lazy import vars_from_ast
self.vars = vars_from_ast(ast)
@ -232,7 +230,7 @@ class CompiledASTRunner(JITRunner):
def build(self, compiler, runtime):
self.lib = compiler.__wrapped__(self.prg) if getenv("DISABLE_COMPILER_CACHE") else compiler(self.prg)
self.clprg = runtime(self.name, self.lib, self.bufcount, len(self.vars))
self.clprg = runtime(self.name, self.lib)
return self
def launch_dims(self, var_vals):

View File

@ -20,10 +20,11 @@ def compile_clang(prg:str, header:str=CLANG_PROGRAM_HEADER) -> bytes:
return pathlib.Path(output_file.name).read_bytes()
class ClangProgram:
def __init__(self, name:str, prg:bytes, bufs:int, vars:int=0):
def __init__(self, name:str, lib:bytes):
self.name, self.lib = name, lib
# write to disk so we can load it
with tempfile.NamedTemporaryFile(delete=True) as cached_file_path:
pathlib.Path(cached_file_path.name).write_bytes(prg)
pathlib.Path(cached_file_path.name).write_bytes(lib)
self.fxn: Any = ctypes.CDLL(str(cached_file_path.name))[name]
def __call__(self, *args, wait=False): return cpu_time_execution(lambda: self.fxn(*args), enable=wait)

View File

@ -23,20 +23,21 @@ def cu_time_execution(cb, enable=False) -> Optional[float]: return time_executio
def compile_cuda(prg) -> bytes: return compile_cuda_style(prg, [f'--gpu-architecture={CUDADevice.default_arch_name}', CUDA_INCLUDE_PATH], cuda.nvrtcProgram, cuda.nvrtcCreateProgram, cuda.nvrtcCompileProgram, cuda.nvrtcGetPTX, cuda.nvrtcGetPTXSize, cuda.nvrtcGetProgramLog, cuda.nvrtcGetProgramLogSize, check)
class CUDAProgram:
def __init__(self, name:str, prg:bytes, bufs:int, vars:int=0):
if DEBUG >= 5: print(pretty_ptx(prg.decode('utf-8')))
def __init__(self, name:str, lib:bytes):
self.name, self.lib = name, lib
if DEBUG >= 5: print(pretty_ptx(lib.decode('utf-8')))
if DEBUG >= 6:
try:
fn = (Path(tempfile.gettempdir()) / f"tinycuda_{hashlib.md5(prg).hexdigest()}").as_posix()
with open(fn + ".ptx", "wb") as f: f.write(prg)
fn = (Path(tempfile.gettempdir()) / f"tinycuda_{hashlib.md5(lib).hexdigest()}").as_posix()
with open(fn + ".ptx", "wb") as f: f.write(lib)
subprocess.run(["ptxas", f"-arch={CUDADevice.default_arch_name}", "-o", fn, fn+".ptx"], check=True)
print(subprocess.check_output(['nvdisasm', fn]).decode('utf-8'))
except Exception as e: print("failed to generate SASS", str(e))
if not CUDACPU:
self.module = init_c_var(cuda.CUmodule(), lambda x: check(cuda.cuModuleLoadData(ctypes.byref(x), prg)))
self.module = init_c_var(cuda.CUmodule(), lambda x: check(cuda.cuModuleLoadData(ctypes.byref(x), lib)))
check(cuda.cuModuleGetFunction(ctypes.byref(prg := cuda.CUfunction()), self.module, name.encode("utf-8")))
self.prg = prg
self.prg = prg if not CUDACPU else lib
def __del__(self):
if not CUDACPU: check(cuda.cuModuleUnload(self.module))

View File

@ -34,15 +34,14 @@ def compile_cl(prg:str) -> bytes:
return bytes(binary)
class CLProgram:
def __init__(self, device:CLDevice, name:str, prg:bytes, bufs:int=0, vars:int=0):
self.device = device
self.program = checked(cl.clCreateProgramWithBinary(device.context, 1, ctypes.byref(device.device_id), (ctypes.c_size_t * 1)(len(prg)),
to_char_p_p([prg], ctypes.c_ubyte),
def __init__(self, device:CLDevice, name:str, lib:bytes):
self.device, self.name, self.lib = device, name, lib
self.program = checked(cl.clCreateProgramWithBinary(device.context, 1, ctypes.byref(device.device_id), (ctypes.c_size_t * 1)(len(lib)),
to_char_p_p([lib], ctypes.c_ubyte),
ctypes.byref(binary_status := ctypes.c_int32()), ctypes.byref(errcode_ret := ctypes.c_int32())), errcode_ret)
check(binary_status.value)
check(cl.clBuildProgram(self.program, 1, ctypes.byref(device.device_id), None, cl.clBuildProgram.argtypes[4](), None)) # NOTE: OSX requires this
self.kernel = checked(cl.clCreateKernel(self.program, name.encode(), ctypes.byref(status := ctypes.c_int32())), status)
self.vars = vars
def __del__(self):
check(cl.clReleaseKernel(self.kernel))
@ -50,7 +49,7 @@ class CLProgram:
def __call__(self, *bufs:Union[cl.cl_mem, int], global_size:Tuple[int,...], local_size:Optional[Tuple[int,...]]=None, wait=False) -> Optional[float]:
for i,b in enumerate(bufs):
bc = ctypes.c_int32(b) if i >= (len(bufs)-self.vars) else cast(cl.cl_mem, b)
bc = ctypes.c_int32(b) if isinstance(b, int) else cast(cl.cl_mem, b)
cl.clSetKernelArg(self.kernel, i, ctypes.sizeof(bc), ctypes.byref(bc))
if local_size is not None: global_size = tuple(int(g*l) for g,l in zip(global_size, local_size))
event = cl.cl_event() if wait else None

View File

@ -23,16 +23,16 @@ def hip_time_execution(cb, enable=False): return time_execution_cuda_style(cb, h
def compile_hip(prg) -> bytes: return compile_cuda_style(prg, [f'--offload-arch={HIPDevice.default_arch_name}'], hip.hiprtcProgram, hip.hiprtcCreateProgram, hip.hiprtcCompileProgram, hip.hiprtcGetCode, hip.hiprtcGetCodeSize, hip.hiprtcGetProgramLog, hip.hiprtcGetProgramLogSize, check)
class HIPProgram:
def __init__(self, device:int, name:str, prg:bytes, bufs:int, vars:int=0):
self.device = device
def __init__(self, device:int, name:str, lib:bytes):
self.device, self.name, self.lib = device, name, lib
if DEBUG >= 6:
asm = early_exec((["/opt/rocm/llvm/bin/llvm-objdump", '-d', '-'], prg))
asm = early_exec((["/opt/rocm/llvm/bin/llvm-objdump", '-d', '-'], lib))
print('\n'.join([x for x in asm.decode('utf-8').split("\n") if 's_code_end' not in x]))
if MOCKHIP: return
check(hip.hipSetDevice(self.device))
self.module = init_c_var(hip.hipModule_t(), lambda x: check(hip.hipModuleLoadData(ctypes.byref(x), prg)))
self.module = init_c_var(hip.hipModule_t(), lambda x: check(hip.hipModuleLoadData(ctypes.byref(x), lib)))
self.prg = init_c_var(hip.hipFunction_t(), lambda x: check(hip.hipModuleGetFunction(ctypes.byref(x), self.module, name.encode("utf-8"))))
def __del__(self):

View File

@ -54,11 +54,13 @@ def compile_llvm(prg, llvmopt=LLVMOPT) -> bytes:
return LLVM.target_machine.emit_object(mod)
class LLVMProgram:
def __init__(self, name:str, lib:bytes, bufs:int, vars:int=0):
def __init__(self, name:str, lib:bytes):
self.name, self.lib = name, lib
LLVM().engine.add_object_file(llvm.object_file.ObjectFileRef.from_data(lib))
self.fxn = LLVM.engine.get_function_address(name)
self.cfunc = CFUNCTYPE(ctypes.c_int, *([ctypes.c_void_p]*bufs), *([ctypes.c_int]*vars))(self.fxn)
def __call__(self, *bufs, wait=False): return cpu_time_execution(lambda: self.cfunc(*bufs), enable=wait)
def __call__(self, *bufs, wait=False):
self.cfunc = CFUNCTYPE(ctypes.c_int, *[ctypes.c_int32 if isinstance(b, int) else ctypes.c_void_p for b in bufs])(self.fxn)
return cpu_time_execution(lambda: self.cfunc(*bufs), enable=wait)
LLVMDevice = Compiled(MallocAllocator, LinearizerOptions(supports_float4=False, has_local=False, has_shared=False), uops_to_llvm_ir, compile_llvm, LLVMProgram)

View File

@ -19,8 +19,8 @@ def compile_metal(prg, use_xcode=bool(getenv("METAL_XCODE"))) -> bytes:
return library.libraryDataContents().bytes().tobytes()
class MetalProgram:
def __init__(self, device:MetalDevice, name:str, lib:bytes, bufs:int, vars:int=0):
self.device = device
def __init__(self, device:MetalDevice, name:str, lib:bytes):
self.device, self.name, self.lib = device, name, lib
data = libdispatch.dispatch_data_create(lib, len(lib), None, None)
self.library = unwrap2(self.device.device.newLibraryWithData_error_(data, None))
self.fxn = self.library.newFunctionWithName_(name)

View File

@ -10,7 +10,7 @@ import wgpu
wgpu_device = get_default_device()
class WebGPUProgram:
def __init__(self, name: str, prg: str, bufs:int=0, vars:int=0): self.name,self.prg = name,wgpu_device.create_shader_module(code=prg)
def __init__(self, name:str, lib:bytes): self.name, self.lib, self.prg = name, lib, wgpu_device.create_shader_module(code=lib) # NOTE: this is the compiler
def __call__(self, *bufs, global_size, local_size, wait=False):
assert len(bufs) <= 8, "WEBGPU only supports 8 buffers"
binding_layouts = [{"binding": i, "visibility": wgpu.ShaderStage.COMPUTE, "buffer": {"type": wgpu.BufferBindingType.storage}} for i in range(len(bufs))]