diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 8802742e8..3b5e0922d 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -164,6 +164,7 @@ jobs: - name: Test openpilot model run: | ALLOWED_KERNEL_COUNT=197 FLOAT16=1 VALIDHACKS=1 DEBUGCL=1 GPU=1 IMAGE=2 python3 openpilot/compile.py + python3 -c 'import os; assert os.path.getsize("/tmp/output.thneed") < 100_000_000' DEBUGCL=1 GPU=1 IMAGE=2 python3 openpilot/compile.py VALIDHACKS=1 DEBUGCL=1 GPU=1 IMAGE=2 python3 openpilot/compile.py diff --git a/extra/thneed.py b/extra/thneed.py index b9b896ca6..32b1b2700 100644 --- a/extra/thneed.py +++ b/extra/thneed.py @@ -1,4 +1,4 @@ -# this can be constructed from a cl_cache or loaded from a thneed file +# this can be constructed from a cl_cache or loaded from a thneed file import time import struct import json @@ -26,7 +26,7 @@ class Thneed: for a in args[3:]: nodes[a]['out_edges'].append(args[2]) nodes[args[2]]['in_edges'].append(a) - + # get buffers to save self.buffers_to_save = set() self.outputs = [] @@ -35,7 +35,7 @@ class Thneed: self.buffers_to_save.add(n) if len(nodes[n]['out_edges']) == 0: self.outputs.append(n) - + fake_inputs = [] for k,n in self.inputs.items(): if n in self.buffers_to_save: @@ -97,7 +97,7 @@ class Thneed: else: # zero out buffers buf = cl.Buffer(CL.cl_ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=b'\x00'*o['size']) - + bufs[o['id']] = buf bufs_loaded[o['id']] = 'data' in o # if it's loaded, it's saved @@ -114,13 +114,13 @@ class Thneed: print("FAILED", k) traceback.print_exc() exit(0) - + # load binaries for o in jdat['binaries']: nptr = ptr + o['length'] prgs[o['name']] = CLProgram(o['name'], weights[ptr:nptr], binary=True) ptr = nptr - + # populate the cl_cache for i,k in enumerate(jdat['kernels']): kernel = prgs[k['name']] @@ -166,7 +166,7 @@ class Thneed: jdat['binaries'].append({"name":prg.name, "length":len(binary[0])}) binaries.append(binary[0]) saved_binaries.add(prg.name) - + # get the args from the kernel, some need the data saved targs, args_size = [], [] argdtypes = prg.argdtypes if prg.argdtypes is not None else [None]*(len(args)-2) @@ -196,6 +196,7 @@ class Thneed: cl.enqueue_copy(CL.cl_queue, data, a, is_blocking=True) weights.append(data.tobytes()) elif isinstance(a, cl.Image): + assert a.format == cl.ImageFormat(cl.channel_order.RGBA, cl.channel_type.HALF_FLOAT if FLOAT16 else cl.channel_type.FLOAT), "wrong type" needs_load = a in self.buffers_to_save row_pitch = (a.shape[0]*4*(2 if FLOAT16 else 4) + 63)//64 * 64 size = row_pitch * a.shape[1] @@ -244,7 +245,7 @@ class Thneed: "local_work_size": [1 for _ in args[0]] if args[1] is None else args[1], "num_args": len(args)-2, "args": targs, - "args_size": args_size + "args_size": args_size }) jdat['outputs'] = [{ diff --git a/run_multibackend.sh b/run_multibackend.sh new file mode 100755 index 000000000..c20ef3a33 --- /dev/null +++ b/run_multibackend.sh @@ -0,0 +1,13 @@ +#!/bin/bash -e +echo "********* CPU *********" +CPU=1 python3 $@ +echo "********* GPU *********" +GPU=1 python3 $@ +echo "********* METAL *********" +METAL=1 python3 $@ +echo "********* CLANG *********" +CLANG=1 python3 $@ +echo "********* LLVM *********" +LLVM=1 python3 $@ +echo "********* TORCH *********" +TORCH=1 python3 $@ diff --git a/test/test_custom_function.py b/test/test_custom_function.py index 4cccf8286..3c283a590 100644 --- a/test/test_custom_function.py +++ b/test/test_custom_function.py @@ -4,7 +4,7 @@ import unittest import numpy as np from typing import Optional, Tuple -from tinygrad.helpers import prod +from tinygrad.helpers import prod, dtypes # *** first, we implement the atan2 op at the lowest level *** # `atan2_gpu` for GPUBuffers and `atan2_cpu` for CPUBuffers @@ -16,6 +16,7 @@ from tinygrad.runtime.ops_cpu import CPUBuffer def atan2_gpu(a:CompiledBuffer, b:CompiledBuffer) -> CompiledBuffer: from tinygrad.runtime.ops_gpu import GPUBuffer assert type(a) == GPUBuffer and type(b) == GPUBuffer, "gpu function requires GPUBuffers" + assert a.dtype == b.dtype and a.dtype == dtypes.float32, "gpu function only supports float32" ret = GPUBuffer(a.shape) ASTRunner("atan2", """ __kernel void atan2(global float *c, global float *a, global float *b) { @@ -40,7 +41,7 @@ class ATan2(Function): assert prod(a.shape) == prod(b.shape) and a.device == b.device, "shape or device mismatch" self.a, self.b = a, b ast = LazyOp(LoadOps.CUSTOM, (a, b), {"GPU": atan2_gpu, "CPU": atan2_cpu}[a.device]) - return LazyBuffer(a.device, a.shape, LoadOps, ast) + return LazyBuffer(a.device, a.shape, LoadOps, ast, max(a.dtype, b.dtype)) def backward(self, grad_output:LazyBuffer) -> Tuple[Optional[LazyBuffer], Optional[LazyBuffer]]: denom = (self.a.binary_op(BinaryOps.MUL, self.a)).binary_op(BinaryOps.ADD, self.b.binary_op(BinaryOps.MUL, self.b)) return grad_output.binary_op(BinaryOps.MUL, self.b.binary_op(BinaryOps.DIV, denom)) if self.needs_input_grad[0] else None, \ diff --git a/test/test_dtype.py b/test/test_dtype.py new file mode 100644 index 000000000..e1119b32f --- /dev/null +++ b/test/test_dtype.py @@ -0,0 +1,41 @@ +import unittest +import numpy as np +from tinygrad.helpers import getenv +from tinygrad.lazy import Device +from tinygrad.tensor import Tensor, dtypes + +# for GPU, cl_khr_fp16 isn't supported +# for LLVM, it segfaults because it can't link to the casting function +@unittest.skipIf(getenv("CI", "") != "" and Device.DEFAULT in ["GPU", "LLVM"], "float16 broken in some CI backends") +class TestDtype(unittest.TestCase): + def test_half_to_np(self): + a = Tensor([1,2,3,4], dtype=dtypes.float16) + print(a) + na = a.numpy() + print(na, na.dtype, a.lazydata.realized) + assert na.dtype == np.float16 + + def test_half_add(self): + a = Tensor([1,2,3,4], dtype=dtypes.float16) + b = Tensor([1,2,3,4], dtype=dtypes.float16) + c = a+b + print(c.numpy()) + assert c.dtype == dtypes.float16 + + def test_upcast_float(self): + # NOTE: there's no downcasting support + a = Tensor([1,2,3,4], dtype=dtypes.float16).float() + print(a) + na = a.numpy() + print(na, na.dtype) + assert na.dtype == np.float32 + + def test_half_add_upcast(self): + a = Tensor([1,2,3,4], dtype=dtypes.float16) + b = Tensor([1,2,3,4], dtype=dtypes.float32) + c = a+b + print(c.numpy()) + assert c.dtype == dtypes.float32 + +if __name__ == '__main__': + unittest.main() \ No newline at end of file diff --git a/test/unit/test_graph.py b/test/unit/test_graph.py index 4f235c901..ca480efbf 100644 --- a/test/unit/test_graph.py +++ b/test/unit/test_graph.py @@ -14,10 +14,10 @@ class TestGraph(unittest.TestCase): assert nx.is_isomorphic(G, RG, node_match=lambda x,y: x["label"] == y["label"], edge_match=lambda x,y: x["label"] == y["label"] if "label" in y else True) def test_add_graph(self): - a = CPUBuffer.fromCPU(np.ones((4,4))) - b = CPUBuffer.fromCPU(np.ones((4,4))) + a = CPUBuffer.fromCPU(np.ones((4,4), dtype=np.float32)) + b = CPUBuffer.fromCPU(np.ones((4,4), dtype=np.float32)) ast = LazyOp(BinaryOps.ADD, (a,b)) - ret = CPUBuffer(np.ones((4,4))) + ret = CPUBuffer(np.ones((4,4), dtype=np.float32)) RG = nx.DiGraph() RG.add_node(0, label="(4, 4)") @@ -30,12 +30,12 @@ class TestGraph(unittest.TestCase): self.helper_compare_graph(RG) def test_add_sum_graph(self): - a = CPUBuffer.fromCPU(np.ones((4,4))) - b = CPUBuffer.fromCPU(np.ones((1,1))) + a = CPUBuffer.fromCPU(np.ones((4,4), dtype=np.float32)) + b = CPUBuffer.fromCPU(np.ones((1,1), dtype=np.float32)) op0 = LazyOp(MovementOps.RESHAPE, (b,), (4, 4)) op1 = LazyOp(BinaryOps.ADD, (a,op0)) ast = LazyOp(ReduceOps.SUM, (op1,), (1,1)) - ret = CPUBuffer(np.ones((1,1))) + ret = CPUBuffer(np.ones((1,1), dtype=np.float32)) RG = nx.DiGraph() RG.add_node(0, label="(4, 4)") @@ -48,14 +48,14 @@ class TestGraph(unittest.TestCase): self.helper_compare_graph(RG) def test_add_graph_prune(self): - a = CPUBuffer.fromCPU(np.ones((1,1))) + a = CPUBuffer.fromCPU(np.ones((1,1), dtype=np.float32)) ast = LazyOp(MovementOps.RESHAPE, (a,), (4, 4)) - ret = CPUBuffer(np.ones((4,4))) + ret = CPUBuffer(np.ones((4,4), dtype=np.float32)) log_op(ret, ast, show_graph=True) - b = CPUBuffer.fromCPU(np.ones((4,4))) + b = CPUBuffer.fromCPU(np.ones((4,4), dtype=np.float32)) ast = LazyOp(BinaryOps.ADD, (ret,b)) - ret = CPUBuffer(np.ones((4,4))) + ret = CPUBuffer(np.ones((4,4), dtype=np.float32)) log_op(ret, ast, show_graph=True) prune_graph() diff --git a/tinygrad/codegen/ast.py b/tinygrad/codegen/ast.py index 16a67ffc6..3f2162a0d 100644 --- a/tinygrad/codegen/ast.py +++ b/tinygrad/codegen/ast.py @@ -1,7 +1,7 @@ import itertools from enum import Enum, auto from typing import List, Tuple -from tinygrad.helpers import prod, dedup, all_same, colored +from tinygrad.helpers import prod, dedup, all_same, colored, dtypes from tinygrad.ops import LazyOp, MovementOps, get_lazyop_info, get_buffers, ReduceOps, get_lazyops, map_buffers from tinygrad.shape import ShapeTracker, View, strides_for_shape @@ -26,7 +26,7 @@ class Token: if len(self.axis) == 0: return [0] acc_strides = [x*(1-self.axis[::-1][i][2]) for i,x in enumerate(strides_for_shape(tuple(1 if r else s for s,_,r in self.axis[::-1])))] return [sum(t) for t in itertools.product(*[[y*acc_strides[i] for y in range(x[0])] for i,x in enumerate(self.axis[::-1])])] - def decltype(self): return ('float' if self.typ == Types.FLOAT else 'float4') + ('*' if self.ptr else str()) + def decltype(self, dtype=dtypes.float32): return (dtype.name if self.typ == Types.FLOAT else f'{dtype.name}4') + ('*' if self.ptr else str()) def __repr__(self): return f"<{self.typ}{'*' if self.ptr else str()} {self.tok}{f'[{self.axis}]' if len(self.axis) else str()}>" # ast kernel can contain one ReduceOp with arbitrary Binary/Unary ops @@ -55,8 +55,9 @@ class ASTKernel: break # create the buffer we are returning (as the same type as the input buffers) and add it as the first buffer - self.ret = output_buffer if output_buffer else type(self.bufs[0])(output_shape if output_shape else self.info.shape, force_create=True) - self.bufs = ([type(self.ret)(self.info.shape, hostbuf=self.ret)] if output_shape else [self.ret]) + self.bufs + self.ret = output_buffer if output_buffer else type(self.bufs[0])(output_shape if output_shape else self.info.shape, force_create=True, dtype=self.info.dtype) + assert self.ret.dtype == self.info.dtype, f"return dtype {self.ret.dtype} != {self.info.dtype}" + self.bufs = ([type(self.ret)(self.info.shape, hostbuf=self.ret, dtype=self.info.dtype)] if output_shape else [self.ret]) + self.bufs # key for lookup in cache (can change, str might not be right) # bufs are needed because kernels like f(x) = x + x and f(x, y) = x + y have the same str(ast), but are different kernels. @@ -121,7 +122,7 @@ class ASTKernel: if print_shapetrackers: for st in self.sts: print(st) for i in range(len(self.sts)): - print(prefix, self.buftokens[i], f"early:{'T' if i < len(self.bufs) and self.bufs[i] in self.earlybufs else 'F'}", self.sts[i].shape, self.sts[i].views[-1].strides, len(self.sts[i].views), type(self.bufs[i]._buf) if self.bufs[i] is not None else "FAKE") + print(prefix, self.bufs[i].dtype, self.buftokens[i], f"early:{'T' if i < len(self.bufs) and self.bufs[i] in self.earlybufs else 'F'}", self.sts[i].shape, self.sts[i].views[-1].strides, len(self.sts[i].views), type(self.bufs[i]._buf) if self.bufs[i] is not None else "FAKE") @property def shape_len(self) -> int: return len(self.sts[0].shape) diff --git a/tinygrad/codegen/gpu.py b/tinygrad/codegen/gpu.py index b3999f19d..9caaec2c9 100644 --- a/tinygrad/codegen/gpu.py +++ b/tinygrad/codegen/gpu.py @@ -25,6 +25,7 @@ class GPULanguage(NamedTuple): lid : List[str] = [] extra_args : List[str] = [] float4 : Optional[str] = None + half_prekernel : Optional[str] = None def to_image_idx(base_shape:Tuple[int, ...], idxy:Node, valid:Node, validhacks=False) -> Tuple[Node, Node]: idy = (idxy//(4*base_shape[1])) @@ -266,6 +267,8 @@ class GPUCodegen(ASTKernel): self.prekernel : Set[str] = set() self.kernel : List[str] = ["const sampler_t smp = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;\n"] if any(hasattr(buf._buf, "IMAGE") for buf in self.bufs if buf is not None) else [] + if self.lang.half_prekernel: self.prekernel.add(self.lang.half_prekernel+"\n") + if len(self.lang.gid) == 0: self.kernel += [f"for (int idx{i} = 0; idx{i} < {self.output_shape[i]}; idx{i}++) {{\n" for i in range(0, len(self.output_shape))] else: @@ -320,7 +323,7 @@ class GPUCodegen(ASTKernel): self.kernel.append("\n}") # concat kernel into prg - buftypes = [f"{'read_only' if i > 0 else 'write_only'} image2d_t" if hasattr(x._buf, "IMAGE") else self.lang.buffer_prefix+self.buftokens[i].decltype()+self.lang.buffer_suffix for i,x in enumerate(self.bufs) if x is not None] + buftypes = [f"{'read_only' if i > 0 else 'write_only'} image2d_t" if hasattr(x._buf, "IMAGE") else self.lang.buffer_prefix+self.buftokens[i].decltype(self.bufs[i].dtype)+self.lang.buffer_suffix for i,x in enumerate(self.bufs) if x is not None] prg = ' '.join(list(self.prekernel) + [f"{self.lang.kernel_prefix} void KERNEL_NAME_PLACEHOLDER(",] + [', '.join([f'{t} data{i}' for i,t in enumerate(buftypes) if i not in self.bufs_to_delete] + self.lang.extra_args)] + [") {\n"] + self.kernel) diff --git a/tinygrad/codegen/llvm.py b/tinygrad/codegen/llvm.py index 4761e7a67..420c473d1 100644 --- a/tinygrad/codegen/llvm.py +++ b/tinygrad/codegen/llvm.py @@ -3,7 +3,7 @@ from typing import ClassVar, List from llvmlite import ir # type: ignore from tinygrad.codegen.ast import ASTKernel from tinygrad.ops import UnaryOps, BinaryOps, ReduceOps, LazyOp, ASTRunner -from tinygrad.helpers import DEBUG, prod +from tinygrad.helpers import DEBUG, prod, dtypes from tinygrad.shape.symbolic import Variable, NumNode, MulNode, DivNode, ModNode, GeNode, LtNode, SumNode, AndNode def int_const(x): return ir.Constant(ir.IntType(64), x) @@ -105,7 +105,8 @@ class LLVMCodegen(ASTKernel): # create llvm function module = ir.Module(name=__file__) - func = ir.Function(module, ir.FunctionType(ir.VoidType(), [ir.FloatType().as_pointer()]*(len(self.bufs))), name='exec') + func_dtypes = [{dtypes.float16:ir.HalfType(), dtypes.float32:ir.FloatType()}[buf.dtype] for buf in self.bufs] + func = ir.Function(module, ir.FunctionType(ir.VoidType(), [x.as_pointer() for x in func_dtypes]), name='exec') # force llvmlite to allow us to add function attribute then add the attribute func.attributes._known = func.attributes._known.union(frozenset(['"no-nans-fp-math"="true"'])) @@ -143,9 +144,11 @@ class LLVMCodegen(ASTKernel): # this always does the load, so we have it load *0 if the arg won't be used # TODO: would control flow be faster? aug_idx = builder.select(valid, idx, int_const(0)) - element = builder.select(valid, builder.load(builder.gep(func.args[buf_index], [aug_idx], inbounds=True)), ir.Constant(ir.FloatType(), 0)) + element = builder.select(valid, builder.load(builder.gep(func.args[buf_index], [aug_idx], inbounds=True)), ir.Constant(func_dtypes[buf_index], 0)) else: element = builder.load(builder.gep(func.args[buf_index], [idx], inbounds=True)) + # upcast + if func_dtypes[buf_index] != ir.FloatType(): element = builder.fpext(element, ir.FloatType()) m = element if kernel_output_dim == 1 else builder.insert_element(m, element, int_const(i)) return m if isinstance(x.op, ReduceOps): @@ -194,6 +197,7 @@ class LLVMCodegen(ASTKernel): builder = loop_exit[store_loop] for i, idx in enumerate(get_idxs(builder, idx_level[0][store_loop], 0)): element = result if kernel_output_dim == 1 else builder.extract_element(result, int_const(i)) + if func_dtypes[0] != ir.FloatType(): element = builder.fptrunc(element, func_dtypes[0]) builder.store(element, builder.gep(func.args[0], [idx], inbounds=True)) # add the looping diff --git a/tinygrad/helpers.py b/tinygrad/helpers.py index 78cda40e3..f7abc7f2e 100644 --- a/tinygrad/helpers.py +++ b/tinygrad/helpers.py @@ -1,5 +1,6 @@ import os, math, functools -from typing import Tuple, Union, List +import numpy as np +from typing import Tuple, Union, List, NamedTuple def dedup(x): return list(dict.fromkeys(x)) # retains list order def prod(x:Union[List[int], Tuple[int, ...]]) -> int: return math.prod(x) @@ -16,3 +17,17 @@ def mnum(i) -> str: return str(i) if i >= 0 else f"m{-i}" def getenv(key, default=0): return type(default)(os.getenv(key, default)) DEBUG, IMAGE = getenv("DEBUG", 0), getenv("IMAGE", 0) + +# **** tinygrad now supports dtypes! ***** + +class DType(NamedTuple): + itemsize : int + name : str + np : type # TODO: someday this will be removed with the "remove numpy" project + def __repr__(self): return f"dtypes.{self.name}" + +class dtypes: + float16 = half = DType(2, "half", np.float16) + float32 = float = DType(4, "float", np.float32) + @staticmethod + def from_np(x:np.ndarray) -> DType: return {np.dtype(np.float16): dtypes.float16, np.dtype(np.float32): dtypes.float32}[np.dtype(x.dtype)] \ No newline at end of file diff --git a/tinygrad/lazy.py b/tinygrad/lazy.py index d79a0aed9..c1c9fe0f4 100644 --- a/tinygrad/lazy.py +++ b/tinygrad/lazy.py @@ -2,7 +2,7 @@ from __future__ import annotations from typing import Optional, Tuple, Union, List, Dict, Any, ClassVar, Type import os, sys, weakref, importlib, inspect, functools from weakref import WeakValueDictionary -from tinygrad.helpers import prod, getenv +from tinygrad.helpers import prod, getenv, DType, dtypes from tinygrad.shape import ShapeTracker, get_contraction from tinygrad.ops import InterpretedBuffer, DeviceBuffer, UnaryOps, BinaryOps, ReduceOps, MovementOps, LoadOps, OpType, LazyOp, get_buffers, get_lazyops, map_buffers from tinygrad.graph import log_op @@ -13,21 +13,12 @@ sys.setrecursionlimit(10000) OPT = getenv("OPT", 2) LAZY = getenv("LAZY", 1) -def get_buffer(name, base='tinygrad.runtime'): - try: - return [cls for cname, cls in inspect.getmembers(importlib.import_module(f'{base}.ops_{name}'), inspect.isclass) if (cname.lower() == name + "buffer")][0] - except Exception as e: # NOTE: this can't be put on one line due to mypy issue - print(name, "backend not available", e, file=sys.stderr) - class _Device: def __init__(self) -> None: - # TODO: make this dynamic to when you try to access the _buffers - self._buffers : Dict[str, Type[DeviceBuffer]] = {x.upper():get_buffer(x) for x in - [os.path.splitext(x)[0][len("ops_"):] for x in sorted(os.listdir(os.path.join(os.path.dirname(os.path.realpath(__file__)), "runtime"))) if x.startswith("ops_")] if x is not None} - self.DEFAULT : str = "CPU" - for name in self._buffers: - if getenv(name) == 1: self.DEFAULT = name # note: DEFAULT can be a Device that can't be imported. better than silent use of a different device - if self._buffers[name] is not None: self.__setattr__(name, name) + self._buffers = {y.upper():y for y in [os.path.splitext(x)[0][len("ops_"):] for x in sorted(os.listdir(os.path.join(os.path.dirname(os.path.realpath(__file__)), "runtime"))) if x.startswith("ops_")]} + self.DEFAULT : str = functools.reduce(lambda val, ele: val if getenv(val) == 1 else ele, self._buffers, "CPU") + @functools.lru_cache(maxsize=None) # this class is a singleton, pylint: disable=method-cache-max-size-none + def __getitem__(self, x:str) -> Type[DeviceBuffer]: return [cls for cname, cls in inspect.getmembers(importlib.import_module(f'tinygrad.runtime.ops_{self._buffers[x]}'), inspect.isclass) if (cname.lower() == self._buffers[x] + "buffer")][0] Device = _Device() # TODO: movement ops that only change shape are really nops. treat them as such @@ -81,9 +72,9 @@ def replace_with_movement_op(y:Union[LazyOp, LazyBuffer], op:MovementOps, arg:Tu return elementwise_op(y.op, *[replace_with_movement_op(z, op, arg) for z in y.src]) # type: ignore class LazyNumpyArray: - def __init__(self, fxn, shape): self.fxn, self.shape = fxn, shape - def __call__(self): return self.fxn(self.shape) - def reshape(self, new_shape): return LazyNumpyArray(self.fxn, new_shape) + def __init__(self, fxn, shape, dtype): self.fxn, self.shape, self.dtype = fxn, shape, dtype + def __call__(self): return self.fxn(self.shape, self.dtype) + def reshape(self, new_shape): return LazyNumpyArray(self.fxn, new_shape, self.dtype) def copy(self): return self def astype(self, typ): return self @@ -91,32 +82,32 @@ def support_weakref(x): return x @support_weakref # needed for mypyc, this prevents LazyBuffer from becoming a native class class LazyBuffer: __deletable__ = ('op',) - lazycache : ClassVar[WeakValueDictionary[Tuple[str, OpType, LazyOp], LazyBuffer]] = WeakValueDictionary() - def __new__(cls, device:str, shape:Union[ShapeTracker, Tuple[int, ...]], optype:OpType, op:LazyOp): + lazycache : ClassVar[WeakValueDictionary[Tuple[str, DType, OpType, LazyOp], LazyBuffer]] = WeakValueDictionary() + def __new__(cls, device:str, shape:Union[ShapeTracker, Tuple[int, ...]], optype:OpType, op:LazyOp, dtype:DType): # fromcpu aren't cached if optype == LoadOps and op.op == LoadOps.FROMCPU: return super().__new__(cls) - wop = (device, optype, get_weakop(op)) # NOTE: shape should be deterministic. annoying to cache with the ShapeTracker + wop = (device, dtype, optype, get_weakop(op)) # NOTE: shape should be deterministic. annoying to cache with the ShapeTracker # NOTE: we need "ret" to prevent the new buffer from being immediately deleted if wop not in LazyBuffer.lazycache: LazyBuffer.lazycache[wop] = ret = super().__new__(cls) else: ret = LazyBuffer.lazycache[wop] return ret - def __init__(self, device:str, shape:Union[ShapeTracker, Tuple[int, ...]], optype:OpType, op:LazyOp): + def __init__(self, device:str, shape:Union[ShapeTracker, Tuple[int, ...]], optype:OpType, op:LazyOp, dtype:DType): if hasattr(self, 'device'): return # cache hit, we return and don't reinit self.st = shape if isinstance(shape, ShapeTracker) else ShapeTracker(tuple(shape)) - self.shape, self.optype, self.op = self.st.shape, optype, op + self.shape, self.optype, self.op, self.dtype = self.st.shape, optype, op, dtype self.realized : Optional[DeviceBuffer] = None self.output_buffer : Optional[DeviceBuffer] = None - self.device, self.dbuffer = device, Device._buffers[device] + self.device, self.dbuffer = device, Device[device] # TODO: does children have to be a ref count instead of a set? can a Buffer be a double child? self.children : weakref.WeakSet[LazyBuffer] = weakref.WeakSet() # NOTE: op should be read only after construction of LazyBuffer for x in get_buffers(op): x.children.add(self) if not LAZY: self.realize() - def __repr__(self): return f"" + def __repr__(self): return f"" # this produces a device buffer def realize(self:LazyBuffer, required_device=None) -> DeviceBuffer: @@ -124,7 +115,7 @@ class LazyBuffer: if self.realized is None: # get real ops first if self.op.op == LoadOps.FROMCPU: - self.realized = Device._buffers[self.device].fromCPU(self.op.arg() if isinstance(self.op.arg, LazyNumpyArray) else self.op.arg) + self.realized = Device[self.device].fromCPU(self.op.arg() if isinstance(self.op.arg, LazyNumpyArray) else self.op.arg) ast = LazyOp(self.op.op, tuple()) elif self.op.op == LoadOps.CONTIGUOUS: real_src = self.op.src[0].realize(self.device) @@ -160,12 +151,13 @@ class LazyBuffer: log_op(self.realized, ast) assert self.realized.shape == self.shape, f"shape mismatch on realize got {self.realized.shape} expected {self.shape}" - assert isinstance(self.realized, Device._buffers[self.device]), f"device mismatch on realized got {type(self.realized)} expected {self.device}" + assert isinstance(self.realized, Device[self.device]), f"device mismatch on realized got {type(self.realized)} expected {self.device}" + assert self.realized.dtype == self.dtype, f"dtype mismatch on realize got {self.realized.dtype} expected {self.dtype}" return self.realized # NOTE: we have to make a copy of the numpy array here in case the user changes it. expose this? @staticmethod - def fromCPU(x, device) -> LazyBuffer: return LazyBuffer(device, x.shape, LoadOps, LazyOp(LoadOps.FROMCPU, tuple(), x.copy())) + def fromCPU(x, device) -> LazyBuffer: return LazyBuffer(device, x.shape, LoadOps, LazyOp(LoadOps.FROMCPU, tuple(), x.copy()), dtypes.from_np(x)) def toCPU(self): ret = self.realize().toCPU() log_op(InterpretedBuffer(ret), LazyOp(LoadOps.TOCPU, (self.realized,), None)) @@ -173,11 +165,11 @@ class LazyBuffer: def unary_op(self:LazyBuffer, op:UnaryOps) -> LazyBuffer: return elementwise_op(op, self) def binary_op(self:LazyBuffer, op:BinaryOps, y:LazyBuffer) -> LazyBuffer: return elementwise_op(op, self, y) - def contiguous(self:LazyBuffer) -> LazyBuffer: return LazyBuffer(self.device, self.shape, LoadOps, LazyOp(LoadOps.CONTIGUOUS, (self,))) + def contiguous(self:LazyBuffer) -> LazyBuffer: return LazyBuffer(self.device, self.shape, LoadOps, LazyOp(LoadOps.CONTIGUOUS, (self,)), self.dtype) def reduce_op(self:LazyBuffer, op:ReduceOps, new_shape:Tuple[int, ...]) -> LazyBuffer: if self.shape == tuple(new_shape): return self - return LazyBuffer(self.device, new_shape, ReduceOps, LazyOp(op, (self,), new_shape)) + return LazyBuffer(self.device, new_shape, ReduceOps, LazyOp(op, (self,), new_shape), self.dtype) def movement_op(self:LazyBuffer, op:MovementOps, arg:Tuple[Any, ...]) -> LazyBuffer: # very instant nop @@ -228,7 +220,7 @@ class LazyBuffer: return replace_with_movement_op(self.op, op, arg) # create the buffer - ret = LazyBuffer(self.device, ShapeTracker(self.st).movement_op(op, arg), MovementOps, LazyOp(op, (self,), arg)) + ret = LazyBuffer(self.device, ShapeTracker(self.st).movement_op(op, arg), MovementOps, LazyOp(op, (self,), arg), self.dtype) # if the ShapeTracker becomes contiguous, replace the whole thing with a reshape (or nothing if shapes match) # NOTE: if ret is in the cache, it can already be realized @@ -241,7 +233,7 @@ class LazyBuffer: return ret def elementwise_op(op:Union[UnaryOps, BinaryOps], *srcs:LazyBuffer) -> LazyBuffer: - out_device, out_shape = srcs[0].device, srcs[0].shape + out_device, out_shape, out_dtype = srcs[0].device, srcs[0].shape, max(x.dtype for x in srcs) # push all contiguous to the end of BinaryOps. kernels 198 -> 196 if PUSH_CONTIGUOUS and any(x.realized is None and x.op.op == LoadOps.CONTIGUOUS and len(x.op.src[0].children) <= 1 for x in srcs): @@ -258,4 +250,4 @@ def elementwise_op(op:Union[UnaryOps, BinaryOps], *srcs:LazyBuffer) -> LazyBuffe # remove the buffers from any (childless) BinaryOps that feed into this srcs = tuple(x.op if x.optype == BinaryOps and len(x.children) == 0 and x.realized is None else x for x in srcs) # type: ignore - return LazyBuffer(out_device, out_shape, BinaryOps, LazyOp(op, srcs)) + return LazyBuffer(out_device, out_shape, BinaryOps, LazyOp(op, srcs), out_dtype) diff --git a/tinygrad/ops.py b/tinygrad/ops.py index b805c184b..34a84bdb0 100644 --- a/tinygrad/ops.py +++ b/tinygrad/ops.py @@ -3,7 +3,7 @@ import functools, itertools, operator, random import numpy as np from enum import Enum, auto from typing import Union, Type, NamedTuple, Tuple, Any, List, ClassVar, Optional, Callable, Dict, TypeVar, Set -from tinygrad.helpers import prod, DEBUG, getenv +from tinygrad.helpers import prod, DEBUG, getenv, DType, dtypes from tinygrad.shape import ShapeTracker # these are the llops your accelerator must implement, along with toCpu @@ -39,17 +39,19 @@ class Copyable: def toCPU(self:Copyable) -> np.ndarray: raise NotImplementedError("must be implemented") class RawBuffer(Copyable): # pylint: disable=abstract-method - def __init__(self, size:int): + def __init__(self, size:int, dtype:DType): self.size : int = size - GlobalCounters.mem_used += self.size - def __del__(self): GlobalCounters.mem_used -= self.size + self.dtype : DType = dtype + self._memsz : int = size*dtype.itemsize + GlobalCounters.mem_used += self._memsz + def __del__(self): GlobalCounters.mem_used -= self.size*self._memsz class RawBufferCopyIn(RawBuffer): def copyin(self, x:np.ndarray) -> None: raise NotImplementedError("must be implemented") @classmethod def fromCPU(cls, x:np.ndarray): - ret = cls(4*prod(x.shape)) + ret = cls(prod(x.shape), dtypes.from_np(x)) ret.copyin(x) return ret @@ -57,7 +59,7 @@ class RawBufferCopyInOut(RawBufferCopyIn): def copyout(self, x:np.ndarray) -> None: raise NotImplementedError("must be implemented") def toCPU(self) -> np.ndarray: - x = np.empty((self.size//4), dtype=np.float32) + x: np.ndarray = np.empty(self.size, dtype=self.dtype.np) self.copyout(x) return x @@ -65,26 +67,27 @@ class RawBufferCopyInOut(RawBufferCopyIn): class DeviceBuffer(Copyable): _buf: Any # underlying buffer shape: Tuple[int, ...] + dtype: DType @classmethod def exec_ast(cls, ast:LazyOp, output_buffer=None): raise NotImplementedError("must be implemented") # this is a quick "buffer" class for flop tracking and getting the output shape class GenericShape: - def __init__(self, shape:Tuple[int, ...], flops:int=0): self.shape, self.flops = shape, flops + def __init__(self, shape:Tuple[int, ...], dtype:DType=dtypes.float32, flops:int=0): self.shape, self.dtype, self.flops = shape, dtype, flops def consume_flops(self): self.flops, ret = 0, self.flops return ret shape_fxn_for_op : Dict[Op, Callable] = { - **{op:lambda self: GenericShape(self.shape, self.consume_flops() + prod(self.shape)) for op in UnaryOps}, - **{op:lambda self,y: GenericShape(self.shape, self.consume_flops() + y.consume_flops() + prod(self.shape)) for op in BinaryOps}, - **{op:lambda self,new_shape: GenericShape(new_shape, self.consume_flops() + prod(self.shape)) for op in ReduceOps}, - **{op:functools.partial(lambda mop,self,arg: GenericShape(ShapeTracker(self.shape).movement_op(mop, arg).shape, self.consume_flops()), op) for op in MovementOps}} + **{op:lambda self: GenericShape(self.shape, self.dtype, self.consume_flops() + prod(self.shape)) for op in UnaryOps}, + **{op:lambda self,y: GenericShape(self.shape, max(self.dtype, y.dtype), self.consume_flops() + y.consume_flops() + prod(self.shape)) for op in BinaryOps}, + **{op:lambda self,new_shape: GenericShape(new_shape, self.dtype, self.consume_flops() + prod(self.shape)) for op in ReduceOps}, + **{op:functools.partial(lambda mop,self,arg: GenericShape(ShapeTracker(self.shape).movement_op(mop, arg).shape, self.dtype, self.consume_flops()), op) for op in MovementOps}} +def get_lazyop_info(ast:LazyOp): return InterpretedBuffer.exec_ast(map_buffers({x:InterpretedBuffer(GenericShape(x.shape, x.dtype)) for x in get_buffers(ast)}, ast))._buf # used in CPUBuffer and TorchBuffer class InterpretedBuffer(DeviceBuffer): # pylint: disable=abstract-method fxn_for_op : ClassVar = shape_fxn_for_op - # TODO: use generic types here to remove __init__ in specialized classes - def __init__(self, lbuf:Any): self._buf, self.shape = lbuf, tuple(lbuf.shape) + def __init__(self, lbuf:Any): self._buf, self.shape, self.dtype = lbuf, tuple(lbuf.shape), self.to_tinygrad_dtype(lbuf) if hasattr(self, 'to_tinygrad_dtype') else lbuf.dtype def contiguous(self): return type(self).exec_ast(LazyOp(op=UnaryOps.NOOP, src=(self,))) def movement_op(self, op:MovementOps, arg=None): return type(self)(self.fxn_for_op[op](self._buf, arg)) if op in self.fxn_for_op else type(self)(getattr(self._buf, op.name.lower())(arg)) @classmethod @@ -101,12 +104,11 @@ class InterpretedBuffer(DeviceBuffer): # pylint: disable=abstract-method else: ret = cls(cls.fxn_for_op[ast.op](*([x._buf for x in srcs] + ([ast.arg] if ast.arg else [])))) context[ast] = ret if output_buffer is not None: - assert output_buffer.shape == ret.shape + assert output_buffer.shape == ret.shape, output_buffer.dtype == ret.dtype output_buffer._buf = ret._buf return output_buffer else: return ret -def get_lazyop_info(ast:LazyOp): return InterpretedBuffer.exec_ast(map_buffers({x:InterpretedBuffer(GenericShape(x.shape)) for x in get_buffers(ast)}, ast))._buf class ASTRunner: def __init__(self, name, prg, bufs_to_delete:Optional[Set[int]]=None, global_size:Optional[List[int]]=None, local_size:Optional[List[int]]=None, op_estimate=0, mem_estimate=0): @@ -139,7 +141,7 @@ class ASTRunner: def optimize_local_size(self, rawbufs:List[RawBuffer]) -> List[int]: assert self.global_size is not None, "needs a global size to optimize local size" if any(x == rawbufs[0] for x in rawbufs[1:]): # this is an assignment, replace the output buffer - output_replacement = type(rawbufs[0])(rawbufs[0].size) + output_replacement = type(rawbufs[0])(rawbufs[0].size, rawbufs[0].dtype) rawbufs = [output_replacement if x == rawbufs[0] else x for x in rawbufs] MAX_WORKGROUP = self.clprg.max_work_group_size() if hasattr(self.clprg, 'max_work_group_size') else 1024 local_dims = [[x for x in set([sz, 1, 2, 4, 8, 16, 32, 64, 128, 256, MAX_WORKGROUP]) if x<=sz] for sz in self.global_size] @@ -149,32 +151,36 @@ class ASTRunner: # assumes you are using ShapeTracker # used in GPUBuffer and LLVMBuffer class CompiledBuffer(DeviceBuffer): # pylint: disable=abstract-method - def __init__(self, shape:Union[ShapeTracker, Tuple[int, ...]], hostbuf:Optional[CompiledBuffer]=None, backing:Optional[np.ndarray]=None, force_create=False): + def __init__(self, shape:Union[ShapeTracker, Tuple[int, ...]], hostbuf:Optional[CompiledBuffer]=None, backing:Optional[np.ndarray]=None, force_create=False, dtype:DType=dtypes.float32): self.st = shape if isinstance(shape, ShapeTracker) else ShapeTracker(tuple(shape)) self.shape = self.st.shape + self.dtype = dtype + assert hostbuf is None or hostbuf.dtype == dtype, f"hostbuf dtype {hostbuf.dtype} != {dtype}" self._base_shape : Tuple[int, ...] = hostbuf._base_shape if hostbuf is not None else self.shape self._buf = hostbuf._buf if hostbuf is not None else None self._backing : Optional[np.ndarray] = hostbuf._backing if hostbuf is not None else backing if (self._backing is not None and self._backing.shape != (1,)) or force_create: self.raw() # TODO: not GPUBuffer, get name of class - def __repr__(self): return f"GPUBuffer(shape={self.st}, hostbuf=GPUBuffer(shape={self._base_shape}" + (f", backing=np.array({self._backing}, dtype=np.float32)))" if self._backing else ", force_create=True))") + # TODO: needs dtype + def __repr__(self): return f"{type(self).__name__}(shape={self.st}, hostbuf={type(self).__name__}(shape={self._base_shape}" + (f", backing=np.array({self._backing}, dtype=np.{self.dtype.np.__name__})))" if self._backing is not None else ", force_create=True))") raw_buffer_type : Type[RawBuffer] @classmethod - def create_raw_buffer(cls, shape, backing) -> RawBuffer: + def create_raw_buffer(cls, shape:Tuple[int, ...], backing:Optional[np.ndarray], dtype:DType) -> RawBuffer: assert backing is None or prod(shape) == prod(backing.shape), "backing has the wrong shape" assert backing is None or GlobalCounters.cache is None, f"can't copy in {backing.shape} while caching" - return cls.raw_buffer_type(4*prod(shape)) if backing is None else cls.raw_buffer_type.fromCPU(backing) + if DEBUG >= 4: print(f"create raw buffer {shape} {dtype} backed:{backing is not None}") + return cls.raw_buffer_type(prod(shape), dtype) if backing is None else cls.raw_buffer_type.fromCPU(backing) def raw(self) -> RawBuffer: if self._buf is None: if DEBUG >= 4 and self._backing is not None: print(f"**** copy in {self._backing.shape} to {type(self)}") - self._buf = self.create_raw_buffer(self._base_shape, self._backing) + self._buf = self.create_raw_buffer(self._base_shape, self._backing, self.dtype) self._backing = None return self._buf @classmethod - def fromCPU(cls, x:np.ndarray) -> CompiledBuffer: return cls(x.shape, backing=x.view(np.ndarray).astype(np.float32).ravel()) + def fromCPU(cls, x:np.ndarray) -> CompiledBuffer: return cls(x.shape, backing=x.ravel(), dtype=dtypes.from_np(x)) def toCPU(self) -> np.ndarray: assert GlobalCounters.cache is None, f"can't copy out {self} while caching" if DEBUG >= 3: print(f"**** copy out {self.shape}") @@ -192,7 +198,7 @@ class CompiledBuffer(DeviceBuffer): # pylint: disable=abstract-method prg = cls.method_cache[k.key] else: prg = k.codegen().build(cls.runtime_type) - if getenv("PRINT_AST", "") == prg.name: + if getenv("PRINT_AST", "") == prg.name or getenv("PRINT_AST", "") == "1": k.print() print(prg.prg) prg.exec(k.bufs) diff --git a/tinygrad/runtime/ops_clang.py b/tinygrad/runtime/ops_clang.py index 23694d2a0..0fc6f287e 100644 --- a/tinygrad/runtime/ops_clang.py +++ b/tinygrad/runtime/ops_clang.py @@ -2,21 +2,22 @@ import os, time, ctypes, hashlib, subprocess, platform import numpy as np from collections import defaultdict from typing import Final, Dict +from tinygrad.helpers import dtypes, DType from tinygrad.ops import CompiledBuffer, RawBufferCopyIn from tinygrad.codegen.gpu import GPUCodegen, GPULanguage class RawMallocBuffer(RawBufferCopyIn): - def __init__(self, size): - super().__init__(size) - self._buf = (ctypes.c_float * (size//4))() + def __init__(self, size, dtype : DType): + super().__init__(size, dtype) + self._buf = ({dtypes.float32: ctypes.c_float, dtypes.float16: ctypes.c_int16}[dtype] * size)() def _buffer(self): return self._buf - def copyin(self, x:np.ndarray): ctypes.memmove(self._buf, x.ctypes.data, x.size*4) - def toCPU(self): return np.ctypeslib.as_array(self._buf) + def copyin(self, x:np.ndarray): ctypes.memmove(self._buf, x.ctypes.data, x.size*np.dtype(x.dtype).itemsize) + def toCPU(self): return np.frombuffer(self._buf, dtype=self.dtype.np) class ClangProgram: kernel_cnt : Final[Dict[str, int]] = defaultdict(int) def __init__(self, name:str, prg:str): - prg = "#include \n#define max(x,y) ((x>y)?x:y)\n" + prg + prg = "#include \n#define max(x,y) ((x>y)?x:y)\n#define half __fp16\n" + prg # TODO: is there a way to not write this to disk? fn = f"/tmp/clang_{hashlib.md5(prg.encode('utf-8')).hexdigest()}.{'dylib' if platform.system() == 'Darwin' else 'so'}" if not os.path.exists(fn): diff --git a/tinygrad/runtime/ops_cpu.py b/tinygrad/runtime/ops_cpu.py index 0cec65a79..921b87140 100644 --- a/tinygrad/runtime/ops_cpu.py +++ b/tinygrad/runtime/ops_cpu.py @@ -1,6 +1,7 @@ import numpy as np import operator from typing import ClassVar, Callable, Dict, Tuple +from tinygrad.helpers import dtypes from tinygrad.ops import UnaryOps, BinaryOps, MovementOps, ReduceOps, FusedOps, InterpretedBuffer, Op def shape_to_axis(old_shape:Tuple[int, ...], new_shape:Tuple[int, ...]) -> Tuple[int, ...]: @@ -35,6 +36,7 @@ numpy_fxn_for_op : Dict[Op, Callable] = {**base_fxn_for_op, **{ class CPUBuffer(InterpretedBuffer): fxn_for_op : ClassVar = numpy_fxn_for_op + to_tinygrad_dtype = staticmethod(dtypes.from_np) @staticmethod def fromCPU(x): return CPUBuffer(x) diff --git a/tinygrad/runtime/ops_cuda.py b/tinygrad/runtime/ops_cuda.py index ef9d29e25..b7ba4c56b 100644 --- a/tinygrad/runtime/ops_cuda.py +++ b/tinygrad/runtime/ops_cuda.py @@ -8,9 +8,9 @@ from tinygrad.ops import CompiledBuffer, RawBufferCopyInOut from tinygrad.codegen.gpu import GPUCodegen, GPULanguage class RawCUDABuffer(RawBufferCopyInOut): - def __init__(self, size): - super().__init__(size) - self._cl = cuda.mem_alloc(size) + def __init__(self, size, dtype): + super().__init__(size, dtype) + self._cl = cuda.mem_alloc(self._memsz) def copyin(self, x:np.ndarray, stream:Optional[cuda.Stream]=None): cuda.memcpy_htod_async(self._cl, x, stream) def copyout(self, x:np.ndarray): cuda.memcpy_dtoh(x, self._cl) @@ -42,6 +42,7 @@ class CUDAProgram: class CUDACodegen(GPUCodegen): lang = GPULanguage( kernel_prefix = "__global__", smem_prefix = "__shared__ ", barrier = "__syncthreads();", float4 = "make_float4", + half_prekernel = "#include ", gid = [f'blockDim.{chr(120+i)}*blockIdx.{chr(120+i)}+threadIdx.{chr(120+i)}' for i in range(3)], lid = [f'threadIdx.{chr(120+i)}' for i in range(3)]) diff --git a/tinygrad/runtime/ops_gpu.py b/tinygrad/runtime/ops_gpu.py index ae1821578..ec9f45829 100644 --- a/tinygrad/runtime/ops_gpu.py +++ b/tinygrad/runtime/ops_gpu.py @@ -2,15 +2,13 @@ from __future__ import annotations import platform, functools import numpy as np import pyopencl as cl # type: ignore -from typing import Dict, Optional, List, ClassVar, Final -from collections import defaultdict -from tinygrad.helpers import IMAGE, DEBUG, getenv +from typing import Optional, List, Final +from tinygrad.helpers import IMAGE, DEBUG, getenv, dtypes from tinygrad.ops import CompiledBuffer, GlobalCounters, RawBufferCopyInOut, RawBuffer from tinygrad.codegen.gpu import GPUCodegen, GPULanguage OSX = platform.system() == "Darwin" OSX_TIMING_RATIO = (125/3) if OSX else 1.0 # see test/external_osx_profiling.py to determine this ratio. it's in like GPU clocks or something -CLCACHE = getenv("CLCACHE", 1) FLOAT16 = getenv("FLOAT16", 0) class _CL: @@ -27,33 +25,18 @@ class _CL: CL = _CL() class CLBuffer(RawBufferCopyInOut): - # TODO: this can be in RawBuffer generically - BUFFER_CACHE : ClassVar[Dict[int, List[cl.Buffer]]] = defaultdict(list) - - def __init__(self, size): # pylint: disable=super-init-not-called - self.size = size - if len(CLBuffer.BUFFER_CACHE[size]) > 0: - self._cl = CLBuffer.BUFFER_CACHE[size].pop() - else: - # TODO: on GPU OOM, clear the cache - self._cl = cl.Buffer(CL.cl_ctx, cl.mem_flags.READ_WRITE, size) - GlobalCounters.mem_used += self._cl.size - - def __del__(self): - if CLCACHE: CLBuffer.BUFFER_CACHE[self._cl.size].append(self._cl) - else: GlobalCounters.mem_used -= self._cl.size - + def __init__(self, size, dtype): + super().__init__(size, dtype) + self._cl = cl.Buffer(CL.cl_ctx, cl.mem_flags.READ_WRITE, self._memsz) def copyin(self, x:np.ndarray): cl.enqueue_copy(CL.cl_queue, self._cl, x, is_blocking=False) def copyout(self, x:np.ndarray): cl.enqueue_copy(CL.cl_queue, x, self._cl, is_blocking=True) class CLImage(RawBuffer): # pylint: disable=abstract-method - fmt : Final = cl.ImageFormat(cl.channel_order.RGBA, cl.channel_type.HALF_FLOAT if FLOAT16 else cl.channel_type.FLOAT) IMAGE : Final = True - - def __init__(self, shape): # pylint: disable=super-init-not-called - self.size, self._cl = shape, cl.Image(CL.cl_ctx, cl.mem_flags.READ_WRITE, CLImage.fmt, shape=(shape[1], shape[0])) + def __init__(self, shape, dtype=dtypes.float16 if getenv("FLOAT16") else dtypes.float32): # pylint: disable=super-init-not-called + fmt = cl.ImageFormat(cl.channel_order.RGBA, {dtypes.float16: cl.channel_type.HALF_FLOAT, dtypes.float32: cl.channel_type.FLOAT}[dtype]) + self.size, self.dtype, self._cl = shape, dtype, cl.Image(CL.cl_ctx, cl.mem_flags.READ_WRITE, fmt, shape=(shape[1], shape[0])) GlobalCounters.mem_used += self._cl.row_pitch * self._cl.height - def __del__(self): GlobalCounters.mem_used -= self._cl.row_pitch * self._cl.height @functools.lru_cache(maxsize=None) @@ -89,6 +72,7 @@ class CLProgram: class CLCodegen(GPUCodegen): lang = GPULanguage( kernel_prefix = "__kernel", buffer_prefix = "__global ", smem_prefix = "__local ", + half_prekernel = "#pragma OPENCL EXTENSION cl_khr_fp16 : enable", barrier = "barrier(CLK_LOCAL_MEM_FENCE);", float4 = "(float4)", gid = [f'get_global_id({i})' for i in range(3)], lid = [f'get_local_id({i})' for i in range(3)]) @@ -96,8 +80,8 @@ class GPUBuffer(CompiledBuffer): raw_buffer_type = CLBuffer # override this method for image @classmethod - def create_raw_buffer(cls, shape, backing) -> RawBuffer: - if len(shape) == 3 and shape[2] == 4 and IMAGE >= 2 and backing is None: return CLImage(shape) - else: return super().create_raw_buffer(shape, backing) + def create_raw_buffer(cls, shape, backing, dtype) -> RawBuffer: + if len(shape) == 3 and shape[2] == 4 and IMAGE >= 2 and backing is None: return CLImage(shape) # NOTE: this is a hack. we don't pass in the dtype here, it's controlled by the FLOAT16 env var + else: return super().create_raw_buffer(shape, backing, dtype) codegen_type = CLCodegen runtime_type = CLProgram diff --git a/tinygrad/runtime/ops_llvm.py b/tinygrad/runtime/ops_llvm.py index 2b156c9c7..b2e778ab4 100644 --- a/tinygrad/runtime/ops_llvm.py +++ b/tinygrad/runtime/ops_llvm.py @@ -57,7 +57,7 @@ class LLVMProgram: def __del__(self): LLVM.engine.remove_module(self.mod) def __call__(self, unused_global_size, unused_local_size, *bufs, wait=False): - cfunc = CFUNCTYPE(ctypes.c_int, *[ctypes.POINTER(ctypes.c_float) for _ in bufs])(self.fxn) + cfunc = CFUNCTYPE(ctypes.c_int, *[ctypes.c_void_p for _ in bufs])(self.fxn) if wait: st = time.monotonic() cfunc(*[x._buf for x in bufs]) if wait: return time.monotonic()-st diff --git a/tinygrad/runtime/ops_metal.py b/tinygrad/runtime/ops_metal.py index 5f1bdb4ae..f8fa1563a 100644 --- a/tinygrad/runtime/ops_metal.py +++ b/tinygrad/runtime/ops_metal.py @@ -4,7 +4,7 @@ import Metal, Cocoa, libdispatch # type: ignore import numpy as np from typing import List, Any from tinygrad.codegen.gpu import GPUCodegen, GPULanguage -from tinygrad.helpers import prod, getenv, DEBUG +from tinygrad.helpers import prod, getenv, DEBUG, DType from tinygrad.ops import CompiledBuffer, RawBufferCopyIn METAL_XCODE = getenv("METAL_XCODE") @@ -20,14 +20,14 @@ class _METAL: METAL = _METAL() class RawMetalBuffer(RawBufferCopyIn): - def __init__(self, size): - super().__init__(size) - self._cl = METAL.device.newBufferWithLength_options_(size, Metal.MTLResourceStorageModeShared) + def __init__(self, size:int, dtype:DType): + super().__init__(size, dtype) + self._cl = METAL.device.newBufferWithLength_options_(size*dtype.itemsize, Metal.MTLResourceStorageModeShared) def __del__(self): self._cl.release() super().__del__() def _buffer(self): return self._cl.contents().as_buffer(self._cl.length()) - def _as_np(self, dtype=np.float32): return np.frombuffer(self._buffer(), dtype=dtype) + def _as_np(self): return np.frombuffer(self._buffer(), dtype=self.dtype.np) def copyin(self, x:np.ndarray): np.copyto(self._as_np(), x.reshape(-1).data) def toCPU(self) -> np.ndarray: for cbuf in METAL.mtl_buffers_in_flight: cbuf.waitUntilCompleted() diff --git a/tinygrad/runtime/ops_torch.py b/tinygrad/runtime/ops_torch.py index cd820152e..fdc2dd084 100644 --- a/tinygrad/runtime/ops_torch.py +++ b/tinygrad/runtime/ops_torch.py @@ -1,7 +1,7 @@ import torch from typing import ClassVar, Dict, Callable from tinygrad.ops import UnaryOps, BinaryOps, MovementOps, FusedOps, InterpretedBuffer, Op -from tinygrad.helpers import getenv +from tinygrad.helpers import getenv, dtypes from tinygrad.runtime.ops_cpu import base_fxn_for_op, einsum_mulacc torch_fxn_for_op : Dict[Op, Callable] = {**base_fxn_for_op, **{ @@ -14,6 +14,7 @@ torch_fxn_for_op : Dict[Op, Callable] = {**base_fxn_for_op, **{ device = torch.device("cuda:0" if torch.cuda.is_available() else ("mps" if getenv("MPS", 0) else "cpu")) class TorchBuffer(InterpretedBuffer): fxn_for_op : ClassVar = torch_fxn_for_op + to_tinygrad_dtype = staticmethod(lambda lbuf: {torch.float16: dtypes.float16, torch.float32: dtypes.float32}[lbuf.dtype]) @staticmethod def fromCPU(x): return TorchBuffer(torch.from_numpy(x).requires_grad_(False).to(device)) diff --git a/tinygrad/tensor.py b/tinygrad/tensor.py index 9c91fa80b..a3aa08b55 100644 --- a/tinygrad/tensor.py +++ b/tinygrad/tensor.py @@ -3,7 +3,7 @@ from __future__ import annotations import math, functools, itertools import numpy as np from typing import List, Tuple, Callable, Optional, ClassVar, Type, Union, Sequence -from tinygrad.helpers import prod, argfix, make_pair, getenv, DEBUG, flatten +from tinygrad.helpers import prod, argfix, make_pair, getenv, DEBUG, flatten, DType, dtypes from tinygrad.lazy import Device, LazyBuffer, LazyNumpyArray from tinygrad.image import image_conv2d_decorator, image_dot_decorator @@ -32,18 +32,20 @@ class Tensor: __deletable__ = ('_ctx',) training : ClassVar[bool] = False no_grad : ClassVar[bool] = False + default_type : DType = dtypes.float32 - def __init__(self, data, device=Device.DEFAULT, requires_grad:Optional[bool]=None): + def __init__(self, data, device=Device.DEFAULT, dtype:Optional[DType]=None, requires_grad:Optional[bool]=None): if isinstance(data, list): - data = np.array(data, dtype=np.float32) + data = np.array(data, dtype=(dtype if dtype is not None else Tensor.default_type).np) elif isinstance(data, LazyBuffer) and data.device != device: # TODO: this has to realize, it shouldn't have to data = data.realize().toCPU() if isinstance(data, (np.ndarray, LazyNumpyArray)): data = data if data.shape else data.reshape((1,)) - self.lazydata = LazyBuffer.fromCPU(data.astype(np.float32), device) + self.lazydata = LazyBuffer.fromCPU(data.astype(dtype.np) if dtype is not None else data, device) elif isinstance(data, LazyBuffer): + assert dtype is None or dtype == data.dtype, "dtype doesn't match, and casting isn't supported" self.lazydata = data else: raise RuntimeError(f"can't create Tensor from {data}") @@ -64,13 +66,12 @@ class Tensor: @property def shape(self) -> Tuple[int, ...]: return self.lazydata.shape - # dtype handling was very broken. it's always float32 now - @property - def dtype(self) -> type: return np.float32 - @property def device(self) -> str: return self.lazydata.device + @property + def dtype(self) -> DType: return self.lazydata.dtype + # ***** data handlers **** def realize(self) -> Tensor: @@ -132,11 +133,11 @@ class Tensor: def manual_seed(seed=None): Tensor._rng = np.random.default_rng(seed=seed) @staticmethod - def rand(*shape, **kwargs) -> Tensor: return Tensor(LazyNumpyArray(lambda shape: Tensor._rng.random(size=shape, dtype=np.float32), shape), **kwargs) + def rand(*shape, **kwargs) -> Tensor: return Tensor(LazyNumpyArray(lambda shape, dtype: Tensor._rng.random(size=shape, dtype=dtype), shape, np.float32), **kwargs) # TODO: replace with a transformation from uniform -> gaussian @staticmethod - def randn(*shape, **kwargs) -> Tensor: return Tensor(LazyNumpyArray(lambda shape: Tensor._rng.standard_normal(size=shape, dtype=np.float32), shape), **kwargs) + def randn(*shape, **kwargs) -> Tensor: return Tensor(LazyNumpyArray(lambda shape, dtype: Tensor._rng.standard_normal(size=shape, dtype=dtype), shape, np.float32), **kwargs) # ***** rng hlops ***** @@ -442,10 +443,16 @@ class Tensor: def dropout(self, p=0.5) -> Tensor: if not Tensor.training: return self - _mask : np.ndarray = np.asarray(Tensor._rng.binomial(1, 1.0-p, size=self.shape), dtype=self.dtype) + # TODO: why is this going through numpy? + _mask : np.ndarray = np.asarray(Tensor._rng.binomial(1, 1.0-p, size=self.shape), dtype=np.float32) return self * Tensor(_mask, requires_grad=False, device=self.device) * (1/(1.0 - p)) + # ***** cast ops ***** + + # TODO: this is a hack, but if we add float(0), it will become a float. need real casting support + def float(self) -> Tensor: return self.add(Tensor([0], device=self.device, dtype=dtypes.float32, requires_grad=self.requires_grad)) + # register functions to move between devices -for device in [device for device in Device._buffers.keys() if device[0] != "_"]: +for device in Device._buffers: setattr(Tensor, f"{device.lower()}", functools.partialmethod(Tensor.to, device)) setattr(Tensor, f"{device.lower()}_", functools.partialmethod(Tensor.to_, device))