1
0
Fork 0

dtypes nice and clean (#673)

* add dtype class

* dtypes

* buffers are lazy

* dtype is tracked by lazybuffer and GenericShape

* fix types in llvm

* llvm store

* dtype tests

* fix tests maybe

* fix flop counter

* fix CI

* CI fix and check format

* fix dtype and dtype check

* fix custom test

* fix test graph
pull/676/head
George Hotz 2023-03-10 16:56:07 -08:00 committed by GitHub
parent d26345595d
commit 1826ff6b89
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
20 changed files with 215 additions and 141 deletions

View File

@ -164,6 +164,7 @@ jobs:
- name: Test openpilot model
run: |
ALLOWED_KERNEL_COUNT=197 FLOAT16=1 VALIDHACKS=1 DEBUGCL=1 GPU=1 IMAGE=2 python3 openpilot/compile.py
python3 -c 'import os; assert os.path.getsize("/tmp/output.thneed") < 100_000_000'
DEBUGCL=1 GPU=1 IMAGE=2 python3 openpilot/compile.py
VALIDHACKS=1 DEBUGCL=1 GPU=1 IMAGE=2 python3 openpilot/compile.py

View File

@ -1,4 +1,4 @@
# this can be constructed from a cl_cache or loaded from a thneed file
# this can be constructed from a cl_cache or loaded from a thneed file
import time
import struct
import json
@ -26,7 +26,7 @@ class Thneed:
for a in args[3:]:
nodes[a]['out_edges'].append(args[2])
nodes[args[2]]['in_edges'].append(a)
# get buffers to save
self.buffers_to_save = set()
self.outputs = []
@ -35,7 +35,7 @@ class Thneed:
self.buffers_to_save.add(n)
if len(nodes[n]['out_edges']) == 0:
self.outputs.append(n)
fake_inputs = []
for k,n in self.inputs.items():
if n in self.buffers_to_save:
@ -97,7 +97,7 @@ class Thneed:
else:
# zero out buffers
buf = cl.Buffer(CL.cl_ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=b'\x00'*o['size'])
bufs[o['id']] = buf
bufs_loaded[o['id']] = 'data' in o
# if it's loaded, it's saved
@ -114,13 +114,13 @@ class Thneed:
print("FAILED", k)
traceback.print_exc()
exit(0)
# load binaries
for o in jdat['binaries']:
nptr = ptr + o['length']
prgs[o['name']] = CLProgram(o['name'], weights[ptr:nptr], binary=True)
ptr = nptr
# populate the cl_cache
for i,k in enumerate(jdat['kernels']):
kernel = prgs[k['name']]
@ -166,7 +166,7 @@ class Thneed:
jdat['binaries'].append({"name":prg.name, "length":len(binary[0])})
binaries.append(binary[0])
saved_binaries.add(prg.name)
# get the args from the kernel, some need the data saved
targs, args_size = [], []
argdtypes = prg.argdtypes if prg.argdtypes is not None else [None]*(len(args)-2)
@ -196,6 +196,7 @@ class Thneed:
cl.enqueue_copy(CL.cl_queue, data, a, is_blocking=True)
weights.append(data.tobytes())
elif isinstance(a, cl.Image):
assert a.format == cl.ImageFormat(cl.channel_order.RGBA, cl.channel_type.HALF_FLOAT if FLOAT16 else cl.channel_type.FLOAT), "wrong type"
needs_load = a in self.buffers_to_save
row_pitch = (a.shape[0]*4*(2 if FLOAT16 else 4) + 63)//64 * 64
size = row_pitch * a.shape[1]
@ -244,7 +245,7 @@ class Thneed:
"local_work_size": [1 for _ in args[0]] if args[1] is None else args[1],
"num_args": len(args)-2,
"args": targs,
"args_size": args_size
"args_size": args_size
})
jdat['outputs'] = [{

View File

@ -0,0 +1,13 @@
#!/bin/bash -e
echo "********* CPU *********"
CPU=1 python3 $@
echo "********* GPU *********"
GPU=1 python3 $@
echo "********* METAL *********"
METAL=1 python3 $@
echo "********* CLANG *********"
CLANG=1 python3 $@
echo "********* LLVM *********"
LLVM=1 python3 $@
echo "********* TORCH *********"
TORCH=1 python3 $@

View File

@ -4,7 +4,7 @@
import unittest
import numpy as np
from typing import Optional, Tuple
from tinygrad.helpers import prod
from tinygrad.helpers import prod, dtypes
# *** first, we implement the atan2 op at the lowest level ***
# `atan2_gpu` for GPUBuffers and `atan2_cpu` for CPUBuffers
@ -16,6 +16,7 @@ from tinygrad.runtime.ops_cpu import CPUBuffer
def atan2_gpu(a:CompiledBuffer, b:CompiledBuffer) -> CompiledBuffer:
from tinygrad.runtime.ops_gpu import GPUBuffer
assert type(a) == GPUBuffer and type(b) == GPUBuffer, "gpu function requires GPUBuffers"
assert a.dtype == b.dtype and a.dtype == dtypes.float32, "gpu function only supports float32"
ret = GPUBuffer(a.shape)
ASTRunner("atan2", """
__kernel void atan2(global float *c, global float *a, global float *b) {
@ -40,7 +41,7 @@ class ATan2(Function):
assert prod(a.shape) == prod(b.shape) and a.device == b.device, "shape or device mismatch"
self.a, self.b = a, b
ast = LazyOp(LoadOps.CUSTOM, (a, b), {"GPU": atan2_gpu, "CPU": atan2_cpu}[a.device])
return LazyBuffer(a.device, a.shape, LoadOps, ast)
return LazyBuffer(a.device, a.shape, LoadOps, ast, max(a.dtype, b.dtype))
def backward(self, grad_output:LazyBuffer) -> Tuple[Optional[LazyBuffer], Optional[LazyBuffer]]:
denom = (self.a.binary_op(BinaryOps.MUL, self.a)).binary_op(BinaryOps.ADD, self.b.binary_op(BinaryOps.MUL, self.b))
return grad_output.binary_op(BinaryOps.MUL, self.b.binary_op(BinaryOps.DIV, denom)) if self.needs_input_grad[0] else None, \

41
test/test_dtype.py 100644
View File

@ -0,0 +1,41 @@
import unittest
import numpy as np
from tinygrad.helpers import getenv
from tinygrad.lazy import Device
from tinygrad.tensor import Tensor, dtypes
# for GPU, cl_khr_fp16 isn't supported
# for LLVM, it segfaults because it can't link to the casting function
@unittest.skipIf(getenv("CI", "") != "" and Device.DEFAULT in ["GPU", "LLVM"], "float16 broken in some CI backends")
class TestDtype(unittest.TestCase):
def test_half_to_np(self):
a = Tensor([1,2,3,4], dtype=dtypes.float16)
print(a)
na = a.numpy()
print(na, na.dtype, a.lazydata.realized)
assert na.dtype == np.float16
def test_half_add(self):
a = Tensor([1,2,3,4], dtype=dtypes.float16)
b = Tensor([1,2,3,4], dtype=dtypes.float16)
c = a+b
print(c.numpy())
assert c.dtype == dtypes.float16
def test_upcast_float(self):
# NOTE: there's no downcasting support
a = Tensor([1,2,3,4], dtype=dtypes.float16).float()
print(a)
na = a.numpy()
print(na, na.dtype)
assert na.dtype == np.float32
def test_half_add_upcast(self):
a = Tensor([1,2,3,4], dtype=dtypes.float16)
b = Tensor([1,2,3,4], dtype=dtypes.float32)
c = a+b
print(c.numpy())
assert c.dtype == dtypes.float32
if __name__ == '__main__':
unittest.main()

View File

@ -14,10 +14,10 @@ class TestGraph(unittest.TestCase):
assert nx.is_isomorphic(G, RG, node_match=lambda x,y: x["label"] == y["label"], edge_match=lambda x,y: x["label"] == y["label"] if "label" in y else True)
def test_add_graph(self):
a = CPUBuffer.fromCPU(np.ones((4,4)))
b = CPUBuffer.fromCPU(np.ones((4,4)))
a = CPUBuffer.fromCPU(np.ones((4,4), dtype=np.float32))
b = CPUBuffer.fromCPU(np.ones((4,4), dtype=np.float32))
ast = LazyOp(BinaryOps.ADD, (a,b))
ret = CPUBuffer(np.ones((4,4)))
ret = CPUBuffer(np.ones((4,4), dtype=np.float32))
RG = nx.DiGraph()
RG.add_node(0, label="(4, 4)")
@ -30,12 +30,12 @@ class TestGraph(unittest.TestCase):
self.helper_compare_graph(RG)
def test_add_sum_graph(self):
a = CPUBuffer.fromCPU(np.ones((4,4)))
b = CPUBuffer.fromCPU(np.ones((1,1)))
a = CPUBuffer.fromCPU(np.ones((4,4), dtype=np.float32))
b = CPUBuffer.fromCPU(np.ones((1,1), dtype=np.float32))
op0 = LazyOp(MovementOps.RESHAPE, (b,), (4, 4))
op1 = LazyOp(BinaryOps.ADD, (a,op0))
ast = LazyOp(ReduceOps.SUM, (op1,), (1,1))
ret = CPUBuffer(np.ones((1,1)))
ret = CPUBuffer(np.ones((1,1), dtype=np.float32))
RG = nx.DiGraph()
RG.add_node(0, label="(4, 4)")
@ -48,14 +48,14 @@ class TestGraph(unittest.TestCase):
self.helper_compare_graph(RG)
def test_add_graph_prune(self):
a = CPUBuffer.fromCPU(np.ones((1,1)))
a = CPUBuffer.fromCPU(np.ones((1,1), dtype=np.float32))
ast = LazyOp(MovementOps.RESHAPE, (a,), (4, 4))
ret = CPUBuffer(np.ones((4,4)))
ret = CPUBuffer(np.ones((4,4), dtype=np.float32))
log_op(ret, ast, show_graph=True)
b = CPUBuffer.fromCPU(np.ones((4,4)))
b = CPUBuffer.fromCPU(np.ones((4,4), dtype=np.float32))
ast = LazyOp(BinaryOps.ADD, (ret,b))
ret = CPUBuffer(np.ones((4,4)))
ret = CPUBuffer(np.ones((4,4), dtype=np.float32))
log_op(ret, ast, show_graph=True)
prune_graph()

View File

@ -1,7 +1,7 @@
import itertools
from enum import Enum, auto
from typing import List, Tuple
from tinygrad.helpers import prod, dedup, all_same, colored
from tinygrad.helpers import prod, dedup, all_same, colored, dtypes
from tinygrad.ops import LazyOp, MovementOps, get_lazyop_info, get_buffers, ReduceOps, get_lazyops, map_buffers
from tinygrad.shape import ShapeTracker, View, strides_for_shape
@ -26,7 +26,7 @@ class Token:
if len(self.axis) == 0: return [0]
acc_strides = [x*(1-self.axis[::-1][i][2]) for i,x in enumerate(strides_for_shape(tuple(1 if r else s for s,_,r in self.axis[::-1])))]
return [sum(t) for t in itertools.product(*[[y*acc_strides[i] for y in range(x[0])] for i,x in enumerate(self.axis[::-1])])]
def decltype(self): return ('float' if self.typ == Types.FLOAT else 'float4') + ('*' if self.ptr else str())
def decltype(self, dtype=dtypes.float32): return (dtype.name if self.typ == Types.FLOAT else f'{dtype.name}4') + ('*' if self.ptr else str())
def __repr__(self): return f"<{self.typ}{'*' if self.ptr else str()} {self.tok}{f'[{self.axis}]' if len(self.axis) else str()}>"
# ast kernel can contain one ReduceOp with arbitrary Binary/Unary ops
@ -55,8 +55,9 @@ class ASTKernel:
break
# create the buffer we are returning (as the same type as the input buffers) and add it as the first buffer
self.ret = output_buffer if output_buffer else type(self.bufs[0])(output_shape if output_shape else self.info.shape, force_create=True)
self.bufs = ([type(self.ret)(self.info.shape, hostbuf=self.ret)] if output_shape else [self.ret]) + self.bufs
self.ret = output_buffer if output_buffer else type(self.bufs[0])(output_shape if output_shape else self.info.shape, force_create=True, dtype=self.info.dtype)
assert self.ret.dtype == self.info.dtype, f"return dtype {self.ret.dtype} != {self.info.dtype}"
self.bufs = ([type(self.ret)(self.info.shape, hostbuf=self.ret, dtype=self.info.dtype)] if output_shape else [self.ret]) + self.bufs
# key for lookup in cache (can change, str might not be right)
# bufs are needed because kernels like f(x) = x + x and f(x, y) = x + y have the same str(ast), but are different kernels.
@ -121,7 +122,7 @@ class ASTKernel:
if print_shapetrackers:
for st in self.sts: print(st)
for i in range(len(self.sts)):
print(prefix, self.buftokens[i], f"early:{'T' if i < len(self.bufs) and self.bufs[i] in self.earlybufs else 'F'}", self.sts[i].shape, self.sts[i].views[-1].strides, len(self.sts[i].views), type(self.bufs[i]._buf) if self.bufs[i] is not None else "FAKE")
print(prefix, self.bufs[i].dtype, self.buftokens[i], f"early:{'T' if i < len(self.bufs) and self.bufs[i] in self.earlybufs else 'F'}", self.sts[i].shape, self.sts[i].views[-1].strides, len(self.sts[i].views), type(self.bufs[i]._buf) if self.bufs[i] is not None else "FAKE")
@property
def shape_len(self) -> int: return len(self.sts[0].shape)

View File

@ -25,6 +25,7 @@ class GPULanguage(NamedTuple):
lid : List[str] = []
extra_args : List[str] = []
float4 : Optional[str] = None
half_prekernel : Optional[str] = None
def to_image_idx(base_shape:Tuple[int, ...], idxy:Node, valid:Node, validhacks=False) -> Tuple[Node, Node]:
idy = (idxy//(4*base_shape[1]))
@ -266,6 +267,8 @@ class GPUCodegen(ASTKernel):
self.prekernel : Set[str] = set()
self.kernel : List[str] = ["const sampler_t smp = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;\n"] if any(hasattr(buf._buf, "IMAGE") for buf in self.bufs if buf is not None) else []
if self.lang.half_prekernel: self.prekernel.add(self.lang.half_prekernel+"\n")
if len(self.lang.gid) == 0:
self.kernel += [f"for (int idx{i} = 0; idx{i} < {self.output_shape[i]}; idx{i}++) {{\n" for i in range(0, len(self.output_shape))]
else:
@ -320,7 +323,7 @@ class GPUCodegen(ASTKernel):
self.kernel.append("\n}")
# concat kernel into prg
buftypes = [f"{'read_only' if i > 0 else 'write_only'} image2d_t" if hasattr(x._buf, "IMAGE") else self.lang.buffer_prefix+self.buftokens[i].decltype()+self.lang.buffer_suffix for i,x in enumerate(self.bufs) if x is not None]
buftypes = [f"{'read_only' if i > 0 else 'write_only'} image2d_t" if hasattr(x._buf, "IMAGE") else self.lang.buffer_prefix+self.buftokens[i].decltype(self.bufs[i].dtype)+self.lang.buffer_suffix for i,x in enumerate(self.bufs) if x is not None]
prg = ' '.join(list(self.prekernel) + [f"{self.lang.kernel_prefix} void KERNEL_NAME_PLACEHOLDER(",] +
[', '.join([f'{t} data{i}' for i,t in enumerate(buftypes) if i not in self.bufs_to_delete] + self.lang.extra_args)] +
[") {\n"] + self.kernel)

View File

@ -3,7 +3,7 @@ from typing import ClassVar, List
from llvmlite import ir # type: ignore
from tinygrad.codegen.ast import ASTKernel
from tinygrad.ops import UnaryOps, BinaryOps, ReduceOps, LazyOp, ASTRunner
from tinygrad.helpers import DEBUG, prod
from tinygrad.helpers import DEBUG, prod, dtypes
from tinygrad.shape.symbolic import Variable, NumNode, MulNode, DivNode, ModNode, GeNode, LtNode, SumNode, AndNode
def int_const(x): return ir.Constant(ir.IntType(64), x)
@ -105,7 +105,8 @@ class LLVMCodegen(ASTKernel):
# create llvm function
module = ir.Module(name=__file__)
func = ir.Function(module, ir.FunctionType(ir.VoidType(), [ir.FloatType().as_pointer()]*(len(self.bufs))), name='exec')
func_dtypes = [{dtypes.float16:ir.HalfType(), dtypes.float32:ir.FloatType()}[buf.dtype] for buf in self.bufs]
func = ir.Function(module, ir.FunctionType(ir.VoidType(), [x.as_pointer() for x in func_dtypes]), name='exec')
# force llvmlite to allow us to add function attribute then add the attribute
func.attributes._known = func.attributes._known.union(frozenset(['"no-nans-fp-math"="true"']))
@ -143,9 +144,11 @@ class LLVMCodegen(ASTKernel):
# this always does the load, so we have it load *0 if the arg won't be used
# TODO: would control flow be faster?
aug_idx = builder.select(valid, idx, int_const(0))
element = builder.select(valid, builder.load(builder.gep(func.args[buf_index], [aug_idx], inbounds=True)), ir.Constant(ir.FloatType(), 0))
element = builder.select(valid, builder.load(builder.gep(func.args[buf_index], [aug_idx], inbounds=True)), ir.Constant(func_dtypes[buf_index], 0))
else:
element = builder.load(builder.gep(func.args[buf_index], [idx], inbounds=True))
# upcast
if func_dtypes[buf_index] != ir.FloatType(): element = builder.fpext(element, ir.FloatType())
m = element if kernel_output_dim == 1 else builder.insert_element(m, element, int_const(i))
return m
if isinstance(x.op, ReduceOps):
@ -194,6 +197,7 @@ class LLVMCodegen(ASTKernel):
builder = loop_exit[store_loop]
for i, idx in enumerate(get_idxs(builder, idx_level[0][store_loop], 0)):
element = result if kernel_output_dim == 1 else builder.extract_element(result, int_const(i))
if func_dtypes[0] != ir.FloatType(): element = builder.fptrunc(element, func_dtypes[0])
builder.store(element, builder.gep(func.args[0], [idx], inbounds=True))
# add the looping

View File

@ -1,5 +1,6 @@
import os, math, functools
from typing import Tuple, Union, List
import numpy as np
from typing import Tuple, Union, List, NamedTuple
def dedup(x): return list(dict.fromkeys(x)) # retains list order
def prod(x:Union[List[int], Tuple[int, ...]]) -> int: return math.prod(x)
@ -16,3 +17,17 @@ def mnum(i) -> str: return str(i) if i >= 0 else f"m{-i}"
def getenv(key, default=0): return type(default)(os.getenv(key, default))
DEBUG, IMAGE = getenv("DEBUG", 0), getenv("IMAGE", 0)
# **** tinygrad now supports dtypes! *****
class DType(NamedTuple):
itemsize : int
name : str
np : type # TODO: someday this will be removed with the "remove numpy" project
def __repr__(self): return f"dtypes.{self.name}"
class dtypes:
float16 = half = DType(2, "half", np.float16)
float32 = float = DType(4, "float", np.float32)
@staticmethod
def from_np(x:np.ndarray) -> DType: return {np.dtype(np.float16): dtypes.float16, np.dtype(np.float32): dtypes.float32}[np.dtype(x.dtype)]

View File

@ -2,7 +2,7 @@ from __future__ import annotations
from typing import Optional, Tuple, Union, List, Dict, Any, ClassVar, Type
import os, sys, weakref, importlib, inspect, functools
from weakref import WeakValueDictionary
from tinygrad.helpers import prod, getenv
from tinygrad.helpers import prod, getenv, DType, dtypes
from tinygrad.shape import ShapeTracker, get_contraction
from tinygrad.ops import InterpretedBuffer, DeviceBuffer, UnaryOps, BinaryOps, ReduceOps, MovementOps, LoadOps, OpType, LazyOp, get_buffers, get_lazyops, map_buffers
from tinygrad.graph import log_op
@ -13,21 +13,12 @@ sys.setrecursionlimit(10000)
OPT = getenv("OPT", 2)
LAZY = getenv("LAZY", 1)
def get_buffer(name, base='tinygrad.runtime'):
try:
return [cls for cname, cls in inspect.getmembers(importlib.import_module(f'{base}.ops_{name}'), inspect.isclass) if (cname.lower() == name + "buffer")][0]
except Exception as e: # NOTE: this can't be put on one line due to mypy issue
print(name, "backend not available", e, file=sys.stderr)
class _Device:
def __init__(self) -> None:
# TODO: make this dynamic to when you try to access the _buffers
self._buffers : Dict[str, Type[DeviceBuffer]] = {x.upper():get_buffer(x) for x in
[os.path.splitext(x)[0][len("ops_"):] for x in sorted(os.listdir(os.path.join(os.path.dirname(os.path.realpath(__file__)), "runtime"))) if x.startswith("ops_")] if x is not None}
self.DEFAULT : str = "CPU"
for name in self._buffers:
if getenv(name) == 1: self.DEFAULT = name # note: DEFAULT can be a Device that can't be imported. better than silent use of a different device
if self._buffers[name] is not None: self.__setattr__(name, name)
self._buffers = {y.upper():y for y in [os.path.splitext(x)[0][len("ops_"):] for x in sorted(os.listdir(os.path.join(os.path.dirname(os.path.realpath(__file__)), "runtime"))) if x.startswith("ops_")]}
self.DEFAULT : str = functools.reduce(lambda val, ele: val if getenv(val) == 1 else ele, self._buffers, "CPU")
@functools.lru_cache(maxsize=None) # this class is a singleton, pylint: disable=method-cache-max-size-none
def __getitem__(self, x:str) -> Type[DeviceBuffer]: return [cls for cname, cls in inspect.getmembers(importlib.import_module(f'tinygrad.runtime.ops_{self._buffers[x]}'), inspect.isclass) if (cname.lower() == self._buffers[x] + "buffer")][0]
Device = _Device()
# TODO: movement ops that only change shape are really nops. treat them as such
@ -81,9 +72,9 @@ def replace_with_movement_op(y:Union[LazyOp, LazyBuffer], op:MovementOps, arg:Tu
return elementwise_op(y.op, *[replace_with_movement_op(z, op, arg) for z in y.src]) # type: ignore
class LazyNumpyArray:
def __init__(self, fxn, shape): self.fxn, self.shape = fxn, shape
def __call__(self): return self.fxn(self.shape)
def reshape(self, new_shape): return LazyNumpyArray(self.fxn, new_shape)
def __init__(self, fxn, shape, dtype): self.fxn, self.shape, self.dtype = fxn, shape, dtype
def __call__(self): return self.fxn(self.shape, self.dtype)
def reshape(self, new_shape): return LazyNumpyArray(self.fxn, new_shape, self.dtype)
def copy(self): return self
def astype(self, typ): return self
@ -91,32 +82,32 @@ def support_weakref(x): return x
@support_weakref # needed for mypyc, this prevents LazyBuffer from becoming a native class
class LazyBuffer:
__deletable__ = ('op',)
lazycache : ClassVar[WeakValueDictionary[Tuple[str, OpType, LazyOp], LazyBuffer]] = WeakValueDictionary()
def __new__(cls, device:str, shape:Union[ShapeTracker, Tuple[int, ...]], optype:OpType, op:LazyOp):
lazycache : ClassVar[WeakValueDictionary[Tuple[str, DType, OpType, LazyOp], LazyBuffer]] = WeakValueDictionary()
def __new__(cls, device:str, shape:Union[ShapeTracker, Tuple[int, ...]], optype:OpType, op:LazyOp, dtype:DType):
# fromcpu aren't cached
if optype == LoadOps and op.op == LoadOps.FROMCPU:
return super().__new__(cls)
wop = (device, optype, get_weakop(op)) # NOTE: shape should be deterministic. annoying to cache with the ShapeTracker
wop = (device, dtype, optype, get_weakop(op)) # NOTE: shape should be deterministic. annoying to cache with the ShapeTracker
# NOTE: we need "ret" to prevent the new buffer from being immediately deleted
if wop not in LazyBuffer.lazycache: LazyBuffer.lazycache[wop] = ret = super().__new__(cls)
else: ret = LazyBuffer.lazycache[wop]
return ret
def __init__(self, device:str, shape:Union[ShapeTracker, Tuple[int, ...]], optype:OpType, op:LazyOp):
def __init__(self, device:str, shape:Union[ShapeTracker, Tuple[int, ...]], optype:OpType, op:LazyOp, dtype:DType):
if hasattr(self, 'device'):
return # cache hit, we return and don't reinit
self.st = shape if isinstance(shape, ShapeTracker) else ShapeTracker(tuple(shape))
self.shape, self.optype, self.op = self.st.shape, optype, op
self.shape, self.optype, self.op, self.dtype = self.st.shape, optype, op, dtype
self.realized : Optional[DeviceBuffer] = None
self.output_buffer : Optional[DeviceBuffer] = None
self.device, self.dbuffer = device, Device._buffers[device]
self.device, self.dbuffer = device, Device[device]
# TODO: does children have to be a ref count instead of a set? can a Buffer be a double child?
self.children : weakref.WeakSet[LazyBuffer] = weakref.WeakSet()
# NOTE: op should be read only after construction of LazyBuffer
for x in get_buffers(op): x.children.add(self)
if not LAZY: self.realize()
def __repr__(self): return f"<LB {self.shape} op:{self.op.op if self.realized is None else 'realized'}>"
def __repr__(self): return f"<LB {self.shape} {self.dtype} op:{self.op.op if self.realized is None else 'realized'}>"
# this produces a device buffer
def realize(self:LazyBuffer, required_device=None) -> DeviceBuffer:
@ -124,7 +115,7 @@ class LazyBuffer:
if self.realized is None:
# get real ops first
if self.op.op == LoadOps.FROMCPU:
self.realized = Device._buffers[self.device].fromCPU(self.op.arg() if isinstance(self.op.arg, LazyNumpyArray) else self.op.arg)
self.realized = Device[self.device].fromCPU(self.op.arg() if isinstance(self.op.arg, LazyNumpyArray) else self.op.arg)
ast = LazyOp(self.op.op, tuple())
elif self.op.op == LoadOps.CONTIGUOUS:
real_src = self.op.src[0].realize(self.device)
@ -160,12 +151,13 @@ class LazyBuffer:
log_op(self.realized, ast)
assert self.realized.shape == self.shape, f"shape mismatch on realize got {self.realized.shape} expected {self.shape}"
assert isinstance(self.realized, Device._buffers[self.device]), f"device mismatch on realized got {type(self.realized)} expected {self.device}"
assert isinstance(self.realized, Device[self.device]), f"device mismatch on realized got {type(self.realized)} expected {self.device}"
assert self.realized.dtype == self.dtype, f"dtype mismatch on realize got {self.realized.dtype} expected {self.dtype}"
return self.realized
# NOTE: we have to make a copy of the numpy array here in case the user changes it. expose this?
@staticmethod
def fromCPU(x, device) -> LazyBuffer: return LazyBuffer(device, x.shape, LoadOps, LazyOp(LoadOps.FROMCPU, tuple(), x.copy()))
def fromCPU(x, device) -> LazyBuffer: return LazyBuffer(device, x.shape, LoadOps, LazyOp(LoadOps.FROMCPU, tuple(), x.copy()), dtypes.from_np(x))
def toCPU(self):
ret = self.realize().toCPU()
log_op(InterpretedBuffer(ret), LazyOp(LoadOps.TOCPU, (self.realized,), None))
@ -173,11 +165,11 @@ class LazyBuffer:
def unary_op(self:LazyBuffer, op:UnaryOps) -> LazyBuffer: return elementwise_op(op, self)
def binary_op(self:LazyBuffer, op:BinaryOps, y:LazyBuffer) -> LazyBuffer: return elementwise_op(op, self, y)
def contiguous(self:LazyBuffer) -> LazyBuffer: return LazyBuffer(self.device, self.shape, LoadOps, LazyOp(LoadOps.CONTIGUOUS, (self,)))
def contiguous(self:LazyBuffer) -> LazyBuffer: return LazyBuffer(self.device, self.shape, LoadOps, LazyOp(LoadOps.CONTIGUOUS, (self,)), self.dtype)
def reduce_op(self:LazyBuffer, op:ReduceOps, new_shape:Tuple[int, ...]) -> LazyBuffer:
if self.shape == tuple(new_shape): return self
return LazyBuffer(self.device, new_shape, ReduceOps, LazyOp(op, (self,), new_shape))
return LazyBuffer(self.device, new_shape, ReduceOps, LazyOp(op, (self,), new_shape), self.dtype)
def movement_op(self:LazyBuffer, op:MovementOps, arg:Tuple[Any, ...]) -> LazyBuffer:
# very instant nop
@ -228,7 +220,7 @@ class LazyBuffer:
return replace_with_movement_op(self.op, op, arg)
# create the buffer
ret = LazyBuffer(self.device, ShapeTracker(self.st).movement_op(op, arg), MovementOps, LazyOp(op, (self,), arg))
ret = LazyBuffer(self.device, ShapeTracker(self.st).movement_op(op, arg), MovementOps, LazyOp(op, (self,), arg), self.dtype)
# if the ShapeTracker becomes contiguous, replace the whole thing with a reshape (or nothing if shapes match)
# NOTE: if ret is in the cache, it can already be realized
@ -241,7 +233,7 @@ class LazyBuffer:
return ret
def elementwise_op(op:Union[UnaryOps, BinaryOps], *srcs:LazyBuffer) -> LazyBuffer:
out_device, out_shape = srcs[0].device, srcs[0].shape
out_device, out_shape, out_dtype = srcs[0].device, srcs[0].shape, max(x.dtype for x in srcs)
# push all contiguous to the end of BinaryOps. kernels 198 -> 196
if PUSH_CONTIGUOUS and any(x.realized is None and x.op.op == LoadOps.CONTIGUOUS and len(x.op.src[0].children) <= 1 for x in srcs):
@ -258,4 +250,4 @@ def elementwise_op(op:Union[UnaryOps, BinaryOps], *srcs:LazyBuffer) -> LazyBuffe
# remove the buffers from any (childless) BinaryOps that feed into this
srcs = tuple(x.op if x.optype == BinaryOps and len(x.children) == 0 and x.realized is None else x for x in srcs) # type: ignore
return LazyBuffer(out_device, out_shape, BinaryOps, LazyOp(op, srcs))
return LazyBuffer(out_device, out_shape, BinaryOps, LazyOp(op, srcs), out_dtype)

View File

@ -3,7 +3,7 @@ import functools, itertools, operator, random
import numpy as np
from enum import Enum, auto
from typing import Union, Type, NamedTuple, Tuple, Any, List, ClassVar, Optional, Callable, Dict, TypeVar, Set
from tinygrad.helpers import prod, DEBUG, getenv
from tinygrad.helpers import prod, DEBUG, getenv, DType, dtypes
from tinygrad.shape import ShapeTracker
# these are the llops your accelerator must implement, along with toCpu
@ -39,17 +39,19 @@ class Copyable:
def toCPU(self:Copyable) -> np.ndarray: raise NotImplementedError("must be implemented")
class RawBuffer(Copyable): # pylint: disable=abstract-method
def __init__(self, size:int):
def __init__(self, size:int, dtype:DType):
self.size : int = size
GlobalCounters.mem_used += self.size
def __del__(self): GlobalCounters.mem_used -= self.size
self.dtype : DType = dtype
self._memsz : int = size*dtype.itemsize
GlobalCounters.mem_used += self._memsz
def __del__(self): GlobalCounters.mem_used -= self.size*self._memsz
class RawBufferCopyIn(RawBuffer):
def copyin(self, x:np.ndarray) -> None: raise NotImplementedError("must be implemented")
@classmethod
def fromCPU(cls, x:np.ndarray):
ret = cls(4*prod(x.shape))
ret = cls(prod(x.shape), dtypes.from_np(x))
ret.copyin(x)
return ret
@ -57,7 +59,7 @@ class RawBufferCopyInOut(RawBufferCopyIn):
def copyout(self, x:np.ndarray) -> None: raise NotImplementedError("must be implemented")
def toCPU(self) -> np.ndarray:
x = np.empty((self.size//4), dtype=np.float32)
x: np.ndarray = np.empty(self.size, dtype=self.dtype.np)
self.copyout(x)
return x
@ -65,26 +67,27 @@ class RawBufferCopyInOut(RawBufferCopyIn):
class DeviceBuffer(Copyable):
_buf: Any # underlying buffer
shape: Tuple[int, ...]
dtype: DType
@classmethod
def exec_ast(cls, ast:LazyOp, output_buffer=None): raise NotImplementedError("must be implemented")
# this is a quick "buffer" class for flop tracking and getting the output shape
class GenericShape:
def __init__(self, shape:Tuple[int, ...], flops:int=0): self.shape, self.flops = shape, flops
def __init__(self, shape:Tuple[int, ...], dtype:DType=dtypes.float32, flops:int=0): self.shape, self.dtype, self.flops = shape, dtype, flops
def consume_flops(self):
self.flops, ret = 0, self.flops
return ret
shape_fxn_for_op : Dict[Op, Callable] = {
**{op:lambda self: GenericShape(self.shape, self.consume_flops() + prod(self.shape)) for op in UnaryOps},
**{op:lambda self,y: GenericShape(self.shape, self.consume_flops() + y.consume_flops() + prod(self.shape)) for op in BinaryOps},
**{op:lambda self,new_shape: GenericShape(new_shape, self.consume_flops() + prod(self.shape)) for op in ReduceOps},
**{op:functools.partial(lambda mop,self,arg: GenericShape(ShapeTracker(self.shape).movement_op(mop, arg).shape, self.consume_flops()), op) for op in MovementOps}}
**{op:lambda self: GenericShape(self.shape, self.dtype, self.consume_flops() + prod(self.shape)) for op in UnaryOps},
**{op:lambda self,y: GenericShape(self.shape, max(self.dtype, y.dtype), self.consume_flops() + y.consume_flops() + prod(self.shape)) for op in BinaryOps},
**{op:lambda self,new_shape: GenericShape(new_shape, self.dtype, self.consume_flops() + prod(self.shape)) for op in ReduceOps},
**{op:functools.partial(lambda mop,self,arg: GenericShape(ShapeTracker(self.shape).movement_op(mop, arg).shape, self.dtype, self.consume_flops()), op) for op in MovementOps}}
def get_lazyop_info(ast:LazyOp): return InterpretedBuffer.exec_ast(map_buffers({x:InterpretedBuffer(GenericShape(x.shape, x.dtype)) for x in get_buffers(ast)}, ast))._buf
# used in CPUBuffer and TorchBuffer
class InterpretedBuffer(DeviceBuffer): # pylint: disable=abstract-method
fxn_for_op : ClassVar = shape_fxn_for_op
# TODO: use generic types here to remove __init__ in specialized classes
def __init__(self, lbuf:Any): self._buf, self.shape = lbuf, tuple(lbuf.shape)
def __init__(self, lbuf:Any): self._buf, self.shape, self.dtype = lbuf, tuple(lbuf.shape), self.to_tinygrad_dtype(lbuf) if hasattr(self, 'to_tinygrad_dtype') else lbuf.dtype
def contiguous(self): return type(self).exec_ast(LazyOp(op=UnaryOps.NOOP, src=(self,)))
def movement_op(self, op:MovementOps, arg=None): return type(self)(self.fxn_for_op[op](self._buf, arg)) if op in self.fxn_for_op else type(self)(getattr(self._buf, op.name.lower())(arg))
@classmethod
@ -101,12 +104,11 @@ class InterpretedBuffer(DeviceBuffer): # pylint: disable=abstract-method
else: ret = cls(cls.fxn_for_op[ast.op](*([x._buf for x in srcs] + ([ast.arg] if ast.arg else []))))
context[ast] = ret
if output_buffer is not None:
assert output_buffer.shape == ret.shape
assert output_buffer.shape == ret.shape, output_buffer.dtype == ret.dtype
output_buffer._buf = ret._buf
return output_buffer
else:
return ret
def get_lazyop_info(ast:LazyOp): return InterpretedBuffer.exec_ast(map_buffers({x:InterpretedBuffer(GenericShape(x.shape)) for x in get_buffers(ast)}, ast))._buf
class ASTRunner:
def __init__(self, name, prg, bufs_to_delete:Optional[Set[int]]=None, global_size:Optional[List[int]]=None, local_size:Optional[List[int]]=None, op_estimate=0, mem_estimate=0):
@ -139,7 +141,7 @@ class ASTRunner:
def optimize_local_size(self, rawbufs:List[RawBuffer]) -> List[int]:
assert self.global_size is not None, "needs a global size to optimize local size"
if any(x == rawbufs[0] for x in rawbufs[1:]): # this is an assignment, replace the output buffer
output_replacement = type(rawbufs[0])(rawbufs[0].size)
output_replacement = type(rawbufs[0])(rawbufs[0].size, rawbufs[0].dtype)
rawbufs = [output_replacement if x == rawbufs[0] else x for x in rawbufs]
MAX_WORKGROUP = self.clprg.max_work_group_size() if hasattr(self.clprg, 'max_work_group_size') else 1024
local_dims = [[x for x in set([sz, 1, 2, 4, 8, 16, 32, 64, 128, 256, MAX_WORKGROUP]) if x<=sz] for sz in self.global_size]
@ -149,32 +151,36 @@ class ASTRunner:
# assumes you are using ShapeTracker
# used in GPUBuffer and LLVMBuffer
class CompiledBuffer(DeviceBuffer): # pylint: disable=abstract-method
def __init__(self, shape:Union[ShapeTracker, Tuple[int, ...]], hostbuf:Optional[CompiledBuffer]=None, backing:Optional[np.ndarray]=None, force_create=False):
def __init__(self, shape:Union[ShapeTracker, Tuple[int, ...]], hostbuf:Optional[CompiledBuffer]=None, backing:Optional[np.ndarray]=None, force_create=False, dtype:DType=dtypes.float32):
self.st = shape if isinstance(shape, ShapeTracker) else ShapeTracker(tuple(shape))
self.shape = self.st.shape
self.dtype = dtype
assert hostbuf is None or hostbuf.dtype == dtype, f"hostbuf dtype {hostbuf.dtype} != {dtype}"
self._base_shape : Tuple[int, ...] = hostbuf._base_shape if hostbuf is not None else self.shape
self._buf = hostbuf._buf if hostbuf is not None else None
self._backing : Optional[np.ndarray] = hostbuf._backing if hostbuf is not None else backing
if (self._backing is not None and self._backing.shape != (1,)) or force_create: self.raw()
# TODO: not GPUBuffer, get name of class
def __repr__(self): return f"GPUBuffer(shape={self.st}, hostbuf=GPUBuffer(shape={self._base_shape}" + (f", backing=np.array({self._backing}, dtype=np.float32)))" if self._backing else ", force_create=True))")
# TODO: needs dtype
def __repr__(self): return f"{type(self).__name__}(shape={self.st}, hostbuf={type(self).__name__}(shape={self._base_shape}" + (f", backing=np.array({self._backing}, dtype=np.{self.dtype.np.__name__})))" if self._backing is not None else ", force_create=True))")
raw_buffer_type : Type[RawBuffer]
@classmethod
def create_raw_buffer(cls, shape, backing) -> RawBuffer:
def create_raw_buffer(cls, shape:Tuple[int, ...], backing:Optional[np.ndarray], dtype:DType) -> RawBuffer:
assert backing is None or prod(shape) == prod(backing.shape), "backing has the wrong shape"
assert backing is None or GlobalCounters.cache is None, f"can't copy in {backing.shape} while caching"
return cls.raw_buffer_type(4*prod(shape)) if backing is None else cls.raw_buffer_type.fromCPU(backing)
if DEBUG >= 4: print(f"create raw buffer {shape} {dtype} backed:{backing is not None}")
return cls.raw_buffer_type(prod(shape), dtype) if backing is None else cls.raw_buffer_type.fromCPU(backing)
def raw(self) -> RawBuffer:
if self._buf is None:
if DEBUG >= 4 and self._backing is not None: print(f"**** copy in {self._backing.shape} to {type(self)}")
self._buf = self.create_raw_buffer(self._base_shape, self._backing)
self._buf = self.create_raw_buffer(self._base_shape, self._backing, self.dtype)
self._backing = None
return self._buf
@classmethod
def fromCPU(cls, x:np.ndarray) -> CompiledBuffer: return cls(x.shape, backing=x.view(np.ndarray).astype(np.float32).ravel())
def fromCPU(cls, x:np.ndarray) -> CompiledBuffer: return cls(x.shape, backing=x.ravel(), dtype=dtypes.from_np(x))
def toCPU(self) -> np.ndarray:
assert GlobalCounters.cache is None, f"can't copy out {self} while caching"
if DEBUG >= 3: print(f"**** copy out {self.shape}")
@ -192,7 +198,7 @@ class CompiledBuffer(DeviceBuffer): # pylint: disable=abstract-method
prg = cls.method_cache[k.key]
else:
prg = k.codegen().build(cls.runtime_type)
if getenv("PRINT_AST", "") == prg.name:
if getenv("PRINT_AST", "") == prg.name or getenv("PRINT_AST", "") == "1":
k.print()
print(prg.prg)
prg.exec(k.bufs)

View File

@ -2,21 +2,22 @@ import os, time, ctypes, hashlib, subprocess, platform
import numpy as np
from collections import defaultdict
from typing import Final, Dict
from tinygrad.helpers import dtypes, DType
from tinygrad.ops import CompiledBuffer, RawBufferCopyIn
from tinygrad.codegen.gpu import GPUCodegen, GPULanguage
class RawMallocBuffer(RawBufferCopyIn):
def __init__(self, size):
super().__init__(size)
self._buf = (ctypes.c_float * (size//4))()
def __init__(self, size, dtype : DType):
super().__init__(size, dtype)
self._buf = ({dtypes.float32: ctypes.c_float, dtypes.float16: ctypes.c_int16}[dtype] * size)()
def _buffer(self): return self._buf
def copyin(self, x:np.ndarray): ctypes.memmove(self._buf, x.ctypes.data, x.size*4)
def toCPU(self): return np.ctypeslib.as_array(self._buf)
def copyin(self, x:np.ndarray): ctypes.memmove(self._buf, x.ctypes.data, x.size*np.dtype(x.dtype).itemsize)
def toCPU(self): return np.frombuffer(self._buf, dtype=self.dtype.np)
class ClangProgram:
kernel_cnt : Final[Dict[str, int]] = defaultdict(int)
def __init__(self, name:str, prg:str):
prg = "#include <math.h>\n#define max(x,y) ((x>y)?x:y)\n" + prg
prg = "#include <math.h>\n#define max(x,y) ((x>y)?x:y)\n#define half __fp16\n" + prg
# TODO: is there a way to not write this to disk?
fn = f"/tmp/clang_{hashlib.md5(prg.encode('utf-8')).hexdigest()}.{'dylib' if platform.system() == 'Darwin' else 'so'}"
if not os.path.exists(fn):

View File

@ -1,6 +1,7 @@
import numpy as np
import operator
from typing import ClassVar, Callable, Dict, Tuple
from tinygrad.helpers import dtypes
from tinygrad.ops import UnaryOps, BinaryOps, MovementOps, ReduceOps, FusedOps, InterpretedBuffer, Op
def shape_to_axis(old_shape:Tuple[int, ...], new_shape:Tuple[int, ...]) -> Tuple[int, ...]:
@ -35,6 +36,7 @@ numpy_fxn_for_op : Dict[Op, Callable] = {**base_fxn_for_op, **{
class CPUBuffer(InterpretedBuffer):
fxn_for_op : ClassVar = numpy_fxn_for_op
to_tinygrad_dtype = staticmethod(dtypes.from_np)
@staticmethod
def fromCPU(x): return CPUBuffer(x)

View File

@ -8,9 +8,9 @@ from tinygrad.ops import CompiledBuffer, RawBufferCopyInOut
from tinygrad.codegen.gpu import GPUCodegen, GPULanguage
class RawCUDABuffer(RawBufferCopyInOut):
def __init__(self, size):
super().__init__(size)
self._cl = cuda.mem_alloc(size)
def __init__(self, size, dtype):
super().__init__(size, dtype)
self._cl = cuda.mem_alloc(self._memsz)
def copyin(self, x:np.ndarray, stream:Optional[cuda.Stream]=None): cuda.memcpy_htod_async(self._cl, x, stream)
def copyout(self, x:np.ndarray): cuda.memcpy_dtoh(x, self._cl)
@ -42,6 +42,7 @@ class CUDAProgram:
class CUDACodegen(GPUCodegen):
lang = GPULanguage(
kernel_prefix = "__global__", smem_prefix = "__shared__ ", barrier = "__syncthreads();", float4 = "make_float4",
half_prekernel = "#include <cuda_fp16.h>",
gid = [f'blockDim.{chr(120+i)}*blockIdx.{chr(120+i)}+threadIdx.{chr(120+i)}' for i in range(3)],
lid = [f'threadIdx.{chr(120+i)}' for i in range(3)])

View File

@ -2,15 +2,13 @@ from __future__ import annotations
import platform, functools
import numpy as np
import pyopencl as cl # type: ignore
from typing import Dict, Optional, List, ClassVar, Final
from collections import defaultdict
from tinygrad.helpers import IMAGE, DEBUG, getenv
from typing import Optional, List, Final
from tinygrad.helpers import IMAGE, DEBUG, getenv, dtypes
from tinygrad.ops import CompiledBuffer, GlobalCounters, RawBufferCopyInOut, RawBuffer
from tinygrad.codegen.gpu import GPUCodegen, GPULanguage
OSX = platform.system() == "Darwin"
OSX_TIMING_RATIO = (125/3) if OSX else 1.0 # see test/external_osx_profiling.py to determine this ratio. it's in like GPU clocks or something
CLCACHE = getenv("CLCACHE", 1)
FLOAT16 = getenv("FLOAT16", 0)
class _CL:
@ -27,33 +25,18 @@ class _CL:
CL = _CL()
class CLBuffer(RawBufferCopyInOut):
# TODO: this can be in RawBuffer generically
BUFFER_CACHE : ClassVar[Dict[int, List[cl.Buffer]]] = defaultdict(list)
def __init__(self, size): # pylint: disable=super-init-not-called
self.size = size
if len(CLBuffer.BUFFER_CACHE[size]) > 0:
self._cl = CLBuffer.BUFFER_CACHE[size].pop()
else:
# TODO: on GPU OOM, clear the cache
self._cl = cl.Buffer(CL.cl_ctx, cl.mem_flags.READ_WRITE, size)
GlobalCounters.mem_used += self._cl.size
def __del__(self):
if CLCACHE: CLBuffer.BUFFER_CACHE[self._cl.size].append(self._cl)
else: GlobalCounters.mem_used -= self._cl.size
def __init__(self, size, dtype):
super().__init__(size, dtype)
self._cl = cl.Buffer(CL.cl_ctx, cl.mem_flags.READ_WRITE, self._memsz)
def copyin(self, x:np.ndarray): cl.enqueue_copy(CL.cl_queue, self._cl, x, is_blocking=False)
def copyout(self, x:np.ndarray): cl.enqueue_copy(CL.cl_queue, x, self._cl, is_blocking=True)
class CLImage(RawBuffer): # pylint: disable=abstract-method
fmt : Final = cl.ImageFormat(cl.channel_order.RGBA, cl.channel_type.HALF_FLOAT if FLOAT16 else cl.channel_type.FLOAT)
IMAGE : Final = True
def __init__(self, shape): # pylint: disable=super-init-not-called
self.size, self._cl = shape, cl.Image(CL.cl_ctx, cl.mem_flags.READ_WRITE, CLImage.fmt, shape=(shape[1], shape[0]))
def __init__(self, shape, dtype=dtypes.float16 if getenv("FLOAT16") else dtypes.float32): # pylint: disable=super-init-not-called
fmt = cl.ImageFormat(cl.channel_order.RGBA, {dtypes.float16: cl.channel_type.HALF_FLOAT, dtypes.float32: cl.channel_type.FLOAT}[dtype])
self.size, self.dtype, self._cl = shape, dtype, cl.Image(CL.cl_ctx, cl.mem_flags.READ_WRITE, fmt, shape=(shape[1], shape[0]))
GlobalCounters.mem_used += self._cl.row_pitch * self._cl.height
def __del__(self): GlobalCounters.mem_used -= self._cl.row_pitch * self._cl.height
@functools.lru_cache(maxsize=None)
@ -89,6 +72,7 @@ class CLProgram:
class CLCodegen(GPUCodegen):
lang = GPULanguage(
kernel_prefix = "__kernel", buffer_prefix = "__global ", smem_prefix = "__local ",
half_prekernel = "#pragma OPENCL EXTENSION cl_khr_fp16 : enable",
barrier = "barrier(CLK_LOCAL_MEM_FENCE);", float4 = "(float4)",
gid = [f'get_global_id({i})' for i in range(3)], lid = [f'get_local_id({i})' for i in range(3)])
@ -96,8 +80,8 @@ class GPUBuffer(CompiledBuffer):
raw_buffer_type = CLBuffer
# override this method for image
@classmethod
def create_raw_buffer(cls, shape, backing) -> RawBuffer:
if len(shape) == 3 and shape[2] == 4 and IMAGE >= 2 and backing is None: return CLImage(shape)
else: return super().create_raw_buffer(shape, backing)
def create_raw_buffer(cls, shape, backing, dtype) -> RawBuffer:
if len(shape) == 3 and shape[2] == 4 and IMAGE >= 2 and backing is None: return CLImage(shape) # NOTE: this is a hack. we don't pass in the dtype here, it's controlled by the FLOAT16 env var
else: return super().create_raw_buffer(shape, backing, dtype)
codegen_type = CLCodegen
runtime_type = CLProgram

View File

@ -57,7 +57,7 @@ class LLVMProgram:
def __del__(self): LLVM.engine.remove_module(self.mod)
def __call__(self, unused_global_size, unused_local_size, *bufs, wait=False):
cfunc = CFUNCTYPE(ctypes.c_int, *[ctypes.POINTER(ctypes.c_float) for _ in bufs])(self.fxn)
cfunc = CFUNCTYPE(ctypes.c_int, *[ctypes.c_void_p for _ in bufs])(self.fxn)
if wait: st = time.monotonic()
cfunc(*[x._buf for x in bufs])
if wait: return time.monotonic()-st

View File

@ -4,7 +4,7 @@ import Metal, Cocoa, libdispatch # type: ignore
import numpy as np
from typing import List, Any
from tinygrad.codegen.gpu import GPUCodegen, GPULanguage
from tinygrad.helpers import prod, getenv, DEBUG
from tinygrad.helpers import prod, getenv, DEBUG, DType
from tinygrad.ops import CompiledBuffer, RawBufferCopyIn
METAL_XCODE = getenv("METAL_XCODE")
@ -20,14 +20,14 @@ class _METAL:
METAL = _METAL()
class RawMetalBuffer(RawBufferCopyIn):
def __init__(self, size):
super().__init__(size)
self._cl = METAL.device.newBufferWithLength_options_(size, Metal.MTLResourceStorageModeShared)
def __init__(self, size:int, dtype:DType):
super().__init__(size, dtype)
self._cl = METAL.device.newBufferWithLength_options_(size*dtype.itemsize, Metal.MTLResourceStorageModeShared)
def __del__(self):
self._cl.release()
super().__del__()
def _buffer(self): return self._cl.contents().as_buffer(self._cl.length())
def _as_np(self, dtype=np.float32): return np.frombuffer(self._buffer(), dtype=dtype)
def _as_np(self): return np.frombuffer(self._buffer(), dtype=self.dtype.np)
def copyin(self, x:np.ndarray): np.copyto(self._as_np(), x.reshape(-1).data)
def toCPU(self) -> np.ndarray:
for cbuf in METAL.mtl_buffers_in_flight: cbuf.waitUntilCompleted()

View File

@ -1,7 +1,7 @@
import torch
from typing import ClassVar, Dict, Callable
from tinygrad.ops import UnaryOps, BinaryOps, MovementOps, FusedOps, InterpretedBuffer, Op
from tinygrad.helpers import getenv
from tinygrad.helpers import getenv, dtypes
from tinygrad.runtime.ops_cpu import base_fxn_for_op, einsum_mulacc
torch_fxn_for_op : Dict[Op, Callable] = {**base_fxn_for_op, **{
@ -14,6 +14,7 @@ torch_fxn_for_op : Dict[Op, Callable] = {**base_fxn_for_op, **{
device = torch.device("cuda:0" if torch.cuda.is_available() else ("mps" if getenv("MPS", 0) else "cpu"))
class TorchBuffer(InterpretedBuffer):
fxn_for_op : ClassVar = torch_fxn_for_op
to_tinygrad_dtype = staticmethod(lambda lbuf: {torch.float16: dtypes.float16, torch.float32: dtypes.float32}[lbuf.dtype])
@staticmethod
def fromCPU(x): return TorchBuffer(torch.from_numpy(x).requires_grad_(False).to(device))

View File

@ -3,7 +3,7 @@ from __future__ import annotations
import math, functools, itertools
import numpy as np
from typing import List, Tuple, Callable, Optional, ClassVar, Type, Union, Sequence
from tinygrad.helpers import prod, argfix, make_pair, getenv, DEBUG, flatten
from tinygrad.helpers import prod, argfix, make_pair, getenv, DEBUG, flatten, DType, dtypes
from tinygrad.lazy import Device, LazyBuffer, LazyNumpyArray
from tinygrad.image import image_conv2d_decorator, image_dot_decorator
@ -32,18 +32,20 @@ class Tensor:
__deletable__ = ('_ctx',)
training : ClassVar[bool] = False
no_grad : ClassVar[bool] = False
default_type : DType = dtypes.float32
def __init__(self, data, device=Device.DEFAULT, requires_grad:Optional[bool]=None):
def __init__(self, data, device=Device.DEFAULT, dtype:Optional[DType]=None, requires_grad:Optional[bool]=None):
if isinstance(data, list):
data = np.array(data, dtype=np.float32)
data = np.array(data, dtype=(dtype if dtype is not None else Tensor.default_type).np)
elif isinstance(data, LazyBuffer) and data.device != device:
# TODO: this has to realize, it shouldn't have to
data = data.realize().toCPU()
if isinstance(data, (np.ndarray, LazyNumpyArray)):
data = data if data.shape else data.reshape((1,))
self.lazydata = LazyBuffer.fromCPU(data.astype(np.float32), device)
self.lazydata = LazyBuffer.fromCPU(data.astype(dtype.np) if dtype is not None else data, device)
elif isinstance(data, LazyBuffer):
assert dtype is None or dtype == data.dtype, "dtype doesn't match, and casting isn't supported"
self.lazydata = data
else:
raise RuntimeError(f"can't create Tensor from {data}")
@ -64,13 +66,12 @@ class Tensor:
@property
def shape(self) -> Tuple[int, ...]: return self.lazydata.shape
# dtype handling was very broken. it's always float32 now
@property
def dtype(self) -> type: return np.float32
@property
def device(self) -> str: return self.lazydata.device
@property
def dtype(self) -> DType: return self.lazydata.dtype
# ***** data handlers ****
def realize(self) -> Tensor:
@ -132,11 +133,11 @@ class Tensor:
def manual_seed(seed=None): Tensor._rng = np.random.default_rng(seed=seed)
@staticmethod
def rand(*shape, **kwargs) -> Tensor: return Tensor(LazyNumpyArray(lambda shape: Tensor._rng.random(size=shape, dtype=np.float32), shape), **kwargs)
def rand(*shape, **kwargs) -> Tensor: return Tensor(LazyNumpyArray(lambda shape, dtype: Tensor._rng.random(size=shape, dtype=dtype), shape, np.float32), **kwargs)
# TODO: replace with a transformation from uniform -> gaussian
@staticmethod
def randn(*shape, **kwargs) -> Tensor: return Tensor(LazyNumpyArray(lambda shape: Tensor._rng.standard_normal(size=shape, dtype=np.float32), shape), **kwargs)
def randn(*shape, **kwargs) -> Tensor: return Tensor(LazyNumpyArray(lambda shape, dtype: Tensor._rng.standard_normal(size=shape, dtype=dtype), shape, np.float32), **kwargs)
# ***** rng hlops *****
@ -442,10 +443,16 @@ class Tensor:
def dropout(self, p=0.5) -> Tensor:
if not Tensor.training: return self
_mask : np.ndarray = np.asarray(Tensor._rng.binomial(1, 1.0-p, size=self.shape), dtype=self.dtype)
# TODO: why is this going through numpy?
_mask : np.ndarray = np.asarray(Tensor._rng.binomial(1, 1.0-p, size=self.shape), dtype=np.float32)
return self * Tensor(_mask, requires_grad=False, device=self.device) * (1/(1.0 - p))
# ***** cast ops *****
# TODO: this is a hack, but if we add float(0), it will become a float. need real casting support
def float(self) -> Tensor: return self.add(Tensor([0], device=self.device, dtype=dtypes.float32, requires_grad=self.requires_grad))
# register functions to move between devices
for device in [device for device in Device._buffers.keys() if device[0] != "_"]:
for device in Device._buffers:
setattr(Tensor, f"{device.lower()}", functools.partialmethod(Tensor.to, device))
setattr(Tensor, f"{device.lower()}_", functools.partialmethod(Tensor.to_, device))