dtypes nice and clean (#673)
* add dtype class * dtypes * buffers are lazy * dtype is tracked by lazybuffer and GenericShape * fix types in llvm * llvm store * dtype tests * fix tests maybe * fix flop counter * fix CI * CI fix and check format * fix dtype and dtype check * fix custom test * fix test graphpull/676/head
parent
d26345595d
commit
1826ff6b89
|
@ -164,6 +164,7 @@ jobs:
|
|||
- name: Test openpilot model
|
||||
run: |
|
||||
ALLOWED_KERNEL_COUNT=197 FLOAT16=1 VALIDHACKS=1 DEBUGCL=1 GPU=1 IMAGE=2 python3 openpilot/compile.py
|
||||
python3 -c 'import os; assert os.path.getsize("/tmp/output.thneed") < 100_000_000'
|
||||
DEBUGCL=1 GPU=1 IMAGE=2 python3 openpilot/compile.py
|
||||
VALIDHACKS=1 DEBUGCL=1 GPU=1 IMAGE=2 python3 openpilot/compile.py
|
||||
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
# this can be constructed from a cl_cache or loaded from a thneed file
|
||||
# this can be constructed from a cl_cache or loaded from a thneed file
|
||||
import time
|
||||
import struct
|
||||
import json
|
||||
|
@ -26,7 +26,7 @@ class Thneed:
|
|||
for a in args[3:]:
|
||||
nodes[a]['out_edges'].append(args[2])
|
||||
nodes[args[2]]['in_edges'].append(a)
|
||||
|
||||
|
||||
# get buffers to save
|
||||
self.buffers_to_save = set()
|
||||
self.outputs = []
|
||||
|
@ -35,7 +35,7 @@ class Thneed:
|
|||
self.buffers_to_save.add(n)
|
||||
if len(nodes[n]['out_edges']) == 0:
|
||||
self.outputs.append(n)
|
||||
|
||||
|
||||
fake_inputs = []
|
||||
for k,n in self.inputs.items():
|
||||
if n in self.buffers_to_save:
|
||||
|
@ -97,7 +97,7 @@ class Thneed:
|
|||
else:
|
||||
# zero out buffers
|
||||
buf = cl.Buffer(CL.cl_ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=b'\x00'*o['size'])
|
||||
|
||||
|
||||
bufs[o['id']] = buf
|
||||
bufs_loaded[o['id']] = 'data' in o
|
||||
# if it's loaded, it's saved
|
||||
|
@ -114,13 +114,13 @@ class Thneed:
|
|||
print("FAILED", k)
|
||||
traceback.print_exc()
|
||||
exit(0)
|
||||
|
||||
|
||||
# load binaries
|
||||
for o in jdat['binaries']:
|
||||
nptr = ptr + o['length']
|
||||
prgs[o['name']] = CLProgram(o['name'], weights[ptr:nptr], binary=True)
|
||||
ptr = nptr
|
||||
|
||||
|
||||
# populate the cl_cache
|
||||
for i,k in enumerate(jdat['kernels']):
|
||||
kernel = prgs[k['name']]
|
||||
|
@ -166,7 +166,7 @@ class Thneed:
|
|||
jdat['binaries'].append({"name":prg.name, "length":len(binary[0])})
|
||||
binaries.append(binary[0])
|
||||
saved_binaries.add(prg.name)
|
||||
|
||||
|
||||
# get the args from the kernel, some need the data saved
|
||||
targs, args_size = [], []
|
||||
argdtypes = prg.argdtypes if prg.argdtypes is not None else [None]*(len(args)-2)
|
||||
|
@ -196,6 +196,7 @@ class Thneed:
|
|||
cl.enqueue_copy(CL.cl_queue, data, a, is_blocking=True)
|
||||
weights.append(data.tobytes())
|
||||
elif isinstance(a, cl.Image):
|
||||
assert a.format == cl.ImageFormat(cl.channel_order.RGBA, cl.channel_type.HALF_FLOAT if FLOAT16 else cl.channel_type.FLOAT), "wrong type"
|
||||
needs_load = a in self.buffers_to_save
|
||||
row_pitch = (a.shape[0]*4*(2 if FLOAT16 else 4) + 63)//64 * 64
|
||||
size = row_pitch * a.shape[1]
|
||||
|
@ -244,7 +245,7 @@ class Thneed:
|
|||
"local_work_size": [1 for _ in args[0]] if args[1] is None else args[1],
|
||||
"num_args": len(args)-2,
|
||||
"args": targs,
|
||||
"args_size": args_size
|
||||
"args_size": args_size
|
||||
})
|
||||
|
||||
jdat['outputs'] = [{
|
||||
|
|
|
@ -0,0 +1,13 @@
|
|||
#!/bin/bash -e
|
||||
echo "********* CPU *********"
|
||||
CPU=1 python3 $@
|
||||
echo "********* GPU *********"
|
||||
GPU=1 python3 $@
|
||||
echo "********* METAL *********"
|
||||
METAL=1 python3 $@
|
||||
echo "********* CLANG *********"
|
||||
CLANG=1 python3 $@
|
||||
echo "********* LLVM *********"
|
||||
LLVM=1 python3 $@
|
||||
echo "********* TORCH *********"
|
||||
TORCH=1 python3 $@
|
|
@ -4,7 +4,7 @@
|
|||
import unittest
|
||||
import numpy as np
|
||||
from typing import Optional, Tuple
|
||||
from tinygrad.helpers import prod
|
||||
from tinygrad.helpers import prod, dtypes
|
||||
|
||||
# *** first, we implement the atan2 op at the lowest level ***
|
||||
# `atan2_gpu` for GPUBuffers and `atan2_cpu` for CPUBuffers
|
||||
|
@ -16,6 +16,7 @@ from tinygrad.runtime.ops_cpu import CPUBuffer
|
|||
def atan2_gpu(a:CompiledBuffer, b:CompiledBuffer) -> CompiledBuffer:
|
||||
from tinygrad.runtime.ops_gpu import GPUBuffer
|
||||
assert type(a) == GPUBuffer and type(b) == GPUBuffer, "gpu function requires GPUBuffers"
|
||||
assert a.dtype == b.dtype and a.dtype == dtypes.float32, "gpu function only supports float32"
|
||||
ret = GPUBuffer(a.shape)
|
||||
ASTRunner("atan2", """
|
||||
__kernel void atan2(global float *c, global float *a, global float *b) {
|
||||
|
@ -40,7 +41,7 @@ class ATan2(Function):
|
|||
assert prod(a.shape) == prod(b.shape) and a.device == b.device, "shape or device mismatch"
|
||||
self.a, self.b = a, b
|
||||
ast = LazyOp(LoadOps.CUSTOM, (a, b), {"GPU": atan2_gpu, "CPU": atan2_cpu}[a.device])
|
||||
return LazyBuffer(a.device, a.shape, LoadOps, ast)
|
||||
return LazyBuffer(a.device, a.shape, LoadOps, ast, max(a.dtype, b.dtype))
|
||||
def backward(self, grad_output:LazyBuffer) -> Tuple[Optional[LazyBuffer], Optional[LazyBuffer]]:
|
||||
denom = (self.a.binary_op(BinaryOps.MUL, self.a)).binary_op(BinaryOps.ADD, self.b.binary_op(BinaryOps.MUL, self.b))
|
||||
return grad_output.binary_op(BinaryOps.MUL, self.b.binary_op(BinaryOps.DIV, denom)) if self.needs_input_grad[0] else None, \
|
||||
|
|
|
@ -0,0 +1,41 @@
|
|||
import unittest
|
||||
import numpy as np
|
||||
from tinygrad.helpers import getenv
|
||||
from tinygrad.lazy import Device
|
||||
from tinygrad.tensor import Tensor, dtypes
|
||||
|
||||
# for GPU, cl_khr_fp16 isn't supported
|
||||
# for LLVM, it segfaults because it can't link to the casting function
|
||||
@unittest.skipIf(getenv("CI", "") != "" and Device.DEFAULT in ["GPU", "LLVM"], "float16 broken in some CI backends")
|
||||
class TestDtype(unittest.TestCase):
|
||||
def test_half_to_np(self):
|
||||
a = Tensor([1,2,3,4], dtype=dtypes.float16)
|
||||
print(a)
|
||||
na = a.numpy()
|
||||
print(na, na.dtype, a.lazydata.realized)
|
||||
assert na.dtype == np.float16
|
||||
|
||||
def test_half_add(self):
|
||||
a = Tensor([1,2,3,4], dtype=dtypes.float16)
|
||||
b = Tensor([1,2,3,4], dtype=dtypes.float16)
|
||||
c = a+b
|
||||
print(c.numpy())
|
||||
assert c.dtype == dtypes.float16
|
||||
|
||||
def test_upcast_float(self):
|
||||
# NOTE: there's no downcasting support
|
||||
a = Tensor([1,2,3,4], dtype=dtypes.float16).float()
|
||||
print(a)
|
||||
na = a.numpy()
|
||||
print(na, na.dtype)
|
||||
assert na.dtype == np.float32
|
||||
|
||||
def test_half_add_upcast(self):
|
||||
a = Tensor([1,2,3,4], dtype=dtypes.float16)
|
||||
b = Tensor([1,2,3,4], dtype=dtypes.float32)
|
||||
c = a+b
|
||||
print(c.numpy())
|
||||
assert c.dtype == dtypes.float32
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
|
@ -14,10 +14,10 @@ class TestGraph(unittest.TestCase):
|
|||
assert nx.is_isomorphic(G, RG, node_match=lambda x,y: x["label"] == y["label"], edge_match=lambda x,y: x["label"] == y["label"] if "label" in y else True)
|
||||
|
||||
def test_add_graph(self):
|
||||
a = CPUBuffer.fromCPU(np.ones((4,4)))
|
||||
b = CPUBuffer.fromCPU(np.ones((4,4)))
|
||||
a = CPUBuffer.fromCPU(np.ones((4,4), dtype=np.float32))
|
||||
b = CPUBuffer.fromCPU(np.ones((4,4), dtype=np.float32))
|
||||
ast = LazyOp(BinaryOps.ADD, (a,b))
|
||||
ret = CPUBuffer(np.ones((4,4)))
|
||||
ret = CPUBuffer(np.ones((4,4), dtype=np.float32))
|
||||
|
||||
RG = nx.DiGraph()
|
||||
RG.add_node(0, label="(4, 4)")
|
||||
|
@ -30,12 +30,12 @@ class TestGraph(unittest.TestCase):
|
|||
self.helper_compare_graph(RG)
|
||||
|
||||
def test_add_sum_graph(self):
|
||||
a = CPUBuffer.fromCPU(np.ones((4,4)))
|
||||
b = CPUBuffer.fromCPU(np.ones((1,1)))
|
||||
a = CPUBuffer.fromCPU(np.ones((4,4), dtype=np.float32))
|
||||
b = CPUBuffer.fromCPU(np.ones((1,1), dtype=np.float32))
|
||||
op0 = LazyOp(MovementOps.RESHAPE, (b,), (4, 4))
|
||||
op1 = LazyOp(BinaryOps.ADD, (a,op0))
|
||||
ast = LazyOp(ReduceOps.SUM, (op1,), (1,1))
|
||||
ret = CPUBuffer(np.ones((1,1)))
|
||||
ret = CPUBuffer(np.ones((1,1), dtype=np.float32))
|
||||
|
||||
RG = nx.DiGraph()
|
||||
RG.add_node(0, label="(4, 4)")
|
||||
|
@ -48,14 +48,14 @@ class TestGraph(unittest.TestCase):
|
|||
self.helper_compare_graph(RG)
|
||||
|
||||
def test_add_graph_prune(self):
|
||||
a = CPUBuffer.fromCPU(np.ones((1,1)))
|
||||
a = CPUBuffer.fromCPU(np.ones((1,1), dtype=np.float32))
|
||||
ast = LazyOp(MovementOps.RESHAPE, (a,), (4, 4))
|
||||
ret = CPUBuffer(np.ones((4,4)))
|
||||
ret = CPUBuffer(np.ones((4,4), dtype=np.float32))
|
||||
log_op(ret, ast, show_graph=True)
|
||||
|
||||
b = CPUBuffer.fromCPU(np.ones((4,4)))
|
||||
b = CPUBuffer.fromCPU(np.ones((4,4), dtype=np.float32))
|
||||
ast = LazyOp(BinaryOps.ADD, (ret,b))
|
||||
ret = CPUBuffer(np.ones((4,4)))
|
||||
ret = CPUBuffer(np.ones((4,4), dtype=np.float32))
|
||||
log_op(ret, ast, show_graph=True)
|
||||
prune_graph()
|
||||
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
import itertools
|
||||
from enum import Enum, auto
|
||||
from typing import List, Tuple
|
||||
from tinygrad.helpers import prod, dedup, all_same, colored
|
||||
from tinygrad.helpers import prod, dedup, all_same, colored, dtypes
|
||||
from tinygrad.ops import LazyOp, MovementOps, get_lazyop_info, get_buffers, ReduceOps, get_lazyops, map_buffers
|
||||
from tinygrad.shape import ShapeTracker, View, strides_for_shape
|
||||
|
||||
|
@ -26,7 +26,7 @@ class Token:
|
|||
if len(self.axis) == 0: return [0]
|
||||
acc_strides = [x*(1-self.axis[::-1][i][2]) for i,x in enumerate(strides_for_shape(tuple(1 if r else s for s,_,r in self.axis[::-1])))]
|
||||
return [sum(t) for t in itertools.product(*[[y*acc_strides[i] for y in range(x[0])] for i,x in enumerate(self.axis[::-1])])]
|
||||
def decltype(self): return ('float' if self.typ == Types.FLOAT else 'float4') + ('*' if self.ptr else str())
|
||||
def decltype(self, dtype=dtypes.float32): return (dtype.name if self.typ == Types.FLOAT else f'{dtype.name}4') + ('*' if self.ptr else str())
|
||||
def __repr__(self): return f"<{self.typ}{'*' if self.ptr else str()} {self.tok}{f'[{self.axis}]' if len(self.axis) else str()}>"
|
||||
|
||||
# ast kernel can contain one ReduceOp with arbitrary Binary/Unary ops
|
||||
|
@ -55,8 +55,9 @@ class ASTKernel:
|
|||
break
|
||||
|
||||
# create the buffer we are returning (as the same type as the input buffers) and add it as the first buffer
|
||||
self.ret = output_buffer if output_buffer else type(self.bufs[0])(output_shape if output_shape else self.info.shape, force_create=True)
|
||||
self.bufs = ([type(self.ret)(self.info.shape, hostbuf=self.ret)] if output_shape else [self.ret]) + self.bufs
|
||||
self.ret = output_buffer if output_buffer else type(self.bufs[0])(output_shape if output_shape else self.info.shape, force_create=True, dtype=self.info.dtype)
|
||||
assert self.ret.dtype == self.info.dtype, f"return dtype {self.ret.dtype} != {self.info.dtype}"
|
||||
self.bufs = ([type(self.ret)(self.info.shape, hostbuf=self.ret, dtype=self.info.dtype)] if output_shape else [self.ret]) + self.bufs
|
||||
|
||||
# key for lookup in cache (can change, str might not be right)
|
||||
# bufs are needed because kernels like f(x) = x + x and f(x, y) = x + y have the same str(ast), but are different kernels.
|
||||
|
@ -121,7 +122,7 @@ class ASTKernel:
|
|||
if print_shapetrackers:
|
||||
for st in self.sts: print(st)
|
||||
for i in range(len(self.sts)):
|
||||
print(prefix, self.buftokens[i], f"early:{'T' if i < len(self.bufs) and self.bufs[i] in self.earlybufs else 'F'}", self.sts[i].shape, self.sts[i].views[-1].strides, len(self.sts[i].views), type(self.bufs[i]._buf) if self.bufs[i] is not None else "FAKE")
|
||||
print(prefix, self.bufs[i].dtype, self.buftokens[i], f"early:{'T' if i < len(self.bufs) and self.bufs[i] in self.earlybufs else 'F'}", self.sts[i].shape, self.sts[i].views[-1].strides, len(self.sts[i].views), type(self.bufs[i]._buf) if self.bufs[i] is not None else "FAKE")
|
||||
|
||||
@property
|
||||
def shape_len(self) -> int: return len(self.sts[0].shape)
|
||||
|
|
|
@ -25,6 +25,7 @@ class GPULanguage(NamedTuple):
|
|||
lid : List[str] = []
|
||||
extra_args : List[str] = []
|
||||
float4 : Optional[str] = None
|
||||
half_prekernel : Optional[str] = None
|
||||
|
||||
def to_image_idx(base_shape:Tuple[int, ...], idxy:Node, valid:Node, validhacks=False) -> Tuple[Node, Node]:
|
||||
idy = (idxy//(4*base_shape[1]))
|
||||
|
@ -266,6 +267,8 @@ class GPUCodegen(ASTKernel):
|
|||
self.prekernel : Set[str] = set()
|
||||
self.kernel : List[str] = ["const sampler_t smp = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;\n"] if any(hasattr(buf._buf, "IMAGE") for buf in self.bufs if buf is not None) else []
|
||||
|
||||
if self.lang.half_prekernel: self.prekernel.add(self.lang.half_prekernel+"\n")
|
||||
|
||||
if len(self.lang.gid) == 0:
|
||||
self.kernel += [f"for (int idx{i} = 0; idx{i} < {self.output_shape[i]}; idx{i}++) {{\n" for i in range(0, len(self.output_shape))]
|
||||
else:
|
||||
|
@ -320,7 +323,7 @@ class GPUCodegen(ASTKernel):
|
|||
self.kernel.append("\n}")
|
||||
|
||||
# concat kernel into prg
|
||||
buftypes = [f"{'read_only' if i > 0 else 'write_only'} image2d_t" if hasattr(x._buf, "IMAGE") else self.lang.buffer_prefix+self.buftokens[i].decltype()+self.lang.buffer_suffix for i,x in enumerate(self.bufs) if x is not None]
|
||||
buftypes = [f"{'read_only' if i > 0 else 'write_only'} image2d_t" if hasattr(x._buf, "IMAGE") else self.lang.buffer_prefix+self.buftokens[i].decltype(self.bufs[i].dtype)+self.lang.buffer_suffix for i,x in enumerate(self.bufs) if x is not None]
|
||||
prg = ' '.join(list(self.prekernel) + [f"{self.lang.kernel_prefix} void KERNEL_NAME_PLACEHOLDER(",] +
|
||||
[', '.join([f'{t} data{i}' for i,t in enumerate(buftypes) if i not in self.bufs_to_delete] + self.lang.extra_args)] +
|
||||
[") {\n"] + self.kernel)
|
||||
|
|
|
@ -3,7 +3,7 @@ from typing import ClassVar, List
|
|||
from llvmlite import ir # type: ignore
|
||||
from tinygrad.codegen.ast import ASTKernel
|
||||
from tinygrad.ops import UnaryOps, BinaryOps, ReduceOps, LazyOp, ASTRunner
|
||||
from tinygrad.helpers import DEBUG, prod
|
||||
from tinygrad.helpers import DEBUG, prod, dtypes
|
||||
|
||||
from tinygrad.shape.symbolic import Variable, NumNode, MulNode, DivNode, ModNode, GeNode, LtNode, SumNode, AndNode
|
||||
def int_const(x): return ir.Constant(ir.IntType(64), x)
|
||||
|
@ -105,7 +105,8 @@ class LLVMCodegen(ASTKernel):
|
|||
|
||||
# create llvm function
|
||||
module = ir.Module(name=__file__)
|
||||
func = ir.Function(module, ir.FunctionType(ir.VoidType(), [ir.FloatType().as_pointer()]*(len(self.bufs))), name='exec')
|
||||
func_dtypes = [{dtypes.float16:ir.HalfType(), dtypes.float32:ir.FloatType()}[buf.dtype] for buf in self.bufs]
|
||||
func = ir.Function(module, ir.FunctionType(ir.VoidType(), [x.as_pointer() for x in func_dtypes]), name='exec')
|
||||
|
||||
# force llvmlite to allow us to add function attribute then add the attribute
|
||||
func.attributes._known = func.attributes._known.union(frozenset(['"no-nans-fp-math"="true"']))
|
||||
|
@ -143,9 +144,11 @@ class LLVMCodegen(ASTKernel):
|
|||
# this always does the load, so we have it load *0 if the arg won't be used
|
||||
# TODO: would control flow be faster?
|
||||
aug_idx = builder.select(valid, idx, int_const(0))
|
||||
element = builder.select(valid, builder.load(builder.gep(func.args[buf_index], [aug_idx], inbounds=True)), ir.Constant(ir.FloatType(), 0))
|
||||
element = builder.select(valid, builder.load(builder.gep(func.args[buf_index], [aug_idx], inbounds=True)), ir.Constant(func_dtypes[buf_index], 0))
|
||||
else:
|
||||
element = builder.load(builder.gep(func.args[buf_index], [idx], inbounds=True))
|
||||
# upcast
|
||||
if func_dtypes[buf_index] != ir.FloatType(): element = builder.fpext(element, ir.FloatType())
|
||||
m = element if kernel_output_dim == 1 else builder.insert_element(m, element, int_const(i))
|
||||
return m
|
||||
if isinstance(x.op, ReduceOps):
|
||||
|
@ -194,6 +197,7 @@ class LLVMCodegen(ASTKernel):
|
|||
builder = loop_exit[store_loop]
|
||||
for i, idx in enumerate(get_idxs(builder, idx_level[0][store_loop], 0)):
|
||||
element = result if kernel_output_dim == 1 else builder.extract_element(result, int_const(i))
|
||||
if func_dtypes[0] != ir.FloatType(): element = builder.fptrunc(element, func_dtypes[0])
|
||||
builder.store(element, builder.gep(func.args[0], [idx], inbounds=True))
|
||||
|
||||
# add the looping
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
import os, math, functools
|
||||
from typing import Tuple, Union, List
|
||||
import numpy as np
|
||||
from typing import Tuple, Union, List, NamedTuple
|
||||
|
||||
def dedup(x): return list(dict.fromkeys(x)) # retains list order
|
||||
def prod(x:Union[List[int], Tuple[int, ...]]) -> int: return math.prod(x)
|
||||
|
@ -16,3 +17,17 @@ def mnum(i) -> str: return str(i) if i >= 0 else f"m{-i}"
|
|||
def getenv(key, default=0): return type(default)(os.getenv(key, default))
|
||||
|
||||
DEBUG, IMAGE = getenv("DEBUG", 0), getenv("IMAGE", 0)
|
||||
|
||||
# **** tinygrad now supports dtypes! *****
|
||||
|
||||
class DType(NamedTuple):
|
||||
itemsize : int
|
||||
name : str
|
||||
np : type # TODO: someday this will be removed with the "remove numpy" project
|
||||
def __repr__(self): return f"dtypes.{self.name}"
|
||||
|
||||
class dtypes:
|
||||
float16 = half = DType(2, "half", np.float16)
|
||||
float32 = float = DType(4, "float", np.float32)
|
||||
@staticmethod
|
||||
def from_np(x:np.ndarray) -> DType: return {np.dtype(np.float16): dtypes.float16, np.dtype(np.float32): dtypes.float32}[np.dtype(x.dtype)]
|
|
@ -2,7 +2,7 @@ from __future__ import annotations
|
|||
from typing import Optional, Tuple, Union, List, Dict, Any, ClassVar, Type
|
||||
import os, sys, weakref, importlib, inspect, functools
|
||||
from weakref import WeakValueDictionary
|
||||
from tinygrad.helpers import prod, getenv
|
||||
from tinygrad.helpers import prod, getenv, DType, dtypes
|
||||
from tinygrad.shape import ShapeTracker, get_contraction
|
||||
from tinygrad.ops import InterpretedBuffer, DeviceBuffer, UnaryOps, BinaryOps, ReduceOps, MovementOps, LoadOps, OpType, LazyOp, get_buffers, get_lazyops, map_buffers
|
||||
from tinygrad.graph import log_op
|
||||
|
@ -13,21 +13,12 @@ sys.setrecursionlimit(10000)
|
|||
OPT = getenv("OPT", 2)
|
||||
LAZY = getenv("LAZY", 1)
|
||||
|
||||
def get_buffer(name, base='tinygrad.runtime'):
|
||||
try:
|
||||
return [cls for cname, cls in inspect.getmembers(importlib.import_module(f'{base}.ops_{name}'), inspect.isclass) if (cname.lower() == name + "buffer")][0]
|
||||
except Exception as e: # NOTE: this can't be put on one line due to mypy issue
|
||||
print(name, "backend not available", e, file=sys.stderr)
|
||||
|
||||
class _Device:
|
||||
def __init__(self) -> None:
|
||||
# TODO: make this dynamic to when you try to access the _buffers
|
||||
self._buffers : Dict[str, Type[DeviceBuffer]] = {x.upper():get_buffer(x) for x in
|
||||
[os.path.splitext(x)[0][len("ops_"):] for x in sorted(os.listdir(os.path.join(os.path.dirname(os.path.realpath(__file__)), "runtime"))) if x.startswith("ops_")] if x is not None}
|
||||
self.DEFAULT : str = "CPU"
|
||||
for name in self._buffers:
|
||||
if getenv(name) == 1: self.DEFAULT = name # note: DEFAULT can be a Device that can't be imported. better than silent use of a different device
|
||||
if self._buffers[name] is not None: self.__setattr__(name, name)
|
||||
self._buffers = {y.upper():y for y in [os.path.splitext(x)[0][len("ops_"):] for x in sorted(os.listdir(os.path.join(os.path.dirname(os.path.realpath(__file__)), "runtime"))) if x.startswith("ops_")]}
|
||||
self.DEFAULT : str = functools.reduce(lambda val, ele: val if getenv(val) == 1 else ele, self._buffers, "CPU")
|
||||
@functools.lru_cache(maxsize=None) # this class is a singleton, pylint: disable=method-cache-max-size-none
|
||||
def __getitem__(self, x:str) -> Type[DeviceBuffer]: return [cls for cname, cls in inspect.getmembers(importlib.import_module(f'tinygrad.runtime.ops_{self._buffers[x]}'), inspect.isclass) if (cname.lower() == self._buffers[x] + "buffer")][0]
|
||||
Device = _Device()
|
||||
|
||||
# TODO: movement ops that only change shape are really nops. treat them as such
|
||||
|
@ -81,9 +72,9 @@ def replace_with_movement_op(y:Union[LazyOp, LazyBuffer], op:MovementOps, arg:Tu
|
|||
return elementwise_op(y.op, *[replace_with_movement_op(z, op, arg) for z in y.src]) # type: ignore
|
||||
|
||||
class LazyNumpyArray:
|
||||
def __init__(self, fxn, shape): self.fxn, self.shape = fxn, shape
|
||||
def __call__(self): return self.fxn(self.shape)
|
||||
def reshape(self, new_shape): return LazyNumpyArray(self.fxn, new_shape)
|
||||
def __init__(self, fxn, shape, dtype): self.fxn, self.shape, self.dtype = fxn, shape, dtype
|
||||
def __call__(self): return self.fxn(self.shape, self.dtype)
|
||||
def reshape(self, new_shape): return LazyNumpyArray(self.fxn, new_shape, self.dtype)
|
||||
def copy(self): return self
|
||||
def astype(self, typ): return self
|
||||
|
||||
|
@ -91,32 +82,32 @@ def support_weakref(x): return x
|
|||
@support_weakref # needed for mypyc, this prevents LazyBuffer from becoming a native class
|
||||
class LazyBuffer:
|
||||
__deletable__ = ('op',)
|
||||
lazycache : ClassVar[WeakValueDictionary[Tuple[str, OpType, LazyOp], LazyBuffer]] = WeakValueDictionary()
|
||||
def __new__(cls, device:str, shape:Union[ShapeTracker, Tuple[int, ...]], optype:OpType, op:LazyOp):
|
||||
lazycache : ClassVar[WeakValueDictionary[Tuple[str, DType, OpType, LazyOp], LazyBuffer]] = WeakValueDictionary()
|
||||
def __new__(cls, device:str, shape:Union[ShapeTracker, Tuple[int, ...]], optype:OpType, op:LazyOp, dtype:DType):
|
||||
# fromcpu aren't cached
|
||||
if optype == LoadOps and op.op == LoadOps.FROMCPU:
|
||||
return super().__new__(cls)
|
||||
wop = (device, optype, get_weakop(op)) # NOTE: shape should be deterministic. annoying to cache with the ShapeTracker
|
||||
wop = (device, dtype, optype, get_weakop(op)) # NOTE: shape should be deterministic. annoying to cache with the ShapeTracker
|
||||
# NOTE: we need "ret" to prevent the new buffer from being immediately deleted
|
||||
if wop not in LazyBuffer.lazycache: LazyBuffer.lazycache[wop] = ret = super().__new__(cls)
|
||||
else: ret = LazyBuffer.lazycache[wop]
|
||||
return ret
|
||||
|
||||
def __init__(self, device:str, shape:Union[ShapeTracker, Tuple[int, ...]], optype:OpType, op:LazyOp):
|
||||
def __init__(self, device:str, shape:Union[ShapeTracker, Tuple[int, ...]], optype:OpType, op:LazyOp, dtype:DType):
|
||||
if hasattr(self, 'device'):
|
||||
return # cache hit, we return and don't reinit
|
||||
self.st = shape if isinstance(shape, ShapeTracker) else ShapeTracker(tuple(shape))
|
||||
self.shape, self.optype, self.op = self.st.shape, optype, op
|
||||
self.shape, self.optype, self.op, self.dtype = self.st.shape, optype, op, dtype
|
||||
self.realized : Optional[DeviceBuffer] = None
|
||||
self.output_buffer : Optional[DeviceBuffer] = None
|
||||
self.device, self.dbuffer = device, Device._buffers[device]
|
||||
self.device, self.dbuffer = device, Device[device]
|
||||
# TODO: does children have to be a ref count instead of a set? can a Buffer be a double child?
|
||||
self.children : weakref.WeakSet[LazyBuffer] = weakref.WeakSet()
|
||||
# NOTE: op should be read only after construction of LazyBuffer
|
||||
for x in get_buffers(op): x.children.add(self)
|
||||
if not LAZY: self.realize()
|
||||
|
||||
def __repr__(self): return f"<LB {self.shape} op:{self.op.op if self.realized is None else 'realized'}>"
|
||||
def __repr__(self): return f"<LB {self.shape} {self.dtype} op:{self.op.op if self.realized is None else 'realized'}>"
|
||||
|
||||
# this produces a device buffer
|
||||
def realize(self:LazyBuffer, required_device=None) -> DeviceBuffer:
|
||||
|
@ -124,7 +115,7 @@ class LazyBuffer:
|
|||
if self.realized is None:
|
||||
# get real ops first
|
||||
if self.op.op == LoadOps.FROMCPU:
|
||||
self.realized = Device._buffers[self.device].fromCPU(self.op.arg() if isinstance(self.op.arg, LazyNumpyArray) else self.op.arg)
|
||||
self.realized = Device[self.device].fromCPU(self.op.arg() if isinstance(self.op.arg, LazyNumpyArray) else self.op.arg)
|
||||
ast = LazyOp(self.op.op, tuple())
|
||||
elif self.op.op == LoadOps.CONTIGUOUS:
|
||||
real_src = self.op.src[0].realize(self.device)
|
||||
|
@ -160,12 +151,13 @@ class LazyBuffer:
|
|||
log_op(self.realized, ast)
|
||||
|
||||
assert self.realized.shape == self.shape, f"shape mismatch on realize got {self.realized.shape} expected {self.shape}"
|
||||
assert isinstance(self.realized, Device._buffers[self.device]), f"device mismatch on realized got {type(self.realized)} expected {self.device}"
|
||||
assert isinstance(self.realized, Device[self.device]), f"device mismatch on realized got {type(self.realized)} expected {self.device}"
|
||||
assert self.realized.dtype == self.dtype, f"dtype mismatch on realize got {self.realized.dtype} expected {self.dtype}"
|
||||
return self.realized
|
||||
|
||||
# NOTE: we have to make a copy of the numpy array here in case the user changes it. expose this?
|
||||
@staticmethod
|
||||
def fromCPU(x, device) -> LazyBuffer: return LazyBuffer(device, x.shape, LoadOps, LazyOp(LoadOps.FROMCPU, tuple(), x.copy()))
|
||||
def fromCPU(x, device) -> LazyBuffer: return LazyBuffer(device, x.shape, LoadOps, LazyOp(LoadOps.FROMCPU, tuple(), x.copy()), dtypes.from_np(x))
|
||||
def toCPU(self):
|
||||
ret = self.realize().toCPU()
|
||||
log_op(InterpretedBuffer(ret), LazyOp(LoadOps.TOCPU, (self.realized,), None))
|
||||
|
@ -173,11 +165,11 @@ class LazyBuffer:
|
|||
|
||||
def unary_op(self:LazyBuffer, op:UnaryOps) -> LazyBuffer: return elementwise_op(op, self)
|
||||
def binary_op(self:LazyBuffer, op:BinaryOps, y:LazyBuffer) -> LazyBuffer: return elementwise_op(op, self, y)
|
||||
def contiguous(self:LazyBuffer) -> LazyBuffer: return LazyBuffer(self.device, self.shape, LoadOps, LazyOp(LoadOps.CONTIGUOUS, (self,)))
|
||||
def contiguous(self:LazyBuffer) -> LazyBuffer: return LazyBuffer(self.device, self.shape, LoadOps, LazyOp(LoadOps.CONTIGUOUS, (self,)), self.dtype)
|
||||
|
||||
def reduce_op(self:LazyBuffer, op:ReduceOps, new_shape:Tuple[int, ...]) -> LazyBuffer:
|
||||
if self.shape == tuple(new_shape): return self
|
||||
return LazyBuffer(self.device, new_shape, ReduceOps, LazyOp(op, (self,), new_shape))
|
||||
return LazyBuffer(self.device, new_shape, ReduceOps, LazyOp(op, (self,), new_shape), self.dtype)
|
||||
|
||||
def movement_op(self:LazyBuffer, op:MovementOps, arg:Tuple[Any, ...]) -> LazyBuffer:
|
||||
# very instant nop
|
||||
|
@ -228,7 +220,7 @@ class LazyBuffer:
|
|||
return replace_with_movement_op(self.op, op, arg)
|
||||
|
||||
# create the buffer
|
||||
ret = LazyBuffer(self.device, ShapeTracker(self.st).movement_op(op, arg), MovementOps, LazyOp(op, (self,), arg))
|
||||
ret = LazyBuffer(self.device, ShapeTracker(self.st).movement_op(op, arg), MovementOps, LazyOp(op, (self,), arg), self.dtype)
|
||||
|
||||
# if the ShapeTracker becomes contiguous, replace the whole thing with a reshape (or nothing if shapes match)
|
||||
# NOTE: if ret is in the cache, it can already be realized
|
||||
|
@ -241,7 +233,7 @@ class LazyBuffer:
|
|||
return ret
|
||||
|
||||
def elementwise_op(op:Union[UnaryOps, BinaryOps], *srcs:LazyBuffer) -> LazyBuffer:
|
||||
out_device, out_shape = srcs[0].device, srcs[0].shape
|
||||
out_device, out_shape, out_dtype = srcs[0].device, srcs[0].shape, max(x.dtype for x in srcs)
|
||||
|
||||
# push all contiguous to the end of BinaryOps. kernels 198 -> 196
|
||||
if PUSH_CONTIGUOUS and any(x.realized is None and x.op.op == LoadOps.CONTIGUOUS and len(x.op.src[0].children) <= 1 for x in srcs):
|
||||
|
@ -258,4 +250,4 @@ def elementwise_op(op:Union[UnaryOps, BinaryOps], *srcs:LazyBuffer) -> LazyBuffe
|
|||
# remove the buffers from any (childless) BinaryOps that feed into this
|
||||
srcs = tuple(x.op if x.optype == BinaryOps and len(x.children) == 0 and x.realized is None else x for x in srcs) # type: ignore
|
||||
|
||||
return LazyBuffer(out_device, out_shape, BinaryOps, LazyOp(op, srcs))
|
||||
return LazyBuffer(out_device, out_shape, BinaryOps, LazyOp(op, srcs), out_dtype)
|
||||
|
|
|
@ -3,7 +3,7 @@ import functools, itertools, operator, random
|
|||
import numpy as np
|
||||
from enum import Enum, auto
|
||||
from typing import Union, Type, NamedTuple, Tuple, Any, List, ClassVar, Optional, Callable, Dict, TypeVar, Set
|
||||
from tinygrad.helpers import prod, DEBUG, getenv
|
||||
from tinygrad.helpers import prod, DEBUG, getenv, DType, dtypes
|
||||
from tinygrad.shape import ShapeTracker
|
||||
|
||||
# these are the llops your accelerator must implement, along with toCpu
|
||||
|
@ -39,17 +39,19 @@ class Copyable:
|
|||
def toCPU(self:Copyable) -> np.ndarray: raise NotImplementedError("must be implemented")
|
||||
|
||||
class RawBuffer(Copyable): # pylint: disable=abstract-method
|
||||
def __init__(self, size:int):
|
||||
def __init__(self, size:int, dtype:DType):
|
||||
self.size : int = size
|
||||
GlobalCounters.mem_used += self.size
|
||||
def __del__(self): GlobalCounters.mem_used -= self.size
|
||||
self.dtype : DType = dtype
|
||||
self._memsz : int = size*dtype.itemsize
|
||||
GlobalCounters.mem_used += self._memsz
|
||||
def __del__(self): GlobalCounters.mem_used -= self.size*self._memsz
|
||||
|
||||
class RawBufferCopyIn(RawBuffer):
|
||||
def copyin(self, x:np.ndarray) -> None: raise NotImplementedError("must be implemented")
|
||||
|
||||
@classmethod
|
||||
def fromCPU(cls, x:np.ndarray):
|
||||
ret = cls(4*prod(x.shape))
|
||||
ret = cls(prod(x.shape), dtypes.from_np(x))
|
||||
ret.copyin(x)
|
||||
return ret
|
||||
|
||||
|
@ -57,7 +59,7 @@ class RawBufferCopyInOut(RawBufferCopyIn):
|
|||
def copyout(self, x:np.ndarray) -> None: raise NotImplementedError("must be implemented")
|
||||
|
||||
def toCPU(self) -> np.ndarray:
|
||||
x = np.empty((self.size//4), dtype=np.float32)
|
||||
x: np.ndarray = np.empty(self.size, dtype=self.dtype.np)
|
||||
self.copyout(x)
|
||||
return x
|
||||
|
||||
|
@ -65,26 +67,27 @@ class RawBufferCopyInOut(RawBufferCopyIn):
|
|||
class DeviceBuffer(Copyable):
|
||||
_buf: Any # underlying buffer
|
||||
shape: Tuple[int, ...]
|
||||
dtype: DType
|
||||
@classmethod
|
||||
def exec_ast(cls, ast:LazyOp, output_buffer=None): raise NotImplementedError("must be implemented")
|
||||
|
||||
# this is a quick "buffer" class for flop tracking and getting the output shape
|
||||
class GenericShape:
|
||||
def __init__(self, shape:Tuple[int, ...], flops:int=0): self.shape, self.flops = shape, flops
|
||||
def __init__(self, shape:Tuple[int, ...], dtype:DType=dtypes.float32, flops:int=0): self.shape, self.dtype, self.flops = shape, dtype, flops
|
||||
def consume_flops(self):
|
||||
self.flops, ret = 0, self.flops
|
||||
return ret
|
||||
shape_fxn_for_op : Dict[Op, Callable] = {
|
||||
**{op:lambda self: GenericShape(self.shape, self.consume_flops() + prod(self.shape)) for op in UnaryOps},
|
||||
**{op:lambda self,y: GenericShape(self.shape, self.consume_flops() + y.consume_flops() + prod(self.shape)) for op in BinaryOps},
|
||||
**{op:lambda self,new_shape: GenericShape(new_shape, self.consume_flops() + prod(self.shape)) for op in ReduceOps},
|
||||
**{op:functools.partial(lambda mop,self,arg: GenericShape(ShapeTracker(self.shape).movement_op(mop, arg).shape, self.consume_flops()), op) for op in MovementOps}}
|
||||
**{op:lambda self: GenericShape(self.shape, self.dtype, self.consume_flops() + prod(self.shape)) for op in UnaryOps},
|
||||
**{op:lambda self,y: GenericShape(self.shape, max(self.dtype, y.dtype), self.consume_flops() + y.consume_flops() + prod(self.shape)) for op in BinaryOps},
|
||||
**{op:lambda self,new_shape: GenericShape(new_shape, self.dtype, self.consume_flops() + prod(self.shape)) for op in ReduceOps},
|
||||
**{op:functools.partial(lambda mop,self,arg: GenericShape(ShapeTracker(self.shape).movement_op(mop, arg).shape, self.dtype, self.consume_flops()), op) for op in MovementOps}}
|
||||
def get_lazyop_info(ast:LazyOp): return InterpretedBuffer.exec_ast(map_buffers({x:InterpretedBuffer(GenericShape(x.shape, x.dtype)) for x in get_buffers(ast)}, ast))._buf
|
||||
|
||||
# used in CPUBuffer and TorchBuffer
|
||||
class InterpretedBuffer(DeviceBuffer): # pylint: disable=abstract-method
|
||||
fxn_for_op : ClassVar = shape_fxn_for_op
|
||||
# TODO: use generic types here to remove __init__ in specialized classes
|
||||
def __init__(self, lbuf:Any): self._buf, self.shape = lbuf, tuple(lbuf.shape)
|
||||
def __init__(self, lbuf:Any): self._buf, self.shape, self.dtype = lbuf, tuple(lbuf.shape), self.to_tinygrad_dtype(lbuf) if hasattr(self, 'to_tinygrad_dtype') else lbuf.dtype
|
||||
def contiguous(self): return type(self).exec_ast(LazyOp(op=UnaryOps.NOOP, src=(self,)))
|
||||
def movement_op(self, op:MovementOps, arg=None): return type(self)(self.fxn_for_op[op](self._buf, arg)) if op in self.fxn_for_op else type(self)(getattr(self._buf, op.name.lower())(arg))
|
||||
@classmethod
|
||||
|
@ -101,12 +104,11 @@ class InterpretedBuffer(DeviceBuffer): # pylint: disable=abstract-method
|
|||
else: ret = cls(cls.fxn_for_op[ast.op](*([x._buf for x in srcs] + ([ast.arg] if ast.arg else []))))
|
||||
context[ast] = ret
|
||||
if output_buffer is not None:
|
||||
assert output_buffer.shape == ret.shape
|
||||
assert output_buffer.shape == ret.shape, output_buffer.dtype == ret.dtype
|
||||
output_buffer._buf = ret._buf
|
||||
return output_buffer
|
||||
else:
|
||||
return ret
|
||||
def get_lazyop_info(ast:LazyOp): return InterpretedBuffer.exec_ast(map_buffers({x:InterpretedBuffer(GenericShape(x.shape)) for x in get_buffers(ast)}, ast))._buf
|
||||
|
||||
class ASTRunner:
|
||||
def __init__(self, name, prg, bufs_to_delete:Optional[Set[int]]=None, global_size:Optional[List[int]]=None, local_size:Optional[List[int]]=None, op_estimate=0, mem_estimate=0):
|
||||
|
@ -139,7 +141,7 @@ class ASTRunner:
|
|||
def optimize_local_size(self, rawbufs:List[RawBuffer]) -> List[int]:
|
||||
assert self.global_size is not None, "needs a global size to optimize local size"
|
||||
if any(x == rawbufs[0] for x in rawbufs[1:]): # this is an assignment, replace the output buffer
|
||||
output_replacement = type(rawbufs[0])(rawbufs[0].size)
|
||||
output_replacement = type(rawbufs[0])(rawbufs[0].size, rawbufs[0].dtype)
|
||||
rawbufs = [output_replacement if x == rawbufs[0] else x for x in rawbufs]
|
||||
MAX_WORKGROUP = self.clprg.max_work_group_size() if hasattr(self.clprg, 'max_work_group_size') else 1024
|
||||
local_dims = [[x for x in set([sz, 1, 2, 4, 8, 16, 32, 64, 128, 256, MAX_WORKGROUP]) if x<=sz] for sz in self.global_size]
|
||||
|
@ -149,32 +151,36 @@ class ASTRunner:
|
|||
# assumes you are using ShapeTracker
|
||||
# used in GPUBuffer and LLVMBuffer
|
||||
class CompiledBuffer(DeviceBuffer): # pylint: disable=abstract-method
|
||||
def __init__(self, shape:Union[ShapeTracker, Tuple[int, ...]], hostbuf:Optional[CompiledBuffer]=None, backing:Optional[np.ndarray]=None, force_create=False):
|
||||
def __init__(self, shape:Union[ShapeTracker, Tuple[int, ...]], hostbuf:Optional[CompiledBuffer]=None, backing:Optional[np.ndarray]=None, force_create=False, dtype:DType=dtypes.float32):
|
||||
self.st = shape if isinstance(shape, ShapeTracker) else ShapeTracker(tuple(shape))
|
||||
self.shape = self.st.shape
|
||||
self.dtype = dtype
|
||||
assert hostbuf is None or hostbuf.dtype == dtype, f"hostbuf dtype {hostbuf.dtype} != {dtype}"
|
||||
self._base_shape : Tuple[int, ...] = hostbuf._base_shape if hostbuf is not None else self.shape
|
||||
self._buf = hostbuf._buf if hostbuf is not None else None
|
||||
self._backing : Optional[np.ndarray] = hostbuf._backing if hostbuf is not None else backing
|
||||
if (self._backing is not None and self._backing.shape != (1,)) or force_create: self.raw()
|
||||
|
||||
# TODO: not GPUBuffer, get name of class
|
||||
def __repr__(self): return f"GPUBuffer(shape={self.st}, hostbuf=GPUBuffer(shape={self._base_shape}" + (f", backing=np.array({self._backing}, dtype=np.float32)))" if self._backing else ", force_create=True))")
|
||||
# TODO: needs dtype
|
||||
def __repr__(self): return f"{type(self).__name__}(shape={self.st}, hostbuf={type(self).__name__}(shape={self._base_shape}" + (f", backing=np.array({self._backing}, dtype=np.{self.dtype.np.__name__})))" if self._backing is not None else ", force_create=True))")
|
||||
|
||||
raw_buffer_type : Type[RawBuffer]
|
||||
@classmethod
|
||||
def create_raw_buffer(cls, shape, backing) -> RawBuffer:
|
||||
def create_raw_buffer(cls, shape:Tuple[int, ...], backing:Optional[np.ndarray], dtype:DType) -> RawBuffer:
|
||||
assert backing is None or prod(shape) == prod(backing.shape), "backing has the wrong shape"
|
||||
assert backing is None or GlobalCounters.cache is None, f"can't copy in {backing.shape} while caching"
|
||||
return cls.raw_buffer_type(4*prod(shape)) if backing is None else cls.raw_buffer_type.fromCPU(backing)
|
||||
if DEBUG >= 4: print(f"create raw buffer {shape} {dtype} backed:{backing is not None}")
|
||||
return cls.raw_buffer_type(prod(shape), dtype) if backing is None else cls.raw_buffer_type.fromCPU(backing)
|
||||
def raw(self) -> RawBuffer:
|
||||
if self._buf is None:
|
||||
if DEBUG >= 4 and self._backing is not None: print(f"**** copy in {self._backing.shape} to {type(self)}")
|
||||
self._buf = self.create_raw_buffer(self._base_shape, self._backing)
|
||||
self._buf = self.create_raw_buffer(self._base_shape, self._backing, self.dtype)
|
||||
self._backing = None
|
||||
return self._buf
|
||||
|
||||
@classmethod
|
||||
def fromCPU(cls, x:np.ndarray) -> CompiledBuffer: return cls(x.shape, backing=x.view(np.ndarray).astype(np.float32).ravel())
|
||||
def fromCPU(cls, x:np.ndarray) -> CompiledBuffer: return cls(x.shape, backing=x.ravel(), dtype=dtypes.from_np(x))
|
||||
def toCPU(self) -> np.ndarray:
|
||||
assert GlobalCounters.cache is None, f"can't copy out {self} while caching"
|
||||
if DEBUG >= 3: print(f"**** copy out {self.shape}")
|
||||
|
@ -192,7 +198,7 @@ class CompiledBuffer(DeviceBuffer): # pylint: disable=abstract-method
|
|||
prg = cls.method_cache[k.key]
|
||||
else:
|
||||
prg = k.codegen().build(cls.runtime_type)
|
||||
if getenv("PRINT_AST", "") == prg.name:
|
||||
if getenv("PRINT_AST", "") == prg.name or getenv("PRINT_AST", "") == "1":
|
||||
k.print()
|
||||
print(prg.prg)
|
||||
prg.exec(k.bufs)
|
||||
|
|
|
@ -2,21 +2,22 @@ import os, time, ctypes, hashlib, subprocess, platform
|
|||
import numpy as np
|
||||
from collections import defaultdict
|
||||
from typing import Final, Dict
|
||||
from tinygrad.helpers import dtypes, DType
|
||||
from tinygrad.ops import CompiledBuffer, RawBufferCopyIn
|
||||
from tinygrad.codegen.gpu import GPUCodegen, GPULanguage
|
||||
|
||||
class RawMallocBuffer(RawBufferCopyIn):
|
||||
def __init__(self, size):
|
||||
super().__init__(size)
|
||||
self._buf = (ctypes.c_float * (size//4))()
|
||||
def __init__(self, size, dtype : DType):
|
||||
super().__init__(size, dtype)
|
||||
self._buf = ({dtypes.float32: ctypes.c_float, dtypes.float16: ctypes.c_int16}[dtype] * size)()
|
||||
def _buffer(self): return self._buf
|
||||
def copyin(self, x:np.ndarray): ctypes.memmove(self._buf, x.ctypes.data, x.size*4)
|
||||
def toCPU(self): return np.ctypeslib.as_array(self._buf)
|
||||
def copyin(self, x:np.ndarray): ctypes.memmove(self._buf, x.ctypes.data, x.size*np.dtype(x.dtype).itemsize)
|
||||
def toCPU(self): return np.frombuffer(self._buf, dtype=self.dtype.np)
|
||||
|
||||
class ClangProgram:
|
||||
kernel_cnt : Final[Dict[str, int]] = defaultdict(int)
|
||||
def __init__(self, name:str, prg:str):
|
||||
prg = "#include <math.h>\n#define max(x,y) ((x>y)?x:y)\n" + prg
|
||||
prg = "#include <math.h>\n#define max(x,y) ((x>y)?x:y)\n#define half __fp16\n" + prg
|
||||
# TODO: is there a way to not write this to disk?
|
||||
fn = f"/tmp/clang_{hashlib.md5(prg.encode('utf-8')).hexdigest()}.{'dylib' if platform.system() == 'Darwin' else 'so'}"
|
||||
if not os.path.exists(fn):
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
import numpy as np
|
||||
import operator
|
||||
from typing import ClassVar, Callable, Dict, Tuple
|
||||
from tinygrad.helpers import dtypes
|
||||
from tinygrad.ops import UnaryOps, BinaryOps, MovementOps, ReduceOps, FusedOps, InterpretedBuffer, Op
|
||||
|
||||
def shape_to_axis(old_shape:Tuple[int, ...], new_shape:Tuple[int, ...]) -> Tuple[int, ...]:
|
||||
|
@ -35,6 +36,7 @@ numpy_fxn_for_op : Dict[Op, Callable] = {**base_fxn_for_op, **{
|
|||
|
||||
class CPUBuffer(InterpretedBuffer):
|
||||
fxn_for_op : ClassVar = numpy_fxn_for_op
|
||||
to_tinygrad_dtype = staticmethod(dtypes.from_np)
|
||||
|
||||
@staticmethod
|
||||
def fromCPU(x): return CPUBuffer(x)
|
||||
|
|
|
@ -8,9 +8,9 @@ from tinygrad.ops import CompiledBuffer, RawBufferCopyInOut
|
|||
from tinygrad.codegen.gpu import GPUCodegen, GPULanguage
|
||||
|
||||
class RawCUDABuffer(RawBufferCopyInOut):
|
||||
def __init__(self, size):
|
||||
super().__init__(size)
|
||||
self._cl = cuda.mem_alloc(size)
|
||||
def __init__(self, size, dtype):
|
||||
super().__init__(size, dtype)
|
||||
self._cl = cuda.mem_alloc(self._memsz)
|
||||
def copyin(self, x:np.ndarray, stream:Optional[cuda.Stream]=None): cuda.memcpy_htod_async(self._cl, x, stream)
|
||||
def copyout(self, x:np.ndarray): cuda.memcpy_dtoh(x, self._cl)
|
||||
|
||||
|
@ -42,6 +42,7 @@ class CUDAProgram:
|
|||
class CUDACodegen(GPUCodegen):
|
||||
lang = GPULanguage(
|
||||
kernel_prefix = "__global__", smem_prefix = "__shared__ ", barrier = "__syncthreads();", float4 = "make_float4",
|
||||
half_prekernel = "#include <cuda_fp16.h>",
|
||||
gid = [f'blockDim.{chr(120+i)}*blockIdx.{chr(120+i)}+threadIdx.{chr(120+i)}' for i in range(3)],
|
||||
lid = [f'threadIdx.{chr(120+i)}' for i in range(3)])
|
||||
|
||||
|
|
|
@ -2,15 +2,13 @@ from __future__ import annotations
|
|||
import platform, functools
|
||||
import numpy as np
|
||||
import pyopencl as cl # type: ignore
|
||||
from typing import Dict, Optional, List, ClassVar, Final
|
||||
from collections import defaultdict
|
||||
from tinygrad.helpers import IMAGE, DEBUG, getenv
|
||||
from typing import Optional, List, Final
|
||||
from tinygrad.helpers import IMAGE, DEBUG, getenv, dtypes
|
||||
from tinygrad.ops import CompiledBuffer, GlobalCounters, RawBufferCopyInOut, RawBuffer
|
||||
from tinygrad.codegen.gpu import GPUCodegen, GPULanguage
|
||||
|
||||
OSX = platform.system() == "Darwin"
|
||||
OSX_TIMING_RATIO = (125/3) if OSX else 1.0 # see test/external_osx_profiling.py to determine this ratio. it's in like GPU clocks or something
|
||||
CLCACHE = getenv("CLCACHE", 1)
|
||||
FLOAT16 = getenv("FLOAT16", 0)
|
||||
|
||||
class _CL:
|
||||
|
@ -27,33 +25,18 @@ class _CL:
|
|||
CL = _CL()
|
||||
|
||||
class CLBuffer(RawBufferCopyInOut):
|
||||
# TODO: this can be in RawBuffer generically
|
||||
BUFFER_CACHE : ClassVar[Dict[int, List[cl.Buffer]]] = defaultdict(list)
|
||||
|
||||
def __init__(self, size): # pylint: disable=super-init-not-called
|
||||
self.size = size
|
||||
if len(CLBuffer.BUFFER_CACHE[size]) > 0:
|
||||
self._cl = CLBuffer.BUFFER_CACHE[size].pop()
|
||||
else:
|
||||
# TODO: on GPU OOM, clear the cache
|
||||
self._cl = cl.Buffer(CL.cl_ctx, cl.mem_flags.READ_WRITE, size)
|
||||
GlobalCounters.mem_used += self._cl.size
|
||||
|
||||
def __del__(self):
|
||||
if CLCACHE: CLBuffer.BUFFER_CACHE[self._cl.size].append(self._cl)
|
||||
else: GlobalCounters.mem_used -= self._cl.size
|
||||
|
||||
def __init__(self, size, dtype):
|
||||
super().__init__(size, dtype)
|
||||
self._cl = cl.Buffer(CL.cl_ctx, cl.mem_flags.READ_WRITE, self._memsz)
|
||||
def copyin(self, x:np.ndarray): cl.enqueue_copy(CL.cl_queue, self._cl, x, is_blocking=False)
|
||||
def copyout(self, x:np.ndarray): cl.enqueue_copy(CL.cl_queue, x, self._cl, is_blocking=True)
|
||||
|
||||
class CLImage(RawBuffer): # pylint: disable=abstract-method
|
||||
fmt : Final = cl.ImageFormat(cl.channel_order.RGBA, cl.channel_type.HALF_FLOAT if FLOAT16 else cl.channel_type.FLOAT)
|
||||
IMAGE : Final = True
|
||||
|
||||
def __init__(self, shape): # pylint: disable=super-init-not-called
|
||||
self.size, self._cl = shape, cl.Image(CL.cl_ctx, cl.mem_flags.READ_WRITE, CLImage.fmt, shape=(shape[1], shape[0]))
|
||||
def __init__(self, shape, dtype=dtypes.float16 if getenv("FLOAT16") else dtypes.float32): # pylint: disable=super-init-not-called
|
||||
fmt = cl.ImageFormat(cl.channel_order.RGBA, {dtypes.float16: cl.channel_type.HALF_FLOAT, dtypes.float32: cl.channel_type.FLOAT}[dtype])
|
||||
self.size, self.dtype, self._cl = shape, dtype, cl.Image(CL.cl_ctx, cl.mem_flags.READ_WRITE, fmt, shape=(shape[1], shape[0]))
|
||||
GlobalCounters.mem_used += self._cl.row_pitch * self._cl.height
|
||||
|
||||
def __del__(self): GlobalCounters.mem_used -= self._cl.row_pitch * self._cl.height
|
||||
|
||||
@functools.lru_cache(maxsize=None)
|
||||
|
@ -89,6 +72,7 @@ class CLProgram:
|
|||
class CLCodegen(GPUCodegen):
|
||||
lang = GPULanguage(
|
||||
kernel_prefix = "__kernel", buffer_prefix = "__global ", smem_prefix = "__local ",
|
||||
half_prekernel = "#pragma OPENCL EXTENSION cl_khr_fp16 : enable",
|
||||
barrier = "barrier(CLK_LOCAL_MEM_FENCE);", float4 = "(float4)",
|
||||
gid = [f'get_global_id({i})' for i in range(3)], lid = [f'get_local_id({i})' for i in range(3)])
|
||||
|
||||
|
@ -96,8 +80,8 @@ class GPUBuffer(CompiledBuffer):
|
|||
raw_buffer_type = CLBuffer
|
||||
# override this method for image
|
||||
@classmethod
|
||||
def create_raw_buffer(cls, shape, backing) -> RawBuffer:
|
||||
if len(shape) == 3 and shape[2] == 4 and IMAGE >= 2 and backing is None: return CLImage(shape)
|
||||
else: return super().create_raw_buffer(shape, backing)
|
||||
def create_raw_buffer(cls, shape, backing, dtype) -> RawBuffer:
|
||||
if len(shape) == 3 and shape[2] == 4 and IMAGE >= 2 and backing is None: return CLImage(shape) # NOTE: this is a hack. we don't pass in the dtype here, it's controlled by the FLOAT16 env var
|
||||
else: return super().create_raw_buffer(shape, backing, dtype)
|
||||
codegen_type = CLCodegen
|
||||
runtime_type = CLProgram
|
||||
|
|
|
@ -57,7 +57,7 @@ class LLVMProgram:
|
|||
def __del__(self): LLVM.engine.remove_module(self.mod)
|
||||
|
||||
def __call__(self, unused_global_size, unused_local_size, *bufs, wait=False):
|
||||
cfunc = CFUNCTYPE(ctypes.c_int, *[ctypes.POINTER(ctypes.c_float) for _ in bufs])(self.fxn)
|
||||
cfunc = CFUNCTYPE(ctypes.c_int, *[ctypes.c_void_p for _ in bufs])(self.fxn)
|
||||
if wait: st = time.monotonic()
|
||||
cfunc(*[x._buf for x in bufs])
|
||||
if wait: return time.monotonic()-st
|
||||
|
|
|
@ -4,7 +4,7 @@ import Metal, Cocoa, libdispatch # type: ignore
|
|||
import numpy as np
|
||||
from typing import List, Any
|
||||
from tinygrad.codegen.gpu import GPUCodegen, GPULanguage
|
||||
from tinygrad.helpers import prod, getenv, DEBUG
|
||||
from tinygrad.helpers import prod, getenv, DEBUG, DType
|
||||
from tinygrad.ops import CompiledBuffer, RawBufferCopyIn
|
||||
|
||||
METAL_XCODE = getenv("METAL_XCODE")
|
||||
|
@ -20,14 +20,14 @@ class _METAL:
|
|||
METAL = _METAL()
|
||||
|
||||
class RawMetalBuffer(RawBufferCopyIn):
|
||||
def __init__(self, size):
|
||||
super().__init__(size)
|
||||
self._cl = METAL.device.newBufferWithLength_options_(size, Metal.MTLResourceStorageModeShared)
|
||||
def __init__(self, size:int, dtype:DType):
|
||||
super().__init__(size, dtype)
|
||||
self._cl = METAL.device.newBufferWithLength_options_(size*dtype.itemsize, Metal.MTLResourceStorageModeShared)
|
||||
def __del__(self):
|
||||
self._cl.release()
|
||||
super().__del__()
|
||||
def _buffer(self): return self._cl.contents().as_buffer(self._cl.length())
|
||||
def _as_np(self, dtype=np.float32): return np.frombuffer(self._buffer(), dtype=dtype)
|
||||
def _as_np(self): return np.frombuffer(self._buffer(), dtype=self.dtype.np)
|
||||
def copyin(self, x:np.ndarray): np.copyto(self._as_np(), x.reshape(-1).data)
|
||||
def toCPU(self) -> np.ndarray:
|
||||
for cbuf in METAL.mtl_buffers_in_flight: cbuf.waitUntilCompleted()
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
import torch
|
||||
from typing import ClassVar, Dict, Callable
|
||||
from tinygrad.ops import UnaryOps, BinaryOps, MovementOps, FusedOps, InterpretedBuffer, Op
|
||||
from tinygrad.helpers import getenv
|
||||
from tinygrad.helpers import getenv, dtypes
|
||||
from tinygrad.runtime.ops_cpu import base_fxn_for_op, einsum_mulacc
|
||||
|
||||
torch_fxn_for_op : Dict[Op, Callable] = {**base_fxn_for_op, **{
|
||||
|
@ -14,6 +14,7 @@ torch_fxn_for_op : Dict[Op, Callable] = {**base_fxn_for_op, **{
|
|||
device = torch.device("cuda:0" if torch.cuda.is_available() else ("mps" if getenv("MPS", 0) else "cpu"))
|
||||
class TorchBuffer(InterpretedBuffer):
|
||||
fxn_for_op : ClassVar = torch_fxn_for_op
|
||||
to_tinygrad_dtype = staticmethod(lambda lbuf: {torch.float16: dtypes.float16, torch.float32: dtypes.float32}[lbuf.dtype])
|
||||
|
||||
@staticmethod
|
||||
def fromCPU(x): return TorchBuffer(torch.from_numpy(x).requires_grad_(False).to(device))
|
||||
|
|
|
@ -3,7 +3,7 @@ from __future__ import annotations
|
|||
import math, functools, itertools
|
||||
import numpy as np
|
||||
from typing import List, Tuple, Callable, Optional, ClassVar, Type, Union, Sequence
|
||||
from tinygrad.helpers import prod, argfix, make_pair, getenv, DEBUG, flatten
|
||||
from tinygrad.helpers import prod, argfix, make_pair, getenv, DEBUG, flatten, DType, dtypes
|
||||
from tinygrad.lazy import Device, LazyBuffer, LazyNumpyArray
|
||||
from tinygrad.image import image_conv2d_decorator, image_dot_decorator
|
||||
|
||||
|
@ -32,18 +32,20 @@ class Tensor:
|
|||
__deletable__ = ('_ctx',)
|
||||
training : ClassVar[bool] = False
|
||||
no_grad : ClassVar[bool] = False
|
||||
default_type : DType = dtypes.float32
|
||||
|
||||
def __init__(self, data, device=Device.DEFAULT, requires_grad:Optional[bool]=None):
|
||||
def __init__(self, data, device=Device.DEFAULT, dtype:Optional[DType]=None, requires_grad:Optional[bool]=None):
|
||||
if isinstance(data, list):
|
||||
data = np.array(data, dtype=np.float32)
|
||||
data = np.array(data, dtype=(dtype if dtype is not None else Tensor.default_type).np)
|
||||
elif isinstance(data, LazyBuffer) and data.device != device:
|
||||
# TODO: this has to realize, it shouldn't have to
|
||||
data = data.realize().toCPU()
|
||||
|
||||
if isinstance(data, (np.ndarray, LazyNumpyArray)):
|
||||
data = data if data.shape else data.reshape((1,))
|
||||
self.lazydata = LazyBuffer.fromCPU(data.astype(np.float32), device)
|
||||
self.lazydata = LazyBuffer.fromCPU(data.astype(dtype.np) if dtype is not None else data, device)
|
||||
elif isinstance(data, LazyBuffer):
|
||||
assert dtype is None or dtype == data.dtype, "dtype doesn't match, and casting isn't supported"
|
||||
self.lazydata = data
|
||||
else:
|
||||
raise RuntimeError(f"can't create Tensor from {data}")
|
||||
|
@ -64,13 +66,12 @@ class Tensor:
|
|||
@property
|
||||
def shape(self) -> Tuple[int, ...]: return self.lazydata.shape
|
||||
|
||||
# dtype handling was very broken. it's always float32 now
|
||||
@property
|
||||
def dtype(self) -> type: return np.float32
|
||||
|
||||
@property
|
||||
def device(self) -> str: return self.lazydata.device
|
||||
|
||||
@property
|
||||
def dtype(self) -> DType: return self.lazydata.dtype
|
||||
|
||||
# ***** data handlers ****
|
||||
|
||||
def realize(self) -> Tensor:
|
||||
|
@ -132,11 +133,11 @@ class Tensor:
|
|||
def manual_seed(seed=None): Tensor._rng = np.random.default_rng(seed=seed)
|
||||
|
||||
@staticmethod
|
||||
def rand(*shape, **kwargs) -> Tensor: return Tensor(LazyNumpyArray(lambda shape: Tensor._rng.random(size=shape, dtype=np.float32), shape), **kwargs)
|
||||
def rand(*shape, **kwargs) -> Tensor: return Tensor(LazyNumpyArray(lambda shape, dtype: Tensor._rng.random(size=shape, dtype=dtype), shape, np.float32), **kwargs)
|
||||
|
||||
# TODO: replace with a transformation from uniform -> gaussian
|
||||
@staticmethod
|
||||
def randn(*shape, **kwargs) -> Tensor: return Tensor(LazyNumpyArray(lambda shape: Tensor._rng.standard_normal(size=shape, dtype=np.float32), shape), **kwargs)
|
||||
def randn(*shape, **kwargs) -> Tensor: return Tensor(LazyNumpyArray(lambda shape, dtype: Tensor._rng.standard_normal(size=shape, dtype=dtype), shape, np.float32), **kwargs)
|
||||
|
||||
# ***** rng hlops *****
|
||||
|
||||
|
@ -442,10 +443,16 @@ class Tensor:
|
|||
|
||||
def dropout(self, p=0.5) -> Tensor:
|
||||
if not Tensor.training: return self
|
||||
_mask : np.ndarray = np.asarray(Tensor._rng.binomial(1, 1.0-p, size=self.shape), dtype=self.dtype)
|
||||
# TODO: why is this going through numpy?
|
||||
_mask : np.ndarray = np.asarray(Tensor._rng.binomial(1, 1.0-p, size=self.shape), dtype=np.float32)
|
||||
return self * Tensor(_mask, requires_grad=False, device=self.device) * (1/(1.0 - p))
|
||||
|
||||
# ***** cast ops *****
|
||||
|
||||
# TODO: this is a hack, but if we add float(0), it will become a float. need real casting support
|
||||
def float(self) -> Tensor: return self.add(Tensor([0], device=self.device, dtype=dtypes.float32, requires_grad=self.requires_grad))
|
||||
|
||||
# register functions to move between devices
|
||||
for device in [device for device in Device._buffers.keys() if device[0] != "_"]:
|
||||
for device in Device._buffers:
|
||||
setattr(Tensor, f"{device.lower()}", functools.partialmethod(Tensor.to, device))
|
||||
setattr(Tensor, f"{device.lower()}_", functools.partialmethod(Tensor.to_, device))
|
||||
|
|
Loading…
Reference in New Issue