compile works (#688)
* compile works * runtimes * line count * fix custom, to tg dtype * meh, that's fine with lazy importpull/702/head
parent
af7745073f
commit
15e0b56e39
|
@ -1,7 +1,6 @@
|
|||
#!/bin/bash
|
||||
# note: if we compile tinygrad/nn/__init__.py __dict__ no longer works, and optimizers will silently fail
|
||||
mypyc --check-untyped-defs --explicit-package-bases --warn-unreachable tinygrad/shape/shapetracker.py tinygrad/shape/symbolic.py \
|
||||
tinygrad/nn/__init__.py tinygrad/helpers.py tinygrad/mlops.py tinygrad/tensor.py tinygrad/graph.py \
|
||||
#tinygrad/codegen/ast.py tinygrad/codegen/gpu.py tinygrad/ops.py tinygrad/runtime/ops_metal.py
|
||||
#tinygrad/runtime/ops_metal.py tinygrad/shape/__init__.py tinygrad/ops.py tinygrad/codegen/ast.py \
|
||||
#tinygrad/helpers.py tinygrad/mlops.py tinygrad/nn/__init__.py tinygrad/graph.py tinygrad/lazy.py tinygrad/tensor.py
|
||||
|
||||
tinygrad/nn/__init__.py tinygrad/helpers.py tinygrad/mlops.py tinygrad/tensor.py tinygrad/graph.py
|
||||
#tinygrad/ops.py tinygrad/runtime/ops_metal.py tinygrad/runtime/ops_gpu.py tinygrad/runtime/ops_cpu.py tinygrad/lazy.py
|
||||
#tinygrad/codegen/ast.py tinygrad/codegen/gpu.py
|
||||
|
|
|
@ -22,7 +22,6 @@ from extra.helpers import Timing
|
|||
from tinygrad.tensor import Tensor
|
||||
from tinygrad.nn import Linear
|
||||
from tinygrad.ops import GlobalCounters
|
||||
from tinygrad.nn.optim import get_state_dict
|
||||
|
||||
# https://github.com/facebookresearch/llama/blob/1076b9c51c77ad06e9d7ba8a4c6df775741732bd/llama/model.py#L47
|
||||
def precompute_freqs_cis(dim: int, end: int, theta: float = 10000.0):
|
||||
|
@ -198,7 +197,7 @@ if __name__ == "__main__":
|
|||
chatbot = args.prompt == None
|
||||
|
||||
# load model (you have to find the weights yourself)
|
||||
from extra.utils import fake_torch_load_zipped
|
||||
from extra.utils import fake_torch_load_zipped, get_child
|
||||
|
||||
if args.large:
|
||||
raise RuntimeError("large model is broken")
|
||||
|
@ -248,14 +247,14 @@ if __name__ == "__main__":
|
|||
with Timing("loaded weights in ", lambda et_ns: f", {GlobalCounters.mem_used/1e9:.2f} GB loaded at {GlobalCounters.mem_used/et_ns:.2f} GB/s"):
|
||||
weights = fake_torch_load_zipped(open(WEIGHTS_FILENAME, "rb"), load_weights=getenv("WEIGHTS", 1), base_name="consolidated")
|
||||
|
||||
state_dict = get_state_dict(model)
|
||||
#from tinygrad.nn.optim import get_state_dict
|
||||
#state_dict = get_state_dict(model)
|
||||
|
||||
# assign weights (should be free)
|
||||
for k,v in weights.items():
|
||||
if '.inner_attention.rope.freqs' in k: continue # no rope today
|
||||
mv = state_dict[k]
|
||||
assert mv.shape == v.shape, f"shape mismatch in {k}, {mv.shape} != {v.shape}"
|
||||
mv.assign(v).realize()
|
||||
#state_dict[k].assign(v).realize()
|
||||
get_child(model, k).assign(v).realize()
|
||||
|
||||
del weights
|
||||
|
||||
|
|
|
@ -1,13 +1,14 @@
|
|||
import pickle
|
||||
import numpy as np
|
||||
from tqdm import tqdm
|
||||
import tempfile
|
||||
import tempfile, platform
|
||||
from collections import defaultdict
|
||||
from tinygrad.helpers import prod, getenv, DEBUG
|
||||
from tinygrad.ops import GlobalCounters
|
||||
from tinygrad.tensor import Tensor
|
||||
from tinygrad.lazy import LazyNumpyArray, Device
|
||||
from tinygrad.shape.shapetracker import strides_for_shape
|
||||
OSX = platform.system() == "Darwin"
|
||||
|
||||
def fetch(url):
|
||||
if url.startswith("/"):
|
||||
|
@ -119,7 +120,7 @@ def load_single_weight(t:Tensor, myfile, shape, strides, dtype, mmap_allowed=Fal
|
|||
ret = np.empty(lna.shape, dtype=lna.dtype)
|
||||
myfile.readinto(ret.data)
|
||||
return ret
|
||||
if mmap_allowed and t.device in ["GPU", "CUDA"]: t.lazydata.op.arg.fxn = _mmap
|
||||
if mmap_allowed and not OSX and t.device in ["GPU", "CUDA"]: t.lazydata.op.arg.fxn = _mmap
|
||||
else: t.lazydata.op.arg.fxn = _read
|
||||
t.realize()
|
||||
|
||||
|
|
2
rmso.sh
2
rmso.sh
|
@ -1,3 +1,3 @@
|
|||
#!/bin/bash
|
||||
rm tinygrad/*.so tinygrad/codegen/*.so tinygrad/shape/*.so tinygrad/llops/*.so tinygrad/nn/*.so tinygrad/runtime/*.so *.so
|
||||
rm tinygrad/*.so tinygrad/codegen/*.so tinygrad/shape/*.so tinygrad/nn/*.so tinygrad/runtime/*.so *.so
|
||||
|
||||
|
|
|
@ -22,7 +22,7 @@ def atan2_gpu(a:CompiledBuffer, b:CompiledBuffer) -> CompiledBuffer:
|
|||
__kernel void atan2(global float *c, global float *a, global float *b) {
|
||||
int idx = get_global_id(0);
|
||||
c[idx] = atan2(a[idx], b[idx]);
|
||||
}""", global_size=[prod(ret.shape)]).build(GPUBuffer.runtime_type).exec([ret, a.contiguous(), b.contiguous()])
|
||||
}""", global_size=[prod(ret.shape)]).build(GPUBuffer.spec.runtime).exec([ret, a.contiguous(), b.contiguous()])
|
||||
return ret
|
||||
|
||||
def atan2_cpu(a:CPUBuffer, b:CPUBuffer) -> CPUBuffer:
|
||||
|
|
|
@ -2,7 +2,7 @@ import itertools
|
|||
from enum import Enum, auto
|
||||
from typing import List, Tuple
|
||||
from tinygrad.helpers import prod, dedup, all_same, colored, dtypes
|
||||
from tinygrad.ops import LazyOp, MovementOps, get_lazyop_info, get_buffers, ReduceOps, get_lazyops, map_buffers
|
||||
from tinygrad.ops import LazyOp, MovementOps, get_lazyop_info, get_buffers, ReduceOps, get_lazyops, map_buffers, GenericShape, ASTRunner
|
||||
from tinygrad.shape.shapetracker import ShapeTracker, View, strides_for_shape
|
||||
|
||||
def get_first_reduce(shapes):
|
||||
|
@ -41,7 +41,7 @@ class ASTKernel:
|
|||
else:
|
||||
output_shape = None
|
||||
|
||||
self.info = get_lazyop_info(ast)
|
||||
self.info: GenericShape = get_lazyop_info(ast)
|
||||
self.bufs = dedup(get_buffers(ast))
|
||||
for b in self.bufs: b.st.simplify()
|
||||
self.ast = ast
|
||||
|
@ -124,6 +124,8 @@ class ASTKernel:
|
|||
for i in range(len(self.sts)):
|
||||
print(prefix, self.bufs[i].dtype if self.bufs[i] is not None else None, self.buftokens[i], f"early:{'T' if i < len(self.bufs) and self.bufs[i] in self.earlybufs else 'F'}", self.sts[i].shape, self.sts[i].views[-1].strides, len(self.sts[i].views), type(self.bufs[i]._buf) if self.bufs[i] is not None else "FAKE")
|
||||
|
||||
def codegen(self) -> ASTRunner: raise NotImplementedError("need a codegen")
|
||||
|
||||
@property
|
||||
def shape_len(self) -> int: return len(self.sts[0].shape)
|
||||
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
import os, math, functools
|
||||
import numpy as np
|
||||
from typing import Tuple, Union, List, NamedTuple, Final
|
||||
from typing import Tuple, Union, List, NamedTuple, Final, Iterator
|
||||
|
||||
def dedup(x): return list(dict.fromkeys(x)) # retains list order
|
||||
def prod(x:Union[List[int], Tuple[int, ...]]) -> int: return math.prod(x)
|
||||
|
@ -10,7 +10,7 @@ def all_same(items): return all(x == items[0] for x in items) if len(items) > 0
|
|||
def colored(st, color, background=False, bright=False): return f"\u001b[{10*background+60*bright+30+['black', 'red', 'green', 'yellow', 'blue', 'magenta', 'cyan', 'white'].index(color)}m{st}\u001b[0m" # replace the termcolor library with one line
|
||||
def partition(lst, fxn): return [x for x in lst if fxn(x)], [x for x in lst if not fxn(x)]
|
||||
def make_pair(x:Union[int, Tuple[int, ...]], cnt=2) -> Tuple[int, ...]: return (x,)*cnt if isinstance(x, int) else x
|
||||
def flatten(l): return [item for sublist in l for item in sublist]
|
||||
def flatten(l:Iterator): return [item for sublist in l for item in sublist]
|
||||
def mnum(i) -> str: return str(i) if i >= 0 else f"m{-i}"
|
||||
|
||||
@functools.lru_cache(maxsize=None)
|
||||
|
|
|
@ -1,10 +1,11 @@
|
|||
from __future__ import annotations
|
||||
from typing import Optional, Tuple, Union, List, Dict, Any, ClassVar, Type
|
||||
import os, sys, weakref, importlib, inspect, functools
|
||||
import sys, weakref, importlib, inspect, functools, pathlib
|
||||
from weakref import WeakValueDictionary
|
||||
from tinygrad.helpers import prod, getenv, DType, dtypes, LazyNumpyArray
|
||||
from tinygrad.helpers import prod, getenv, DType, dtypes, LazyNumpyArray, flatten
|
||||
from tinygrad.shape.shapetracker import ShapeTracker, get_contraction
|
||||
from tinygrad.ops import InterpretedBuffer, DeviceBuffer, UnaryOps, BinaryOps, ReduceOps, MovementOps, LoadOps, OpType, LazyOp, get_buffers, get_lazyops, map_buffers
|
||||
from tinygrad.ops import DeviceBuffer, UnaryOps, BinaryOps, ReduceOps, MovementOps, LoadOps, OpType, LazyOp, get_buffers, get_lazyops, map_buffers
|
||||
from tinygrad.runtime.ops_cpu import CPUBuffer
|
||||
from tinygrad.graph import log_op
|
||||
|
||||
# lazy can recurse a lot
|
||||
|
@ -15,10 +16,10 @@ LAZY = getenv("LAZY", 1)
|
|||
|
||||
class _Device:
|
||||
def __init__(self) -> None:
|
||||
self._buffers = {y.upper():y for y in [os.path.splitext(x)[0][len("ops_"):] for x in sorted(os.listdir(os.path.join(os.path.dirname(os.path.realpath(__file__)), "runtime"))) if x.startswith("ops_")]}
|
||||
self._buffers: List[str] = [x.stem[len("ops_"):].upper() for x in (pathlib.Path(__file__).parent/"runtime").iterdir() if x.stem.startswith("ops_")]
|
||||
self.DEFAULT: str = functools.reduce(lambda val, ele: ele if getenv(ele) == 1 else val, self._buffers, "CPU")
|
||||
@functools.lru_cache(maxsize=None) # this class is a singleton, pylint: disable=method-cache-max-size-none
|
||||
def __getitem__(self, x:str) -> Type[DeviceBuffer]: return [cls for cname, cls in inspect.getmembers(importlib.import_module(f'tinygrad.runtime.ops_{self._buffers[x]}'), inspect.isclass) if (cname.lower() == self._buffers[x] + "buffer")][0]
|
||||
def __getitem__(self, x:str) -> Type[DeviceBuffer]: return [cls for cname, cls in inspect.getmembers(importlib.import_module(f'tinygrad.runtime.ops_{x.lower()}'), inspect.isclass) if (cname.lower() == x.lower() + "buffer")][0]
|
||||
Device = _Device()
|
||||
|
||||
# TODO: movement ops that only change shape are really nops. treat them as such
|
||||
|
@ -155,7 +156,7 @@ class LazyBuffer:
|
|||
# NOTE: we also have to copy the numpy array on the way out...otherwise the underlying Tensor could be freed and use after free. improve this?
|
||||
def toCPU(self):
|
||||
ret = self.realize().toCPU()
|
||||
log_op(InterpretedBuffer(ret), LazyOp(LoadOps.TOCPU, (self.realized,), None))
|
||||
log_op(CPUBuffer(ret), LazyOp(LoadOps.TOCPU, (self.realized,), None))
|
||||
return ret.copy()
|
||||
|
||||
def unary_op(self:LazyBuffer, op:UnaryOps) -> LazyBuffer: return elementwise_op(op, self)
|
||||
|
@ -206,9 +207,8 @@ class LazyBuffer:
|
|||
# move permutes before reshapes if we can
|
||||
if op == MovementOps.PERMUTE and PUSH_PERMUTES and self.realized is None and self.op.op == MovementOps.RESHAPE and isinstance(self.op.src[0], LazyBuffer):
|
||||
if shape_idx_groups := get_contraction(self.op.src[0].shape, self.shape):
|
||||
new_arg: List[int] = functools.reduce(lambda r, x: r + shape_idx_groups[x], arg, [])
|
||||
self.op.src[0].children.discard(self) # this changes nothing?
|
||||
return self.op.src[0].movement_op(MovementOps.PERMUTE, tuple(new_arg)) \
|
||||
return self.op.src[0].movement_op(MovementOps.PERMUTE, tuple(flatten(shape_idx_groups[i] for i in arg))) \
|
||||
.movement_op(MovementOps.RESHAPE, ShapeTracker(self.st).movement_op(op, arg).shape)
|
||||
|
||||
# if this MovementOp is being applied to a BinaryOp, apply the MovementOp to all the BinaryOp inputs instead. NOTE: UnaryOps is never an OpType
|
||||
|
|
|
@ -27,7 +27,7 @@ class LazyOp(NamedTuple):
|
|||
# Any == Union[LazyBuffer, DeviceBuffer]
|
||||
def get_buffers(op:LazyOp) -> List[Any]: return functools.reduce(operator.add, [get_buffers(x) if isinstance(x, LazyOp) else [x] for x in op.src], [])
|
||||
def get_lazyops(op:LazyOp) -> List[LazyOp]: return functools.reduce(operator.add, [get_lazyops(x) for x in op.src if isinstance(x, LazyOp)], [op])
|
||||
def map_buffers(real_srcs, x:LazyOp) -> LazyOp:
|
||||
def map_buffers(real_srcs:Dict[Any, Any], x:Any) -> LazyOp:
|
||||
if x in real_srcs: return map_buffers(real_srcs, real_srcs[x]) if isinstance(real_srcs[x], LazyOp) else real_srcs[x]
|
||||
return LazyOp(x.op, tuple((map_buffers(real_srcs, y) if isinstance(y, LazyOp) else real_srcs[y]) for y in x.src), x.arg)
|
||||
|
||||
|
@ -86,7 +86,7 @@ shape_fxn_for_op: Dict[Op, Callable] = {
|
|||
**{op:lambda self,y: GenericShape(self.shape, max(self.dtype, y.dtype), self.consume_flops() + y.consume_flops() + prod(self.shape)) for op in BinaryOps},
|
||||
**{op:lambda self,new_shape: GenericShape(new_shape, self.dtype, self.consume_flops() + prod(self.shape)) for op in ReduceOps},
|
||||
**{op:functools.partial(lambda mop,self,arg: GenericShape(ShapeTracker(self.shape).movement_op(mop, arg).shape, self.dtype, self.consume_flops()), op) for op in MovementOps}}
|
||||
def get_lazyop_info(ast:LazyOp): return InterpretedBuffer.exec_ast(map_buffers({x:InterpretedBuffer(GenericShape(x.shape, x.dtype)) for x in get_buffers(ast)}, ast))._buf
|
||||
def get_lazyop_info(ast:LazyOp) -> GenericShape: return InterpretedBuffer.exec_ast(map_buffers({x:InterpretedBuffer(GenericShape(x.shape, x.dtype)) for x in get_buffers(ast)}, ast))._buf
|
||||
|
||||
# used in CPUBuffer and TorchBuffer
|
||||
class InterpretedBuffer(DeviceBuffer): # pylint: disable=abstract-method
|
||||
|
@ -94,7 +94,7 @@ class InterpretedBuffer(DeviceBuffer): # pylint: disable=abstract-method
|
|||
def __init__(self, lbuf:Any):
|
||||
self._buf: Any = lbuf
|
||||
self.shape: Tuple[int, ...] = tuple(lbuf.shape)
|
||||
self.dtype: DType = self.to_tinygrad_dtype(lbuf) if hasattr(self, 'to_tinygrad_dtype') else lbuf.dtype
|
||||
self.dtype: DType = self.to_tinygrad_dtype() if hasattr(self, 'to_tinygrad_dtype') else lbuf.dtype
|
||||
# NOTE: this is overcounting the memory used, as reshapes and stuff are aliases
|
||||
self._memsz = (prod(self.shape) * self.dtype.itemsize) if not isinstance(lbuf, GenericShape) else 0
|
||||
GlobalCounters.mem_used += self._memsz
|
||||
|
@ -163,9 +163,17 @@ class ASTRunner:
|
|||
local_sizes = [list(x) for x in itertools.product(*local_dims) if prod(x) <= MAX_WORKGROUP] * 2 # try each valid size twice
|
||||
return min([(self.timeit(rawbufs, local_size), local_size) for local_size in random.sample(local_sizes, len(local_sizes))])[1]
|
||||
|
||||
from tinygrad.codegen.ast import ASTKernel
|
||||
class Specialized(NamedTuple):
|
||||
raw_buffer: Type[RawBuffer]
|
||||
codegen: Type[ASTKernel]
|
||||
runtime: Type
|
||||
|
||||
# assumes you are using ShapeTracker
|
||||
# used in GPUBuffer and LLVMBuffer
|
||||
class CompiledBuffer(DeviceBuffer): # pylint: disable=abstract-method
|
||||
spec: ClassVar[Specialized]
|
||||
|
||||
def __init__(self, shape:Union[ShapeTracker, Tuple[int, ...]], hostbuf:Optional[CompiledBuffer]=None, backing:Optional[np.ndarray]=None, force_create=False, dtype:DType=dtypes.float32):
|
||||
self.st = shape if isinstance(shape, ShapeTracker) else ShapeTracker(tuple(shape))
|
||||
self.shape = self.st.shape
|
||||
|
@ -179,13 +187,12 @@ class CompiledBuffer(DeviceBuffer): # pylint: disable=abstract-method
|
|||
|
||||
def __repr__(self): return f"{type(self).__name__}(shape={self.st}, hostbuf={type(self).__name__}(shape={self._base_shape}" + (f", backing=np.array({self._backing}, dtype=np.{self.dtype.np.__name__}), dtype={self.dtype}), dtype={self.dtype})" if self._backing is not None else f", force_create=True, dtype={self.dtype}), dtype={self.dtype})")
|
||||
|
||||
raw_buffer_type: ClassVar[Type[RawBuffer]]
|
||||
@classmethod
|
||||
def create_raw_buffer(cls, shape:Tuple[int, ...], backing:Optional[np.ndarray], dtype:DType) -> RawBuffer:
|
||||
def create_raw_buffer(self, shape:Tuple[int, ...], backing:Optional[np.ndarray], dtype:DType) -> RawBuffer:
|
||||
assert backing is None or prod(shape) == prod(backing.shape), "backing has the wrong shape"
|
||||
assert backing is None or GlobalCounters.cache is None, f"can't copy in {backing.shape} while caching"
|
||||
if DEBUG >= 4: print(f"create raw buffer {shape} {dtype} backed:{backing is not None}")
|
||||
return cls.raw_buffer_type(prod(shape), dtype) if backing is None else cls.raw_buffer_type.fromCPU(backing)
|
||||
return self.spec.raw_buffer(prod(shape), dtype) if backing is None else self.spec.raw_buffer.fromCPU(backing)
|
||||
|
||||
def raw(self) -> RawBuffer:
|
||||
if self._buf is None:
|
||||
if DEBUG >= 4 and self._backing is not None: print(f"**** copy in {self._backing.shape} to {type(self)}")
|
||||
|
@ -200,19 +207,16 @@ class CompiledBuffer(DeviceBuffer): # pylint: disable=abstract-method
|
|||
if DEBUG >= 3: print(f"**** copy out {self.shape}")
|
||||
return self.contiguous().raw().toCPU().reshape(self.shape)
|
||||
|
||||
codegen_type: ClassVar[Any]
|
||||
runtime_type: ClassVar[Type]
|
||||
|
||||
method_cache: Final[Dict[str, ASTRunner]] = {}
|
||||
@classmethod
|
||||
def exec_ast(cls, ast:LazyOp, output_buffer:Optional[CompiledBuffer]=None):
|
||||
k = cls.codegen_type(ast, output_buffer)
|
||||
k = cls.spec.codegen(ast, output_buffer)
|
||||
if getenv("ENABLE_METHOD_CACHE", 1): # this is the default now
|
||||
if k.key not in cls.method_cache: cls.method_cache[k.key] = k.codegen().build(cls.runtime_type)
|
||||
if k.key not in cls.method_cache: cls.method_cache[k.key] = k.codegen().build(cls.spec.runtime)
|
||||
elif DEBUG >= 4: print(f"method cache hit : {k.key}")
|
||||
prg = cls.method_cache[k.key]
|
||||
else:
|
||||
prg = k.codegen().build(cls.runtime_type)
|
||||
prg = k.codegen().build(cls.spec.runtime)
|
||||
if getenv("PRINT_AST", "") == prg.name or getenv("PRINT_AST", "") == "1":
|
||||
k.print()
|
||||
print(prg.prg)
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
import os, time, ctypes, hashlib, subprocess, platform
|
||||
from tinygrad.helpers import dtypes, DType
|
||||
from tinygrad.ops import CompiledBuffer, RawBufferMapped
|
||||
from tinygrad.ops import CompiledBuffer, RawBufferMapped, Specialized
|
||||
from tinygrad.codegen.gpu import GPUCodegen, GPULanguage
|
||||
|
||||
class RawMallocBuffer(RawBufferMapped):
|
||||
|
@ -29,4 +29,4 @@ class ClangCodegen(GPUCodegen):
|
|||
lang = GPULanguage(buffer_suffix=" restrict")
|
||||
|
||||
class ClangBuffer(CompiledBuffer):
|
||||
raw_buffer_type, codegen_type, runtime_type = RawMallocBuffer, ClangCodegen, ClangProgram
|
||||
spec = Specialized(RawMallocBuffer, ClangCodegen, ClangProgram)
|
||||
|
|
|
@ -36,7 +36,7 @@ numpy_fxn_for_op: Dict[Op, Callable] = {**base_fxn_for_op, **{
|
|||
|
||||
class CPUBuffer(InterpretedBuffer):
|
||||
fxn_for_op: ClassVar = numpy_fxn_for_op
|
||||
to_tinygrad_dtype = staticmethod(dtypes.from_np)
|
||||
def to_tinygrad_dtype(self): return dtypes.from_np(self._buf)
|
||||
|
||||
@staticmethod
|
||||
def fromCPU(x): return CPUBuffer(x)
|
||||
|
|
|
@ -4,7 +4,7 @@ import pycuda.autoprimaryctx # type: ignore # pylint: disable=unused-import # no
|
|||
import pycuda.driver as cuda # type: ignore
|
||||
from pycuda.compiler import compile as cuda_compile # type: ignore
|
||||
from tinygrad.helpers import DEBUG
|
||||
from tinygrad.ops import CompiledBuffer, RawBufferCopyInOut
|
||||
from tinygrad.ops import CompiledBuffer, RawBufferCopyInOut, Specialized
|
||||
from tinygrad.codegen.gpu import GPUCodegen, GPULanguage
|
||||
|
||||
class RawCUDABuffer(RawBufferCopyInOut):
|
||||
|
@ -47,4 +47,4 @@ class CUDACodegen(GPUCodegen):
|
|||
lid = [f'threadIdx.{chr(120+i)}' for i in range(3)])
|
||||
|
||||
class CUDABuffer(CompiledBuffer):
|
||||
raw_buffer_type, codegen_type, runtime_type = RawCUDABuffer, CUDACodegen, CUDAProgram
|
||||
spec = Specialized(RawCUDABuffer, CUDACodegen, CUDAProgram)
|
||||
|
|
|
@ -1,10 +1,10 @@
|
|||
from __future__ import annotations
|
||||
import platform, functools
|
||||
import platform
|
||||
import numpy as np
|
||||
import pyopencl as cl # type: ignore
|
||||
from typing import Optional, List, Final
|
||||
from tinygrad.helpers import IMAGE, DEBUG, getenv, dtypes
|
||||
from tinygrad.ops import CompiledBuffer, GlobalCounters, RawBufferCopyInOut, RawBuffer
|
||||
from tinygrad.ops import CompiledBuffer, GlobalCounters, RawBufferCopyInOut, RawBuffer, Specialized
|
||||
from tinygrad.codegen.gpu import GPUCodegen, GPULanguage
|
||||
|
||||
OSX = platform.system() == "Darwin"
|
||||
|
@ -12,16 +12,12 @@ OSX_TIMING_RATIO = (125/3) if OSX else 1.0 # see test/external_osx_profiling.p
|
|||
FLOAT16 = getenv("FLOAT16", 0)
|
||||
|
||||
class _CL:
|
||||
@functools.cached_property
|
||||
def cl_ctx(self) -> cl.Context:
|
||||
def __init__(self):
|
||||
devices: List[cl.Device] = sum([x.get_devices(device_type=cl.device_type.GPU) for x in cl.get_platforms()], [])
|
||||
if len(devices) == 0: devices = sum([x.get_devices(device_type=cl.device_type.CPU) for x in cl.get_platforms()], []) # settle for CPU
|
||||
if len(devices) > 1 or DEBUG >= 1: print(f"using {devices[getenv('CL_DEVICE', 0)]}")
|
||||
return cl.Context(devices=[devices[getenv("CL_DEVICE", 0)]])
|
||||
|
||||
@functools.cached_property
|
||||
def cl_queue(self) -> cl.CommandQueue:
|
||||
return cl.CommandQueue(CL.cl_ctx, properties=cl.command_queue_properties.PROFILING_ENABLE) # this is an in-order command queue
|
||||
self.cl_ctx: cl.Context = cl.Context(devices=[devices[getenv("CL_DEVICE", 0)]])
|
||||
self.cl_queue: cl.CommandQueue = cl.CommandQueue(self.cl_ctx, properties=cl.command_queue_properties.PROFILING_ENABLE) # this is an in-order command queue
|
||||
CL = _CL()
|
||||
|
||||
class CLBuffer(RawBufferCopyInOut):
|
||||
|
@ -39,7 +35,7 @@ class CLImage(RawBuffer): # pylint: disable=abstract-method
|
|||
GlobalCounters.mem_used += self._cl.row_pitch * self._cl.height
|
||||
def __del__(self): GlobalCounters.mem_used -= self._cl.row_pitch * self._cl.height
|
||||
|
||||
@functools.lru_cache(maxsize=None)
|
||||
#@functools.lru_cache(maxsize=None)
|
||||
class CLProgram:
|
||||
def __init__(self, name:str, prg:str, binary=False, argdtypes=None):
|
||||
self.name, self.argdtypes, self.clprogram = name, argdtypes, cl.Program(CL.cl_ctx, CL.cl_ctx.devices, [prg]) if binary else cl.Program(CL.cl_ctx, prg) # type: ignore
|
||||
|
@ -77,11 +73,8 @@ class CLCodegen(GPUCodegen):
|
|||
gid = [f'get_global_id({i})' for i in range(3)], lid = [f'get_local_id({i})' for i in range(3)])
|
||||
|
||||
class GPUBuffer(CompiledBuffer):
|
||||
raw_buffer_type = CLBuffer
|
||||
spec = Specialized(CLBuffer, CLCodegen, CLProgram)
|
||||
# override this method for image
|
||||
@classmethod
|
||||
def create_raw_buffer(cls, shape, backing, dtype) -> RawBuffer:
|
||||
def create_raw_buffer(self, shape, backing, dtype) -> RawBuffer:
|
||||
if len(shape) == 3 and shape[2] == 4 and IMAGE >= 2 and backing is None: return CLImage(shape) # NOTE: this is a hack. we don't pass in the dtype here, it's controlled by the FLOAT16 env var
|
||||
else: return super().create_raw_buffer(shape, backing, dtype)
|
||||
codegen_type = CLCodegen
|
||||
runtime_type = CLProgram
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
import time, hashlib, ctypes
|
||||
from typing import ClassVar
|
||||
from tinygrad.ops import CompiledBuffer
|
||||
from tinygrad.ops import CompiledBuffer, Specialized
|
||||
from tinygrad.runtime.ops_clang import RawMallocBuffer
|
||||
from tinygrad.helpers import getenv, DEBUG
|
||||
from ctypes import CFUNCTYPE
|
||||
|
@ -63,4 +63,4 @@ class LLVMProgram:
|
|||
if wait: return time.monotonic()-st
|
||||
|
||||
class LLVMBuffer(CompiledBuffer):
|
||||
raw_buffer_type, codegen_type, runtime_type = RawMallocBuffer, LLVMCodegen, LLVMProgram
|
||||
spec = Specialized(RawMallocBuffer, LLVMCodegen, LLVMProgram)
|
||||
|
|
|
@ -1,21 +1,18 @@
|
|||
# pip3 install pyobjc-framework-Metal pyobjc-framework-Cocoa pyobjc-framework-libdispatch
|
||||
import os, subprocess, pathlib, functools
|
||||
import os, subprocess, pathlib
|
||||
import Metal, Cocoa, libdispatch # type: ignore
|
||||
from typing import List, Any, Final
|
||||
from typing import List, Any
|
||||
from tinygrad.codegen.gpu import GPUCodegen, GPULanguage
|
||||
from tinygrad.helpers import prod, getenv, DEBUG, DType
|
||||
from tinygrad.ops import CompiledBuffer, RawBufferMapped
|
||||
from tinygrad.ops import CompiledBuffer, RawBufferMapped, Specialized
|
||||
|
||||
METAL_XCODE = getenv("METAL_XCODE")
|
||||
|
||||
class _METAL:
|
||||
mtl_buffers_in_flight: Final[List[Any]] = []
|
||||
@functools.cached_property
|
||||
def device(self) -> Any:
|
||||
return Metal.MTLCreateSystemDefaultDevice()
|
||||
@functools.cached_property
|
||||
def mtl_queue(self) -> Any:
|
||||
return METAL.device.newCommandQueue()
|
||||
def __init__(self):
|
||||
self.mtl_buffers_in_flight: List[Any] = []
|
||||
self.device = Metal.MTLCreateSystemDefaultDevice()
|
||||
self.mtl_queue = self.device.newCommandQueue()
|
||||
METAL = _METAL()
|
||||
|
||||
class RawMetalBuffer(RawBufferMapped):
|
||||
|
@ -85,6 +82,4 @@ class MetalCodegen(GPUCodegen):
|
|||
extra_args = ['uint3 gid [[thread_position_in_grid]]', 'uint3 lid [[thread_position_in_threadgroup]]'])
|
||||
|
||||
class MetalBuffer(CompiledBuffer):
|
||||
raw_buffer_type = RawMetalBuffer
|
||||
codegen_type = MetalCodegen
|
||||
runtime_type = MetalProgram
|
||||
spec = Specialized(RawMetalBuffer, MetalCodegen, MetalProgram)
|
||||
|
|
|
@ -15,7 +15,7 @@ torch_fxn_for_op: Dict[Op, Callable] = {**base_fxn_for_op, **{
|
|||
device = torch.device("cuda:0" if torch.cuda.is_available() else ("mps" if getenv("MPS", 0) else "cpu"))
|
||||
class TorchBuffer(InterpretedBuffer):
|
||||
fxn_for_op: ClassVar = torch_fxn_for_op
|
||||
to_tinygrad_dtype = staticmethod(lambda lbuf: {torch.float16: dtypes.float16, torch.float32: dtypes.float32}[lbuf.dtype])
|
||||
def to_tinygrad_dtype(self): return {torch.float16: dtypes.float16, torch.float32: dtypes.float32}[self._buf.dtype]
|
||||
|
||||
@staticmethod
|
||||
def fromCPU(x): return TorchBuffer(torch.from_numpy(x).requires_grad_(False).to(device))
|
||||
|
|
|
@ -210,7 +210,7 @@ dispatch: Dict[MovementOps, Callable] = {MovementOps.RESHAPE: ShapeTracker._resh
|
|||
MovementOps.SHRINK: ShapeTracker._shrink, MovementOps.PERMUTE: ShapeTracker._permute, MovementOps.STRIDE: ShapeTracker._stride}
|
||||
|
||||
# returns the axes to create new_shape if new_shape can be created by combining axis from old_shape
|
||||
def get_contraction(old_shape:Tuple[int, ...], new_shape:Tuple[int, ...]):
|
||||
def get_contraction(old_shape:Tuple[int, ...], new_shape:Tuple[int, ...]) -> Optional[List[List[int]]]:
|
||||
# Pre-allocate all groups.
|
||||
axis_groups: List[List[int]] = [[] for _ in range(len(new_shape))]
|
||||
# Index for new_shape and axis_groups.
|
||||
|
@ -221,7 +221,7 @@ def get_contraction(old_shape:Tuple[int, ...], new_shape:Tuple[int, ...]):
|
|||
if new_shape[i] == 1 and old_shape[old_shape_i] != 1:
|
||||
if i < len(new_shape) - 1: i += 1
|
||||
else:
|
||||
if new_shape[i] % old_shape[old_shape_i] != 0 or prod([old_shape[x] for x in axis_groups[i]]) * old_shape[old_shape_i] > new_shape[i]:
|
||||
if new_shape[i] % old_shape[old_shape_i] != 0 or prod([old_shape[x] for x in axis_groups[i]]) * old_shape[old_shape_i] > new_shape[i]:
|
||||
return None
|
||||
axis_groups[i].append(old_shape_i)
|
||||
# Move to next axes group if total size of all dimensions match.
|
||||
|
|
Loading…
Reference in New Issue