1
0
Fork 0

compile works (#688)

* compile works

* runtimes

* line count

* fix custom, to tg dtype

* meh, that's fine with lazy import
pull/702/head
George Hotz 2023-03-12 11:01:25 -07:00 committed by GitHub
parent af7745073f
commit 15e0b56e39
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
17 changed files with 71 additions and 78 deletions

View File

@ -1,7 +1,6 @@
#!/bin/bash
# note: if we compile tinygrad/nn/__init__.py __dict__ no longer works, and optimizers will silently fail
mypyc --check-untyped-defs --explicit-package-bases --warn-unreachable tinygrad/shape/shapetracker.py tinygrad/shape/symbolic.py \
tinygrad/nn/__init__.py tinygrad/helpers.py tinygrad/mlops.py tinygrad/tensor.py tinygrad/graph.py \
#tinygrad/codegen/ast.py tinygrad/codegen/gpu.py tinygrad/ops.py tinygrad/runtime/ops_metal.py
#tinygrad/runtime/ops_metal.py tinygrad/shape/__init__.py tinygrad/ops.py tinygrad/codegen/ast.py \
#tinygrad/helpers.py tinygrad/mlops.py tinygrad/nn/__init__.py tinygrad/graph.py tinygrad/lazy.py tinygrad/tensor.py
tinygrad/nn/__init__.py tinygrad/helpers.py tinygrad/mlops.py tinygrad/tensor.py tinygrad/graph.py
#tinygrad/ops.py tinygrad/runtime/ops_metal.py tinygrad/runtime/ops_gpu.py tinygrad/runtime/ops_cpu.py tinygrad/lazy.py
#tinygrad/codegen/ast.py tinygrad/codegen/gpu.py

View File

@ -22,7 +22,6 @@ from extra.helpers import Timing
from tinygrad.tensor import Tensor
from tinygrad.nn import Linear
from tinygrad.ops import GlobalCounters
from tinygrad.nn.optim import get_state_dict
# https://github.com/facebookresearch/llama/blob/1076b9c51c77ad06e9d7ba8a4c6df775741732bd/llama/model.py#L47
def precompute_freqs_cis(dim: int, end: int, theta: float = 10000.0):
@ -198,7 +197,7 @@ if __name__ == "__main__":
chatbot = args.prompt == None
# load model (you have to find the weights yourself)
from extra.utils import fake_torch_load_zipped
from extra.utils import fake_torch_load_zipped, get_child
if args.large:
raise RuntimeError("large model is broken")
@ -248,14 +247,14 @@ if __name__ == "__main__":
with Timing("loaded weights in ", lambda et_ns: f", {GlobalCounters.mem_used/1e9:.2f} GB loaded at {GlobalCounters.mem_used/et_ns:.2f} GB/s"):
weights = fake_torch_load_zipped(open(WEIGHTS_FILENAME, "rb"), load_weights=getenv("WEIGHTS", 1), base_name="consolidated")
state_dict = get_state_dict(model)
#from tinygrad.nn.optim import get_state_dict
#state_dict = get_state_dict(model)
# assign weights (should be free)
for k,v in weights.items():
if '.inner_attention.rope.freqs' in k: continue # no rope today
mv = state_dict[k]
assert mv.shape == v.shape, f"shape mismatch in {k}, {mv.shape} != {v.shape}"
mv.assign(v).realize()
#state_dict[k].assign(v).realize()
get_child(model, k).assign(v).realize()
del weights

View File

@ -1,13 +1,14 @@
import pickle
import numpy as np
from tqdm import tqdm
import tempfile
import tempfile, platform
from collections import defaultdict
from tinygrad.helpers import prod, getenv, DEBUG
from tinygrad.ops import GlobalCounters
from tinygrad.tensor import Tensor
from tinygrad.lazy import LazyNumpyArray, Device
from tinygrad.shape.shapetracker import strides_for_shape
OSX = platform.system() == "Darwin"
def fetch(url):
if url.startswith("/"):
@ -119,7 +120,7 @@ def load_single_weight(t:Tensor, myfile, shape, strides, dtype, mmap_allowed=Fal
ret = np.empty(lna.shape, dtype=lna.dtype)
myfile.readinto(ret.data)
return ret
if mmap_allowed and t.device in ["GPU", "CUDA"]: t.lazydata.op.arg.fxn = _mmap
if mmap_allowed and not OSX and t.device in ["GPU", "CUDA"]: t.lazydata.op.arg.fxn = _mmap
else: t.lazydata.op.arg.fxn = _read
t.realize()

View File

@ -1,3 +1,3 @@
#!/bin/bash
rm tinygrad/*.so tinygrad/codegen/*.so tinygrad/shape/*.so tinygrad/llops/*.so tinygrad/nn/*.so tinygrad/runtime/*.so *.so
rm tinygrad/*.so tinygrad/codegen/*.so tinygrad/shape/*.so tinygrad/nn/*.so tinygrad/runtime/*.so *.so

View File

@ -22,7 +22,7 @@ def atan2_gpu(a:CompiledBuffer, b:CompiledBuffer) -> CompiledBuffer:
__kernel void atan2(global float *c, global float *a, global float *b) {
int idx = get_global_id(0);
c[idx] = atan2(a[idx], b[idx]);
}""", global_size=[prod(ret.shape)]).build(GPUBuffer.runtime_type).exec([ret, a.contiguous(), b.contiguous()])
}""", global_size=[prod(ret.shape)]).build(GPUBuffer.spec.runtime).exec([ret, a.contiguous(), b.contiguous()])
return ret
def atan2_cpu(a:CPUBuffer, b:CPUBuffer) -> CPUBuffer:

View File

@ -2,7 +2,7 @@ import itertools
from enum import Enum, auto
from typing import List, Tuple
from tinygrad.helpers import prod, dedup, all_same, colored, dtypes
from tinygrad.ops import LazyOp, MovementOps, get_lazyop_info, get_buffers, ReduceOps, get_lazyops, map_buffers
from tinygrad.ops import LazyOp, MovementOps, get_lazyop_info, get_buffers, ReduceOps, get_lazyops, map_buffers, GenericShape, ASTRunner
from tinygrad.shape.shapetracker import ShapeTracker, View, strides_for_shape
def get_first_reduce(shapes):
@ -41,7 +41,7 @@ class ASTKernel:
else:
output_shape = None
self.info = get_lazyop_info(ast)
self.info: GenericShape = get_lazyop_info(ast)
self.bufs = dedup(get_buffers(ast))
for b in self.bufs: b.st.simplify()
self.ast = ast
@ -124,6 +124,8 @@ class ASTKernel:
for i in range(len(self.sts)):
print(prefix, self.bufs[i].dtype if self.bufs[i] is not None else None, self.buftokens[i], f"early:{'T' if i < len(self.bufs) and self.bufs[i] in self.earlybufs else 'F'}", self.sts[i].shape, self.sts[i].views[-1].strides, len(self.sts[i].views), type(self.bufs[i]._buf) if self.bufs[i] is not None else "FAKE")
def codegen(self) -> ASTRunner: raise NotImplementedError("need a codegen")
@property
def shape_len(self) -> int: return len(self.sts[0].shape)

View File

@ -1,6 +1,6 @@
import os, math, functools
import numpy as np
from typing import Tuple, Union, List, NamedTuple, Final
from typing import Tuple, Union, List, NamedTuple, Final, Iterator
def dedup(x): return list(dict.fromkeys(x)) # retains list order
def prod(x:Union[List[int], Tuple[int, ...]]) -> int: return math.prod(x)
@ -10,7 +10,7 @@ def all_same(items): return all(x == items[0] for x in items) if len(items) > 0
def colored(st, color, background=False, bright=False): return f"\u001b[{10*background+60*bright+30+['black', 'red', 'green', 'yellow', 'blue', 'magenta', 'cyan', 'white'].index(color)}m{st}\u001b[0m" # replace the termcolor library with one line
def partition(lst, fxn): return [x for x in lst if fxn(x)], [x for x in lst if not fxn(x)]
def make_pair(x:Union[int, Tuple[int, ...]], cnt=2) -> Tuple[int, ...]: return (x,)*cnt if isinstance(x, int) else x
def flatten(l): return [item for sublist in l for item in sublist]
def flatten(l:Iterator): return [item for sublist in l for item in sublist]
def mnum(i) -> str: return str(i) if i >= 0 else f"m{-i}"
@functools.lru_cache(maxsize=None)

View File

@ -1,10 +1,11 @@
from __future__ import annotations
from typing import Optional, Tuple, Union, List, Dict, Any, ClassVar, Type
import os, sys, weakref, importlib, inspect, functools
import sys, weakref, importlib, inspect, functools, pathlib
from weakref import WeakValueDictionary
from tinygrad.helpers import prod, getenv, DType, dtypes, LazyNumpyArray
from tinygrad.helpers import prod, getenv, DType, dtypes, LazyNumpyArray, flatten
from tinygrad.shape.shapetracker import ShapeTracker, get_contraction
from tinygrad.ops import InterpretedBuffer, DeviceBuffer, UnaryOps, BinaryOps, ReduceOps, MovementOps, LoadOps, OpType, LazyOp, get_buffers, get_lazyops, map_buffers
from tinygrad.ops import DeviceBuffer, UnaryOps, BinaryOps, ReduceOps, MovementOps, LoadOps, OpType, LazyOp, get_buffers, get_lazyops, map_buffers
from tinygrad.runtime.ops_cpu import CPUBuffer
from tinygrad.graph import log_op
# lazy can recurse a lot
@ -15,10 +16,10 @@ LAZY = getenv("LAZY", 1)
class _Device:
def __init__(self) -> None:
self._buffers = {y.upper():y for y in [os.path.splitext(x)[0][len("ops_"):] for x in sorted(os.listdir(os.path.join(os.path.dirname(os.path.realpath(__file__)), "runtime"))) if x.startswith("ops_")]}
self._buffers: List[str] = [x.stem[len("ops_"):].upper() for x in (pathlib.Path(__file__).parent/"runtime").iterdir() if x.stem.startswith("ops_")]
self.DEFAULT: str = functools.reduce(lambda val, ele: ele if getenv(ele) == 1 else val, self._buffers, "CPU")
@functools.lru_cache(maxsize=None) # this class is a singleton, pylint: disable=method-cache-max-size-none
def __getitem__(self, x:str) -> Type[DeviceBuffer]: return [cls for cname, cls in inspect.getmembers(importlib.import_module(f'tinygrad.runtime.ops_{self._buffers[x]}'), inspect.isclass) if (cname.lower() == self._buffers[x] + "buffer")][0]
def __getitem__(self, x:str) -> Type[DeviceBuffer]: return [cls for cname, cls in inspect.getmembers(importlib.import_module(f'tinygrad.runtime.ops_{x.lower()}'), inspect.isclass) if (cname.lower() == x.lower() + "buffer")][0]
Device = _Device()
# TODO: movement ops that only change shape are really nops. treat them as such
@ -155,7 +156,7 @@ class LazyBuffer:
# NOTE: we also have to copy the numpy array on the way out...otherwise the underlying Tensor could be freed and use after free. improve this?
def toCPU(self):
ret = self.realize().toCPU()
log_op(InterpretedBuffer(ret), LazyOp(LoadOps.TOCPU, (self.realized,), None))
log_op(CPUBuffer(ret), LazyOp(LoadOps.TOCPU, (self.realized,), None))
return ret.copy()
def unary_op(self:LazyBuffer, op:UnaryOps) -> LazyBuffer: return elementwise_op(op, self)
@ -206,9 +207,8 @@ class LazyBuffer:
# move permutes before reshapes if we can
if op == MovementOps.PERMUTE and PUSH_PERMUTES and self.realized is None and self.op.op == MovementOps.RESHAPE and isinstance(self.op.src[0], LazyBuffer):
if shape_idx_groups := get_contraction(self.op.src[0].shape, self.shape):
new_arg: List[int] = functools.reduce(lambda r, x: r + shape_idx_groups[x], arg, [])
self.op.src[0].children.discard(self) # this changes nothing?
return self.op.src[0].movement_op(MovementOps.PERMUTE, tuple(new_arg)) \
return self.op.src[0].movement_op(MovementOps.PERMUTE, tuple(flatten(shape_idx_groups[i] for i in arg))) \
.movement_op(MovementOps.RESHAPE, ShapeTracker(self.st).movement_op(op, arg).shape)
# if this MovementOp is being applied to a BinaryOp, apply the MovementOp to all the BinaryOp inputs instead. NOTE: UnaryOps is never an OpType

View File

@ -27,7 +27,7 @@ class LazyOp(NamedTuple):
# Any == Union[LazyBuffer, DeviceBuffer]
def get_buffers(op:LazyOp) -> List[Any]: return functools.reduce(operator.add, [get_buffers(x) if isinstance(x, LazyOp) else [x] for x in op.src], [])
def get_lazyops(op:LazyOp) -> List[LazyOp]: return functools.reduce(operator.add, [get_lazyops(x) for x in op.src if isinstance(x, LazyOp)], [op])
def map_buffers(real_srcs, x:LazyOp) -> LazyOp:
def map_buffers(real_srcs:Dict[Any, Any], x:Any) -> LazyOp:
if x in real_srcs: return map_buffers(real_srcs, real_srcs[x]) if isinstance(real_srcs[x], LazyOp) else real_srcs[x]
return LazyOp(x.op, tuple((map_buffers(real_srcs, y) if isinstance(y, LazyOp) else real_srcs[y]) for y in x.src), x.arg)
@ -86,7 +86,7 @@ shape_fxn_for_op: Dict[Op, Callable] = {
**{op:lambda self,y: GenericShape(self.shape, max(self.dtype, y.dtype), self.consume_flops() + y.consume_flops() + prod(self.shape)) for op in BinaryOps},
**{op:lambda self,new_shape: GenericShape(new_shape, self.dtype, self.consume_flops() + prod(self.shape)) for op in ReduceOps},
**{op:functools.partial(lambda mop,self,arg: GenericShape(ShapeTracker(self.shape).movement_op(mop, arg).shape, self.dtype, self.consume_flops()), op) for op in MovementOps}}
def get_lazyop_info(ast:LazyOp): return InterpretedBuffer.exec_ast(map_buffers({x:InterpretedBuffer(GenericShape(x.shape, x.dtype)) for x in get_buffers(ast)}, ast))._buf
def get_lazyop_info(ast:LazyOp) -> GenericShape: return InterpretedBuffer.exec_ast(map_buffers({x:InterpretedBuffer(GenericShape(x.shape, x.dtype)) for x in get_buffers(ast)}, ast))._buf
# used in CPUBuffer and TorchBuffer
class InterpretedBuffer(DeviceBuffer): # pylint: disable=abstract-method
@ -94,7 +94,7 @@ class InterpretedBuffer(DeviceBuffer): # pylint: disable=abstract-method
def __init__(self, lbuf:Any):
self._buf: Any = lbuf
self.shape: Tuple[int, ...] = tuple(lbuf.shape)
self.dtype: DType = self.to_tinygrad_dtype(lbuf) if hasattr(self, 'to_tinygrad_dtype') else lbuf.dtype
self.dtype: DType = self.to_tinygrad_dtype() if hasattr(self, 'to_tinygrad_dtype') else lbuf.dtype
# NOTE: this is overcounting the memory used, as reshapes and stuff are aliases
self._memsz = (prod(self.shape) * self.dtype.itemsize) if not isinstance(lbuf, GenericShape) else 0
GlobalCounters.mem_used += self._memsz
@ -163,9 +163,17 @@ class ASTRunner:
local_sizes = [list(x) for x in itertools.product(*local_dims) if prod(x) <= MAX_WORKGROUP] * 2 # try each valid size twice
return min([(self.timeit(rawbufs, local_size), local_size) for local_size in random.sample(local_sizes, len(local_sizes))])[1]
from tinygrad.codegen.ast import ASTKernel
class Specialized(NamedTuple):
raw_buffer: Type[RawBuffer]
codegen: Type[ASTKernel]
runtime: Type
# assumes you are using ShapeTracker
# used in GPUBuffer and LLVMBuffer
class CompiledBuffer(DeviceBuffer): # pylint: disable=abstract-method
spec: ClassVar[Specialized]
def __init__(self, shape:Union[ShapeTracker, Tuple[int, ...]], hostbuf:Optional[CompiledBuffer]=None, backing:Optional[np.ndarray]=None, force_create=False, dtype:DType=dtypes.float32):
self.st = shape if isinstance(shape, ShapeTracker) else ShapeTracker(tuple(shape))
self.shape = self.st.shape
@ -179,13 +187,12 @@ class CompiledBuffer(DeviceBuffer): # pylint: disable=abstract-method
def __repr__(self): return f"{type(self).__name__}(shape={self.st}, hostbuf={type(self).__name__}(shape={self._base_shape}" + (f", backing=np.array({self._backing}, dtype=np.{self.dtype.np.__name__}), dtype={self.dtype}), dtype={self.dtype})" if self._backing is not None else f", force_create=True, dtype={self.dtype}), dtype={self.dtype})")
raw_buffer_type: ClassVar[Type[RawBuffer]]
@classmethod
def create_raw_buffer(cls, shape:Tuple[int, ...], backing:Optional[np.ndarray], dtype:DType) -> RawBuffer:
def create_raw_buffer(self, shape:Tuple[int, ...], backing:Optional[np.ndarray], dtype:DType) -> RawBuffer:
assert backing is None or prod(shape) == prod(backing.shape), "backing has the wrong shape"
assert backing is None or GlobalCounters.cache is None, f"can't copy in {backing.shape} while caching"
if DEBUG >= 4: print(f"create raw buffer {shape} {dtype} backed:{backing is not None}")
return cls.raw_buffer_type(prod(shape), dtype) if backing is None else cls.raw_buffer_type.fromCPU(backing)
return self.spec.raw_buffer(prod(shape), dtype) if backing is None else self.spec.raw_buffer.fromCPU(backing)
def raw(self) -> RawBuffer:
if self._buf is None:
if DEBUG >= 4 and self._backing is not None: print(f"**** copy in {self._backing.shape} to {type(self)}")
@ -200,19 +207,16 @@ class CompiledBuffer(DeviceBuffer): # pylint: disable=abstract-method
if DEBUG >= 3: print(f"**** copy out {self.shape}")
return self.contiguous().raw().toCPU().reshape(self.shape)
codegen_type: ClassVar[Any]
runtime_type: ClassVar[Type]
method_cache: Final[Dict[str, ASTRunner]] = {}
@classmethod
def exec_ast(cls, ast:LazyOp, output_buffer:Optional[CompiledBuffer]=None):
k = cls.codegen_type(ast, output_buffer)
k = cls.spec.codegen(ast, output_buffer)
if getenv("ENABLE_METHOD_CACHE", 1): # this is the default now
if k.key not in cls.method_cache: cls.method_cache[k.key] = k.codegen().build(cls.runtime_type)
if k.key not in cls.method_cache: cls.method_cache[k.key] = k.codegen().build(cls.spec.runtime)
elif DEBUG >= 4: print(f"method cache hit : {k.key}")
prg = cls.method_cache[k.key]
else:
prg = k.codegen().build(cls.runtime_type)
prg = k.codegen().build(cls.spec.runtime)
if getenv("PRINT_AST", "") == prg.name or getenv("PRINT_AST", "") == "1":
k.print()
print(prg.prg)

View File

@ -1,6 +1,6 @@
import os, time, ctypes, hashlib, subprocess, platform
from tinygrad.helpers import dtypes, DType
from tinygrad.ops import CompiledBuffer, RawBufferMapped
from tinygrad.ops import CompiledBuffer, RawBufferMapped, Specialized
from tinygrad.codegen.gpu import GPUCodegen, GPULanguage
class RawMallocBuffer(RawBufferMapped):
@ -29,4 +29,4 @@ class ClangCodegen(GPUCodegen):
lang = GPULanguage(buffer_suffix=" restrict")
class ClangBuffer(CompiledBuffer):
raw_buffer_type, codegen_type, runtime_type = RawMallocBuffer, ClangCodegen, ClangProgram
spec = Specialized(RawMallocBuffer, ClangCodegen, ClangProgram)

View File

@ -36,7 +36,7 @@ numpy_fxn_for_op: Dict[Op, Callable] = {**base_fxn_for_op, **{
class CPUBuffer(InterpretedBuffer):
fxn_for_op: ClassVar = numpy_fxn_for_op
to_tinygrad_dtype = staticmethod(dtypes.from_np)
def to_tinygrad_dtype(self): return dtypes.from_np(self._buf)
@staticmethod
def fromCPU(x): return CPUBuffer(x)

View File

@ -4,7 +4,7 @@ import pycuda.autoprimaryctx # type: ignore # pylint: disable=unused-import # no
import pycuda.driver as cuda # type: ignore
from pycuda.compiler import compile as cuda_compile # type: ignore
from tinygrad.helpers import DEBUG
from tinygrad.ops import CompiledBuffer, RawBufferCopyInOut
from tinygrad.ops import CompiledBuffer, RawBufferCopyInOut, Specialized
from tinygrad.codegen.gpu import GPUCodegen, GPULanguage
class RawCUDABuffer(RawBufferCopyInOut):
@ -47,4 +47,4 @@ class CUDACodegen(GPUCodegen):
lid = [f'threadIdx.{chr(120+i)}' for i in range(3)])
class CUDABuffer(CompiledBuffer):
raw_buffer_type, codegen_type, runtime_type = RawCUDABuffer, CUDACodegen, CUDAProgram
spec = Specialized(RawCUDABuffer, CUDACodegen, CUDAProgram)

View File

@ -1,10 +1,10 @@
from __future__ import annotations
import platform, functools
import platform
import numpy as np
import pyopencl as cl # type: ignore
from typing import Optional, List, Final
from tinygrad.helpers import IMAGE, DEBUG, getenv, dtypes
from tinygrad.ops import CompiledBuffer, GlobalCounters, RawBufferCopyInOut, RawBuffer
from tinygrad.ops import CompiledBuffer, GlobalCounters, RawBufferCopyInOut, RawBuffer, Specialized
from tinygrad.codegen.gpu import GPUCodegen, GPULanguage
OSX = platform.system() == "Darwin"
@ -12,16 +12,12 @@ OSX_TIMING_RATIO = (125/3) if OSX else 1.0 # see test/external_osx_profiling.p
FLOAT16 = getenv("FLOAT16", 0)
class _CL:
@functools.cached_property
def cl_ctx(self) -> cl.Context:
def __init__(self):
devices: List[cl.Device] = sum([x.get_devices(device_type=cl.device_type.GPU) for x in cl.get_platforms()], [])
if len(devices) == 0: devices = sum([x.get_devices(device_type=cl.device_type.CPU) for x in cl.get_platforms()], []) # settle for CPU
if len(devices) > 1 or DEBUG >= 1: print(f"using {devices[getenv('CL_DEVICE', 0)]}")
return cl.Context(devices=[devices[getenv("CL_DEVICE", 0)]])
@functools.cached_property
def cl_queue(self) -> cl.CommandQueue:
return cl.CommandQueue(CL.cl_ctx, properties=cl.command_queue_properties.PROFILING_ENABLE) # this is an in-order command queue
self.cl_ctx: cl.Context = cl.Context(devices=[devices[getenv("CL_DEVICE", 0)]])
self.cl_queue: cl.CommandQueue = cl.CommandQueue(self.cl_ctx, properties=cl.command_queue_properties.PROFILING_ENABLE) # this is an in-order command queue
CL = _CL()
class CLBuffer(RawBufferCopyInOut):
@ -39,7 +35,7 @@ class CLImage(RawBuffer): # pylint: disable=abstract-method
GlobalCounters.mem_used += self._cl.row_pitch * self._cl.height
def __del__(self): GlobalCounters.mem_used -= self._cl.row_pitch * self._cl.height
@functools.lru_cache(maxsize=None)
#@functools.lru_cache(maxsize=None)
class CLProgram:
def __init__(self, name:str, prg:str, binary=False, argdtypes=None):
self.name, self.argdtypes, self.clprogram = name, argdtypes, cl.Program(CL.cl_ctx, CL.cl_ctx.devices, [prg]) if binary else cl.Program(CL.cl_ctx, prg) # type: ignore
@ -77,11 +73,8 @@ class CLCodegen(GPUCodegen):
gid = [f'get_global_id({i})' for i in range(3)], lid = [f'get_local_id({i})' for i in range(3)])
class GPUBuffer(CompiledBuffer):
raw_buffer_type = CLBuffer
spec = Specialized(CLBuffer, CLCodegen, CLProgram)
# override this method for image
@classmethod
def create_raw_buffer(cls, shape, backing, dtype) -> RawBuffer:
def create_raw_buffer(self, shape, backing, dtype) -> RawBuffer:
if len(shape) == 3 and shape[2] == 4 and IMAGE >= 2 and backing is None: return CLImage(shape) # NOTE: this is a hack. we don't pass in the dtype here, it's controlled by the FLOAT16 env var
else: return super().create_raw_buffer(shape, backing, dtype)
codegen_type = CLCodegen
runtime_type = CLProgram

View File

@ -1,6 +1,6 @@
import time, hashlib, ctypes
from typing import ClassVar
from tinygrad.ops import CompiledBuffer
from tinygrad.ops import CompiledBuffer, Specialized
from tinygrad.runtime.ops_clang import RawMallocBuffer
from tinygrad.helpers import getenv, DEBUG
from ctypes import CFUNCTYPE
@ -63,4 +63,4 @@ class LLVMProgram:
if wait: return time.monotonic()-st
class LLVMBuffer(CompiledBuffer):
raw_buffer_type, codegen_type, runtime_type = RawMallocBuffer, LLVMCodegen, LLVMProgram
spec = Specialized(RawMallocBuffer, LLVMCodegen, LLVMProgram)

View File

@ -1,21 +1,18 @@
# pip3 install pyobjc-framework-Metal pyobjc-framework-Cocoa pyobjc-framework-libdispatch
import os, subprocess, pathlib, functools
import os, subprocess, pathlib
import Metal, Cocoa, libdispatch # type: ignore
from typing import List, Any, Final
from typing import List, Any
from tinygrad.codegen.gpu import GPUCodegen, GPULanguage
from tinygrad.helpers import prod, getenv, DEBUG, DType
from tinygrad.ops import CompiledBuffer, RawBufferMapped
from tinygrad.ops import CompiledBuffer, RawBufferMapped, Specialized
METAL_XCODE = getenv("METAL_XCODE")
class _METAL:
mtl_buffers_in_flight: Final[List[Any]] = []
@functools.cached_property
def device(self) -> Any:
return Metal.MTLCreateSystemDefaultDevice()
@functools.cached_property
def mtl_queue(self) -> Any:
return METAL.device.newCommandQueue()
def __init__(self):
self.mtl_buffers_in_flight: List[Any] = []
self.device = Metal.MTLCreateSystemDefaultDevice()
self.mtl_queue = self.device.newCommandQueue()
METAL = _METAL()
class RawMetalBuffer(RawBufferMapped):
@ -85,6 +82,4 @@ class MetalCodegen(GPUCodegen):
extra_args = ['uint3 gid [[thread_position_in_grid]]', 'uint3 lid [[thread_position_in_threadgroup]]'])
class MetalBuffer(CompiledBuffer):
raw_buffer_type = RawMetalBuffer
codegen_type = MetalCodegen
runtime_type = MetalProgram
spec = Specialized(RawMetalBuffer, MetalCodegen, MetalProgram)

View File

@ -15,7 +15,7 @@ torch_fxn_for_op: Dict[Op, Callable] = {**base_fxn_for_op, **{
device = torch.device("cuda:0" if torch.cuda.is_available() else ("mps" if getenv("MPS", 0) else "cpu"))
class TorchBuffer(InterpretedBuffer):
fxn_for_op: ClassVar = torch_fxn_for_op
to_tinygrad_dtype = staticmethod(lambda lbuf: {torch.float16: dtypes.float16, torch.float32: dtypes.float32}[lbuf.dtype])
def to_tinygrad_dtype(self): return {torch.float16: dtypes.float16, torch.float32: dtypes.float32}[self._buf.dtype]
@staticmethod
def fromCPU(x): return TorchBuffer(torch.from_numpy(x).requires_grad_(False).to(device))

View File

@ -210,7 +210,7 @@ dispatch: Dict[MovementOps, Callable] = {MovementOps.RESHAPE: ShapeTracker._resh
MovementOps.SHRINK: ShapeTracker._shrink, MovementOps.PERMUTE: ShapeTracker._permute, MovementOps.STRIDE: ShapeTracker._stride}
# returns the axes to create new_shape if new_shape can be created by combining axis from old_shape
def get_contraction(old_shape:Tuple[int, ...], new_shape:Tuple[int, ...]):
def get_contraction(old_shape:Tuple[int, ...], new_shape:Tuple[int, ...]) -> Optional[List[List[int]]]:
# Pre-allocate all groups.
axis_groups: List[List[int]] = [[] for _ in range(len(new_shape))]
# Index for new_shape and axis_groups.
@ -221,7 +221,7 @@ def get_contraction(old_shape:Tuple[int, ...], new_shape:Tuple[int, ...]):
if new_shape[i] == 1 and old_shape[old_shape_i] != 1:
if i < len(new_shape) - 1: i += 1
else:
if new_shape[i] % old_shape[old_shape_i] != 0 or prod([old_shape[x] for x in axis_groups[i]]) * old_shape[old_shape_i] > new_shape[i]:
if new_shape[i] % old_shape[old_shape_i] != 0 or prod([old_shape[x] for x in axis_groups[i]]) * old_shape[old_shape_i] > new_shape[i]:
return None
axis_groups[i].append(old_shape_i)
# Move to next axes group if total size of all dimensions match.