compile works (#688)

* compile works * runtimes * line count * fix custom, to tg dtype * meh, that's fine with lazy import
2023-03-12 11:01:25 -07:00 · 2023-03-12 11:01:25 -07:00 · 15e0b56e39
parent af7745073f
commit 15e0b56e39
17 changed files with 71 additions and 78 deletions
--- a/compile.sh
+++ b/compile.sh
@ -1,7 +1,6 @@
 #!/bin/bash
+# note: if we compile tinygrad/nn/__init__.py __dict__ no longer works, and optimizers will silently fail
 mypyc --check-untyped-defs --explicit-package-bases --warn-unreachable tinygrad/shape/shapetracker.py tinygrad/shape/symbolic.py \
-  tinygrad/nn/__init__.py tinygrad/helpers.py tinygrad/mlops.py tinygrad/tensor.py tinygrad/graph.py \
-  #tinygrad/codegen/ast.py tinygrad/codegen/gpu.py tinygrad/ops.py tinygrad/runtime/ops_metal.py
-  #tinygrad/runtime/ops_metal.py tinygrad/shape/__init__.py tinygrad/ops.py tinygrad/codegen/ast.py \
-  #tinygrad/helpers.py tinygrad/mlops.py tinygrad/nn/__init__.py tinygrad/graph.py tinygrad/lazy.py tinygrad/tensor.py
-
+  tinygrad/nn/__init__.py tinygrad/helpers.py tinygrad/mlops.py tinygrad/tensor.py tinygrad/graph.py
+  #tinygrad/ops.py tinygrad/runtime/ops_metal.py tinygrad/runtime/ops_gpu.py tinygrad/runtime/ops_cpu.py tinygrad/lazy.py
+  #tinygrad/codegen/ast.py tinygrad/codegen/gpu.py
--- a/examples/llama.py
+++ b/examples/llama.py
@ -22,7 +22,6 @@ from extra.helpers import Timing
 from tinygrad.tensor import Tensor
 from tinygrad.nn import Linear
 from tinygrad.ops import GlobalCounters
-from tinygrad.nn.optim import get_state_dict

 # https://github.com/facebookresearch/llama/blob/1076b9c51c77ad06e9d7ba8a4c6df775741732bd/llama/model.py#L47
 def precompute_freqs_cis(dim: int, end: int, theta: float = 10000.0):
@ -198,7 +197,7 @@ if __name__ == "__main__":
  chatbot = args.prompt == None

  # load model (you have to find the weights yourself)
-  from extra.utils import fake_torch_load_zipped
+  from extra.utils import fake_torch_load_zipped, get_child

  if args.large:
    raise RuntimeError("large model is broken")
@ -248,14 +247,14 @@ if __name__ == "__main__":
    with Timing("loaded weights in ", lambda et_ns: f", {GlobalCounters.mem_used/1e9:.2f} GB loaded at {GlobalCounters.mem_used/et_ns:.2f} GB/s"):
      weights = fake_torch_load_zipped(open(WEIGHTS_FILENAME, "rb"), load_weights=getenv("WEIGHTS", 1), base_name="consolidated")

-    state_dict = get_state_dict(model)
+    #from tinygrad.nn.optim import get_state_dict
+    #state_dict = get_state_dict(model)

    # assign weights (should be free)
    for k,v in weights.items():
      if '.inner_attention.rope.freqs' in k: continue  # no rope today
-      mv = state_dict[k]
-      assert mv.shape == v.shape, f"shape mismatch in {k}, {mv.shape} != {v.shape}"
-      mv.assign(v).realize()
+      #state_dict[k].assign(v).realize()
+      get_child(model, k).assign(v).realize()

    del weights

--- a/extra/utils.py
+++ b/extra/utils.py
@ -1,13 +1,14 @@
 import pickle
 import numpy as np
 from tqdm import tqdm
-import tempfile
+import tempfile, platform
 from collections import defaultdict
 from tinygrad.helpers import prod, getenv, DEBUG
 from tinygrad.ops import GlobalCounters
 from tinygrad.tensor import Tensor
 from tinygrad.lazy import LazyNumpyArray, Device
 from tinygrad.shape.shapetracker import strides_for_shape
+OSX = platform.system() == "Darwin"

 def fetch(url):
  if url.startswith("/"):
@ -119,7 +120,7 @@ def load_single_weight(t:Tensor, myfile, shape, strides, dtype, mmap_allowed=Fal
      ret = np.empty(lna.shape, dtype=lna.dtype)
      myfile.readinto(ret.data)
      return ret
-    if mmap_allowed and t.device in ["GPU", "CUDA"]: t.lazydata.op.arg.fxn = _mmap
+    if mmap_allowed and not OSX and t.device in ["GPU", "CUDA"]: t.lazydata.op.arg.fxn = _mmap
    else: t.lazydata.op.arg.fxn = _read
    t.realize()

--- a/rmso.sh
+++ b/rmso.sh
@ -1,3 +1,3 @@
 #!/bin/bash
-rm tinygrad/*.so tinygrad/codegen/*.so tinygrad/shape/*.so tinygrad/llops/*.so tinygrad/nn/*.so tinygrad/runtime/*.so *.so
+rm tinygrad/*.so tinygrad/codegen/*.so tinygrad/shape/*.so tinygrad/nn/*.so tinygrad/runtime/*.so *.so

--- a/test/test_custom_function.py
+++ b/test/test_custom_function.py
@ -22,7 +22,7 @@ def atan2_gpu(a:CompiledBuffer, b:CompiledBuffer) -> CompiledBuffer:
    __kernel void atan2(global float *c, global float *a, global float *b) {
      int idx = get_global_id(0);
      c[idx] = atan2(a[idx], b[idx]);
-    }""", global_size=[prod(ret.shape)]).build(GPUBuffer.runtime_type).exec([ret, a.contiguous(), b.contiguous()])
+    }""", global_size=[prod(ret.shape)]).build(GPUBuffer.spec.runtime).exec([ret, a.contiguous(), b.contiguous()])
  return ret

 def atan2_cpu(a:CPUBuffer, b:CPUBuffer) -> CPUBuffer:
--- a/tinygrad/codegen/ast.py
+++ b/tinygrad/codegen/ast.py
@ -2,7 +2,7 @@ import itertools
 from enum import Enum, auto
 from typing import List, Tuple
 from tinygrad.helpers import prod, dedup, all_same, colored, dtypes
-from tinygrad.ops import LazyOp, MovementOps, get_lazyop_info, get_buffers, ReduceOps, get_lazyops, map_buffers
+from tinygrad.ops import LazyOp, MovementOps, get_lazyop_info, get_buffers, ReduceOps, get_lazyops, map_buffers, GenericShape, ASTRunner
 from tinygrad.shape.shapetracker import ShapeTracker, View, strides_for_shape

 def get_first_reduce(shapes):
@ -41,7 +41,7 @@ class ASTKernel:
    else:
      output_shape = None

-    self.info = get_lazyop_info(ast)
+    self.info: GenericShape = get_lazyop_info(ast)
    self.bufs = dedup(get_buffers(ast))
    for b in self.bufs: b.st.simplify()
    self.ast = ast
@ -124,6 +124,8 @@ class ASTKernel:
    for i in range(len(self.sts)):
      print(prefix, self.bufs[i].dtype if self.bufs[i] is not None else None, self.buftokens[i], f"early:{'T' if i < len(self.bufs) and self.bufs[i] in self.earlybufs else 'F'}", self.sts[i].shape, self.sts[i].views[-1].strides, len(self.sts[i].views), type(self.bufs[i]._buf) if self.bufs[i] is not None else "FAKE")

+  def codegen(self) -> ASTRunner: raise NotImplementedError("need a codegen")
+
  @property
  def shape_len(self) -> int: return len(self.sts[0].shape)

--- a/tinygrad/helpers.py
+++ b/tinygrad/helpers.py
@ -1,6 +1,6 @@
 import os, math, functools
 import numpy as np
-from typing import Tuple, Union, List, NamedTuple, Final
+from typing import Tuple, Union, List, NamedTuple, Final, Iterator

 def dedup(x): return list(dict.fromkeys(x))   # retains list order
 def prod(x:Union[List[int], Tuple[int, ...]]) -> int: return math.prod(x)
@ -10,7 +10,7 @@ def all_same(items): return all(x == items[0] for x in items) if len(items) > 0
 def colored(st, color, background=False, bright=False): return f"\u001b[{10*background+60*bright+30+['black', 'red', 'green', 'yellow', 'blue', 'magenta', 'cyan', 'white'].index(color)}m{st}\u001b[0m"  # replace the termcolor library with one line
 def partition(lst, fxn): return [x for x in lst if fxn(x)], [x for x in lst if not fxn(x)]
 def make_pair(x:Union[int, Tuple[int, ...]], cnt=2) -> Tuple[int, ...]: return (x,)*cnt if isinstance(x, int) else x
-def flatten(l): return [item for sublist in l for item in sublist]
+def flatten(l:Iterator): return [item for sublist in l for item in sublist]
 def mnum(i) -> str: return str(i) if i >= 0 else f"m{-i}"

@functools.lru_cache(maxsize=None)
--- a/tinygrad/lazy.py
+++ b/tinygrad/lazy.py
@ -1,10 +1,11 @@
 from __future__ import annotations
 from typing import Optional, Tuple, Union, List, Dict, Any, ClassVar, Type
-import os, sys, weakref, importlib, inspect, functools
+import sys, weakref, importlib, inspect, functools, pathlib
 from weakref import WeakValueDictionary
-from tinygrad.helpers import prod, getenv, DType, dtypes, LazyNumpyArray
+from tinygrad.helpers import prod, getenv, DType, dtypes, LazyNumpyArray, flatten
 from tinygrad.shape.shapetracker import ShapeTracker, get_contraction
-from tinygrad.ops import InterpretedBuffer, DeviceBuffer, UnaryOps, BinaryOps, ReduceOps, MovementOps, LoadOps, OpType, LazyOp, get_buffers, get_lazyops, map_buffers
+from tinygrad.ops import DeviceBuffer, UnaryOps, BinaryOps, ReduceOps, MovementOps, LoadOps, OpType, LazyOp, get_buffers, get_lazyops, map_buffers
+from tinygrad.runtime.ops_cpu import CPUBuffer
 from tinygrad.graph import log_op

 # lazy can recurse a lot
@ -15,10 +16,10 @@ LAZY = getenv("LAZY", 1)

 class _Device:
  def __init__(self) -> None:
-    self._buffers = {y.upper():y for y in [os.path.splitext(x)[0][len("ops_"):] for x in sorted(os.listdir(os.path.join(os.path.dirname(os.path.realpath(__file__)), "runtime"))) if x.startswith("ops_")]}
+    self._buffers: List[str] = [x.stem[len("ops_"):].upper() for x in (pathlib.Path(__file__).parent/"runtime").iterdir() if x.stem.startswith("ops_")]
    self.DEFAULT: str = functools.reduce(lambda val, ele: ele if getenv(ele) == 1 else val, self._buffers, "CPU")
  @functools.lru_cache(maxsize=None)  # this class is a singleton, pylint: disable=method-cache-max-size-none
-  def __getitem__(self, x:str) -> Type[DeviceBuffer]: return [cls for cname, cls in inspect.getmembers(importlib.import_module(f'tinygrad.runtime.ops_{self._buffers[x]}'), inspect.isclass) if (cname.lower() == self._buffers[x] + "buffer")][0]
+  def __getitem__(self, x:str) -> Type[DeviceBuffer]: return [cls for cname, cls in inspect.getmembers(importlib.import_module(f'tinygrad.runtime.ops_{x.lower()}'), inspect.isclass) if (cname.lower() == x.lower() + "buffer")][0]
 Device = _Device()

 # TODO: movement ops that only change shape are really nops. treat them as such
@ -155,7 +156,7 @@ class LazyBuffer:
  # NOTE: we also have to copy the numpy array on the way out...otherwise the underlying Tensor could be freed and use after free. improve this?
  def toCPU(self):
    ret = self.realize().toCPU()
-    log_op(InterpretedBuffer(ret), LazyOp(LoadOps.TOCPU, (self.realized,), None))
+    log_op(CPUBuffer(ret), LazyOp(LoadOps.TOCPU, (self.realized,), None))
    return ret.copy()

  def unary_op(self:LazyBuffer, op:UnaryOps) -> LazyBuffer: return elementwise_op(op, self)
@ -206,9 +207,8 @@ class LazyBuffer:
    # move permutes before reshapes if we can
    if op == MovementOps.PERMUTE and PUSH_PERMUTES and self.realized is None and self.op.op == MovementOps.RESHAPE and isinstance(self.op.src[0], LazyBuffer):
      if shape_idx_groups := get_contraction(self.op.src[0].shape, self.shape):
-        new_arg: List[int] = functools.reduce(lambda r, x: r + shape_idx_groups[x], arg, [])
        self.op.src[0].children.discard(self)   # this changes nothing?
-        return self.op.src[0].movement_op(MovementOps.PERMUTE, tuple(new_arg)) \
+        return self.op.src[0].movement_op(MovementOps.PERMUTE, tuple(flatten(shape_idx_groups[i] for i in arg))) \
          .movement_op(MovementOps.RESHAPE, ShapeTracker(self.st).movement_op(op, arg).shape)

    # if this MovementOp is being applied to a BinaryOp, apply the MovementOp to all the BinaryOp inputs instead. NOTE: UnaryOps is never an OpType
--- a/tinygrad/ops.py
+++ b/tinygrad/ops.py
@ -27,7 +27,7 @@ class LazyOp(NamedTuple):
 # Any == Union[LazyBuffer, DeviceBuffer]
 def get_buffers(op:LazyOp) -> List[Any]: return functools.reduce(operator.add, [get_buffers(x) if isinstance(x, LazyOp) else [x] for x in op.src], [])
 def get_lazyops(op:LazyOp) -> List[LazyOp]: return functools.reduce(operator.add, [get_lazyops(x) for x in op.src if isinstance(x, LazyOp)], [op])
-def map_buffers(real_srcs, x:LazyOp) -> LazyOp:
+def map_buffers(real_srcs:Dict[Any, Any], x:Any) -> LazyOp:
  if x in real_srcs: return map_buffers(real_srcs, real_srcs[x]) if isinstance(real_srcs[x], LazyOp) else real_srcs[x]
  return LazyOp(x.op, tuple((map_buffers(real_srcs, y) if isinstance(y, LazyOp) else real_srcs[y]) for y in x.src), x.arg)

@ -86,7 +86,7 @@ shape_fxn_for_op: Dict[Op, Callable] = {
  **{op:lambda self,y: GenericShape(self.shape, max(self.dtype, y.dtype), self.consume_flops() + y.consume_flops() + prod(self.shape)) for op in BinaryOps},
  **{op:lambda self,new_shape: GenericShape(new_shape, self.dtype, self.consume_flops() + prod(self.shape)) for op in ReduceOps},
  **{op:functools.partial(lambda mop,self,arg: GenericShape(ShapeTracker(self.shape).movement_op(mop, arg).shape, self.dtype, self.consume_flops()), op) for op in MovementOps}}
-def get_lazyop_info(ast:LazyOp): return InterpretedBuffer.exec_ast(map_buffers({x:InterpretedBuffer(GenericShape(x.shape, x.dtype)) for x in get_buffers(ast)}, ast))._buf
+def get_lazyop_info(ast:LazyOp) -> GenericShape: return InterpretedBuffer.exec_ast(map_buffers({x:InterpretedBuffer(GenericShape(x.shape, x.dtype)) for x in get_buffers(ast)}, ast))._buf

 # used in CPUBuffer and TorchBuffer
 class InterpretedBuffer(DeviceBuffer):  # pylint: disable=abstract-method
@ -94,7 +94,7 @@ class InterpretedBuffer(DeviceBuffer):  # pylint: disable=abstract-method
  def __init__(self, lbuf:Any):
    self._buf: Any = lbuf
    self.shape: Tuple[int, ...] = tuple(lbuf.shape)
-    self.dtype: DType = self.to_tinygrad_dtype(lbuf) if hasattr(self, 'to_tinygrad_dtype') else lbuf.dtype
+    self.dtype: DType = self.to_tinygrad_dtype() if hasattr(self, 'to_tinygrad_dtype') else lbuf.dtype
    # NOTE: this is overcounting the memory used, as reshapes and stuff are aliases
    self._memsz = (prod(self.shape) * self.dtype.itemsize) if not isinstance(lbuf, GenericShape) else 0
    GlobalCounters.mem_used += self._memsz
@ -163,9 +163,17 @@ class ASTRunner:
    local_sizes = [list(x) for x in itertools.product(*local_dims) if prod(x) <= MAX_WORKGROUP] * 2  # try each valid size twice
    return min([(self.timeit(rawbufs, local_size), local_size) for local_size in random.sample(local_sizes, len(local_sizes))])[1]

+from tinygrad.codegen.ast import ASTKernel
+class Specialized(NamedTuple):
+  raw_buffer: Type[RawBuffer]
+  codegen: Type[ASTKernel]
+  runtime: Type
+
 # assumes you are using ShapeTracker
 # used in GPUBuffer and LLVMBuffer
 class CompiledBuffer(DeviceBuffer):  # pylint: disable=abstract-method
+  spec: ClassVar[Specialized]
+
  def __init__(self, shape:Union[ShapeTracker, Tuple[int, ...]], hostbuf:Optional[CompiledBuffer]=None, backing:Optional[np.ndarray]=None, force_create=False, dtype:DType=dtypes.float32):
    self.st = shape if isinstance(shape, ShapeTracker) else ShapeTracker(tuple(shape))
    self.shape = self.st.shape
@ -179,13 +187,12 @@ class CompiledBuffer(DeviceBuffer):  # pylint: disable=abstract-method

  def __repr__(self): return f"{type(self).__name__}(shape={self.st}, hostbuf={type(self).__name__}(shape={self._base_shape}" + (f", backing=np.array({self._backing}, dtype=np.{self.dtype.np.__name__}), dtype={self.dtype}), dtype={self.dtype})" if self._backing is not None else f", force_create=True, dtype={self.dtype}), dtype={self.dtype})")

-  raw_buffer_type: ClassVar[Type[RawBuffer]]
-  @classmethod
-  def create_raw_buffer(cls, shape:Tuple[int, ...], backing:Optional[np.ndarray], dtype:DType) -> RawBuffer:
+  def create_raw_buffer(self, shape:Tuple[int, ...], backing:Optional[np.ndarray], dtype:DType) -> RawBuffer:
    assert backing is None or prod(shape) == prod(backing.shape), "backing has the wrong shape"
    assert backing is None or GlobalCounters.cache is None, f"can't copy in {backing.shape} while caching"
    if DEBUG >= 4: print(f"create raw buffer {shape} {dtype} backed:{backing is not None}")
-    return cls.raw_buffer_type(prod(shape), dtype) if backing is None else cls.raw_buffer_type.fromCPU(backing)
+    return self.spec.raw_buffer(prod(shape), dtype) if backing is None else self.spec.raw_buffer.fromCPU(backing)
+
  def raw(self) -> RawBuffer:
    if self._buf is None:
      if DEBUG >= 4 and self._backing is not None: print(f"**** copy in {self._backing.shape} to {type(self)}")
@ -200,19 +207,16 @@ class CompiledBuffer(DeviceBuffer):  # pylint: disable=abstract-method
    if DEBUG >= 3: print(f"**** copy out {self.shape}")
    return self.contiguous().raw().toCPU().reshape(self.shape)

-  codegen_type: ClassVar[Any]
-  runtime_type: ClassVar[Type]
-
  method_cache: Final[Dict[str, ASTRunner]] = {}
  @classmethod
  def exec_ast(cls, ast:LazyOp, output_buffer:Optional[CompiledBuffer]=None):
-    k = cls.codegen_type(ast, output_buffer)
+    k = cls.spec.codegen(ast, output_buffer)
    if getenv("ENABLE_METHOD_CACHE", 1):  # this is the default now
-      if k.key not in cls.method_cache: cls.method_cache[k.key] = k.codegen().build(cls.runtime_type)
+      if k.key not in cls.method_cache: cls.method_cache[k.key] = k.codegen().build(cls.spec.runtime)
      elif DEBUG >= 4: print(f"method cache hit : {k.key}")
      prg = cls.method_cache[k.key]
    else:
-      prg = k.codegen().build(cls.runtime_type)
+      prg = k.codegen().build(cls.spec.runtime)
    if getenv("PRINT_AST", "") == prg.name or getenv("PRINT_AST", "") == "1":
      k.print()
      print(prg.prg)
--- a/tinygrad/runtime/ops_clang.py
+++ b/tinygrad/runtime/ops_clang.py
@ -1,6 +1,6 @@
 import os, time, ctypes, hashlib, subprocess, platform
 from tinygrad.helpers import dtypes, DType
-from tinygrad.ops import CompiledBuffer, RawBufferMapped
+from tinygrad.ops import CompiledBuffer, RawBufferMapped, Specialized
 from tinygrad.codegen.gpu import GPUCodegen, GPULanguage

 class RawMallocBuffer(RawBufferMapped):
@ -29,4 +29,4 @@ class ClangCodegen(GPUCodegen):
  lang = GPULanguage(buffer_suffix=" restrict")

 class ClangBuffer(CompiledBuffer):
-  raw_buffer_type, codegen_type, runtime_type = RawMallocBuffer, ClangCodegen, ClangProgram
+  spec = Specialized(RawMallocBuffer, ClangCodegen, ClangProgram)
--- a/tinygrad/runtime/ops_cpu.py
+++ b/tinygrad/runtime/ops_cpu.py
@ -36,7 +36,7 @@ numpy_fxn_for_op: Dict[Op, Callable] = {**base_fxn_for_op, **{

 class CPUBuffer(InterpretedBuffer):
  fxn_for_op: ClassVar = numpy_fxn_for_op
-  to_tinygrad_dtype = staticmethod(dtypes.from_np)
+  def to_tinygrad_dtype(self): return dtypes.from_np(self._buf)

  @staticmethod
  def fromCPU(x): return CPUBuffer(x)
--- a/tinygrad/runtime/ops_cuda.py
+++ b/tinygrad/runtime/ops_cuda.py
@ -4,7 +4,7 @@ import pycuda.autoprimaryctx # type: ignore # pylint: disable=unused-import # no
 import pycuda.driver as cuda # type: ignore
 from pycuda.compiler import compile as cuda_compile # type: ignore
 from tinygrad.helpers import DEBUG
-from tinygrad.ops import CompiledBuffer, RawBufferCopyInOut
+from tinygrad.ops import CompiledBuffer, RawBufferCopyInOut, Specialized
 from tinygrad.codegen.gpu import GPUCodegen, GPULanguage

 class RawCUDABuffer(RawBufferCopyInOut):
@ -47,4 +47,4 @@ class CUDACodegen(GPUCodegen):
    lid = [f'threadIdx.{chr(120+i)}' for i in range(3)])

 class CUDABuffer(CompiledBuffer):
-  raw_buffer_type, codegen_type, runtime_type = RawCUDABuffer, CUDACodegen, CUDAProgram
+  spec = Specialized(RawCUDABuffer, CUDACodegen, CUDAProgram)
--- a/tinygrad/runtime/ops_gpu.py
+++ b/tinygrad/runtime/ops_gpu.py
@ -1,10 +1,10 @@
 from __future__ import annotations
-import platform, functools
+import platform
 import numpy as np
 import pyopencl as cl  # type: ignore
 from typing import Optional, List, Final
 from tinygrad.helpers import IMAGE, DEBUG, getenv, dtypes
-from tinygrad.ops import CompiledBuffer, GlobalCounters, RawBufferCopyInOut, RawBuffer
+from tinygrad.ops import CompiledBuffer, GlobalCounters, RawBufferCopyInOut, RawBuffer, Specialized
 from tinygrad.codegen.gpu import GPUCodegen, GPULanguage

 OSX = platform.system() == "Darwin"
@ -12,16 +12,12 @@ OSX_TIMING_RATIO = (125/3) if OSX else 1.0   # see test/external_osx_profiling.p
 FLOAT16 = getenv("FLOAT16", 0)

 class _CL:
-  @functools.cached_property
-  def cl_ctx(self) -> cl.Context:
+  def __init__(self):
    devices: List[cl.Device] = sum([x.get_devices(device_type=cl.device_type.GPU) for x in cl.get_platforms()], [])
    if len(devices) == 0: devices = sum([x.get_devices(device_type=cl.device_type.CPU) for x in cl.get_platforms()], []) # settle for CPU
    if len(devices) > 1 or DEBUG >= 1: print(f"using {devices[getenv('CL_DEVICE', 0)]}")
-    return cl.Context(devices=[devices[getenv("CL_DEVICE", 0)]])
-
-  @functools.cached_property
-  def cl_queue(self) -> cl.CommandQueue:
-    return cl.CommandQueue(CL.cl_ctx, properties=cl.command_queue_properties.PROFILING_ENABLE)  # this is an in-order command queue
+    self.cl_ctx: cl.Context = cl.Context(devices=[devices[getenv("CL_DEVICE", 0)]])
+    self.cl_queue: cl.CommandQueue = cl.CommandQueue(self.cl_ctx, properties=cl.command_queue_properties.PROFILING_ENABLE)  # this is an in-order command queue
 CL = _CL()

 class CLBuffer(RawBufferCopyInOut):
@ -39,7 +35,7 @@ class CLImage(RawBuffer):  # pylint: disable=abstract-method
    GlobalCounters.mem_used += self._cl.row_pitch * self._cl.height
  def __del__(self): GlobalCounters.mem_used -= self._cl.row_pitch * self._cl.height

-@functools.lru_cache(maxsize=None)
+#@functools.lru_cache(maxsize=None)
 class CLProgram:
  def __init__(self, name:str, prg:str, binary=False, argdtypes=None):
    self.name, self.argdtypes, self.clprogram = name, argdtypes, cl.Program(CL.cl_ctx, CL.cl_ctx.devices, [prg]) if binary else cl.Program(CL.cl_ctx, prg)  # type: ignore
@ -77,11 +73,8 @@ class CLCodegen(GPUCodegen):
    gid = [f'get_global_id({i})' for i in range(3)], lid = [f'get_local_id({i})' for i in range(3)])

 class GPUBuffer(CompiledBuffer):
-  raw_buffer_type = CLBuffer
+  spec = Specialized(CLBuffer, CLCodegen, CLProgram)
  # override this method for image
-  @classmethod
-  def create_raw_buffer(cls, shape, backing, dtype) -> RawBuffer:
+  def create_raw_buffer(self, shape, backing, dtype) -> RawBuffer:
    if len(shape) == 3 and shape[2] == 4 and IMAGE >= 2 and backing is None: return CLImage(shape)   # NOTE: this is a hack. we don't pass in the dtype here, it's controlled by the FLOAT16 env var
    else: return super().create_raw_buffer(shape, backing, dtype)
-  codegen_type = CLCodegen
-  runtime_type = CLProgram
--- a/tinygrad/runtime/ops_llvm.py
+++ b/tinygrad/runtime/ops_llvm.py
@ -1,6 +1,6 @@
 import time, hashlib, ctypes
 from typing import ClassVar
-from tinygrad.ops import CompiledBuffer
+from tinygrad.ops import CompiledBuffer, Specialized
 from tinygrad.runtime.ops_clang import RawMallocBuffer
 from tinygrad.helpers import getenv, DEBUG
 from ctypes import CFUNCTYPE
@ -63,4 +63,4 @@ class LLVMProgram:
    if wait: return time.monotonic()-st

 class LLVMBuffer(CompiledBuffer):
-  raw_buffer_type, codegen_type, runtime_type = RawMallocBuffer, LLVMCodegen, LLVMProgram
+  spec = Specialized(RawMallocBuffer, LLVMCodegen, LLVMProgram)
--- a/tinygrad/runtime/ops_metal.py
+++ b/tinygrad/runtime/ops_metal.py
@ -1,21 +1,18 @@
 # pip3 install pyobjc-framework-Metal pyobjc-framework-Cocoa pyobjc-framework-libdispatch
-import os, subprocess, pathlib, functools
+import os, subprocess, pathlib
 import Metal, Cocoa, libdispatch # type: ignore
-from typing import List, Any, Final
+from typing import List, Any
 from tinygrad.codegen.gpu import GPUCodegen, GPULanguage
 from tinygrad.helpers import prod, getenv, DEBUG, DType
-from tinygrad.ops import CompiledBuffer, RawBufferMapped
+from tinygrad.ops import CompiledBuffer, RawBufferMapped, Specialized

 METAL_XCODE = getenv("METAL_XCODE")

 class _METAL:
-  mtl_buffers_in_flight: Final[List[Any]] = []
-  @functools.cached_property
-  def device(self) -> Any:
-    return Metal.MTLCreateSystemDefaultDevice()
-  @functools.cached_property
-  def mtl_queue(self) -> Any:
-    return METAL.device.newCommandQueue()
+  def __init__(self):
+    self.mtl_buffers_in_flight: List[Any] = []
+    self.device = Metal.MTLCreateSystemDefaultDevice()
+    self.mtl_queue = self.device.newCommandQueue()
 METAL = _METAL()

 class RawMetalBuffer(RawBufferMapped):
@ -85,6 +82,4 @@ class MetalCodegen(GPUCodegen):
    extra_args = ['uint3 gid [[thread_position_in_grid]]', 'uint3 lid [[thread_position_in_threadgroup]]'])

 class MetalBuffer(CompiledBuffer):
-  raw_buffer_type = RawMetalBuffer
-  codegen_type = MetalCodegen
-  runtime_type = MetalProgram
+  spec = Specialized(RawMetalBuffer, MetalCodegen, MetalProgram)
--- a/tinygrad/runtime/ops_torch.py
+++ b/tinygrad/runtime/ops_torch.py
@ -15,7 +15,7 @@ torch_fxn_for_op: Dict[Op, Callable] = {**base_fxn_for_op, **{
 device = torch.device("cuda:0" if torch.cuda.is_available() else ("mps" if getenv("MPS", 0) else "cpu"))
 class TorchBuffer(InterpretedBuffer):
  fxn_for_op: ClassVar = torch_fxn_for_op
-  to_tinygrad_dtype = staticmethod(lambda lbuf: {torch.float16: dtypes.float16, torch.float32: dtypes.float32}[lbuf.dtype])
+  def to_tinygrad_dtype(self): return {torch.float16: dtypes.float16, torch.float32: dtypes.float32}[self._buf.dtype]

  @staticmethod
  def fromCPU(x): return TorchBuffer(torch.from_numpy(x).requires_grad_(False).to(device))
--- a/tinygrad/shape/shapetracker.py
+++ b/tinygrad/shape/shapetracker.py
@ -210,7 +210,7 @@ dispatch: Dict[MovementOps, Callable] = {MovementOps.RESHAPE: ShapeTracker._resh
                                         MovementOps.SHRINK: ShapeTracker._shrink, MovementOps.PERMUTE: ShapeTracker._permute, MovementOps.STRIDE: ShapeTracker._stride}

 # returns the axes to create new_shape if new_shape can be created by combining axis from old_shape
-def get_contraction(old_shape:Tuple[int, ...], new_shape:Tuple[int, ...]):
+def get_contraction(old_shape:Tuple[int, ...], new_shape:Tuple[int, ...]) -> Optional[List[List[int]]]:
  # Pre-allocate all groups.
  axis_groups: List[List[int]] = [[] for _ in range(len(new_shape))]
  # Index for new_shape and axis_groups.
@ -221,7 +221,7 @@ def get_contraction(old_shape:Tuple[int, ...], new_shape:Tuple[int, ...]):
    if new_shape[i] == 1 and old_shape[old_shape_i] != 1:
      if i < len(new_shape) - 1: i += 1
    else:
-      if new_shape[i] % old_shape[old_shape_i]  != 0 or prod([old_shape[x] for x in axis_groups[i]]) * old_shape[old_shape_i]  > new_shape[i]:
+      if new_shape[i] % old_shape[old_shape_i] != 0 or prod([old_shape[x] for x in axis_groups[i]]) * old_shape[old_shape_i] > new_shape[i]:
        return None
      axis_groups[i].append(old_shape_i)
      # Move to next axes group if total size of all dimensions match.