new style device (#2530)

* cpu tests pass * torch works * works * metal works * fix ops_disk * metal jit works * fix openpilot * llvm and clang work * fix webgpu * docs are rly broken * LRU works on metal * delete comment * revert name to ._buf. LRU only on Compiled * changes * allocator * allocator, getting closer * lru alloc * LRUAllocator * all pass * metal * cuda * test examples * linearizer * test fixes * fix custom + clean realize * fix hip * skip tests * fix tests * fix size=0 * fix MOCKHIP * fix thneed * copy better * simple * old style metal copy * fix thneed * np reshape * give cuda a device
2023-11-30 17:07:16 -08:00 · 2023-11-30 17:07:16 -08:00 · 2c363b5f0b
parent e56511b59a
commit 2c363b5f0b
38 changed files with 572 additions and 1039 deletions
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@ -197,11 +197,11 @@ jobs:
      - if: ${{ matrix.task == 'openpilot' }}
        name: Test openpilot fastvits model correctness (float32)
        run: FLOAT16=0 DEBUGCL=1 GPU=1 IMAGE=2 python openpilot/compile2.py https://github.com/commaai/openpilot/raw/9118973ed03c1ae1d40cf69a29507ec2cc78efd7/selfdrive/modeld/models/supercombo.onnx
-      - if: ${{ matrix.task == 'openpilot' }}
-        name: Test multigpu
-        run: |
-          PYTHONPATH="." python test/external/dist/test_world.py
-          PYTHONPATH="." python test/external/dist/test_collectives.py
+      #- if: ${{ matrix.task == 'openpilot' }}
+      #  name: Test multigpu
+      #  run: |
+      #    PYTHONPATH="." python test/external/dist/test_world.py
+      #    PYTHONPATH="." python test/external/dist/test_collectives.py
      - if: ${{ matrix.task == 'onnx' }}
        name: Test ONNX (CPU)
        run: CPU=1 python -m pytest -n=auto test/external/external_test_onnx_backend.py --durations=20
--- a/.gitignore
+++ b/.gitignore
@ -45,3 +45,5 @@ coverage.xml
 htmlcov
 outputs_yolov8
 wandb
+model.safetensors
+quickstart.py
--- a/docs/abstractions.py
+++ b/docs/abstractions.py
@ -83,9 +83,9 @@ class LazyBuffer:
  # we'll come back to this later
  st: ShapeTracker

-  # if the LazyBuffer is realized, it has a RawBuffer
-  # we will come back to RawBuffers later
-  realized: Optional[RawBuffer]
+  # if the LazyBuffer is realized, it has a Buffer
+  # we will come back to Buffer later
+  realized: Optional[Buffer]

  # if the lazybuffer is unrealized, it has a LazyOp
  # this LazyOp describes the computation needed to realize this LazyBuffer
@ -142,7 +142,7 @@ assert result.lazydata.realized is None, "the LazyBuffer is not realized yet"
 result.realize()
 assert result.lazydata.realized is not None, "the LazyBuffer is realized!"
 # this brings us nicely to DeviceBuffer, of which the realized ClangBuffer is a subclass
-assert 'RawMallocBuffer' in str(type(result.lazydata.realized))
+#assert 'RawMallocBuffer' in str(type(result.lazydata.realized))
 # getting ahead of ourselves, but we can copy the DeviceBuffer toCPU
 assert result.lazydata.realized.toCPU()[0] == 5, "when put in numpy with toCPU, it's 5"

@ -153,9 +153,6 @@ assert result.lazydata.realized.toCPU()[0] == 5, "when put in numpy with toCPU,

 # Interpreted backends are very simple (example: CPU and TORCH)
 class Interpreted:
-  # they have a backing RawBuffer
-  buffer: Type[RawBuffer]
-
  # and they have a lookup table to functions for the Ops
  fxn_for_op: Dict[Op, Callable] = {
    UnaryOps.EXP2: lambda x: np.exp2(x),
@ -163,9 +160,6 @@ class Interpreted:

 # Compiled backends take a little more (example: GPU and LLVM)
 class Compiled:
-  # they also have a backing RawBuffer
-  buffer: Type[RawBuffer]
-
  # a code generator, which compiles the AST
  codegen: Type[Linearizer]

@ -178,41 +172,28 @@ class Runtime(ABC):
  # the constructor compiles the code
  def __init__(self, name:str, prg:str): pass
  # call runs the code on the bufs. NOTE: the output is always bufs[0], but this is just a convention
-  def __call__(self, global_size:Optional[List[int]], local_size:Optional[List[int]], *bufs:List[RawBuffer]): pass
+  def __call__(self, *bufs:List[Buffer], global_size:Optional[List[int]], local_size:Optional[List[int]]): pass

 # %%
-# == RawBuffer (in tinygrad/runtime/lib.py, code 5/10) ==
+# == Buffer (in tinygrad/device.py, code 6/10) ==
 import numpy as np

-# RawBuffer is where the data is actually held. it's pretty close to just memory
-class RawBuffer(ABC):
+# Buffer is where the data is actually held. it's pretty close to just memory
+class Buffer(ABC):
  # create an empty rawbuffer that holds `size` elements of type `dtype`
-  # `buf` is an opaque container class
-  def __init__(self, size:int, dtype:DType, buf:Any): raise NotImplementedError("must be implemented")
+  # `opaque` is an opaque container class
+  def __init__(self, device:str, size:int, dtype:DType, opaque:Any=None): pass

-  # fromCPU is classmethod that creates a RawBuffer, it's a classmethod since some runtimes are 0 copy
-  @classmethod
-  def fromCPU(cls:RawBuffer, x:np.ndarray) -> RawBuffer: raise NotImplementedError("must be implemented")
-
-  # toCPU converts the RawBuffer to a numpy array with shape (size,). many backends are 0 copy here
-  def toCPU(self) -> np.ndarray: raise NotImplementedError("must be implemented")
-
-# RawNumpyBuffer is a RawBuffer example for numpy. It's very simple
-class RawNumpyBuffer(RawBuffer):
-  # NOTE: the "np.ndarray" is stored in the opaque container
-  def __init__(self, buf:np.ndarray):
-    super().__init__(buf.size, dtypes.from_np(buf.dtype), buf)
-  @classmethod
-  def fromCPU(cls, x): return cls(x)
-  def toCPU(self): return self._buf
+  # toCPU converts the RawBuffer to a numpy array with shape (size,)
+  def toCPU(self) -> np.ndarray: pass

 # %%
 # == Example: 2+3 in raw clang ==

-# RawMallocBuffer is the simplest concrete version of RawBuffer (in tinygrad/ops.py)
+# MallocAllocator is the simplest concrete version of Allocator (in tinygrad/device.py)
 # it's used for the CLANG and LLVM backends
 # it's just malloc(size * dtype.itemsize)
-from tinygrad.runtime.lib import RawMallocBuffer
+from tinygrad.device import MallocAllocator

 # ClangProgram is the simplest runtime (in tinygrad/runtime/ops_clang.py, code 7/10)
 # __init__ calls clang, and __call__ calls the function in the *.so outputted by clang
@ -224,16 +205,21 @@ from tinygrad.runtime.ops_clang import ClangProgram, compile_clang
 # then we copy the numpy in to RawMallocBuffers
 # last, we create an empty output buffer
 from tinygrad.helpers import dtypes
+input_a, input_b = MallocAllocator.alloc(1, dtypes.float32), MallocAllocator.alloc(1, dtypes.float32)
+output = MallocAllocator.alloc(1, dtypes.float32)
+
+# now we copy in the values
 numpy_a, numpy_b = np.array([2], dtype=np.float32), np.array([3], dtype=np.float32)
-input_a, input_b = RawMallocBuffer.fromCPU(numpy_a), RawMallocBuffer.fromCPU(numpy_b)
-output = RawMallocBuffer(1, dtypes.float32)
+MallocAllocator.copyin(input_a, numpy_a.data.cast("B"))
+MallocAllocator.copyin(input_b, numpy_b.data.cast("B"))

 # compile the program, run it, and 2+3 does indeed equal 5
-program = ClangProgram("add", compile_clang(f"void add(float *a, float *b, float *c) {{ *a = *b + *c; }}"))
+program = ClangProgram("add", compile_clang(f"void add(float *a, float *b, float *c) {{ *a = *b + *c; }}"), bufs=3)
 program(output, input_a, input_b)
-print(output.toCPU())
-assert output.toCPU()[0] == 5, "it's still 5"
-np.testing.assert_allclose(output.toCPU(), numpy_a+numpy_b)
+numpy_out = np.empty(1, dtype=np.float32)
+MallocAllocator.copyout(numpy_out.data.cast("B"), output)
+assert numpy_out[0] == 5, "it's still 5"
+np.testing.assert_allclose(numpy_out, numpy_a+numpy_b)

 # %%
 # == Linearizer (in tinygrad/codegen/linearizer.py, code 4/10) ==
--- a/examples/hlb_cifar10.py
+++ b/examples/hlb_cifar10.py
@ -5,6 +5,7 @@ from tinygrad.helpers import getenv, dtypes
 if __name__ == "__main__":
  if getenv("DIST"):
    dist.preinit()
+    from extra.dist import collectives

 # tinygrad implementation of https://github.com/tysam-code/hlb-CIFAR10/blob/main/main.py
 # https://myrtle.ai/learn/how-to-train-your-resnet-8-bag-of-tricks/
@ -22,7 +23,6 @@ from tinygrad.helpers import GlobalCounters
 from tinygrad.shape.symbolic import Node
 from extra.lr_scheduler import OneCycleLR
 from tinygrad.jit import TinyJit
-from extra.dist import collectives

 BS, EVAL_BS, STEPS = getenv("BS", 512), getenv('EVAL_BS', 500), getenv("STEPS", 1000)

--- a/extra/export_model.py
+++ b/extra/export_model.py
@ -17,9 +17,9 @@ def compile_net(run:TinyJit, special_names:Dict[int,str]) -> Tuple[Dict[str,str]
      key = id(arg)
      if key not in bufs:
        if key in special_names:
-          bufs[key] = (special_names[key], arg._memsz, arg.dtype, key)
+          bufs[key] = (special_names[key], arg.size*arg.dtype.itemsize, arg.dtype, key)
        else:
-          bufs[key] = (f"buf_{bufnum}", arg._memsz, arg.dtype, key)
+          bufs[key] = (f"buf_{bufnum}", arg.size*arg.dtype.itemsize, arg.dtype, key)
          bufnum += 1
          if i > 0: bufs_to_save[bufs[key][0]] = arg   # if first usage of a buffer is not an output, and it's not a special name
      cargs.append(bufs[key][0])
--- a/extra/hip_wrapper.py
+++ b/extra/hip_wrapper.py
@ -6,8 +6,8 @@ from typing import Any, Dict, List, Tuple
 from dataclasses import dataclass

 try:
-  _libhip = ctypes.cdll.LoadLibrary("libamdhip64.so")
-  _libhiprtc = ctypes.cdll.LoadLibrary("libhiprtc.so")
+  _libhip = ctypes.cdll.LoadLibrary("/opt/rocm/lib/libamdhip64.so")
+  _libhiprtc = ctypes.cdll.LoadLibrary("/opt/rocm/lib/libhiprtc.so")

  _libhip.hipGetErrorString.restype = ctypes.c_char_p
  _libhip.hipGetErrorString.argtypes = [ctypes.c_int]
--- a/extra/thneed.py
+++ b/extra/thneed.py
@ -5,10 +5,12 @@ import json
 import traceback
 import numpy as np
 from tinygrad.runtime.ops_gpu import CLProgram, compile_gpu
+from tinygrad.device import Device
 from tinygrad.helpers import DEBUG, getenv
 from collections import defaultdict
 import pyopencl as cl
-from tinygrad.runtime.ops_gpu import CL, OSX_TIMING_RATIO
+from tinygrad.runtime.ops_gpu import OSX_TIMING_RATIO
+CL = Device["GPU"]

 DEBUGCL = getenv("DEBUGCL", 0)
 FLOAT16 = getenv("FLOAT16", 0)
@ -74,29 +76,29 @@ class Thneed:
        if o['arg_type'] == "image2d_t":
          if 'buffer_id' in o and o['height'] == 1 and not bufs_loaded[o['buffer_id']]:
            # hack: use a image1d since we can back that with a buffer
-            buf = cl.Image(CL.cl_ctxs[0], mf.READ_WRITE, tfmt, shape=(o['width'],), buffer=bufs[o['buffer_id']])
+            buf = cl.Image(CL.ctx, mf.READ_WRITE, tfmt, shape=(o['width'],), buffer=bufs[o['buffer_id']])
          else:
            # buffer isn't supported in image2d, copy buffer into image
            if 'buffer_id' in o and bufs_loaded[o['buffer_id']]:
              arr = np.zeros(bufs[o['buffer_id']].size // 2, dtype=np.float16)
-              cl.enqueue_copy(CL.cl_queue[0], arr, bufs[o['buffer_id']])
-              buf = cl.Image(CL.cl_ctxs[0], mf.READ_WRITE | mf.COPY_HOST_PTR, tfmt,
+              cl.enqueue_copy(CL.queue, arr, bufs[o['buffer_id']])
+              buf = cl.Image(CL.ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, tfmt,
                shape=(o['width'], o['height']), pitches=(o['row_pitch'],), hostbuf=arr)
            elif o['needs_load']:
-              buf = cl.Image(CL.cl_ctxs[0], mf.READ_WRITE | mf.COPY_HOST_PTR, tfmt,
+              buf = cl.Image(CL.ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, tfmt,
                shape=(o['width'], o['height']), pitches=(o['row_pitch'],), hostbuf=o['data'])
            else:
-              buf = cl.Image(CL.cl_ctxs[0], mf.READ_WRITE, tfmt, shape=(o['width'], o['height']))
+              buf = cl.Image(CL.ctx, mf.READ_WRITE, tfmt, shape=(o['width'], o['height']))
        if o['arg_type'] == "image1d_t":
          assert not o['needs_load']
          assert not bufs_loaded[o['buffer_id']]
-          buf = cl.Image(CL.cl_ctxs[0], mf.READ_WRITE, tfmt, shape=(o['width'],), buffer=bufs[o['buffer_id']])
+          buf = cl.Image(CL.ctx, mf.READ_WRITE, tfmt, shape=(o['width'],), buffer=bufs[o['buffer_id']])
      else:
        if 'data' in o:
-          buf = cl.Buffer(CL.cl_ctxs[0], mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=o['data'])
+          buf = cl.Buffer(CL.ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=o['data'])
        else:
          # zero out buffers
-          buf = cl.Buffer(CL.cl_ctxs[0], mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=b'\x00'*o['size'])
+          buf = cl.Buffer(CL.ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=b'\x00'*o['size'])

      bufs[o['id']] = buf
      bufs_loaded[o['id']] = 'data' in o
@ -108,7 +110,7 @@ class Thneed:
    prgs = {}
    for o in jdat['binaries']:
      nptr = ptr + o['length']
-      prgs[o['name']] = CLProgram(o['name'], weights[ptr:nptr])
+      prgs[o['name']] = CLProgram(Device["GPU"], o['name'], weights[ptr:nptr])
      ptr = nptr

    # populate the cl_cache
@ -153,7 +155,7 @@ class Thneed:
    for prg, args in self.cl_cache:
      # get binaries for saving
      if prg.name not in saved_binaries:
-        binary = prg.clprograms[0].get_info(cl.program_info.BINARIES)
+        binary = prg.clprogram.get_info(cl.program_info.BINARIES)
        assert len(binary) == 1
        jdat['binaries'].append({"name":prg.name, "length":len(binary[0])})
        binaries.append(binary[0])
@ -161,7 +163,7 @@ class Thneed:

      # get the args from the kernel, some need the data saved
      targs, args_size = [], []
-      argdtypes = prg.argdtypes if prg.argdtypes is not None else [None]*(len(args)-2)
+      argdtypes = [None]*(len(args)-2)
      for a,d in zip(args[2:], argdtypes):
        if d == np.int16:
          targs.append(struct.pack("H", a).decode("latin_1"))
@ -185,7 +187,7 @@ class Thneed:
              })
              if needs_load:
                data = np.empty(a.size//4, dtype=np.float32)
-                cl.enqueue_copy(CL.cl_queue[0], data, a, is_blocking=True)
+                cl.enqueue_copy(CL.queue, data, a, is_blocking=True)
                weights.append(data.tobytes())
            elif isinstance(a, cl.Image):
              assert a.format == cl.ImageFormat(cl.channel_order.RGBA, cl.channel_type.HALF_FLOAT if FLOAT16 else cl.channel_type.FLOAT), "wrong type"
@ -193,12 +195,12 @@ class Thneed:
              row_pitch = (a.shape[0]*4*(2 if FLOAT16 else 4) + 63)//64 * 64
              size = row_pitch * a.shape[1]
              # this is *2 if float16 and *4 if float32
-              buf = cl.Buffer(CL.cl_ctxs[0], cl.mem_flags.READ_WRITE, size=size * (2 if FLOAT16 else 1))
+              buf = cl.Buffer(CL.ctx, cl.mem_flags.READ_WRITE, size=size * (2 if FLOAT16 else 1))

              # zero out the buffer
-              cl.enqueue_copy(CL.cl_queue[0], buf, b'\x00'*buf.size, is_blocking=True)
+              cl.enqueue_copy(CL.queue, buf, b'\x00'*buf.size, is_blocking=True)

-              CLProgram("from_image_strided", compile_gpu("""
+              CLProgram(CL, "from_image_strided", compile_gpu("""
                __kernel void from_image_strided(read_only image2d_t in, __global float4 *out, int row_pitch) {
                  const sampler_t smp = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
                  int2 l;
@ -206,7 +208,7 @@ class Thneed:
                  l.x = get_global_id(0);
                  out[l.y*row_pitch + l.x] = read_imagef(in, smp, l);
                }
-              """), argdtypes=(None, None, np.int32))(a, buf, row_pitch//(4*(2 if FLOAT16 else 4)), global_size=a.shape)
+              """), bufs=2, vars=1)(a, buf, row_pitch//(4*(2 if FLOAT16 else 4)), global_size=a.shape)

              # multiple of 32 isn't enough
              jdat['objects'].append({
@ -216,7 +218,7 @@ class Thneed:

              if needs_load:
                data = np.empty(size//(2 if FLOAT16 else 4), dtype=np.float32)
-                cl.enqueue_copy(CL.cl_queue[0], data, buf, is_blocking=True)
+                cl.enqueue_copy(CL.queue, data, buf, is_blocking=True)
                if FLOAT16: data = data.astype(np.float16)
                weights.append(data.tobytes())
            else:
@ -263,9 +265,9 @@ class Thneed:
    events = []
    st = time.monotonic()
    for prg, args in self.cl_cache:
-      events.append(prg.clprgs[0](CL.cl_queue[0], *args))
+      events.append(prg.clprg(CL.queue, *args))
    mt = time.monotonic()
-    CL.synchronize()
+    Device["GPU"].synchronize()
    et = time.monotonic() - st
    print(f"submit in {(mt-st)*1000.0:.2f} ms, total runtime is {et*1000.0:.2f} ms")

--- a/openpilot/compile2.py
+++ b/openpilot/compile2.py
@ -85,7 +85,6 @@ def schedule_to_thneed(schedule, output_fn):
 def thneed_test_onnx(onnx_data, output_fn):
  import onnx
  import pyopencl as cl
-  from tinygrad.runtime.ops_gpu import CL
  import numpy as np
  from extra.thneed import Thneed
  onnx_model = onnx.load(io.BytesIO(onnx_data))
@ -118,11 +117,11 @@ def thneed_test_onnx(onnx_data, output_fn):

    # inputs
    for k,v in nt.inputs.items():
-      cl.enqueue_copy(CL.cl_queue[0], v, new_np_inputs[k], is_blocking=True)
+      cl.enqueue_copy(Device["GPU"].queue, v, new_np_inputs[k], is_blocking=True)

    nt.run()
    new_thneed_out = np.empty((nt.outputs[0].size//4,), dtype=np.float32).reshape(new_torch_out.shape)
-    cl.enqueue_copy(CL.cl_queue[0], new_thneed_out, nt.outputs[0], is_blocking=True)
+    cl.enqueue_copy(Device["GPU"].queue, new_thneed_out, nt.outputs[0], is_blocking=True)

    # compare torch to thneed
    np.testing.assert_allclose(new_torch_out, new_thneed_out, atol=1e-4, rtol=1e-2)
--- a/test/external/external_test_allocator_on_models.py
+++ b/test/external/external_test_allocator_on_models.py
@ -1,125 +0,0 @@
-#!/usr/bin/env python
-import unittest, gc
-import numpy as np
-from tinygrad.tensor import Tensor
-from tinygrad.nn.state import get_state_dict
-from tinygrad.helpers import GlobalCounters
-from tinygrad.runtime.lib import RawBuffer, LRUAllocator
-from tinygrad.helpers import dtypes, prod
-from tinygrad import Device
-from test.helpers import derandomize_model
-
-from examples.llama import Transformer
-
-ALLOCATED_DEV_BUFS = 0
-class FakeDeviceBuffer:
-  def __init__(self, sz, dt, device):
-    self.id = 1
-    self.size = sz
-    self.dtype = dt
-    self.device = device
-
-    global ALLOCATED_DEV_BUFS
-    ALLOCATED_DEV_BUFS += 1
-class FakeAllocator(LRUAllocator):
-  def _do_alloc(self, size, dtype, device, **kwargs): return FakeDeviceBuffer(size, dtype, device)
-  def _do_free(self, buf):
-    buf.id -= 1
-    assert buf.id == 0, f"Free should be called once, but {buf.id}"
-  def __del__(self): # Fake allocator should clear all buffers after each test.
-    for v in self.cached_buffers.values():
-      for buf, _ in v: self._free_buffer(buf)
-
-FAKE_GLOBAL_ALLOCATOR = None
-class FakeBuffer(RawBuffer):
-  def __init__(self, size, dtype, device='0'):
-    global FAKE_GLOBAL_ALLOCATOR
-    super().__init__(size, dtype, allocator=FAKE_GLOBAL_ALLOCATOR, **{'device': device})
-    assert self._buf.size == size and self._buf.dtype == dtype and self._buf.device == device, "This allocator requires 100% match of dtype and size."
-  @classmethod
-  def fromCPU(cls, x:np.ndarray, **kwargs): return cls(prod(x.shape), dtypes.from_np(x.dtype), **kwargs)
-  def toCPU(self): return np.empty(self.size, dtype=self.dtype.np)
-class FakeProgram:
-  def __init__(self, name:str, prg:str): pass
-  def __call__(self, *bufs, global_size, local_size, wait=False): pass
-
-def helper_test_correctness(gen, train):
-  from tinygrad.runtime.ops_gpu import CL, CLAllocator
-  old_alloc = CL.cl_allocator
-  CL.cl_allocator = CLAllocator(0)
-  no_alloc_result = train(*gen()).numpy()
-  Device[Device.DEFAULT].synchronize()
-  CL.cl_allocator = CLAllocator(512<<30) # Test cache correctness, so cache as much as possible, 512gb
-  for _ in range(4):
-    GlobalCounters.reset()
-    np.testing.assert_allclose(train(*gen()).numpy(), no_alloc_result, rtol=1e-3, atol=1e-5)
-    Device[Device.DEFAULT].synchronize()
-  assert len(CL.cl_allocator.cached_buffers) != 0, "Cache must be used"
-  CL.cl_allocator = old_alloc
-
-def __helper_test_alloc_count(gen, train):
-  was_alloc = ALLOCATED_DEV_BUFS
-  for _ in range(2):
-    train(*gen())
-  return ALLOCATED_DEV_BUFS - was_alloc
-
-def helper_test_alloc_count(mm, gen, train):
-  global FAKE_GLOBAL_ALLOCATOR
-  backup_program = Device[Device.DEFAULT].runtime
-  backup_buffer = Device[Device.DEFAULT].buffer
-  Device[Device.DEFAULT].runtime = FakeProgram
-  Device[Device.DEFAULT].buffer = FakeBuffer
-  Device[Device.DEFAULT].get_runner.cache_clear()
-  FAKE_GLOBAL_ALLOCATOR = FakeAllocator(16<<30)
-  new_allocs = __helper_test_alloc_count(gen, train)
-  Device[Device.DEFAULT].get_runner.cache_clear()
-  FAKE_GLOBAL_ALLOCATOR = FakeAllocator(0)
-  old_allocs = __helper_test_alloc_count(gen, train)
-  print(f"{mm}: llama: old allocs count {old_allocs}, new allocs count {new_allocs}")
-  assert new_allocs < old_allocs, "Hmm, doesn't cache work any more?"
-  Device[Device.DEFAULT].runtime = backup_program
-  Device[Device.DEFAULT].buffer = backup_buffer
-  FAKE_GLOBAL_ALLOCATOR = None
-
-def check_gc():
-  if Device.DEFAULT == "GPU":
-    gc.collect() # Need to collect Tensors.
-    from extra.introspection import print_objects
-    assert print_objects() == 0
-
-class TestAllocators(unittest.TestCase):
-  @unittest.skipUnless(Device.DEFAULT == "GPU", "Not Implemented")
-  def test_lru_allocator_tiny_llama(self):
-    old_type = Tensor.default_type
-    Tensor.default_type = dtypes.float16
-
-    args_tiny = {"dim": 1024, "hidden_dim": 1024, "n_heads": 8, "n_layers": 8, "norm_eps": 1e-05, "vocab_size": 1000}
-    def __test():
-      model = Transformer(**args_tiny)
-      derandomize_model(model)
-      def test(t): return model(t, 0).realize()
-      helper_test_correctness(lambda: (Tensor([[1,]]),), test)
-    __test()
-    Tensor.default_type = old_type
-    check_gc()
-
-  @unittest.skipUnless(Device.DEFAULT == "GPU", "Not Implemented")
-  def test_lru_allocator_tiny_llama_alloc_counts(self):
-    args_tiny = {"dim": 1024, "hidden_dim": 1024, "n_heads": 8, "n_layers": 8, "norm_eps": 1e-05, "vocab_size": 1000}
-    def test_alloc_count(t):
-      model = Transformer(**args_tiny)
-      for v in get_state_dict(model).values(): v.assign(Tensor.empty(*v.shape, dtype=v.dtype))
-      return model(t, 0).realize()
-    helper_test_alloc_count("llama", lambda: (Tensor([[2,]]),), test_alloc_count)
-    check_gc()
-
-  @unittest.skip("huge for CI")
-  def test_stable_diffusion(self):
-    from examples.stable_diffusion import UNetModel
-    model = UNetModel()
-    derandomize_model(model)
-    def test(t, t2): return model(t, 801, t2).realize()
-    helper_test_correctness(lambda: (Tensor.randn(1, 4, 16, 16),Tensor.randn(1, 77, 768)), test)
-
-if __name__ == "__main__":
-  unittest.main()
--- a/test/external/external_test_speed_llama.py
+++ b/test/external/external_test_speed_llama.py
@ -1,29 +1,27 @@
 # NOTE: this only tests the speed of the LLaMA codegen, it doesn't actually run the net
 import unittest, time
-import numpy as np
 from examples.llama import Transformer, MODEL_PARAMS
 from tinygrad.tensor import Tensor
 from tinygrad import Device
 from tinygrad.nn.state import get_state_dict
-from tinygrad.device import Compiled
+from tinygrad.device import Compiled, Allocator
 from tinygrad.helpers import Profiling
-from tinygrad.runtime.lib import RawBuffer

 class FakeProgram:
-  def __init__(self, name:str, prg:str): pass
+  def __init__(self, name:str, prg:bytes, bufs:int, vars:int=0): pass
  def __call__(self, *bufs, global_size, local_size, wait=False): pass

-class RawFakeBuffer(RawBuffer):
-  def _copyin(self, x:np.ndarray): pass
-  def toCPU(self): return np.empty(self.size, dtype=self.dtype.np)
+class FakeAllocator(Allocator):
+  def _alloc(self, sz, dtype): return None
+  def copyin(self, dest, src:memoryview): pass

 class TestLLaMASpeed(unittest.TestCase):
  @unittest.skipIf(not isinstance(Device[Device.DEFAULT], Compiled), "only test for compiled backends")
  def test_llama_compile(self):
    backup_program = Device[Device.DEFAULT].runtime
-    backup_buffer = Device[Device.DEFAULT].buffer
+    backup_allocator = Device[Device.DEFAULT].allocator
    Device[Device.DEFAULT].runtime = FakeProgram
-    Device[Device.DEFAULT].buffer = RawFakeBuffer
+    Device[Device.DEFAULT].allocator = FakeAllocator()

    print("testing llama python run time")
    model = Transformer(**MODEL_PARAMS["1"]["7B"]["args"])
@ -48,7 +46,7 @@ class TestLLaMASpeed(unittest.TestCase):
      run_llama("profile")

    Device[Device.DEFAULT].runtime = backup_program
-    Device[Device.DEFAULT].buffer = backup_buffer
+    Device[Device.DEFAULT].allocator = backup_allocator

 if __name__ == '__main__':
  unittest.main()
--- a/test/helpers.py
+++ b/test/helpers.py
@ -6,7 +6,7 @@ from tinygrad.nn.state import get_parameters
 def derandomize(x):
  if isinstance(x, LazyOp):
    new_op = LoadOps.EMPTY if x.op == LoadOps.RAND else x.op
-    return LazyOp(new_op, tuple([derandomize(s) for s in x.src]), x.arg)
+    return LazyOp(new_op, tuple([derandomize(s) for s in x.src]), None if x.op == LoadOps.RAND else x.arg)
  x.op = derandomize(x.op)
  return x

--- a/test/test_allocators.py
+++ b/test/test_allocators.py
@ -1,188 +0,0 @@
-#!/usr/bin/env python
-import unittest
-import pytest
-import numpy as np
-from weakref import ref
-
-from tinygrad.helpers import GlobalCounters
-from tinygrad.runtime.lib import RawBuffer, LRUAllocator
-from tinygrad.helpers import dtypes, prod
-from tinygrad import Device
-from tinygrad.tensor import Tensor
-
-def check_gc():
-  if Device.DEFAULT == "GPU":
-    from extra.introspection import print_objects
-    assert print_objects() == 0
-
-class FakeDeviceBuffer:
-  def __init__(self, sz, dt, device):
-    self.id = 1
-    self.size = sz
-    self.dtype = dt
-    self.device = device
-  def __del__(self):
-    assert self.id == 0, "Should called _do_free() before"
-
-class FakeAllocator(LRUAllocator):
-  def _do_alloc(self, size, dtype, device, **kwargs):
-    if size*dtype.itemsize > self._get_cur_free_space(device): raise Exception("OOM")
-    return FakeDeviceBuffer(size, dtype, device)
-  def _do_free(self, buf):
-    buf.id -= 1
-    assert buf.id == 0, f"Free should be called once, but {buf.id}"
-  def __del__(self): # Fake allocator should clear all buffers after each test.
-    for v in self.cached_buffers.values():
-      for buf, _ in v: self._free_buffer(buf)
-
-FAKE_GLOBAL_ALLOCATOR = None
-class FakeBuffer(RawBuffer):
-  def __init__(self, size, dtype, device='0'):
-    global FAKE_GLOBAL_ALLOCATOR
-    super().__init__(size, dtype, allocator=FAKE_GLOBAL_ALLOCATOR, **{'device': device})
-    assert self._buf.size == size and self._buf.dtype == dtype and self._buf.device == device, "This allocator requires 100% match of dtype and size."
-  @classmethod
-  def fromCPU(cls, x:np.ndarray, **kwargs): return cls(prod(x.shape), dtypes.from_np(x.dtype), **kwargs)
-  def toCPU(self): return np.empty(self.size, dtype=self.dtype.np)
-
-def alloc(allocator, size, dtype, **kwargs):
-  global FAKE_GLOBAL_ALLOCATOR
-  FAKE_GLOBAL_ALLOCATOR = allocator
-  buf = FakeBuffer(size, dtype, **kwargs)
-  assert buf.dtype == dtype and buf.size == size
-  FAKE_GLOBAL_ALLOCATOR = None
-  return buf
-
-def alloc_free_trace(allocator, size, dtype, **kwargs):
-  buf = alloc(allocator, size, dtype, **kwargs)
-  return ref(buf._buf)
-
-def cmp_trace_and_buf(buf, trace_ref): return trace_ref and trace_ref() == buf._buf
-
-class TestAllocators(unittest.TestCase):
-  def test_lru_allocator_reusage(self):
-    mc, mu = GlobalCounters.mem_cached, GlobalCounters.mem_used
-    def test():
-      lru_allocator = FakeAllocator(2048)
-      traced_buf = alloc_free_trace(lru_allocator, 16, dtypes.float32)
-      assert GlobalCounters.mem_cached - mc == 16*dtypes.float32.itemsize, "Buffer should be cached"
-      for _ in range(32):
-        def __test():
-          buf = alloc(lru_allocator, 16, dtypes.float32)
-          assert cmp_trace_and_buf(buf, traced_buf), "Buffer should be reused"
-        __test()
-
-      usedbuf = alloc(lru_allocator, 16, dtypes.float32)
-      for _ in range(32):
-        def __test():
-          buf = alloc(lru_allocator, 16, dtypes.float32)
-          assert usedbuf != buf, "Nobody should get used buffer"
-        __test()
-      assert GlobalCounters.mem_used - mu == 16*dtypes.float32.itemsize, "Only usedbuf is still allocated."
-    test()
-    check_gc()
-
-  def test_lru_allocator_cache_free(self):
-    mc, mu = GlobalCounters.mem_cached, GlobalCounters.mem_used
-    def test():
-      lru_allocator = FakeAllocator(128)
-      refs = []
-      for _ in range(32):
-        refs.append(alloc_free_trace(lru_allocator, 16, dtypes.float32))
-      for sz in range(1, 32):
-        alloc_free_trace(lru_allocator, sz, dtypes.float32)
-        assert GlobalCounters.mem_used + GlobalCounters.mem_cached - mc - mu <= 128, "Should not allocate on device more than allowed (128)"
-      for r in refs: assert r() is None, "All refs should be dead, since buffers were cleared from cache"
-    test()
-    check_gc()
-
-  def test_lru_allocator_multidevice(self):
-    def test():
-      lru_allocator = FakeAllocator(256)
-      refs=[]
-      for i in range(8):
-        refs.append(alloc_free_trace(lru_allocator, 16, dtypes.float32, device=str(i)))
-      for i in range(64):
-        def __test():
-          dev = str(i % 8)
-          buf = alloc(lru_allocator, 16, dtypes.float32, device=dev)
-          assert cmp_trace_and_buf(buf, refs[i%8]), "Buffer should be reused"
-        __test()
-      for r in refs: assert r() is not None, "All refs should be cached"
-    test()
-    check_gc()
-
-  def test_lru_allocator_failing_alloc_cleans_cache(self):
-    def test():
-      lru_allocator = FakeAllocator(128)
-      for size in range(1, 4):
-        alloc_free_trace(lru_allocator, size, dtypes.float32, device='0')
-      assert len(lru_allocator.aging_order['0']) == 3, "All buffers should be cached"
-      assert lru_allocator.free_space['0'] == 128 - 24, "24 bytes to be used by current cached buffers"
-
-      def always_raise_exception(*args, **kwargs):
-        raise MemoryError("OOM")
-      lru_allocator._do_alloc = always_raise_exception
-
-      with pytest.raises(Exception):
-        alloc(lru_allocator, 5, dtypes.float32, device='0')
-      assert len(lru_allocator.aging_order['0']) == 0, "All buffers should be freed from cache due to failing alloc"
-    test()
-    check_gc()
-
-  def test_lru_allocator_fail_first_alloc_pass_after_clear_cahce(self):
-    def test():
-      lru_allocator = FakeAllocator(128)
-      for size in range(1, 4):
-        alloc_free_trace(lru_allocator, size, dtypes.float32, device='0')
-      cache_length = 3
-      assert len(lru_allocator.aging_order['0']) == cache_length, "All buffers should be cached"
-      assert lru_allocator.free_space['0'] == 128 - 24, "24 bytes to be used by current cached buffers"
-
-      original_do_alloc = lru_allocator._do_alloc  # save the original method
-      def single_fail_then_pass(*args, **kwargs):
-        lru_allocator._do_alloc = original_do_alloc  # restore the original method
-        raise MemoryError("OOM")
-      lru_allocator._do_alloc = single_fail_then_pass
-
-      alloc(lru_allocator, 5, dtypes.float32, device='0')
-      assert len(lru_allocator.aging_order['0']) < cache_length, "Some buffers should be cleaned as first alloc failed"
-    test()
-    check_gc()
-
-  @unittest.skip("failing in CI")
-  def test_gpu_copyout(self):
-    def test():
-      from tinygrad.runtime.ops_gpu import CL
-
-      # Allocation to init the allocator.
-      tx = Tensor.rand(1)
-      tx.realize()
-      free_space = CL.cl_allocator.free_space[tx.lazydata.realized._device]
-
-      # Spawning 128mb objects to fill half of free_space
-      will_allocate = free_space // 3
-      trash_allocation_size = free_space // 2
-
-      def sp():
-        trash_buffer = Tensor.rand(trash_allocation_size // 4)
-        trash_buffer.realize()
-      sp()
-
-      xx = Tensor.rand(will_allocate // 4)
-      _ = xx.numpy()
-    test()
-    check_gc()
-
-  def test_lru_allocator_massive_buffer(self):
-    with self.assertRaises(AssertionError) as context: alloc(allocator := FakeAllocator(), size := 1e13, dtypes.int8)
-    self.assertEqual(str(context.exception), f"out of memory - requested: {size/1e9:5.2f} GB, available: {allocator._get_cur_free_space('0')/1e9:5.2f} GB")
-
-  @unittest.skipIf(Device.DEFAULT != "METAL", "only applies to Metal")
-  def test_lru_allocator_metal_max_buffer_length(self):
-    from tinygrad.runtime.ops_metal import METAL
-    with self.assertRaises(AssertionError) as context: METAL.allocator._do_alloc(buf_len := (max_buf_len := METAL.device.maxBufferLength()+1), dtypes.int8, '0')
-    self.assertEqual(str(context.exception), f"Buffer length of {buf_len/1e9:5.2f} GB exceeds Metal's max buffer length of {max_buf_len/1e9:5.2f} GB.")
-
-if __name__ == "__main__":
-  unittest.main()
--- a/test/test_custom_function.py
+++ b/test/test_custom_function.py
@ -8,7 +8,7 @@ from tinygrad.helpers import prod, dtypes

 # *** first, we implement the atan2 op at the lowest level ***
 # `atan2_gpu` for GPUBuffers and `atan2_cpu` for CPUBuffers
-from tinygrad.lazy import LazyBuffer, create_lazybuffer
+from tinygrad.lazy import Buffer, create_lazybuffer
 from tinygrad.device import CompiledASTRunner, Device
 from tinygrad.shape.shapetracker import ShapeTracker
 import pytest
@ -16,17 +16,15 @@ import pytest
 pytestmark = pytest.mark.webgpu

 # we don't always have GPU support, so the type signature is the abstract CompiledBuffer instead of GPUBuffer
-def atan2_gpu(ret:LazyBuffer, a:LazyBuffer, b:LazyBuffer):
-  assert a.device == "GPU" and b.device == "GPU", "gpu function requires GPUBuffers"
+def atan2_gpu(ret:Buffer, a:Buffer, b:Buffer):
  assert a.dtype == b.dtype and a.dtype == dtypes.float32, "gpu function only supports float32"
-  ret.realized = Device[ret.device].buffer(prod(ret.shape), ret.dtype)
  CompiledASTRunner(None, "atan2_gpu", """
    __kernel void atan2_gpu(global float *c, global float *a, global float *b) {
      int idx = get_global_id(0);
      c[idx] = atan2(a[idx], b[idx]);
-    }""", global_size=[prod(ret.shape)]).build(Device[ret.device].compiler, Device[ret.device].runtime).exec([ret.realized, a.realized, b.realized])
+    }""", global_size=[ret.size], bufcount=3).build(Device[ret.device].compiler, Device[ret.device].runtime).exec([ret, a, b])

-def atan2_cpu(ret:LazyBuffer, a:LazyBuffer, b:LazyBuffer): ret.realized._copyin(np.arctan2(a.realized._buf, b.realized._buf))
+def atan2_cpu(ret:Buffer, a:Buffer, b:Buffer): ret.copyin(np.require(np.arctan2(a._buf, b._buf), requirements='C').data)

 # *** second, we write the ATan2 mlop ***
 # NOTE: The derivative of atan2 doesn't need a custom op! https://www.liquisearch.com/atan2/derivative
--- a/test/test_lazybuffer.py
+++ b/test/test_lazybuffer.py
@ -7,6 +7,7 @@ from tinygrad.tensor import Tensor
 from tinygrad.jit import CacheCollector

 class TestLazyBuffer(unittest.TestCase):
+  @unittest.skip("it doesn't work like this anymore")
  def test_fromcpu_buffer_sharing(self):
    a = np.arange(8)
    assert LazyBuffer.fromCPU(a).realized._buf is a
--- a/test/test_linearizer.py
+++ b/test/test_linearizer.py
@ -3,7 +3,7 @@ import unittest, os

 from tinygrad.codegen.kernel import Opt, OptOps, tensor_cores
 from tinygrad.codegen.linearizer import Linearizer, UOp, UOps
-from tinygrad.device import Compiled, Device
+from tinygrad.device import Compiled, Device, Buffer
 from tinygrad.ops import BufferOps, MemBuffer, ConstBuffer, LazyOp, LoadOps, TernaryOps
 from tinygrad.shape.shapetracker import ShapeTracker
 from tinygrad.shape.view import View
@ -140,7 +140,7 @@ def helper_realized_ast(r:Tensor):
  s = r.lazydata.schedule()
  run_schedule(s[:-1])  # run all kernels except the last one
  # now all input LazyBuffers buffers in s[-1] should be realized
-  output_buffer = Device[s[-1].out.device].buffer(prod((s if isinstance(s, int) else s.max for s in s[-1].out.shape)), s[-1].out.dtype, **s[-1].out._device_extra_args())  # allocate an output buffer
+  output_buffer = Buffer(s[-1].out.device, prod((s if isinstance(s, int) else s.max for s in s[-1].out.shape)), s[-1].out.dtype, **s[-1].out._device_extra_args())  # allocate an output buffer
  return s[-1].ast, [output_buffer] + [l.realized for l in s[-1].inputs]

 class TestFloat4(unittest.TestCase):
@ -367,7 +367,7 @@ def helper_linearizer_opt(r:Tensor, opts=[], apply_tc=False):
      for opt in opts:
        k.apply_opt(opt)
    prg = to_prg(k)
-    real_bufs[0] = real_bufs[0].fromCPU(np.zeros((real_bufs[0].size, ), dtype=real_bufs[0].dtype.np)) # Zero to check that all values are filled
+    real_bufs[0].copyin(np.zeros((real_bufs[0].size, ), dtype=real_bufs[0].dtype.np).data) # Zero to check that all values are filled
    prg.exec(real_bufs)
    np.testing.assert_allclose(wanna_output, real_bufs[0].toCPU(), atol=1e-4, rtol=1e-4)

@ -381,7 +381,7 @@ def helper_linearizer_opt(r:Tensor, opts=[], apply_tc=False):
  k = Linearizer(realized_ast)
  k.hand_coded_optimizations()
  prg = Device[Device.DEFAULT].to_program(k)
-  real_bufs[0] = real_bufs[0].fromCPU(np.zeros((real_bufs[0].size, ), dtype=real_bufs[0].dtype.np)) # Zero to check that all values are filled
+  real_bufs[0].copyin(np.zeros((real_bufs[0].size, ), dtype=real_bufs[0].dtype.np).data) # Zero to check that all values are filled
  prg.exec(real_bufs)
  np.testing.assert_allclose(wanna_output, real_bufs[0].toCPU(), atol=1e-4, rtol=1e-4)
  for x in opts: # Check custom transformations if any.
--- a/test/test_search.py
+++ b/test/test_search.py
@ -2,7 +2,7 @@ import unittest

 from tinygrad.codegen.linearizer import Linearizer
 from tinygrad.features.search import time_linearizer
-from tinygrad.device import Compiled, Device
+from tinygrad.device import Compiled, Device, Buffer
 from tinygrad.ops import LoadOps
 from tinygrad.tensor import Tensor

@ -12,7 +12,7 @@ class TestTimeLinearizer(unittest.TestCase):

  def test_reasonable_time(self):
    si = [si for si in Tensor([1,2,3,4]).add(1).lazydata.schedule() if si.ast.op not in LoadOps][0]
-    rawbufs = [Device[Device.DEFAULT].buffer(si.out.st.size(), si.out.dtype)] + [Device[Device.DEFAULT].buffer(x.st.size(), x.dtype) for x in si.inputs]
+    rawbufs = [Buffer(Device.DEFAULT, si.out.st.size(), si.out.dtype)] + [Buffer(Device.DEFAULT, x.st.size(), x.dtype) for x in si.inputs]
    tm = time_linearizer(Linearizer(si.ast), rawbufs, allow_test_size=False, cnt=10)
    assert tm > 0 and tm != float('inf')

--- a/test/test_uops.py
+++ b/test/test_uops.py
@ -2,16 +2,16 @@ from typing import Optional, Tuple, Any, List
 import unittest, math
 import numpy as np
 from tinygrad.helpers import dtypes, getenv, DType, PtrDType
-from tinygrad.tensor import Device
+from tinygrad.device import Buffer, Device
 from tinygrad.ops import UnaryOps, BinaryOps, TernaryOps
 from tinygrad.device import CompiledASTRunner, Compiled
 from tinygrad.codegen.linearizer import UOps, UOp

-def _uops_to_prg(uops):
+def _uops_to_prg(uops, bufcount):
  src, runtime_args = Device[Device.DEFAULT].renderer("test", uops)
  return CompiledASTRunner(None, "test", src,
                           [1] if Device[Device.DEFAULT].linearizer_opts.has_local else None, [1] if Device[Device.DEFAULT].linearizer_opts.has_local else None,
-                           runtime_args=runtime_args).build(Device[Device.DEFAULT].compiler, Device[Device.DEFAULT].runtime)
+                           runtime_args=runtime_args, bufcount=bufcount).build(Device[Device.DEFAULT].compiler, Device[Device.DEFAULT].runtime)

 def uop(uops:List[UOp], uop:UOps, dtype:Optional[DType], vin:Tuple[UOp, ...], arg:Any=None) -> UOp:
  uops.append(UOp(uop, dtype, tuple(vin), arg))
@ -24,9 +24,9 @@ def _test_single_value(vals, op, dtype):
  loads = (uop(uops, UOps.LOAD, dtype, [buf_loads[i], uop(uops, UOps.CONST, dtypes.int32, (), 0)]) for i in range(len(vals)))
  alu = uop(uops, UOps.ALU, dtype, loads, op)
  uop(uops, UOps.STORE, None, (buf_store, uop(uops, UOps.CONST, dtypes.int32, (), 0), alu))
-  buf = Device[Device.DEFAULT].buffer(1, dtype)
-  buf2 = [Device[Device.DEFAULT].buffer.fromCPU(np.array([a], dtype=dtype.np)) for a in vals]
-  prg = _uops_to_prg(uops)
+  buf = Buffer(Device.DEFAULT, 1, dtype)
+  buf2 = [Buffer.fromCPU(Device.DEFAULT, np.array([a], dtype=dtype.np)) for a in vals]
+  prg = _uops_to_prg(uops, 1+len(buf2))
  prg.exec([buf]+buf2)
  return buf.toCPU()[0]

@ -36,8 +36,8 @@ def _test_single_value_const(vals, op, dtype):
  loads = (uop(uops, UOps.CONST, dtype, [], a) for a in vals)
  alu = uop(uops, UOps.ALU, dtype, loads, op)
  uop(uops, UOps.STORE, None, (buf_store, uop(uops, UOps.CONST, dtypes.int32, (), 0), alu))
-  buf = Device[Device.DEFAULT].buffer(1, dtype)
-  prg = _uops_to_prg(uops)
+  buf = Buffer(Device.DEFAULT, 1, dtype)
+  prg = _uops_to_prg(uops, 1)
  prg.exec([buf])
  return buf.toCPU()[0]

--- a/test/unit/test_disk_tensor.py
+++ b/test/unit/test_disk_tensor.py
@ -3,8 +3,7 @@ import unittest
 import numpy as np
 from tinygrad.tensor import Tensor, Device
 from tinygrad.nn.state import safe_load, safe_save, get_state_dict, torch_load
-from tinygrad.helpers import dtypes, fetch, temp
-from tinygrad.runtime.ops_disk import RawDiskBuffer
+from tinygrad.helpers import fetch, temp
 from tinygrad.helpers import Timing

 def compare_weights_both(url):
@ -40,11 +39,6 @@ class TestRawDiskBuffer(unittest.TestCase):
      with Timing("copy in ", lambda et_ns: f" {test_size/et_ns:.2f} GB/s"):
        f.readinto(tst)

-  def test_mmap_read_speed(self):
-    db = RawDiskBuffer(test_size, dtype=dtypes.uint8, device=test_fn)
-    tst = np.empty(test_size, np.uint8)
-    with Timing("copy in ", lambda et_ns: f" {test_size/et_ns:.2f} GB/s"):
-      np.copyto(tst, db.toCPU())
@unittest.skipIf(Device.DEFAULT == "WEBGPU", "webgpu doesn't support uint8 datatype")
 class TestSafetensors(unittest.TestCase):
  def test_real_safetensors(self):
--- a/tinygrad/device.py
+++ b/tinygrad/device.py
@ -1,8 +1,9 @@
 from __future__ import annotations
-from typing import TYPE_CHECKING, Union, Type, Any, List, Optional, Dict, Callable
+import numpy as np
+from collections import defaultdict
+from typing import TYPE_CHECKING, Union, Any, List, Optional, Dict, Callable, Tuple
 import importlib, inspect, functools, pathlib, time, re
-from tinygrad.helpers import ansilen, DEBUG, getenv, GlobalCounters, colored, BEAM, NOOPT, all_int, to_function_name
-from tinygrad.runtime.lib import RawBuffer
+from tinygrad.helpers import ansilen, DEBUG, getenv, GlobalCounters, colored, BEAM, NOOPT, all_int, to_function_name, DType, from_mv, dtypes
 from tinygrad.shape.symbolic import Variable, sym_infer, sint
 from tinygrad.ops import LazyOp, TernaryOps, get_lazyop_info, ReduceOps, BufferOps, BinaryOps, Op

@ -16,9 +17,11 @@ class _Device:
  def __init__(self) -> None: self._buffers: List[str] = [x.stem[len("ops_"):].upper() for x in (pathlib.Path(__file__).parent/"runtime").iterdir() if x.stem.startswith("ops_")]
  def canonicalize(self, device:Optional[str]) -> str: return (device.split(":", 1)[0].upper() + ((":"+device.split(":", 1)[1]) if ':' in device else '')).replace(":0", "") if device is not None else self.DEFAULT
  @functools.lru_cache(maxsize=None)  # this class is a singleton, pylint: disable=method-cache-max-size-none
-  def __getitem__(self, x:str) -> Union[Interpreted, Compiled]:
-    x = x.split(":")[0].upper()
-    return [cls for cname, cls in inspect.getmembers(importlib.import_module(f'tinygrad.runtime.ops_{x.lower()}')) if (cname.lower() == x.lower() + "device") and x in self._buffers][0]
+  def __getitem__(self, ix:str) -> Union[Interpreted, Compiled]:
+    x = ix.split(":")[0].upper()
+    ret = [cls for cname, cls in inspect.getmembers(importlib.import_module(f'tinygrad.runtime.ops_{x.lower()}')) if (cname.lower() == x.lower() + "device") and x in self._buffers][0]
+    if isinstance(ret, type): ret = ret(ix)
+    return ret
  @functools.cached_property
  def DEFAULT(self) -> str:
    device_from_env: Optional[str] = functools.reduce(lambda val, ele: ele if getenv(ele) == 1 else val, self._buffers, None)   # type: ignore
@ -30,18 +33,64 @@ class _Device:
    return "CPU"
 Device = _Device()

+class Buffer:
+  def __init__(self, device:str, size:int, dtype:DType, opaque:Any=None):
+    assert isinstance(dtype, DType)
+    self.device, self.size, self.dtype = device, size, dtype
+    self._buf = opaque if opaque is not None else Device[self.device].allocator.alloc(size, dtype)
+    GlobalCounters.mem_used += self.size * self.dtype.itemsize
+  def __del__(self):
+    GlobalCounters.mem_used -= self.size * self.dtype.itemsize
+    Device[self.device].allocator.free(self._buf, self.size, self.dtype)
+  def __repr__(self): return f"<buf device:{self.device} size:{self.size}>"
+  def copyin(self, mv:memoryview):
+    mv = mv.cast("B", shape=[self.size*self.dtype.itemsize])
+    assert len(mv) == self.size*self.dtype.itemsize, f"size mismatch, {len(mv)=} != {self.dtype=} {self.size=}"
+    Device[self.device].allocator.copyin(self._buf, mv)
+    return self
+  @staticmethod
+  def fromCPU(device:str, x:np.ndarray): return Buffer(device, x.size, dtypes.from_np(x.dtype)).copyin(x.data)
+  def toCPU(self) -> np.ndarray:
+    ret = np.empty(self.size, self.dtype.np)
+    if self.size > 0: Device[self.device].allocator.copyout(ret.data.cast("B", shape=[self.size*self.dtype.itemsize]), self._buf)
+    return ret
+
+# TODO: size, dest, src are the same type. can we enforce this?
+class Allocator:
+  def alloc(self, size:int, dtype:DType): return self._alloc(size, dtype)
+  def _alloc(self, size:int, dtype:DType): raise NotImplementedError("need alloc")
+  def free(self, opaque, size:int, dtype:DType): self._free(opaque) # if you are returning a Python object, you don't need a free
+  def _free(self, opaque): pass
+  def copyin(self, dest, src:memoryview): raise NotImplementedError("need copyin")
+  def copyout(self, dest:memoryview, src): raise NotImplementedError("need copyout")
+
+class LRUAllocator(Allocator):  # pylint: disable=abstract-method
+  def __init__(self): self.cache: Dict[Tuple[int, DType], Any] = defaultdict(list)
+  def alloc(self, size:int, dtype:DType):
+    if len(c := self.cache[(size, dtype)]): return c.pop()
+    try:
+      return self._alloc(size, dtype)
+    except MemoryError:
+      self.free_cache()
+      return self._alloc(size, dtype)
+  def free_cache(self):
+    for opaques in self.cache.values():
+      for opaque in opaques: self._free(opaque)
+      opaques.clear()
+  def free(self, opaque:Any, size:int, dtype:DType): self.cache[(size, dtype)].append(opaque)
+
 # **************** shared device helpers ****************

 class JITRunner:
  def __init__(self):
    self.op_estimate, self.mem_estimate = 0, 0
-  def exec(self, rawbufs:List[RawBuffer], var_vals:Optional[Dict[Variable, int]]=None) -> Optional[float]:
+  def exec(self, rawbufs:List[Buffer], var_vals:Optional[Dict[Variable, int]]=None) -> Optional[float]:
    var_vals = var_vals if var_vals is not None else {}
    from tinygrad.jit import CacheCollector
    et = self(rawbufs, var_vals)
    CacheCollector.add(self, rawbufs, var_vals)
    return et
-  def __call__(self, rawbufs:List[RawBuffer], var_vals:Dict[Variable, int], wait=False, jit=False) -> Optional[float]:
+  def __call__(self, rawbufs:List[Buffer], var_vals:Dict[Variable, int], wait=False, jit=False) -> Optional[float]:
    raise NotImplementedError("override this")

 def update_stats(name:str, op_estimate:sint, mem_estimate:sint, var_vals: Optional[Dict[Variable, int]], et: Optional[float], buf_count, jit=False, num_kernels=1, lra: Optional[Dict]=None):
@ -64,18 +113,16 @@ class InterpretedASTRunner(JITRunner):
    info = get_lazyop_info(ast)
    self.op_estimate, self.mem_estimate = info.flops, info.mem_estimate

-  def __call__(self, rawbufs:List[RawBuffer], var_vals:Dict[Variable, int], wait=False, jit=False) -> float:
+  def __call__(self, rawbufs:List[Buffer], var_vals:Dict[Variable, int], wait=False, jit=False) -> float:
    st = time.perf_counter()
-    ret: RawBuffer = self.fxn(rawbufs[1:], var_vals)
+    rawbufs[0]._buf = self.fxn([x._buf for x in rawbufs], var_vals)
    et = time.perf_counter() - st
-    update_stats(f"<interpreted {ret.size}>", self.op_estimate, self.mem_estimate, var_vals, et, len(rawbufs), jit)
-    assert rawbufs[0].dtype == ret.dtype, f"dtype mismatch in Interpreted, {rawbufs[0].dtype=} != {ret.dtype=}"
-    rawbufs[0].dtype, rawbufs[0].size, rawbufs[0]._buf, rawbufs[0].offset = ret.dtype, ret.size, ret._buf, ret.offset
+    update_stats(f"<interpreted {rawbufs[0].size}>", self.op_estimate, self.mem_estimate, var_vals, et, len(rawbufs), jit)
    return et

 class Interpreted:
-  def __init__(self, buffer: Type[RawBuffer], fxn_for_op:Dict[Op, Callable]):
-    self.buffer, self.fxn_for_op = buffer, fxn_for_op
+  def __init__(self, allocator: Allocator, fxn_for_op:Dict[Op, Callable]):
+    self.allocator, self.fxn_for_op = allocator, fxn_for_op
    self.synchronize, self.codegen, self.graph = lambda: None, None, None

  @functools.lru_cache(None)    # pylint: disable=method-cache-max-size-none
@ -86,7 +133,6 @@ def _get_interpreted_fxn(fxn_for_op:Dict[Op, Callable], ast:LazyOp) -> Interpret
    from tinygrad.graph import print_tree
    print_tree(ast)
  tglob: Dict[str, Any] = {"Variable": Variable}
-  lines: List[str] = []

  @functools.lru_cache(None)
  def gstr(x:Any, nm=None) -> str:
@ -98,15 +144,16 @@ def _get_interpreted_fxn(fxn_for_op:Dict[Op, Callable], ast:LazyOp) -> Interpret
    tglob[ret] = x
    return ret

+  lines: List[str] = []
  @functools.lru_cache(None)
  def _interpret_ast(ast:LazyOp) -> str:
+    # TODO: shortcutted store won't work with strides
+    if ast.op == BufferOps.STORE: return _interpret_ast(ast.src[0])
    if TernaryOps.MULACC in fxn_for_op and ast.op == ReduceOps.SUM and isinstance(ast.src[0], LazyOp) and ast.src[0].op == BinaryOps.MUL:
      ast = LazyOp(TernaryOps.MULACC, ast.src[0].src, ast.arg)

-    if ast.op is BufferOps.STORE:
-      tmp = f"{gstr(fxn_for_op[ast.op], ast.op)}({_interpret_ast(ast.src[0])})"
-    elif ast.op in BufferOps:
-      tmp = f"{gstr(fxn_for_op[ast.op], ast.op)}({gstr(ast.arg.val)}, {gstr(ast.arg.dtype)})" if ast.op == BufferOps.CONST else f"{gstr(fxn_for_op[ast.op], ast.op)}(inputs[{ast.arg.idx-1}])"
+    if ast.op in BufferOps:
+      tmp = f"{gstr(fxn_for_op[ast.op], ast.op)}({gstr(ast.arg.val)}, {gstr(ast.arg.dtype)})" if ast.op == BufferOps.CONST else f"inputs[{ast.arg.idx}]"
      for mop,arg in ast.arg.st.to_movement_ops(): tmp = f"{gstr(fxn_for_op[mop], mop)}({tmp}, {gstr(arg)})"
    else:
      tmp = f"{gstr(fxn_for_op[ast.op], ast.op)}({', '.join([_interpret_ast(src) for src in ast.src] + ([gstr(ast.arg)] if ast.arg else []))})"
@ -124,16 +171,18 @@ def _get_interpreted_fxn(fxn_for_op:Dict[Op, Callable], ast:LazyOp) -> Interpret
 # **************** for Compiled Devices ****************

 class CompiledASTRunner(JITRunner):
-  def __init__(self, ast:Optional[LazyOp], name:str, prg:str, global_size:Optional[List[int]]=None, local_size:Optional[List[int]]=None, runtime_args:Optional[dict]=None):
+  def __init__(self, ast:Optional[LazyOp], name:str, prg:str, global_size:Optional[List[int]]=None, local_size:Optional[List[int]]=None, runtime_args:Optional[dict]=None, bufcount:int=0):
    super().__init__()
    if DEBUG >= 4: print(prg)
    if global_size is not None: global_size = global_size + [1]*(3-len(global_size))
    if local_size is not None: local_size = local_size + [1]*(3-len(local_size))
    self.name, self.display_name, self.prg, self.global_size, self.local_size, self.runtime_args = \
      to_function_name(name), name, prg, global_size, local_size, runtime_args if runtime_args is not None else {}
+    self.bufcount = bufcount
    self.vars: List[Variable] = []
    if ast:
      info = get_lazyop_info(ast)
+      self.bufcount = len(info.mem)
      self.op_estimate, self.mem_estimate = info.flops, info.mem_estimate
      from tinygrad.lazy import vars_from_ast
      self.vars = vars_from_ast(ast)
@ -141,7 +190,7 @@ class CompiledASTRunner(JITRunner):

  def build(self, compiler, runtime):
    self.lib = compiler.__wrapped__(self.prg) if getenv("DISABLE_COMPILER_CACHE") else compiler(self.prg)
-    self.clprg = runtime(self.name, self.lib)
+    self.clprg = runtime(self.name, self.lib, self.bufcount, len(self.vars))
    return self

  def launch_dims(self, var_vals):
@ -149,7 +198,7 @@ class CompiledASTRunner(JITRunner):
    local_size = [sym_infer(sz, var_vals) for sz in self.local_size] if self.local_size is not None else self.local_size
    return global_size, local_size

-  def __call__(self, rawbufs:List[RawBuffer], var_vals:Dict[Variable, int], wait=False, jit=False) -> Optional[float]:
+  def __call__(self, rawbufs:List[Buffer], var_vals:Dict[Variable, int], wait=False, jit=False) -> Optional[float]:
    global_size, local_size = self.launch_dims(var_vals)
    if global_size is not None and local_size is None and all_int(self.global_size): # type: ignore[arg-type]
      # TODO: this is copied from get_program
@ -159,13 +208,13 @@ class CompiledASTRunner(JITRunner):
    lra = self.runtime_args.copy()
    if global_size: lra['global_size'] = global_size
    if local_size and 'local_size' not in lra: lra['local_size'] = local_size
-    et = self.clprg(*rawbufs, *[var_vals[k] for k in self.vars], **lra, wait=wait or DEBUG>=2)
+    et = self.clprg(*[x._buf for x in rawbufs], *[var_vals[k] for k in self.vars], **lra, wait=wait or DEBUG>=2)
    update_stats(self.display_name, self.op_estimate, self.mem_estimate, var_vals, et, len(rawbufs), jit, lra=lra)
    return et

 class Compiled:
-  def __init__(self, buffer: Type[RawBuffer], linearizer_opts:LinearizerOptions, renderer, compiler, runtime, synchronize=lambda: None, graph=None):
-    self.buffer, self.linearizer_opts, self.renderer, self.compiler, self.runtime, self.synchronize, self.graph = buffer, linearizer_opts, renderer, compiler, runtime, synchronize, graph
+  def __init__(self, allocator:Allocator, linearizer_opts:LinearizerOptions, renderer, compiler, runtime, graph=None):
+    self.allocator, self.linearizer_opts, self.renderer, self.compiler, self.runtime, self.graph = allocator, linearizer_opts, renderer, compiler, runtime, graph

  def to_program(self, k:Linearizer) -> CompiledASTRunner:
    k.linearize()
@ -174,6 +223,7 @@ class Compiled:

  @functools.lru_cache(None)    # pylint: disable=method-cache-max-size-none
  def get_runner(self, ast:LazyOp) -> CompiledASTRunner: return self.to_program(_get_optimized_linearizer(self.linearizer_opts, ast))
+  def synchronize(self): pass

 def _get_optimized_linearizer(linearizer_opts:LinearizerOptions, ast:LazyOp) -> Linearizer:
  if DEBUG >= 3:
@ -196,4 +246,11 @@ def _get_optimized_linearizer(linearizer_opts:LinearizerOptions, ast:LazyOp) ->
      timed = sorted([(nm, tk, time_linearizer(tk, test_rawbuffers, allow_test_size=False, clear_l2=True)) for nm, tk in lins], key=lambda x: x[2])
      if DEBUG >= 1: print("  <  ".join(f"{nm:6s} : {lin.colored_shape(30, dense=True)} : {tm*1e6:8.2f} us" for nm, lin, tm in timed))
      k = timed[0][1]
-  return k
+  return k
+
+import ctypes
+class _MallocAllocator(LRUAllocator):
+  def _alloc(self, size:int, dtype:DType): return (ctypes.c_uint8 * (size*dtype.itemsize))()
+  def copyin(self, dest, src:memoryview): ctypes.memmove(dest, from_mv(src), len(src))
+  def copyout(self, dest:memoryview, src): ctypes.memmove(from_mv(dest), src, len(dest))
+MallocAllocator = _MallocAllocator()
--- a/tinygrad/features/search.py
+++ b/tinygrad/features/search.py
@ -1,11 +1,10 @@
 from typing import Dict, List, cast, DefaultDict, Optional, Tuple, Callable
 import itertools, random, math, time
 from tinygrad.lazy import vars_from_ast
-from tinygrad.device import Device, Compiled
+from tinygrad.device import Device, Compiled, Buffer
 from tinygrad.ops import MemBuffer
 from tinygrad.helpers import prod, ImageDType, flatten, DEBUG, CACHELEVEL, diskcache_get, diskcache_put, getenv, Context, all_int, colored, Timing
 from tinygrad.codegen.linearizer import Linearizer, UOp
-from tinygrad.runtime.lib import RawBuffer
 from collections import defaultdict
 from tinygrad.tensor import Tensor

@ -23,7 +22,7 @@ actions += [
 if getenv("NOLOCALS"): actions += [Opt(op=OptOps.NOLOCALS)]

 # returns time in seconds
-def time_linearizer(lin:Linearizer, rawbufs:List[RawBuffer], allow_test_size=True, max_global_size=65536, cnt=3, disable_cache=False, clear_l2=False) -> float:
+def time_linearizer(lin:Linearizer, rawbufs:List[Buffer], allow_test_size=True, max_global_size=65536, cnt=3, disable_cache=False, clear_l2=False) -> float:
  key = {"ast": str(lin.ast), "opts": str(lin.applied_opts), "allow_test_size": allow_test_size, "max_global_size": max_global_size, "clear_l2": clear_l2, "device": Device.DEFAULT}
  if not disable_cache and CACHELEVEL >= 2 and (val:=diskcache_get("time_linearizer", key)) is not None: return min(val)

@ -62,7 +61,7 @@ def time_linearizer(lin:Linearizer, rawbufs:List[RawBuffer], allow_test_size=Tru
      if clear_l2:
        # TODO: this is too small for many L2 caches
        with Context(DEBUG=0): Tensor.rand(1024,1024).realize()
-      tms.append(prg.clprg(*rawbufs, *var_vals.values(), **lra, wait=True)*factor)
+      tms.append(prg.clprg(*[x._buf for x in rawbufs], *var_vals.values(), **lra, wait=True)*factor)
  except Exception:
    if DEBUG >= 4:
      import traceback
@ -75,14 +74,14 @@ def time_linearizer(lin:Linearizer, rawbufs:List[RawBuffer], allow_test_size=Tru
  return min(tms)

 # get (scrap) buffers for timing the linearizer
-def bufs_from_lin(lin:Linearizer) -> List[RawBuffer]:
+def bufs_from_lin(lin:Linearizer) -> List[Buffer]:
  bufsts:DefaultDict[int, List[MemBuffer]] = defaultdict(list)
  for x in lin.membufs: bufsts[x.idx].append(x)
-  rawbufs:List[Optional[RawBuffer]] = [None]*len(bufsts)
+  rawbufs:List[Optional[Buffer]] = [None]*len(bufsts)
  for k,lx in bufsts.items():
-    rawbufs[k] = cast(Compiled, Device[Device.DEFAULT]).buffer(prod(lx[0].dtype.shape) if isinstance(lx[0].dtype, ImageDType) else max(y.st.size() for y in lx), lx[0].dtype)
+    rawbufs[k] = Buffer(Device.DEFAULT, prod(lx[0].dtype.shape) if isinstance(lx[0].dtype, ImageDType) else max(y.st.size() for y in lx), lx[0].dtype)
  assert all(r is not None for r in rawbufs)
-  return cast(List[RawBuffer], rawbufs)
+  return cast(List[Buffer], rawbufs)

 # get dictionary of all possible actions
 def get_linearizer_actions(lin:Linearizer, include_0=True) -> Dict[int, Linearizer]:
@ -148,14 +147,14 @@ def beam_search(lin:Linearizer, rawbufs, amt:int, allow_test_size=True) -> Linea
  if DEBUG >= 3: print(beam[0][0].applied_opts)
  return beam[0][0]

-def optimize_local_size(clprg:Callable, global_size:List[int], rawbufs:List[RawBuffer]) -> List[int]:
-  test_rawbuffers = [type(rawbufs[0])(rawbufs[0].size, rawbufs[0].dtype), *rawbufs[1:]] if rawbufs[0] in rawbufs[1:] else rawbufs
+def optimize_local_size(clprg:Callable, global_size:List[int], rawbufs:List[Buffer]) -> List[int]:
+  test_rawbuffers = [Buffer(rawbufs[0].device, rawbufs[0].size, rawbufs[0].dtype), *rawbufs[1:]] if rawbufs[0] in rawbufs[1:] else rawbufs
  MAX_WORKGROUP = clprg.max_work_group_size() if hasattr(clprg, 'max_work_group_size') else 1024
  local_dims = [[x for x in set([sz, 1, 2, 4, 8, 16, 32, 64, 128, 256, MAX_WORKGROUP]) if x<=sz] for sz in global_size]
  local_sizes = [list(x) for x in itertools.product(*local_dims) if prod(x) <= MAX_WORKGROUP] * 2  # try each valid size twice
  def try_exec(local_size):
    try:
-      return clprg(*test_rawbuffers, global_size=[g//l if g%l == 0 else g/l for g,l in zip(global_size, local_size)], local_size=local_size, wait=True)
+      return clprg(*[x._buf for x in test_rawbuffers], global_size=[g//l if g%l == 0 else g/l for g,l in zip(global_size, local_size)], local_size=local_size, wait=True)
    except Exception:
      return float('inf')
  ret = min([(try_exec(local_size), local_size) for local_size in random.sample(local_sizes, len(local_sizes))])
--- a/tinygrad/helpers.py
+++ b/tinygrad/helpers.py
@ -1,5 +1,5 @@
 from __future__ import annotations
-import os, functools, platform, time, re, contextlib, operator, hashlib, pickle, sqlite3, cProfile, pstats, tempfile, pathlib, string
+import os, functools, platform, time, re, contextlib, operator, hashlib, pickle, sqlite3, cProfile, pstats, tempfile, pathlib, string, ctypes
 import numpy as np
 from urllib import request
 from tqdm import tqdm
@ -40,6 +40,10 @@ def partition(lst:List[T], fxn:Callable[[T],bool]):
 def unwrap(x:Optional[T]) -> T:
  assert x is not None
  return x
+def unwrap2(x):
+  ret, err = x
+  assert err is None, str(err)
+  return ret
 def get_child(obj, key):
  for k in key.split('.'):
    if k.isnumeric(): obj = obj[int(k)]
@ -52,6 +56,7 @@ def to_function_name(s:str): return ''.join([c if c in (string.ascii_letters+str
@functools.lru_cache(maxsize=None)
 def getenv(key:str, default=0): return type(default)(os.getenv(key, default))
 def temp(x:str) -> str: return (pathlib.Path(tempfile.gettempdir()) / x).as_posix()
+def from_mv(mv, to_type=ctypes.c_char): return ctypes.cast(ctypes.addressof(to_type.from_buffer(mv)), ctypes.POINTER(to_type))

 class Context(contextlib.ContextDecorator):
  stack: ClassVar[List[dict[str, int]]] = [{}]
@ -251,3 +256,15 @@ def fetch(url:str, name:Optional[Union[pathlib.Path, str]]=None, allow_caching=n
        if (file_size:=os.stat(f.name).st_size) < total_length: raise RuntimeError(f"fetch size incomplete, {file_size} < {total_length}")
        pathlib.Path(f.name).rename(fp)
  return fp
+
+# *** pretty PTX printer
+
+def pretty_ptx(s):
+  # all expressions match `<valid_before><expr><valid_after>` and replace it with `<valid_before>color(<expr>)<valid_after>`
+  s = re.sub(r'([!@<\[\s,\+\-;\n])((?:[_%$][\w%\$_]+(?:\.[xyz])?\:?)|(?:buf\d+))([<>\]\s,\+\-;\n\)])', lambda m:m[1]+colored(m[2], "blue")+m[3], s, flags=re.M) # identifiers
+  s = re.sub(r'(.)((?:b|s|u|f)(?:8|16|32|64)|pred)([\.\s])', lambda m:m[1]+colored(m[2], "green")+m[3], s, flags=re.M) # types
+  s = re.sub(r'^(\s*)([\w]+)(.*?;$)', lambda m:m[1]+colored(m[2], "yellow")+m[3], s, flags=re.M) # instructions
+  s = re.sub(r'([<>\[\]\s,\+\-;])((?:0[fF][0-9a-fA-F]{8})|(?:[0-9]+)|(?:0[xX][0-9a-fA-F]+))([<>\[\]\s,\+\-;])', lambda m:m[1]+colored(m[2], "yellow")+m[3], s, flags=re.M) # numbers
+  s = re.sub(r'(\.)(param|reg|global)', lambda m:m[1]+colored(m[2], "magenta"), s, flags=re.M) # space
+  s = re.sub(r'(\.)(version|target|address_size|visible|entry)', lambda m:m[1]+colored(m[2], "magenta"), s, flags=re.M) # derivatives
+  return s
--- a/tinygrad/jit.py
+++ b/tinygrad/jit.py
@ -2,8 +2,7 @@ from __future__ import annotations
 from typing import Callable, List, Tuple, Dict, cast, Union, Optional, TypeVar, Generic
 import functools, itertools, operator
 from tinygrad.helpers import DEBUG, DType, merge_dicts, getenv, all_int
-from tinygrad.device import Device, JITRunner, CompiledASTRunner
-from tinygrad.runtime.lib import RawBuffer
+from tinygrad.device import Device, JITRunner, CompiledASTRunner, Buffer
 from tinygrad.tensor import Tensor
 from tinygrad.shape.shapetracker import ShapeTracker
 from tinygrad.shape.symbolic import Variable, NumNode, Node
@ -13,11 +12,11 @@ from dataclasses import dataclass
@dataclass(frozen=True)
 class JitItem:
  prg: JITRunner  # or a graph executor like MetalGraph
-  rawbufs: List[Optional[RawBuffer]]
+  rawbufs: List[Optional[Buffer]]

 def get_jit_stats(jit_cache: List[JitItem]) -> Tuple[Node, Node]:
  return functools.reduce(operator.__add__, [ji.prg.op_estimate for ji in jit_cache], NumNode(0)), functools.reduce(operator.__add__, [ji.prg.mem_estimate for ji in jit_cache], NumNode(0))
-def get_input_replace(jit_cache: List[JitItem], input_rawbuffers:List[RawBuffer]) -> Dict[Tuple[int, int], int]:
+def get_input_replace(jit_cache: List[JitItem], input_rawbuffers:List[Buffer]) -> Dict[Tuple[int, int], int]:
  input_replace: Dict[Tuple[int, int], int] = {}
  for j,ji in enumerate(jit_cache):
    for i,a in enumerate(ji.rawbufs):
@ -55,7 +54,7 @@ class TinyJit(Generic[ReturnType]):
    expected_name_sts_dtype = tuple([(k, v.lazydata.st.unbind(), v.dtype) for k,v in input_tensors.items()])

    # get rawbuffers
-    input_rawbuffers: List[RawBuffer] = [cast(RawBuffer, v.lazydata.realized) for v in input_tensors.values()]
+    input_rawbuffers: List[Buffer] = [cast(Buffer, v.lazydata.realized) for v in input_tensors.values()]
    assert len(set(input_rawbuffers)) == len(input_rawbuffers), "duplicate inputs to JIT"

    # get variables: they can either be in Tensors or passed in as arguments, and all must be bound. these are all global
@ -67,7 +66,7 @@ class TinyJit(Generic[ReturnType]):
      assert self.expected_vals == expected_vals, "mismatch of var_vals"
      assert self.expected_name_sts_dtype == expected_name_sts_dtype, f"mismatch of sts, expected {self.expected_name_sts_dtype} got {expected_name_sts_dtype}"
      for (j,i),input_idx in self.input_replace.items(): self.jit_cache[j].rawbufs[i] = input_rawbuffers[input_idx]
-      for ji in self.jit_cache: ji.prg(cast(List[RawBuffer], ji.rawbufs), var_vals, wait=DEBUG>=2, jit=True)
+      for ji in self.jit_cache: ji.prg(cast(List[Buffer], ji.rawbufs), var_vals, wait=DEBUG>=2, jit=True)
    elif self.cnt == 1:
      # jit capture
      self.expected_vals, self.expected_name_sts_dtype = expected_vals, expected_name_sts_dtype
@ -80,7 +79,7 @@ class TinyJit(Generic[ReturnType]):
      # if your Device supports it, condense the items into a graph executor
      if (make_graph := Device[Device.DEFAULT].graph) and getenv("JIT") != 2:
        try:
-          self.jit_cache = [JitItem(make_graph(self.jit_cache, input_rawbuffers, var_vals), cast(List[Optional[RawBuffer]], input_rawbuffers))]
+          self.jit_cache = [JitItem(make_graph(self.jit_cache, input_rawbuffers, var_vals), cast(List[Optional[Buffer]], input_rawbuffers))]
        except GraphException as e:
          if DEBUG >= 1: print(f"graph create failed {e}")

@ -96,34 +95,34 @@ class TinyJit(Generic[ReturnType]):
    return cast(ReturnType, self.ret)

 class PlaceHolder:
-  def __init__(self, buf:RawBuffer): self.size, self.dtype, self._device, self.ref, self.buftype, self.bufid = buf.size, buf.dtype, getattr(buf, '_device', None), ref(buf), type(buf), id(buf._buf)
-  def to_tuple(self): return (self.size, self.dtype, self._device, self.buftype, self.bufid)
+  def __init__(self, buf:Buffer): self.size, self.dtype, self.device, self.ref, self.bufid = buf.size, buf.dtype, buf.device, ref(buf), id(buf._buf)
+  def to_tuple(self): return (self.size, self.dtype, self.device, self.bufid)
  def __hash__(self): return hash(self.to_tuple())
  def __eq__(self, x): return isinstance(x, PlaceHolder) and self.to_tuple() == x.to_tuple()
-  def alloc_if_needed(self, buffer_cache: Dict[PlaceHolder, RawBuffer]) -> RawBuffer:
+  def alloc_if_needed(self, buffer_cache: Dict[PlaceHolder, Buffer]) -> Buffer:
    ret = self.ref()
    if ret: return ret
-    if self not in buffer_cache: buffer_cache[self] = self.buftype(self.size, self.dtype, **({'device':self._device} if self._device is not None else dict()))
+    if self not in buffer_cache: buffer_cache[self] = Buffer(self.device, self.size, self.dtype)
    return buffer_cache[self]

 class _CacheCollector:
  def __init__(self):
-    self.cache: Optional[List[Tuple[JITRunner, List[Union[RawBuffer, PlaceHolder]]]]] = None
+    self.cache: Optional[List[Tuple[JITRunner, List[Union[Buffer, PlaceHolder]]]]] = None

  def start(self, var_vals:Optional[Dict[Variable, int]]=None):
    self.cache = []
-    self.placeholders: WeakKeyDictionary[RawBuffer, PlaceHolder] = WeakKeyDictionary()
+    self.placeholders: WeakKeyDictionary[Buffer, PlaceHolder] = WeakKeyDictionary()
    self.var_vals = var_vals if var_vals is not None else {}

  def add(self, prg, rawbufs, var_vals):
    if self.cache is None: return
    for k,v in var_vals.items(): assert k in self.var_vals and self.var_vals[k] == v, f"var_vals {k} mismatch {v} != {self.var_vals.get(k)}"
    self.placeholders[rawbufs[0]] = PlaceHolder(rawbufs[0])    # NOTE: this is making an assumption that 0 is special
-    self.cache.append((prg, [self.placeholders.get(x, x) if isinstance(x, RawBuffer) else x for x in rawbufs]))
+    self.cache.append((prg, [self.placeholders.get(x, x) if isinstance(x, Buffer) else x for x in rawbufs]))

  def finish(self) -> List[JitItem]:
    if self.cache is None: return []
-    buffer_cache: Dict[PlaceHolder, RawBuffer] = {}
+    buffer_cache: Dict[PlaceHolder, Buffer] = {}
    saved_cache, self.cache = self.cache, None
    return [JitItem(prg, [x.alloc_if_needed(buffer_cache) if isinstance(x, PlaceHolder) else x for x in pl]) for prg, pl in saved_cache]
 CacheCollector = _CacheCollector()
--- a/tinygrad/lazy.py
+++ b/tinygrad/lazy.py
@ -8,9 +8,7 @@ from tinygrad.helpers import prod, getenv, DType, dtypes, flatten, dedup, merge_
 from tinygrad.ops import ScheduleItem, UnaryOps, BinaryOps, TernaryOps, ReduceOps, MovementOps, LoadOps, OpType, LazyOp, MemBuffer, ConstBuffer, BufferOps, get_lazyop_info
 from tinygrad.shape.shapetracker import ShapeTracker, get_contraction
 from tinygrad.shape.symbolic import Variable, sint
-
-from tinygrad.runtime.lib import RawBuffer
-from tinygrad.runtime.ops_cpu import RawNumpyBuffer
+from tinygrad.device import Buffer

 # lazy can recurse a lot
 sys.setrecursionlimit(10000)
@ -100,11 +98,11 @@ UNSAFE_PAD_OPS = {BinaryOps.DIV, BinaryOps.CMPLT, UnaryOps.LOG2, UnaryOps.EXP2,

 class LazyBuffer:
  __deletable__ = ('op',)
-  def __init__(self, device:str, st:ShapeTracker, optype:OpType, op:Optional[LazyOp], dtype:DType, src:Optional[RawBuffer]=None, base:Optional[LazyBuffer]=None):
+  def __init__(self, device:str, st:ShapeTracker, optype:OpType, op:Optional[LazyOp], dtype:DType, src:Optional[Buffer]=None, base:Optional[LazyBuffer]=None):
    self.st: ShapeTracker = st
    self.device, self.shape, self.optype, self._dtype = device, self.st.shape, optype, dtype
-    self._realized: Optional[RawBuffer] = src
-    self.output_buffer: Optional[RawBuffer] = None   # TODO: do we really need this? or can we just use realized
+    self._realized: Optional[Buffer] = src
+    self.output_buffer: Optional[Buffer] = None   # TODO: do we really need this? or can we just use realized
    # TODO: does children have to be a ref count instead of a set? can a Buffer be a double child?
    self.children: WeakSet[LazyBuffer] = WeakSet()
    self.views: WeakSet[LazyBuffer] = WeakSet()
@ -211,7 +209,7 @@ class LazyBuffer:

  @staticmethod
  def fromCPU(x: np.ndarray) -> LazyBuffer:
-    return LazyBuffer("CPU", ShapeTracker.from_shape(x.shape), LoadOps, None, dtypes.from_np(x.dtype), RawNumpyBuffer.fromCPU(x))
+    return LazyBuffer("CPU", ShapeTracker.from_shape(x.shape), LoadOps, None, dtypes.from_np(x.dtype), Buffer("CPU", prod(x.shape), dtypes.from_np(x.dtype), x.flatten()))

  def cast(self, dtype:DType, bitcast:bool=False):
    return self.e(UnaryOps.CAST, arg=(dtype, bitcast))
--- a/tinygrad/ops.py
+++ b/tinygrad/ops.py
@ -96,7 +96,7 @@ class FlopCounter:

 InterpretedFlopCounter: Dict[Op, Callable] = {
  BufferOps.LOAD: lambda arg: FlopCounter(arg.st.shape, arg.dtype, 0, {arg.idx: arg.dtype.itemsize*arg.st.size()}), BufferOps.CONST: lambda arg: FlopCounter(arg.st.shape, arg.dtype, 0, {}),
-  BufferOps.STORE: lambda self,arg: FlopCounter(arg.st.shape, arg.dtype, self.consume_flops(), {arg.idx: arg.dtype.itemsize*arg.st.size()}), UnaryOps.CAST: lambda self,arg: FlopCounter(self.shape, arg[0], self.consume_flops(), self.mem),   # cast uses no flops
+  BufferOps.STORE: lambda self,arg: FlopCounter(arg.st.shape, arg.dtype, self.consume_flops(), {**self.mem, arg.idx: arg.dtype.itemsize*arg.st.size()}), UnaryOps.CAST: lambda self,arg: FlopCounter(self.shape, arg[0], self.consume_flops(), self.mem),   # cast uses no flops
  **{op:lambda self: FlopCounter(self.shape, self.dtype, self.consume_flops() + prod(self.shape), self.mem) for op in UnaryOps if op != UnaryOps.CAST},
  **{op:lambda self,y: FlopCounter(self.shape, max(self.dtype, y.dtype), self.consume_flops() + y.consume_flops() + prod(self.shape), {**self.mem, **y.mem}) for op in BinaryOps},
  **{op:lambda self,new_shape: FlopCounter(new_shape, self.dtype, self.consume_flops() + prod(self.shape), self.mem) for op in ReduceOps},
--- a/tinygrad/realize.py
+++ b/tinygrad/realize.py
@ -1,10 +1,9 @@
 from typing import List, cast, Dict, Callable
 import numpy as np
 from tinygrad.ops import ScheduleItem, LazyOp, LoadOps, BufferOps
-from tinygrad.device import Device
+from tinygrad.device import Device, Buffer
 from tinygrad.graph import log_schedule_item, print_tree
-from tinygrad.lazy import LazyBuffer
-from tinygrad.helpers import DEBUG, prod, all_int, getenv
+from tinygrad.helpers import DEBUG, prod

 def run_schedule(schedule:List[ScheduleItem], disable_logging=False):
  # NOTE: if you for loop the schedule it's slow because nothing frees
@ -27,53 +26,55 @@ def run_schedule(schedule:List[ScheduleItem], disable_logging=False):
            break
    # we don't have an output buffer, we have to create it, and create to max size if it has symbolic shape
    si.out.realized = si.out.output_buffer if si.out.output_buffer is not None else \
-      Device[si.out.device].buffer(prod((s if isinstance(s, int) else s.max for s in si.out.shape)), si.out.dtype, **si.out._device_extra_args())
-    if si.ast.op in LoadOps:
-      # confirm the LoadOps are contiguous and in order
-      for i,s in enumerate(si.ast.src): assert isinstance(s, LazyOp) and s.op == BufferOps.LOAD and s.arg.idx == i+1 and s.arg.st.contiguous, f"bad LoadOps src {i}: {s}"
-      LOAD_OPS_DISPATCHER[cast(LoadOps, si.ast.op)](si.out, *si.inputs)
-    else:
-      # TODO: should this be handled here? it probably just shouldn't be in the schedule
-      if not hasattr(si.out.realized, 'size') or si.out.realized.size != 0:
+      Buffer(si.out.device, prod((s if isinstance(s, int) else s.max for s in si.out.shape)), si.out.dtype)
+      #Device[si.out.device].buffer(prod((s if isinstance(s, int) else s.max for s in si.out.shape)), si.out.dtype, **si.out._device_extra_args())
+    # TODO: size 0 should be removed from the schedule
+    if si.out.realized.size != 0:
+      if si.ast.op in LoadOps:
+        # confirm the LoadOps are contiguous and in order
+        for i,s in enumerate(si.ast.src): assert isinstance(s, LazyOp) and s.op == BufferOps.LOAD and s.arg.idx == i+1 and s.arg.st.contiguous, f"bad LoadOps src {i}: {s}"
+        kwargs = {"arg": si.ast.arg} if si.ast.arg is not None else {}
+        LOAD_OPS_DISPATCHER[cast(LoadOps, si.ast.op)](si.out.realized, *[x.realized for x in si.inputs], **kwargs)
+      else:
        Device[si.out.device].get_runner(si.ast).exec([si.out.realized] + [x.realized for x in si.inputs], si.var_vals)
    del si.out.op
    for v in si.out.views: del v.op
-    assert si.out.realized and isinstance(si.out.realized, Device[si.out.device].buffer), f"device mismatch on realized got {type(si.out.realized)} expected {si.out.device}"
+    #assert si.out.realized and isinstance(si.out.realized, Device[si.out.device].buffer), f"device mismatch on realized got {type(si.out.realized)} expected {si.out.device}"
    assert si.out.realized.dtype == si.out.dtype, f"realized dtype is incorrect, {si.out.realized.dtype} != {si.out.dtype}"

 # *** zero op LoadOps ***

-def _realize_empty(buffer: LazyBuffer) -> None:
-  if DEBUG >= 2: print(f"***     empty {buffer.device}                              shape {str(buffer.shape):23s} dtype {buffer.dtype}")
+def _realize_empty(buffer: Buffer) -> None:
+  if DEBUG >= 2: print(f"***     empty {buffer.device}                              shape {buffer.size:5d} dtype {buffer.dtype}")

 # TODO: remove this and write the RNG in tinygrad
-def _realize_rand(buffer: LazyBuffer) -> None:
-  assert all_int(buffer.shape), "rand doesn't support symbolic shape"
-  if DEBUG >= 2: print(f"***      rand {buffer.device}    seed {buffer.op.arg:<10d}  shape {str(buffer.shape):23s} dtype {buffer.dtype}")
-  rng = np.random.default_rng(buffer.op.arg)
-  buffer.realized._copyin(rng.random(size=prod(buffer.shape), dtype=np.float32).astype(dtype=buffer.dtype.np, copy=False), **buffer._device_extra_args())
+def _realize_rand(buffer: Buffer, arg) -> None:
+  if DEBUG >= 2: print(f"***      rand {buffer.device}    seed {arg:<10d}  shape {buffer.size:5d} dtype {buffer.dtype}")
+  rng = np.random.default_rng(arg)
+  rng_np_buffer = rng.random(size=buffer.size, dtype=np.float32).astype(dtype=buffer.dtype.np, copy=False)
+  buffer.copyin(rng_np_buffer.data)

 # *** one op LoadOps ***

-from tinygrad.runtime.lib import RawBufferMapped, RawBufferTransfer
-from tinygrad.runtime.ops_disk import RawDiskBuffer
-def _realize_from(buffer: LazyBuffer, src: LazyBuffer) -> None:
-  assert src.realized.size == buffer.realized.size, f"size mismatch on FROM {src.realized.size=} != {buffer.realized.size=}"
-  assert src.st.contiguous and buffer.st.contiguous, "all must be contiguous for from"
-  if DEBUG >= 2: print(f"***      copy {buffer.device} <- {src.device} size {src.realized.size:<16d} shape {str(buffer.shape):23s} dtype {src.realized.dtype}")
+#from tinygrad.runtime.lib import RawBufferMapped, RawBufferTransfer
+#from tinygrad.runtime.ops_disk import RawDiskBuffer
+def _realize_from(buffer: Buffer, src: Buffer) -> None:
+  assert src.size == buffer.size, f"size mismatch on FROM {src.size=} != {buffer.size=}"
+  if DEBUG >= 2: print(f"***      copy {buffer.device} <- {src.device} size {src.size:<16d} shape {buffer.size:5d} dtype {src.dtype}")
+  buffer.copyin(src.toCPU().data)
  # TODO: make this generic
-  if isinstance(src.realized, RawDiskBuffer) and isinstance(buffer.realized, RawBufferMapped):
-    src.realized.readinto(buffer.realized._buffer())
-  elif isinstance(src.realized, RawBufferTransfer) and isinstance(buffer.realized, RawBufferTransfer) and getenv("P2P", 0) >= 1:
-    buffer.realized._transfer(src.realized)
-  else:
-    buffer.realized._copyin(src.realized.toCPU())
+  #if isinstance(src.realized, RawDiskBuffer) and isinstance(buffer.realized, RawBufferMapped):
+  #  src.realized.readinto(buffer.realized._buffer())
+  #elif isinstance(src.realized, RawBufferTransfer) and isinstance(buffer.realized, RawBufferTransfer) and getenv("P2P", 0) >= 1:
+  #  buffer.realized._transfer(src.realized)
+  #else:
+    #buffer.realized._copyin(src.realized.toCPU())

 # *** n op LoadOps ***

-def _realize_custom(buffer: LazyBuffer, *inputs: LazyBuffer) -> None:
-  if DEBUG >= 2: print(f"***    custom {buffer.device}                              shape {str(buffer.shape):23s} dtype {buffer.dtype}")
-  buffer.op.arg(buffer, *inputs)
+def _realize_custom(buffer: Buffer, *inputs: Buffer, arg) -> None:
+  if DEBUG >= 2: print(f"***    custom {buffer.device}                              shape {buffer.size:5d} dtype {buffer.dtype}")
+  arg(buffer, *inputs)

 LOAD_OPS_DISPATCHER: Dict[LoadOps, Callable] = {
  LoadOps.EMPTY: _realize_empty,
--- a/tinygrad/runtime/graph/metal.py
+++ b/tinygrad/runtime/graph/metal.py
@ -0,0 +1,78 @@
+from typing import List, Any, Dict, cast, Optional
+import numpy as np
+import Metal
+from tinygrad.helpers import dtypes, dedup, unwrap2
+from tinygrad.device import Buffer, CompiledASTRunner, update_stats
+from tinygrad.jit import JitItem, get_input_replace, get_jit_stats, get_jc_idxs_with_updatable_launch_dims, GraphException
+from tinygrad.shape.symbolic import Variable
+from tinygrad.runtime.ops_metal import MetalDevice
+
+class MetalGraph:
+  def __init__(self, device:MetalDevice, jit_cache: List[JitItem], input_rawbuffers: List[Buffer], var_vals: Dict[Variable, int]):
+    self.jit_cache = jit_cache
+    self.input_replace = get_input_replace(jit_cache, input_rawbuffers)
+    self.op_estimate, self.mem_estimate = get_jit_stats(jit_cache)
+    self.jc_idx_with_updatable_launch_dims = get_jc_idxs_with_updatable_launch_dims(jit_cache)
+    self.device: MetalDevice = device
+
+    # create metal batch exec
+    icb_descriptor = Metal.MTLIndirectCommandBufferDescriptor.new()
+    icb_descriptor.setCommandTypes_(Metal.MTLIndirectCommandType(Metal.MTLIndirectCommandTypeConcurrentDispatch))
+    icb_descriptor.setInheritBuffers_(False)
+    icb_descriptor.setInheritPipelineState_(False)
+    icb_descriptor.setMaxKernelBufferBindCount_(31)
+    self.icb = self.device.device.newIndirectCommandBufferWithDescriptor_maxCommandCount_options_(icb_descriptor, len(self.jit_cache), Metal.MTLResourceOptions(0))
+    if self.icb is None: raise GraphException("create indirect command buffer failed, does your system support this?")
+
+    if len(var_vals): self.int_buf = self.device.allocator.alloc(len(var_vals), dtypes.int32)
+    read_resources, write_resources = [], []
+    for j,ji in enumerate(self.jit_cache):
+      prg: CompiledASTRunner = cast(CompiledASTRunner, ji.prg)
+      descriptor = Metal.MTLComputePipelineDescriptor.new()
+      descriptor.setComputeFunction_(prg.clprg.fxn)
+      descriptor.setSupportIndirectCommandBuffers_(True)
+      pipeline_state = unwrap2(self.device.device.newComputePipelineStateWithDescriptor_options_reflection_error_(descriptor, Metal.MTLPipelineOption(0), None, None))
+      icb_command = self.icb.indirectComputeCommandAtIndex_(j)
+      icb_command.setComputePipelineState_(pipeline_state)
+      for i,b in enumerate(ji.rawbufs):
+        if b is not None:
+          icb_command.setKernelBuffer_offset_atIndex_(b._buf, 0, i)
+          if i == 0: write_resources.append(b._buf)
+          else: read_resources.append(b._buf)
+      var_vals_keys = list(var_vals.keys())
+      for i,v in enumerate(prg.vars):
+        icb_command.setKernelBuffer_offset_atIndex_(self.int_buf, var_vals_keys.index(v)*4, len(ji.rawbufs)+i)
+      if j not in self.jc_idx_with_updatable_launch_dims:
+        global_size, local_size = prg.launch_dims(var_vals)
+        icb_command.concurrentDispatchThreadgroups_threadsPerThreadgroup_(Metal.MTLSize(*global_size), Metal.MTLSize(*local_size))
+      icb_command.setBarrier()
+    self.read_resources, self.write_resources = dedup(read_resources), dedup(write_resources)
+    self.command_buffer: Any = None
+    if len(var_vals): self.int_buf_view = np.frombuffer(self.int_buf.contents().as_buffer(self.int_buf.length()), np.int32)
+
+  def __call__(self, input_rawbuffers: List[Buffer], var_vals: Dict[Variable, int], wait=False, jit=False) -> Optional[float]:
+    # NOTE: you at least can't update the ints if this is running
+    if self.command_buffer is not None and self.command_buffer in self.device.mtl_buffers_in_flight: self.command_buffer.waitUntilCompleted()
+    all_read_resources = self.read_resources + [x._buf for x in input_rawbuffers]
+    for (j,i),input_idx in self.input_replace.items():
+      self.icb.indirectComputeCommandAtIndex_(j).setKernelBuffer_offset_atIndex_(input_rawbuffers[input_idx]._buf, 0, i)
+    for j in self.jc_idx_with_updatable_launch_dims:
+      global_size, local_size = cast(CompiledASTRunner, self.jit_cache[j].prg).launch_dims(var_vals)
+      self.icb.indirectComputeCommandAtIndex_(j).concurrentDispatchThreadgroups_threadsPerThreadgroup_(Metal.MTLSize(*global_size), Metal.MTLSize(*local_size))
+    if len(var_vals): self.int_buf_view[:] = list(var_vals.values())
+    command_buffer = self.device.mtl_queue.commandBuffer()
+    encoder = command_buffer.computeCommandEncoder()
+    encoder.executeCommandsInBuffer_withRange_(self.icb, Metal.MTLIndirectCommandBufferExecutionRangeMake(0,len(self.jit_cache)))
+    encoder.useResources_count_usage_(all_read_resources, len(all_read_resources), Metal.MTLResourceUsageRead)
+    encoder.useResources_count_usage_(self.write_resources, len(self.write_resources), Metal.MTLResourceUsageWrite)
+    encoder.endEncoding()
+    command_buffer.commit()
+    self.command_buffer = command_buffer
+    if wait:
+      command_buffer.waitUntilCompleted()
+      et = command_buffer.GPUEndTime() - command_buffer.GPUStartTime()
+    else:
+      self.device.mtl_buffers_in_flight.append(command_buffer)
+      et = None
+    update_stats(f"<batched {len(self.jit_cache)}>", self.op_estimate, self.mem_estimate, var_vals, et, buf_count=len(input_rawbuffers), jit=jit, num_kernels=len(self.jit_cache))
+    return et
--- a/tinygrad/runtime/lib.py
+++ b/tinygrad/runtime/lib.py
@ -1,105 +0,0 @@
-from __future__ import annotations
-import ctypes
-import numpy as np
-from collections import defaultdict, deque
-from typing import Any, Dict, Deque, Tuple
-from tinygrad.helpers import DType, dtypes, prod, GlobalCounters, ImageDType
-
-class RawBuffer:  # pylint: disable=abstract-method
-  def __init__(self, size:int, dtype:DType, buf:Any=None, allocator:Any=None, **kwargs):
-    self.size: int = size
-    self.dtype: DType = dtype
-    self.offset: int = 0    # TODO: this is very unsupported, only in disk
-    self._buf = buf if buf is not None else (allocator(size, dtype, **kwargs) if allocator else None) # If buf is provided, use it. Otherwise try to allocate from the allocator.
-    self._memsz: int = size*dtype.itemsize
-    self._allocator = allocator
-    self._device = kwargs.get('device', None)
-    GlobalCounters.mem_used += self._memsz
-  def __del__(self):  # NOTE: if it fails on init (bad dtype), it won't have a _memsz
-    if hasattr(self, '_memsz'): GlobalCounters.mem_used -= self._memsz
-    if hasattr(self, '_allocator') and self._allocator: self._allocator.free(self._buf)
-  def __repr__(self): return f"buffer<{self.size}, {self.dtype}, {id(self)}>"
-  @classmethod
-  def fromCPU(cls, x:np.ndarray, **kwargs):
-    ret = cls(prod(x.shape), dtypes.from_np(x.dtype), **kwargs)
-    if x.size > 0: ret._copyin(x)
-    return ret
-  def _copyin(self, x:np.ndarray) -> None: raise NotImplementedError("must be implemented")
-  def toCPU(self) -> np.ndarray: raise NotImplementedError("must be implemented")
-
-class RawBufferMapped(RawBuffer):
-  def _buffer(self) -> memoryview: raise NotImplementedError("must be implemented")
-  # NOTE: this metadata prevents the backing buffer from being freed. hack can be removed with PEP688
-  def toCPU(self) -> np.ndarray: return np.frombuffer(self._buffer(), dtype=np.dtype(self.dtype.np, metadata={"backing": self}), count=self.size)
-  def _copyin(self, x:np.ndarray) -> None: np.copyto(self.toCPU(), x.reshape(-1))
-
-# this one is simple enough that i moved it out of the runtimes
-ctypes_map = {dtypes.float64:ctypes.c_double, dtypes.float32: ctypes.c_float, dtypes.float16: ctypes.c_int16, dtypes.bfloat16: ctypes.c_int16, dtypes.int8: ctypes.c_int8, dtypes.uint8: ctypes.c_uint8, dtypes.bool: ctypes.c_uint8, dtypes.int32: ctypes.c_int32, dtypes.uint32: ctypes.c_uint32, dtypes.int64: ctypes.c_int64, dtypes.uint64: ctypes.c_uint64, dtypes.int16: ctypes.c_int16, dtypes.uint16: ctypes.c_uint16}
-class RawMallocBuffer(RawBufferMapped):
-  def __init__(self, size, dtype: DType): super().__init__(size, dtype, (ctypes_map[dtype] * size)())
-  def _buffer(self): return memoryview(self._buf)
-
-class RawBufferCopyInOut(RawBuffer):
-  def _copyout(self, x:np.ndarray) -> None: raise NotImplementedError("must be implemented")
-
-  def toCPU(self) -> np.ndarray:
-    x: np.ndarray = np.empty(self.size, dtype=self.dtype.np)
-    if x.size > 0: self._copyout(x)
-    return x
-
-class RawBufferTransfer(RawBuffer):
-  def _transfer(self, x:RawBuffer) -> None: raise NotImplementedError("must be implemented")
-
-class LRUAllocator:
-  def __init__(self, dev_memsz=(4<<30)):
-    self.epoch = 0
-    self.free_space: Dict[Any, int] = defaultdict(lambda: dev_memsz)
-    self.buffer_info: Dict[Any, Tuple[int, DType, str]] = dict()
-    self.cached_buffers: Dict[Tuple[int, ...], Deque[Tuple[Any, int]]] = defaultdict(deque) # Cached buffer storage, splitted by type and size, newest first.
-    self.aging_order: Dict[Any, Deque[Tuple[Tuple[int, ...], int]]] = defaultdict(deque) # Keys of cached_buffers, ordered from oldest to newest updates.
-
-  def _cache_reuse_buffer(self, rawbufs: Deque[Tuple[Any, int]]): # The newest cached buffer is reused.
-    GlobalCounters.mem_cached -= self._underlying_buf_memsz(rawbufs[0][0])
-    return rawbufs.popleft()[0]
-
-  def ensure_has_free_space(self, space_to_free, device):
-    while len(self.aging_order[device]) and self._get_cur_free_space(device) < space_to_free: # When OOM removing lru buffers.
-      bucket, epoch = self.aging_order[device].popleft()
-      if self.cached_buffers[bucket] and self.cached_buffers[bucket][-1][1] == epoch: self._free_buffer(self.cached_buffers[bucket].pop()[0]) # Free cached buffer if it is still in cache.
-    assert (curr_free := self._get_cur_free_space(device)) > space_to_free, f"out of memory - requested: {space_to_free/1e9:5.2f} GB, available: {curr_free/1e9:5.2f} GB"
-
-  def _alloc_buffer(self, size, dtype, device, **kwargs):
-    self.ensure_has_free_space(size*dtype.itemsize, device)
-    while True:
-      try:
-        newbuf = self._do_alloc(max(1, size), dtype, device, **kwargs)
-        break
-      except Exception:
-        if len(self.aging_order[device]) == 0: raise
-        self.ensure_has_free_space(1.1*self._get_cur_free_space(device), device) # increase free space by 10% and try again.
-    self.free_space[device] -= size*dtype.itemsize
-    self.buffer_info[newbuf] = (size, dtype, device)
-    return newbuf
-
-  def _free_buffer(self, buf_to_free):
-    self.free_space[self.buffer_info[buf_to_free][2]] += self._underlying_buf_memsz(buf_to_free)
-    GlobalCounters.mem_cached -= self._underlying_buf_memsz(buf_to_free)
-    self.buffer_info.pop(buf_to_free)
-    self._do_free(buf_to_free)
-
-  def __call__(self, size, dtype, device='0', **kwargs): # allocate
-    rawbufs = self.cached_buffers.get(self._cached_bufkey(size, dtype, device), None)
-    return self._cache_reuse_buffer(rawbufs) if rawbufs else self._alloc_buffer(size, dtype, device, **kwargs)
-
-  def free(self, buf): # free() just caches buffer. It might be freed later when OOM during allocation.
-    self.epoch += 1
-    size, dtype, device = self.buffer_info[buf]
-    self.cached_buffers[self._cached_bufkey(size, dtype, device)].appendleft((buf, self.epoch))
-    self.aging_order[device].append((self._cached_bufkey(size, dtype, device), self.epoch))
-    GlobalCounters.mem_cached += self._underlying_buf_memsz(buf)
-
-  def _underlying_buf_memsz(self, buf): return self.buffer_info[buf][0] * self.buffer_info[buf][1].itemsize
-  def _cached_bufkey(self, size, dtype, device) -> Tuple[int, ...]: return (device, size, dtype, dtype.shape) if isinstance(dtype, ImageDType) else (device, size, dtype) # Provides a key for reusing device buffers with identical keys.
-  def _do_alloc(self, size, dtype, device, **kwargs): raise NotImplementedError("must be implemented")
-  def _do_free(self, buf): pass
-  def _get_cur_free_space(self, device): return self.free_space[device]
--- a/tinygrad/runtime/ops_clang.py
+++ b/tinygrad/runtime/ops_clang.py
@ -1,8 +1,7 @@
 import time, ctypes, subprocess, platform, functools, pathlib, tempfile
 from typing import Any
-from tinygrad.device import Compiled
+from tinygrad.device import Compiled, MallocAllocator
 from tinygrad.helpers import diskcache
-from tinygrad.runtime.lib import RawMallocBuffer
 from tinygrad.codegen.kernel import LinearizerOptions
 from tinygrad.renderer.cstyle import uops_to_cstyle, CStyleLanguage

@ -21,7 +20,7 @@ def compile_clang(prg:str, header:str=CLANG_PROGRAM_HEADER) -> bytes:
    return pathlib.Path(output_file.name).read_bytes()

 class ClangProgram:
-  def __init__(self, name:str, prg:bytes):
+  def __init__(self, name:str, prg:bytes, bufs:int, vars:int=0):
    # write to disk so we can load it
    with tempfile.NamedTemporaryFile(delete=True) as cached_file_path:
      pathlib.Path(cached_file_path.name).write_bytes(prg)
@ -29,8 +28,8 @@ class ClangProgram:

  def __call__(self, *args, wait=False):
    if wait: st = time.perf_counter()
-    self.fxn(*[x._buf if isinstance(x, RawMallocBuffer) else x for x in args])
+    self.fxn(*args)
    if wait: return time.perf_counter()-st

 renderer = functools.partial(uops_to_cstyle, CStyleLanguage(buffer_suffix=" restrict", arg_int_prefix="const int"))
-ClangDevice = Compiled(RawMallocBuffer, LinearizerOptions(supports_float4=False, has_local=False), renderer, compile_clang, ClangProgram)
+ClangDevice = Compiled(MallocAllocator, LinearizerOptions(supports_float4=False, has_local=False), renderer, compile_clang, ClangProgram)
--- a/tinygrad/runtime/ops_cpu.py
+++ b/tinygrad/runtime/ops_cpu.py
@ -1,14 +1,8 @@
 import numpy as np
-from typing import Callable, Dict, Tuple, Optional
+from typing import Callable, Dict, Tuple
 from tinygrad.helpers import dtypes, DType
 from tinygrad.ops import BufferOps, UnaryOps, BinaryOps, MovementOps, ReduceOps, TernaryOps, Op
-from tinygrad.device import Interpreted
-from tinygrad.runtime.lib import RawBuffer
-
-class RawNumpyBuffer(RawBuffer):
-  def __init__(self, size:int, dtype:DType, buf:Optional[np.ndarray]=None): super().__init__(size, dtype, buf)
-  def _copyin(self, x): self.size, self.dtype, self._buf = x.size, dtypes.from_np(x.dtype), x
-  def toCPU(self): return self._buf if self._buf is not None else np.empty([self.size], self.dtype.np)
+from tinygrad.device import Interpreted, Allocator

 def shape_to_axis(old_shape:Tuple[int, ...], new_shape:Tuple[int, ...]) -> Tuple[int, ...]:
  assert len(old_shape) == len(new_shape), "reduce shapes must have same dimensions"
@ -31,7 +25,7 @@ def einsum_mulacc(einsum, get_strides, expand):
  return mulacc

 numpy_fxn_for_op: Dict[Op, Callable] = {
-  BufferOps.LOAD: lambda x: x.toCPU(), BufferOps.CONST: lambda val, dtype: np.array(val, dtype=dtype.np), BufferOps.STORE: RawNumpyBuffer.fromCPU,
+  BufferOps.CONST: lambda val, dtype: np.array(val, dtype=dtype.np),
  UnaryOps.NOOP: lambda x: np.require(x, requirements='C'), UnaryOps.EXP2: np.exp2, UnaryOps.LOG2: np.log2, UnaryOps.SIN: np.sin,
  UnaryOps.CAST: lambda x,y: x.view(y[0].np) if y[1] else x.astype(y[0].np, copy=False), UnaryOps.NEG: lambda x: np.logical_not(x) if x.dtype == np.bool_ else np.negative(x),
  BinaryOps.MAX: np.maximum, BinaryOps.CMPLT: lambda x,y: (x<y).astype(output_type(x,y)), BinaryOps.ADD: lambda x, y: np.add(*match_types(x, y)),
@ -45,4 +39,9 @@ numpy_fxn_for_op: Dict[Op, Callable] = {
  TernaryOps.WHERE: np.where,
 }

-CPUDevice = Interpreted(RawNumpyBuffer, numpy_fxn_for_op)
+class NumpyAllocator(Allocator):
+  def _alloc(self, size:int, dtype:DType): return np.empty(size, dtype.np)
+  def copyin(self, dest:np.ndarray, src:memoryview): np.copyto(dest, np.frombuffer(src, dest.dtype).reshape(dest.shape))
+  def copyout(self, dest:memoryview, src:np.ndarray): np.copyto(np.frombuffer(dest, src.dtype).reshape(src.shape), src)
+
+CPUDevice = Interpreted(NumpyAllocator(), numpy_fxn_for_op)
--- a/tinygrad/runtime/ops_cuda.py
+++ b/tinygrad/runtime/ops_cuda.py
@ -1,26 +1,17 @@
-import subprocess, time, re, hashlib, tempfile
+import subprocess, time, hashlib, tempfile
 from pathlib import Path
-from typing import Optional, Tuple
+from typing import Tuple
 import numpy as np
 from pycuda.compiler import compile as cuda_compile
-from tinygrad.helpers import DEBUG, getenv, colored, diskcache
-from tinygrad.device import Compiled
-from tinygrad.runtime.lib import RawBufferCopyInOut, RawMallocBuffer, LRUAllocator
+from tinygrad.helpers import DEBUG, getenv, pretty_ptx, diskcache
+from tinygrad.device import Compiled, LRUAllocator, MallocAllocator
 from tinygrad.codegen.kernel import LinearizerOptions
 from tinygrad.renderer.cuda import CUDARenderer

-def pretty_ptx(s):
-  # all expressions match `<valid_before><expr><valid_after>` and replace it with `<valid_before>color(<expr>)<valid_after>`
-  s = re.sub(r'([!@<\[\s,\+\-;\n])((?:[_%$][\w%\$_]+(?:\.[xyz])?\:?)|(?:buf\d+))([<>\]\s,\+\-;\n\)])', lambda m:m[1]+colored(m[2], "blue")+m[3], s, flags=re.M) # identifiers
-  s = re.sub(r'(.)((?:b|s|u|f)(?:8|16|32|64)|pred)([\.\s])', lambda m:m[1]+colored(m[2], "green")+m[3], s, flags=re.M) # types
-  s = re.sub(r'^(\s*)([\w]+)(.*?;$)', lambda m:m[1]+colored(m[2], "yellow")+m[3], s, flags=re.M) # instructions
-  s = re.sub(r'([<>\[\]\s,\+\-;])((?:0[fF][0-9a-fA-F]{8})|(?:[0-9]+)|(?:0[xX][0-9a-fA-F]+))([<>\[\]\s,\+\-;])', lambda m:m[1]+colored(m[2], "yellow")+m[3], s, flags=re.M) # numbers
-  s = re.sub(r'(\.)(param|reg|global)', lambda m:m[1]+colored(m[2], "magenta"), s, flags=re.M) # space
-  s = re.sub(r'(\.)(version|target|address_size|visible|entry)', lambda m:m[1]+colored(m[2], "magenta"), s, flags=re.M) # derivatives
-  return s
 def arch(): return "sm_" + "".join([str(x) for x in pycuda.driver.Context.get_device().compute_capability()])
+CUDACPU = getenv("CUDACPU") == 1

-if getenv("CUDACPU", 0) == 1:
+if CUDACPU:
  import ctypes, ctypes.util
  lib = ctypes.CDLL(ctypes.util.find_library("gpuocelot"))
  lib.ptx_run.argtypes = [ctypes.c_char_p, ctypes.c_int, ctypes.POINTER(ctypes.c_void_p), ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_int]
@ -44,26 +35,21 @@ if getenv("CUDACPU", 0) == 1:
    get_device = lambda: context.device # pylint: disable=unnecessary-lambda # noqa: E731
  import pycuda.driver
  pycuda.driver.Context = context
-  RawCUDABuffer = RawMallocBuffer
 else:
  import pycuda.autoprimaryctx # pylint: disable=unused-import # noqa: F401
  import pycuda.driver as cuda # type: ignore
  class CUDAAllocator(LRUAllocator):
-    def __init__(self): super().__init__(self._get_cur_free_space(None))
-    def _do_alloc(self, size, dtype, device, **kwargs): return cuda.mem_alloc(size * dtype.itemsize) # type: ignore
-    def _cached_bufkey(self, size, dtype, device): return (device, size*dtype.itemsize) # Buffers of the same length could be reused, no matter what dtype.
-    def _get_cur_free_space(self, device): return cuda.mem_get_info()[0] # type: ignore
-  CUDAAlloc = CUDAAllocator()
-  class RawCUDABuffer(RawBufferCopyInOut): # type: ignore
-    def __init__(self, size, dtype): super().__init__(size, dtype, allocator=CUDAAlloc)
-    def _copyin(self, x:np.ndarray, stream:Optional[cuda.Stream]=None): cuda.memcpy_htod_async(self._buf, x.ravel(), stream) # type: ignore
-    def _copyout(self, x:np.ndarray): cuda.memcpy_dtoh(x, self._buf) # type: ignore
+    def _alloc(self, size, dtype):
+      if size == 0: return None
+      return cuda.mem_alloc(size * dtype.itemsize) # type: ignore
+    def copyin(self, dest, src:memoryview): cuda.memcpy_htod_async(dest, src) # type: ignore
+    def copyout(self, dest:memoryview, src): cuda.memcpy_dtoh(dest, src) # type: ignore

@diskcache
 def compile_cuda(prg) -> bytes: return cuda_compile(prg, target="ptx", no_extern_c=True, options=['-Wno-deprecated-gpu-targets'])

 class CUDAProgram:
-  def __init__(self, name:str, _prg:bytes):
+  def __init__(self, name:str, _prg:bytes, bufs:int, vars:int=0):
    prg = _prg.decode('utf-8')
    if DEBUG >= 5: print(pretty_ptx(prg))
    if DEBUG >= 6:
@ -80,11 +66,15 @@ class CUDAProgram:
    if wait:
      start, end = cuda.Event(), cuda.Event()
      start.record()
-    self.prg(*[x._buf if isinstance(x, RawCUDABuffer) else np.int32(x) if (isinstance(x, int) and not getenv("CUDACPU")) else x for x in args], block=tuple(local_size), grid=tuple(global_size), shared=shared)
+    self.prg(*[np.int32(x) if (isinstance(x, int) and not CUDACPU) else x for x in args], block=tuple(local_size), grid=tuple(global_size), shared=shared)
    if wait:
      end.record()
      end.synchronize()
      return start.time_till(end)*1e-3

-CUDADevice = Compiled(RawCUDABuffer, LinearizerOptions(supports_float4=False if getenv("PTX") else True, supports_float4_alu=False, global_max = [65535, 65535, 2147483647], local_max = [64, 1024, 1024]),
-                      CUDARenderer, compile_cuda, CUDAProgram, cuda.Context.synchronize)
+class CUDADevice(Compiled):
+  def __init__(self, device:str):
+    super().__init__(MallocAllocator if CUDACPU else CUDAAllocator(),
+                     LinearizerOptions(supports_float4_alu=False, global_max=[65535, 65535, 2147483647], local_max=[64, 1024, 1024]),
+                     CUDARenderer, compile_cuda, CUDAProgram)
+  def synchronize(self): return cuda.Context.synchronize()
--- a/tinygrad/runtime/ops_disk.py
+++ b/tinygrad/runtime/ops_disk.py
@ -1,54 +1,56 @@
 import os, mmap
 try: import _posixshmem
 except Exception: pass
-from typing import Optional
 from typing import Callable, Dict, Tuple
 from tinygrad.helpers import prod, DType, OSX
-from tinygrad.runtime.lib import RawBufferMapped
-from tinygrad.device import Interpreted
-from tinygrad.ops import Op, MovementOps, UnaryOps, BufferOps
+from tinygrad.device import Interpreted, Allocator
+from tinygrad.ops import Op, MovementOps, UnaryOps
 from tinygrad.shape.view import strides_for_shape
-MAP_LOCKED, MAP_POPULATE = 0x2000, 0x008000

 class UnderlyingDiskBuffer:
  def __init__(self, fd, mem): self.fd, self.mem = fd, mem
  def __del__(self):
    if self.fd: self.fd.close()

-class RawDiskBuffer(RawBufferMapped):
-  def __init__(self, size, dtype:DType, buf=None, device:Optional[str]=None, offset:int=0):  # pylint: disable=super-init-not-called
-    assert device is not None or buf is not None, "disk tensor needs a path or a buf"
-    if device is not None:
-      if str(device).startswith("shm:"):
-        if OSX:
-          with open(f"/tmp/shm_{device[4:]}", "w+b") as f:
-            f.truncate(size * dtype.itemsize)
-            shm = mmap.mmap(f.fileno(), size * dtype.itemsize, flags=mmap.MAP_SHARED)
-        else:
-          fd = _posixshmem.shm_open(device[4:], os.O_RDWR, 0o600)
-          # TODO: these flags are somewhat platform specific, but python doesn't expose the ones we need
-          shm = mmap.mmap(fd, size * dtype.itemsize, flags=mmap.MAP_SHARED | MAP_LOCKED | MAP_POPULATE)
-          shm.madvise(mmap.MADV_HUGEPAGE)     # type: ignore   # not on OSX
-          os.close(fd)
-        buf = UnderlyingDiskBuffer(None, shm)
-      else:
-        f = open(device, "a+b")
-        if os.path.getsize(device) < size * dtype.itemsize: os.ftruncate(f.fileno(), size * dtype.itemsize)
-        buf = UnderlyingDiskBuffer(f, mmap.mmap(f.fileno(), size * dtype.itemsize))
-    # NOTE: we don't call super since disk tensors don't use RAM
-    self.size, self.dtype, self._buf, self.offset = size, dtype, buf, offset
-  def cast(self, arg:Tuple[DType, bool]):
-    return RawDiskBuffer(self.size, arg[0], self._buf, offset=self.offset)
+class DiskBuffer:
+  def __init__(self, ud:UnderlyingDiskBuffer, size:int, dtype:DType, offset=0): self.ud, self.size, self.dtype, self.offset = ud, size, dtype, offset
+  def __repr__(self): return f"<DiskBuffer size={self.size} dtype={self.dtype} offset={self.offset}>"
+  def cast(self, arg:Tuple[DType, bool]): return DiskBuffer(self.ud, self.size, arg[0], offset=self.offset)
  def as_strided(self, arg):
    assert strides_for_shape(arg[0]) == arg[1], "disk tensors don't support strides"
-    return RawDiskBuffer(prod(arg[0]), self.dtype, self._buf, offset=self.offset+arg[2]*self.dtype.itemsize)
-  def _buffer(self): return memoryview(self._buf.mem)[self.offset:self.offset+self.size*self.dtype.itemsize]
-  def readinto(self, buf:memoryview):
-    if self._buf.fd is not None:
-      self._buf.fd.seek(self.offset)
-      self._buf.fd.readinto(buf)
-    else:
-      buf.cast('B')[:] = self._buffer()
+    return DiskBuffer(self.ud, prod(arg[0]), self.dtype, offset=self.offset+arg[2]*self.dtype.itemsize)
+  def _buf(self) -> memoryview: return memoryview(self.ud.mem).cast("B")[self.offset:self.offset+self.size*self.dtype.itemsize]

-disk_fxn_for_op: Dict[Op, Callable] = { BufferOps.LOAD: lambda x: x, BufferOps.STORE: lambda x: x, UnaryOps.NOOP: lambda x: x, UnaryOps.CAST: RawDiskBuffer.cast, MovementOps.AS_STRIDED: RawDiskBuffer.as_strided }
-DiskDevice = Interpreted(RawDiskBuffer, disk_fxn_for_op)
+disk_fxn_for_op: Dict[Op, Callable] = { UnaryOps.NOOP: lambda x: x, UnaryOps.CAST: DiskBuffer.cast, MovementOps.AS_STRIDED: DiskBuffer.as_strided }
+
+MAP_LOCKED, MAP_POPULATE = 0x2000, 0x008000
+class DiskAllocator(Allocator):
+  def __init__(self, device): self.device = device
+  def _alloc(self, size, dtype):
+    if str(self.device).startswith("shm:"):
+      if OSX:
+        with open(f"/tmp/shm_{self.device[4:]}", "w+b") as f:
+          f.truncate(size * dtype.itemsize)
+          shm = mmap.mmap(f.fileno(), size * dtype.itemsize, flags=mmap.MAP_SHARED)
+      else:
+        fd = _posixshmem.shm_open(self.device[4:], os.O_RDWR, 0o600)
+        # TODO: these flags are somewhat platform specific, but python doesn't expose the ones we need
+        shm = mmap.mmap(fd, size * dtype.itemsize, flags=mmap.MAP_SHARED | MAP_LOCKED | MAP_POPULATE)
+        shm.madvise(mmap.MADV_HUGEPAGE)     # type: ignore   # not on OSX
+        os.close(fd)
+      buf = UnderlyingDiskBuffer(None, shm)
+    else:
+      f = open(self.device, "a+b")
+      if os.path.getsize(self.device) < size * dtype.itemsize: os.ftruncate(f.fileno(), size * dtype.itemsize)
+      buf = UnderlyingDiskBuffer(f, mmap.mmap(f.fileno(), size * dtype.itemsize))
+    return DiskBuffer(buf, size, dtype)
+  def copyin(self, dest:DiskBuffer, src:memoryview): dest._buf()[:] = src
+  def copyout(self, dest:memoryview, src:DiskBuffer):
+    if src.ud.fd is not None:
+      src.ud.fd.seek(src.offset)
+      src.ud.fd.readinto(dest)
+    else:
+      dest[:] = src._buf()
+
+class DiskDevice(Interpreted):
+  def __init__(self, device): super().__init__(DiskAllocator(device[5:]), disk_fxn_for_op)
--- a/tinygrad/runtime/ops_gpu.py
+++ b/tinygrad/runtime/ops_gpu.py
@ -1,105 +1,50 @@
 from __future__ import annotations
 import os
 os.environ['PYOPENCL_NO_CACHE'] = '1'
-import pathlib
+import pathlib, functools
 import numpy as np
 import pyopencl as cl
 from typing import Optional, List, Tuple
-from tinygrad.helpers import DEBUG, getenv, prod, ImageDType, OSX, fromimport, diskcache
-from tinygrad.device import Compiled
+from tinygrad.helpers import DEBUG, getenv, prod, ImageDType, OSX, fromimport, diskcache, DType
+from tinygrad.device import Compiled, LRUAllocator
 from tinygrad.renderer.opencl import OpenCLRenderer
-from tinygrad.runtime.lib import RawBufferCopyInOut, LRUAllocator, RawBufferTransfer
 from tinygrad.codegen.kernel import LinearizerOptions

 OSX_TIMING_RATIO = (125/3) if OSX else 1.0   # see test/external/external_osx_profiling.py to determine this ratio. it's in like GPU clocks or something

 # TODO: if you fork and exit the child process after creating anything with cl on AMD, it hangs on e.wait()
 ROCM_LLVM_PATH = pathlib.Path("/opt/rocm/llvm/bin")
-if DEBUG >= 5:
+if DEBUG >= 6:
  early_exec = fromimport("extra.helpers", "enable_early_exec")()

-class CLAllocator(LRUAllocator):
-  def _do_alloc(self, size, dtype, device, **kwargs):
-    if isinstance(dtype, ImageDType):
-      # NOTE: the memory is a bit off here due to padding, it's buf.row_pitch * buf.height * 4 * dtype.itemsize
-      assert size == prod(dtype.shape), f"image size mismatch {size} != {dtype.shape}"
-      fmt = cl.ImageFormat(cl.channel_order.RGBA, {2: cl.channel_type.HALF_FLOAT, 4: cl.channel_type.FLOAT}[dtype.itemsize])
-      buf = cl.Image(CL.cl_ctxs[int(device)], cl.mem_flags.READ_WRITE, fmt, shape=(dtype.shape[1], dtype.shape[0]))
-    else:
-      buf = cl.Buffer(CL.cl_ctxs[int(device)], cl.mem_flags.READ_WRITE, size * dtype.itemsize)
-    setattr(buf, 'device', int(device)) # device is tracked on the underlying buffer
-    return buf
-
-class _CL:
-  def __init__(self):
-    cl_platforms = cl.get_platforms()
-    platform_devices: List[List[cl.Device]] = [y for y in ([x.get_devices(device_type=cl.device_type.GPU) for x in cl_platforms] + [x.get_devices(device_type=cl.device_type.CPU) for x in cl_platforms]) if y]
-    self.devices = [device for device in platform_devices[getenv('CL_PLATFORM', 0)] if device.name not in getenv('CL_EXCLUDE', "").split(",")]
-    self.cl_platform = self.devices[0].platform
-  def post_init(self, device=None):
-    self.cl_ctxs: List[cl.Context] = [cl.Context(devices=[x]) for x in self.devices] if device is None else [cl.Context(devices=[self.devices[device]])]
-    if DEBUG >= 1: print(f"using devices: {[ctx.devices[0].hashable_model_and_version_identifier for ctx in self.cl_ctxs]}")
-    self.cl_queue: List[cl.CommandQueue] = [cl.CommandQueue(ctx, device=ctx.devices[0], properties=cl.command_queue_properties.PROFILING_ENABLE) for ctx in self.cl_ctxs]
-    self.cl_allocator = CLAllocator(CL.cl_ctxs[0].devices[0].get_info(cl.device_info.GLOBAL_MEM_SIZE))
-  def synchronize(self):
-    for q in self.cl_queue: q.finish()
-CL = _CL()
-if not getenv("DELAYED_RUNTIME_INIT", False): CL.post_init()
-
-class CLBuffer(RawBufferCopyInOut, RawBufferTransfer):
-  def __init__(self, size, dtype, device='0'): super().__init__(size, dtype, allocator=CL.cl_allocator, **{'device': device})
-  def _clear_event(self, _): del self.event
-  def _copyin(self, x:np.ndarray):
-    assert not self.dtype.name.startswith("image"), f"can't copyin images {self.dtype}"
-    self.event = cl.enqueue_copy(CL.cl_queue[self._buf.device], self._buf, np.require(x, requirements=['C', 'A']), is_blocking=False)
-    self.event.set_callback(cl.command_execution_status.COMPLETE, self._clear_event)
-  def _copyout(self, x:np.ndarray):
-    assert not self.dtype.name.startswith("image"), f"can't copyout images {self.dtype}"
-    CL.cl_allocator.ensure_has_free_space(self.size*self.dtype.itemsize, self._device)
-    buf = cl.Buffer(CL.cl_ctxs[self._buf.device], cl.mem_flags.WRITE_ONLY | cl.mem_flags.USE_HOST_PTR, 0, hostbuf=x.data)
-    mapped, event = cl.enqueue_map_buffer(CL.cl_queue[self._buf.device], buf, cl.map_flags.WRITE, 0, self.size, dtype=self.dtype.np, is_blocking=False)
-    with mapped.base: cl.enqueue_copy(CL.cl_queue[self._buf.device], mapped, self._buf, is_blocking=True, wait_for=[event] + ([evt] if (evt:=getattr(self, "event", None)) else []))
-  def _transfer(self, x):
-    if "gfx" in CL.cl_ctxs[x._buf.device].devices[0].name:
-      cl.enqueue_copy_buffer_p2p_amd(CL.cl_platform, CL.cl_queue[x._buf.device], x._buf, self._buf, x.size * x.dtype.itemsize).wait()
-    else: raise NotImplementedError("p2p transfer between devices not implemented on non-amd")
-
@diskcache
 def compile_gpu(prg:str) -> bytes:
-  clprg = cl.Program(CL.cl_ctxs[0], prg)
+  clprg = cl.Program(GPUDevice.compile_context, prg)
  clprg.build()
  return clprg.get_info(cl.program_info.BINARIES)[0]

 class CLProgram:
-  def __init__(self, name:str, prg:bytes, argdtypes=None, options=None):
-    self.name, self.clprograms = name, [cl.Program(ctx, ctx.devices, [prg]*len(ctx.devices)) for ctx in CL.cl_ctxs]
-    self._clprgs = [clprogram.build(options=options) for clprogram in self.clprograms]
-    self.clprgs = [clprg.__getattr__(name) for clprg in self._clprgs]
+  def __init__(self, device:GPUDevice, name:str, prg:bytes, bufs:int=0, vars:int=0):
+    self.device, self.name, self.clprogram = device, name, cl.Program(device.ctx, [device.ctx.devices[0]], [prg])
+    self.clprogram.build()
+    self.clprg = self.clprogram.__getattr__(name)
    if DEBUG >= 5 and not OSX:
-      if 'Adreno' in CL.cl_ctxs[0].devices[0].name:
+      device_name = self.device.ctx.devices[0].name
+      if 'Adreno' in device_name:
        fromimport('disassemblers.adreno', 'disasm')(prg)
-      elif CL.cl_ctxs[0].devices[0].name.startswith('gfx'):
+      elif device_name.startswith('gfx'):
        asm = early_exec(([ROCM_LLVM_PATH / "llvm-objdump", '-d', '-'], prg))
        print('\n'.join([x for x in asm.decode('utf-8').split("\n") if 's_code_end' not in x]))
-      elif "NVIDIA" in CL.cl_ctxs[0].devices[0].name:
+      elif "NVIDIA" in device_name:
        # print the PTX for NVIDIA.
        print(prg.decode('utf-8'))
-    if argdtypes is not None: self.set_argdtypes(argdtypes)
-
-  def set_argdtypes(self, argdtypes): self.argdtypes, _ = argdtypes, [clprg.set_scalar_arg_dtypes(argdtypes) for clprg in self.clprgs]
+    if vars > 0: self.clprg.set_scalar_arg_dtypes([None]*bufs + [np.int32]*vars)

  @staticmethod
-  def max_work_group_size(): return CL.cl_ctxs[0].devices[0].max_work_group_size
+  def max_work_group_size(): return GPUDevice.compile_context.devices[0].max_work_group_size if GPUDevice.compile_context is not None else 1024

  def __call__(self, *bufs, global_size:Tuple[int,int,int], local_size:Optional[Tuple[int,int,int]]=None, wait=False) -> Optional[float]:
-    if not hasattr(self, 'argdtypes'): self.set_argdtypes(tuple(None if x.__class__ is CLBuffer else np.int32 for x in bufs))
-    cl_bufs, wait_for = [], []
-    for x in bufs:
-      if x.__class__ is CLBuffer:
-        cl_bufs.append(x._buf)
-        if (event:=getattr(x, "event",None)): wait_for.append(event)
-      else: cl_bufs.append(x)
-    e = self.clprgs[cl_bufs[0].device](CL.cl_queue[cl_bufs[0].device], [int(g*l) for g,l in zip(global_size, local_size)] if local_size is not None else global_size, local_size, *cl_bufs, wait_for=wait_for)
+    e = self.clprg(self.device.queue, [int(g*l) for g,l in zip(global_size, local_size)] if local_size is not None else global_size, local_size, *bufs)
    if wait:
      e.wait()
      try:
@ -108,4 +53,38 @@ class CLProgram:
        return None
    return None

-GPUDevice = Compiled(CLBuffer, LinearizerOptions(), OpenCLRenderer, compile_gpu, CLProgram, CL.synchronize)
+class CLAllocator(LRUAllocator):
+  def __init__(self, device:GPUDevice):
+    self.events: List[cl.Event] = []
+    self.device = device
+    super().__init__()
+  def _alloc(self, size:int, dtype:DType):
+    if size == 0: return None
+    if isinstance(dtype, ImageDType):
+      # NOTE: the memory is a bit off here due to padding, it's buf.row_pitch * buf.height * 4 * dtype.itemsize
+      assert size == prod(dtype.shape), f"image size mismatch {size} != {dtype.shape}"
+      fmt = cl.ImageFormat(cl.channel_order.RGBA, {2: cl.channel_type.HALF_FLOAT, 4: cl.channel_type.FLOAT}[dtype.itemsize])
+      buf = cl.Image(self.device.ctx, cl.mem_flags.READ_WRITE, fmt, shape=(dtype.shape[1], dtype.shape[0]))
+    else:
+      buf = cl.Buffer(self.device.ctx, cl.mem_flags.READ_WRITE, size * dtype.itemsize)
+    return buf
+  def copyin(self, dest:cl.Buffer, src:memoryview): self.events.append(cl.enqueue_copy(self.device.queue, dest, src, is_blocking=False))
+  def copyout(self, dest:memoryview, src:cl.Buffer):
+    self.events.clear()
+    cl.enqueue_copy(self.device.queue, dest, src, is_blocking=True)
+
+class GPUDevice(Compiled):
+  devices = None
+  compile_context = None
+  def __init__(self, device:str):
+    if GPUDevice.devices is None:
+      cl_platforms = cl.get_platforms()
+      platform_devices: List[List[cl.Device]] = [y for y in ([x.get_devices(device_type=cl.device_type.GPU) for x in cl_platforms] + [x.get_devices(device_type=cl.device_type.CPU) for x in cl_platforms]) if y]
+      GPUDevice.devices = [device for device in platform_devices[getenv('CL_PLATFORM', 0)] if device.name not in getenv('CL_EXCLUDE', "").split(",")]
+      if DEBUG >= 1: print(f"using devices: {[device.hashable_model_and_version_identifier for device in GPUDevice.devices]}")
+    self.device = int(device.split(":")[1]) if ":" in device else 0
+    self.ctx = cl.Context(devices=[GPUDevice.devices[self.device]])
+    if GPUDevice.compile_context is None: GPUDevice.compile_context = self.ctx
+    self.queue = cl.CommandQueue(self.ctx, device=self.ctx.devices[0], properties=cl.command_queue_properties.PROFILING_ENABLE)
+    super().__init__(CLAllocator(self), LinearizerOptions(), OpenCLRenderer, compile_gpu, functools.partial(CLProgram, self))
+  def synchronize(self): self.queue.finish()
--- a/tinygrad/runtime/ops_hip.py
+++ b/tinygrad/runtime/ops_hip.py
@ -1,14 +1,10 @@
-import numpy as np
-import ctypes
+import ctypes, functools
 import extra.hip_wrapper as hip
-from typing import Tuple, List, Any, Dict, cast, Optional, Callable
-from tinygrad.helpers import DEBUG, getenv, diskcache
-from tinygrad.device import Compiled, CompiledASTRunner, update_stats
+from typing import Tuple, cast, Callable, TypeVar
+from tinygrad.helpers import DEBUG, DType, getenv, diskcache, from_mv
+from tinygrad.device import Compiled, LRUAllocator, MallocAllocator
 from tinygrad.renderer.hip import HIPRenderer
-from tinygrad.runtime.lib import RawBufferCopyInOut, LRUAllocator, RawBufferTransfer, RawBuffer, RawMallocBuffer
 from tinygrad.codegen.kernel import LinearizerOptions
-from tinygrad.shape.symbolic import Variable
-from tinygrad.jit import JitItem, get_input_replace, get_jit_stats, get_jc_idxs_with_updatable_launch_dims, get_jc_idxs_with_updatable_var_vals, GraphException

 # TODO: if you fork and exit the child process after creating anything with cl on AMD, it hangs on e.wait()
 if DEBUG >= 6:
@ -16,41 +12,12 @@ if DEBUG >= 6:
  early_exec = enable_early_exec()

 # The default HIP stream is used for everything.
-
-class HIPAllocator(LRUAllocator):
-  def _do_alloc(self, size, dtype, device, **kwargs):
-    hip.hipSetDevice(device)
-    return hip.hipMalloc(size * dtype.itemsize)
-  def _do_free(self, buf): hip.hipFree(buf)
-  def _cached_bufkey(self, size, dtype, device): return (device, size*dtype.itemsize) # Buffers of the same length could be reused, no matter what dtype.
-
 MOCKHIP = getenv("MOCKHIP") # for CI. don't run kernels, only check if they compile

-class _HIP:
-  def __init__(self, device=None):
-    self.default_device = device or getenv("HIP_DEFAULT_DEVICE")
-    self.device_count = 0 if MOCKHIP else hip.hipGetDeviceCount()
-    if not MOCKHIP: hip.hipSetDevice(self.default_device)
-    self.allocator = None if MOCKHIP else HIPAllocator(hip.hipGetDeviceProperties(self.default_device).totalGlobalMem)
-HIP = _HIP()
-
-class RawHIPBuffer(RawBufferCopyInOut, RawBufferTransfer):
-  def __init__(self, size, dtype, device=HIP.default_device, buf=None, allocator=HIP.allocator): super().__init__(size, dtype, buf=buf, allocator=allocator, **{'device': int(device)})
-  def _copyin(self, x:np.ndarray):
-    hip.hipSetDevice(self._device)
-    hip.hipMemcpyAsync(self._buf, np.require(x, requirements='C').ctypes.data_as(ctypes.c_void_p), self.size * self.dtype.itemsize, hip.hipMemcpyHostToDevice, 0)
-  def _copyout(self, x:np.ndarray):
-    hip.hipSetDevice(self._device)
-    hip.hipMemcpy(x.ctypes.data, self._buf, self.size * self.dtype.itemsize, hip.hipMemcpyDeviceToHost)
-  def _transfer(self, x:RawBuffer):
-    hip.hipSetDevice(x._device)
-    hip.hipMemcpy(self._buf, x._buf, self.size * self.dtype.itemsize, hip.hipMemcpyDeviceToDevice)
-
@diskcache
 def compile_hip(prg) -> bytes:
  prog = hip.hiprtcCreateProgram(prg, "<null>", [], [])
-  arch = "gfx1100" if MOCKHIP else hip.hipGetDeviceProperties(HIP.default_device).gcnArchName
-  hip.hiprtcCompileProgram(prog, [f'--offload-arch={arch}'])
+  hip.hiprtcCompileProgram(prog, [f'--offload-arch={HIPDevice.default_arch_name}'])
  return hip.hiprtcGetCode(prog)

 def time_execution(cb, enable=False):
@ -67,78 +34,53 @@ def time_execution(cb, enable=False):
    return ret

 class HIPProgram:
-  def __init__(self, name:str, prg:bytes):
-    self.modules, self.prgs, self.c_struct_t = [], [], None
+  def __init__(self, device:int, name:str, prg:bytes, bufs:int, vars:int=0):
+    self.device, self.c_struct_t = device, None

    if DEBUG >= 6:
      asm = early_exec((["/opt/rocm/llvm/bin/llvm-objdump", '-d', '-'], prg))
      print('\n'.join([x for x in asm.decode('utf-8').split("\n") if 's_code_end' not in x]))

-    for i in range(HIP.device_count):
-      hip.hipSetDevice(i)
-      self.modules.append(hip.hipModuleLoadData(prg))
-      self.prgs.append(hip.hipModuleGetFunction(self.modules[-1], name))
+    if MOCKHIP: return
+    hip.hipSetDevice(self.device)
+    self.module = hip.hipModuleLoadData(prg)
+    self.prg = hip.hipModuleGetFunction(self.module, name)
+    self.c_struct_t = hip.getCStructForType([ctypes.c_void_p]*bufs + [ctypes.c_int]*vars)

  def __call__(self, *args, global_size:Tuple[int,int,int], local_size:Tuple[int,int,int], wait=False):
    if MOCKHIP: return
-    hip.hipSetDevice(args[0]._device)
-    if self.c_struct_t is None: self.c_struct_t = hip.getCStructForType([(ctypes.c_void_p if not isinstance(x, int) else ctypes.c_int) for x in args])
-    c_params = cast(Callable, self.c_struct_t)(*[x._buf if not isinstance(x, int) else x for x in args])
-    return time_execution(lambda: hip.hipModuleLaunchKernel(self.prgs[args[0]._device], *global_size, *local_size, 0, 0, c_params), enable=wait)
+    hip.hipSetDevice(self.device)
+    c_params = cast(Callable, self.c_struct_t)(*args)
+    return time_execution(lambda: hip.hipModuleLaunchKernel(self.prg, *global_size, *local_size, 0, 0, c_params), enable=wait)

  def __del__(self):
-    for module in self.modules: hip.hipModuleUnload(module)
+    if MOCKHIP: return
+    hip.hipModuleUnload(self.module)

-class HIPGraph:
-  def __init__(self, jit_cache: List[JitItem], input_rawbuffers: List[RawBuffer], var_vals: Dict[Variable, int]):
-    # TODO: Only HIPProgram can be captured for now.
-    if not all(isinstance(ji.prg, CompiledASTRunner) and isinstance(ji.prg.clprg, HIPProgram) for ji in jit_cache): raise GraphException
+T = TypeVar("T")
+class HIPAllocator(LRUAllocator):
+  def __init__(self, device):
+    self.device = device
+    super().__init__()
+  def _alloc(self, size: int, dtype: DType):
+    if size == 0: return None
+    hip.hipSetDevice(self.device)
+    return hip.hipMalloc(size * dtype.itemsize)
+  def _free(self, opaque:T): hip.hipFree(opaque)
+  def copyin(self, dest:T, src: memoryview):
+    hip.hipSetDevice(self.device)
+    hip.hipMemcpyAsync(dest, from_mv(src), len(src), hip.hipMemcpyHostToDevice, 0)
+  def copyout(self, dest:memoryview, src:T):
+    hip.hipSetDevice(self.device)
+    hip.hipMemcpy(from_mv(dest), src, len(dest), hip.hipMemcpyDeviceToHost)
+  def transfer(self, dest:T, src:T, sz:int):
+    hip.hipSetDevice(self.device)
+    hip.hipMemcpy(dest, src, sz, hip.hipMemcpyDeviceToDevice)

-    self.jit_cache = jit_cache
-    self.input_replace = get_input_replace(jit_cache, input_rawbuffers)
-    self.op_estimate, self.mem_estimate = get_jit_stats(jit_cache)
-    self.jc_idxs_with_updatable_launch_dims = get_jc_idxs_with_updatable_launch_dims(jit_cache)
-    self.jc_idxs_with_updatable_var_vals = get_jc_idxs_with_updatable_var_vals(jit_cache)
-    self.jc_idxs_with_updatable_rawbufs = list(set([x[0] for x in self.input_replace.keys()]))
-
-    self.graph, graph_node = hip.hipGraphCreate(), None
-    self.updatable_nodes: Dict[int, Tuple[Any, hip.kernelNodeParamsWrapper]] = {} # Dict[jc index] = tuple(graph_node, node_params)
-
-    for (j,i),input_name in self.input_replace.items(): self.jit_cache[j].rawbufs[i] = input_rawbuffers[input_name]
-    for j,ji in enumerate(self.jit_cache):
-      prg: CompiledASTRunner = cast(CompiledASTRunner, ji.prg)
-      assert all(x is not None for x in ji.rawbufs) and ji.rawbufs[0] is not None, "buffers could not be None" # for linters
-
-      args = [cast(RawBuffer, x)._buf for x in ji.rawbufs] + [var_vals[x] for x in prg.vars]
-      types = [ctypes.c_void_p] * len(ji.rawbufs) + [ctypes.c_int] * len(prg.vars)
-      c_params = hip.buildKernelNodeParams(args, types, prg.clprg.prgs[ji.rawbufs[0]._device], *prg.launch_dims(var_vals))
-      graph_node = hip.hipGraphAddKernelNode(self.graph, [graph_node] if graph_node else [], c_params)
-
-      if j in self.jc_idxs_with_updatable_launch_dims or j in self.jc_idxs_with_updatable_var_vals or j in self.jc_idxs_with_updatable_rawbufs:
-        self.updatable_nodes[j] = (graph_node, c_params)
-
-    self.instance = hip.hipGraphInstantiate(self.graph)
-
-  def __del__(self):
-    hip.hipGraphExecDestroy(self.instance)
-    hip.hipGraphDestroy(self.graph)
-
-  def __call__(self, input_rawbuffers: List[RawBuffer], var_vals: Dict[Variable, int], wait=False, jit=False) -> Optional[float]:
-    # Update cached params structs with the new values.
-    for (j,i),input_idx in self.input_replace.items():
-      hip.setKernelNodeParams(self.updatable_nodes[j][1], [input_rawbuffers[input_idx]._buf], [i])
-    for j in self.jc_idxs_with_updatable_launch_dims:
-      hip.setKernelNodeLaunchDims(self.updatable_nodes[j][1], *cast(CompiledASTRunner, self.jit_cache[j].prg).launch_dims(var_vals))
-    for j in self.jc_idxs_with_updatable_var_vals:
-      prg: CompiledASTRunner = cast(CompiledASTRunner, self.jit_cache[j].prg)
-      hip.setKernelNodeParams(self.updatable_nodes[j][1], [var_vals[x] for x in prg.vars], list(range(len(self.jit_cache[j].rawbufs), len(self.jit_cache[j].rawbufs) + len(prg.vars))))
-
-    # Update graph nodes with the updated structs.
-    for node, params in self.updatable_nodes.values():
-      hip.hipGraphExecKernelNodeSetParams(self.instance, node, params)
-
-    et = time_execution(lambda: hip.hipGraphLaunch(self.instance), enable=wait)
-    update_stats(f"<batched {len(self.jit_cache)}>", self.op_estimate, self.mem_estimate, var_vals, et, buf_count=len(input_rawbuffers), jit=jit, num_kernels=len(self.jit_cache))
-    return et
-
-HIPDevice = Compiled(RawHIPBuffer if not MOCKHIP else RawMallocBuffer, LinearizerOptions(device="HIP"), HIPRenderer, compile_hip, HIPProgram, hip.hipDeviceSynchronize, graph=HIPGraph)
+class HIPDevice(Compiled):
+  default_arch_name = "gfx1100"
+  def __init__(self, device:str):
+    self.device = int(device.split(":")[1]) if ":" in device else 0
+    if self.device == 0 and not MOCKHIP: HIPDevice.default_arch_name = hip.hipGetDeviceProperties(self.device).gcnArchName
+    super().__init__(MallocAllocator if MOCKHIP else HIPAllocator(self.device), LinearizerOptions(device="HIP"), HIPRenderer, compile_hip, functools.partial(HIPProgram, self.device))
+  def synchronize(self): hip.hipDeviceSynchronize()
--- a/tinygrad/runtime/ops_llvm.py
+++ b/tinygrad/runtime/ops_llvm.py
@ -1,11 +1,10 @@
 import time, ctypes
 from typing import ClassVar
-from tinygrad.device import Compiled
+from tinygrad.device import Compiled, MallocAllocator
 from tinygrad.helpers import getenv, DEBUG, diskcache
 from ctypes import CFUNCTYPE
 from tinygrad.codegen.kernel import LinearizerOptions
 from tinygrad.renderer.llvmir import uops_to_llvm_ir
-from tinygrad.runtime.lib import RawMallocBuffer

 import llvmlite.binding as llvm

@ -55,14 +54,14 @@ def compile_llvm(prg, llvmopt=LLVMOPT) -> bytes:
  return LLVM.target_machine.emit_object(mod)

 class LLVMProgram:
-  def __init__(self, name:str, lib:bytes):
+  def __init__(self, name:str, lib:bytes, bufs:int, vars:int=0):
    LLVM().engine.add_object_file(llvm.object_file.ObjectFileRef.from_data(lib))
    self.fxn = LLVM.engine.get_function_address(name)
+    self.cfunc = CFUNCTYPE(ctypes.c_int, *([ctypes.c_void_p]*bufs), *([ctypes.c_int]*vars))(self.fxn)

  def __call__(self, *bufs, wait=False):
-    cfunc = CFUNCTYPE(ctypes.c_int, *[ctypes.c_void_p for _ in bufs])(self.fxn)
    if wait: st = time.perf_counter()
-    cfunc(*[x._buf if not isinstance(x, int) else x for x in bufs])
+    self.cfunc(*bufs)
    if wait: return time.perf_counter()-st

-LLVMDevice = Compiled(RawMallocBuffer, LinearizerOptions(supports_float4=False, has_local=False, has_shared=False), uops_to_llvm_ir, compile_llvm, LLVMProgram)
+LLVMDevice = Compiled(MallocAllocator, LinearizerOptions(supports_float4=False, has_local=False, has_shared=False), uops_to_llvm_ir, compile_llvm, LLVMProgram)
--- a/tinygrad/runtime/ops_metal.py
+++ b/tinygrad/runtime/ops_metal.py
@ -1,155 +1,77 @@
-import os, subprocess, pathlib, ctypes, tempfile
+from __future__ import annotations
+import os, subprocess, pathlib, ctypes, tempfile, functools
 import Metal, libdispatch
-from typing import List, Any, Tuple, Dict, cast, Optional
+from typing import List, Any, Tuple
 from tinygrad.codegen.kernel import LinearizerOptions
-from tinygrad.helpers import prod, getenv, DEBUG, DType, dtypes, diskcache, dedup
-from tinygrad.device import Compiled, CompiledASTRunner, update_stats
+from tinygrad.helpers import prod, getenv, DEBUG, DType, diskcache, unwrap2
+from tinygrad.device import Compiled, LRUAllocator
 from tinygrad.renderer.metal import MetalRenderer
-from tinygrad.runtime.lib import RawBufferMapped, RawBuffer, LRUAllocator
-from tinygrad.shape.symbolic import Variable
-from tinygrad.jit import JitItem, get_input_replace, get_jit_stats, get_jc_idxs_with_updatable_launch_dims, GraphException
-
-class MetalAllocator(LRUAllocator):
-  def _do_alloc(self, size, dtype, device, **kwargs):
-    buf_len, max_buf_len = size*dtype.itemsize, METAL.device.maxBufferLength()
-    assert buf_len < max_buf_len, f"Buffer length of {buf_len/1e9:5.2f} GB exceeds Metal's max buffer length of {max_buf_len/1e9:5.2f} GB."
-    buf = METAL.device.newBufferWithLength_options_(buf_len, Metal.MTLResourceStorageModeShared)
-    assert buf, f"Metal buffer allocation failed with {buf}."
-    return buf
-  def _do_free(self, buf): buf.release()
-  def _cached_bufkey(self, size, dtype, device): return (device, size*dtype.itemsize) # Buffers of the same length could be reused, no matter what dtype.
-
-class _METAL:
-  def __init__(self):
-    self.mtl_buffers_in_flight: List[Any] = []
-    self.device = Metal.MTLCreateSystemDefaultDevice()
-    self.mtl_queue = self.device.newCommandQueueWithMaxCommandBufferCount_(1024)
-    self.allocator = MetalAllocator(self.device.dedicatedMemorySize() or self.device.sharedMemorySize())
-  # TODO: is there a better way to do this?
-  def synchronize(self):
-    for cbuf in self.mtl_buffers_in_flight: cbuf.waitUntilCompleted()
-    self.mtl_buffers_in_flight.clear()
-METAL = _METAL()
-
-class RawMetalBuffer(RawBufferMapped):
-  def __init__(self, size:int, dtype:DType):
-    assert dtype != dtypes.double, f"METAL does not support {dtype.name}"
-    super().__init__(size, dtype, allocator=METAL.allocator)
-  def _buffer(self):
-    METAL.synchronize()
-    return self._buf.contents().as_buffer(self._buf.length())
-
-def unwrap(x):
-  ret, err = x
-  assert err is None, str(err)
-  return ret

@diskcache
 def compile_metal(prg, use_xcode=bool(getenv("METAL_XCODE"))) -> bytes:
+  assert MetalDevice.compiler_device, "metal device creation is required for metal compile"
  if use_xcode:
    # NOTE: if you run llvm-dis on "air" you can see the llvm bytecode
    air = subprocess.check_output(['xcrun', '-sdk', 'macosx', 'metal', '-x', 'metal', '-c', '-', '-o', '-'], input=prg.encode('utf-8'))
    return subprocess.check_output(['xcrun', '-sdk', 'macosx', 'metallib', '-', '-o', '-'], input=air)
  options = Metal.MTLCompileOptions.new()
-  library = unwrap(METAL.device.newLibraryWithSource_options_error_(prg, options, None))
+  library = unwrap2(MetalDevice.compiler_device.newLibraryWithSource_options_error_(prg, options, None))
  return library.libraryDataContents().bytes().tobytes()

 class MetalProgram:
-  def __init__(self, name:str, lib:bytes):
+  def __init__(self, device:MetalDevice, name:str, lib:bytes, bufs:int, vars:int=0):
+    self.device = device
    data = libdispatch.dispatch_data_create(lib, len(lib), None, None)
-    self.library = unwrap(METAL.device.newLibraryWithData_error_(data, None))
+    self.library = unwrap2(self.device.device.newLibraryWithData_error_(data, None))
    self.fxn = self.library.newFunctionWithName_(name)
    if DEBUG >= 6:
      with tempfile.NamedTemporaryFile(delete=True) as shader:
        shader.write(lib)
        shader.flush()
        os.system(f"cd {pathlib.Path(__file__).parents[2]}/disassemblers/applegpu && python3 compiler_explorer.py {shader.name}")
-    self.pipeline_state = unwrap(METAL.device.newComputePipelineStateWithFunction_error_(self.fxn, None))
+    self.pipeline_state = unwrap2(self.device.device.newComputePipelineStateWithFunction_error_(self.fxn, None))

  def __call__(self, *bufs, global_size:Tuple[int,int,int], local_size:Tuple[int,int,int], wait=False):
    assert prod(local_size) <= self.pipeline_state.maxTotalThreadsPerThreadgroup(), f"local size {local_size} bigger than {self.pipeline_state.maxTotalThreadsPerThreadgroup()} with exec width {self.pipeline_state.threadExecutionWidth()} memory length {self.pipeline_state.staticThreadgroupMemoryLength()}"
-    command_buffer = METAL.mtl_queue.commandBuffer()
+    command_buffer = self.device.mtl_queue.commandBuffer()
    encoder = command_buffer.computeCommandEncoder()
    encoder.setComputePipelineState_(self.pipeline_state)
    for i,a in enumerate(bufs):
-      if isinstance(a, RawMetalBuffer): encoder.setBuffer_offset_atIndex_(a._buf, 0, i)
-      elif isinstance(a, int): encoder.setBytes_length_atIndex_((arg:=ctypes.c_int32(a)), ctypes.sizeof(arg), i)
-      else: raise RuntimeError(f"arg at index {i} has unsupported type {type(a)}")
+      if isinstance(a, int): encoder.setBytes_length_atIndex_((arg:=ctypes.c_int32(a)), ctypes.sizeof(arg), i)
+      else: encoder.setBuffer_offset_atIndex_(a, 0, i)
    encoder.dispatchThreadgroups_threadsPerThreadgroup_(Metal.MTLSize(*global_size), Metal.MTLSize(*local_size))
    encoder.endEncoding()
    command_buffer.commit()
    if wait:
      command_buffer.waitUntilCompleted()
      return command_buffer.GPUEndTime() - command_buffer.GPUStartTime()
-    METAL.mtl_buffers_in_flight.append(command_buffer)
+    self.device.mtl_buffers_in_flight.append(command_buffer)

-class MetalGraph:
-  def __init__(self, jit_cache: List[JitItem], input_rawbuffers: List[RawBuffer], var_vals: Dict[Variable, int]):
-    self.jit_cache = jit_cache
-    self.input_replace = get_input_replace(jit_cache, input_rawbuffers)
-    self.op_estimate, self.mem_estimate = get_jit_stats(jit_cache)
-    self.jc_idx_with_updatable_launch_dims = get_jc_idxs_with_updatable_launch_dims(jit_cache)
+class MetalAllocator(LRUAllocator):
+  def __init__(self, device:MetalDevice):
+    self.device:MetalDevice = device
+    super().__init__()
+  def _alloc(self, size:int, dtype:DType):
+    if size == 0: return None
+    ret = self.device.device.newBufferWithLength_options_(size*dtype.itemsize, Metal.MTLResourceStorageModeShared)
+    if ret is None: raise MemoryError(f"Metal OOM while allocating {size=} {dtype=}")
+    return ret
+  def _free(self, opaque): opaque.release()
+  def _buffer(self, src):
+    self.device.synchronize()
+    return src.contents().as_buffer(src.length())
+  def copyin(self, dest, src:memoryview): self._buffer(dest)[:] = src
+  def copyout(self, dest:memoryview, src): dest[:] = self._buffer(src)

-    # create metal batch exec
-    icb_descriptor = Metal.MTLIndirectCommandBufferDescriptor.new()
-    icb_descriptor.setCommandTypes_(Metal.MTLIndirectCommandType(Metal.MTLIndirectCommandTypeConcurrentDispatch))
-    icb_descriptor.setInheritBuffers_(False)
-    icb_descriptor.setInheritPipelineState_(False)
-    icb_descriptor.setMaxKernelBufferBindCount_(31)
-    self.icb = METAL.device.newIndirectCommandBufferWithDescriptor_maxCommandCount_options_(icb_descriptor, len(self.jit_cache), Metal.MTLResourceOptions(0))
-    if self.icb is None: raise GraphException("create indirect command buffer failed, does your system support this?")
-
-    self.int_buf = RawMetalBuffer(len(var_vals), dtypes.int32)
-    read_resources, write_resources = [], []
-    for j,ji in enumerate(self.jit_cache):
-      prg: CompiledASTRunner = cast(CompiledASTRunner, ji.prg)
-      descriptor = Metal.MTLComputePipelineDescriptor.new()
-      descriptor.setComputeFunction_(prg.clprg.fxn)
-      descriptor.setSupportIndirectCommandBuffers_(True)
-      pipeline_state = unwrap(METAL.device.newComputePipelineStateWithDescriptor_options_reflection_error_(descriptor, Metal.MTLPipelineOption(0), None, None))
-      icb_command = self.icb.indirectComputeCommandAtIndex_(j)
-      icb_command.setComputePipelineState_(pipeline_state)
-      for i,b in enumerate(ji.rawbufs):
-        if b is not None:
-          icb_command.setKernelBuffer_offset_atIndex_(b._buf, 0, i)
-          if i == 0: write_resources.append(b._buf)
-          else: read_resources.append(b._buf)
-      var_vals_keys = list(var_vals.keys())
-      for i,v in enumerate(prg.vars):
-        icb_command.setKernelBuffer_offset_atIndex_(self.int_buf._buf, var_vals_keys.index(v)*4, len(ji.rawbufs)+i)
-      if j not in self.jc_idx_with_updatable_launch_dims:
-        global_size, local_size = prg.launch_dims(var_vals)
-        icb_command.concurrentDispatchThreadgroups_threadsPerThreadgroup_(Metal.MTLSize(*global_size), Metal.MTLSize(*local_size))
-      icb_command.setBarrier()
-    self.read_resources, self.write_resources = dedup(read_resources), dedup(write_resources)
-    self.command_buffer: Any = None
-    self.int_buf_view = self.int_buf.toCPU()    # TODO: this is metal syncing when it doesn't need to
-
-  def __call__(self, input_rawbuffers: List[RawBuffer], var_vals: Dict[Variable, int], wait=False, jit=False) -> Optional[float]:
-    # NOTE: you at least can't update the ints if this is running
-    if self.command_buffer is not None and self.command_buffer in METAL.mtl_buffers_in_flight: self.command_buffer.waitUntilCompleted()
-    all_read_resources = self.read_resources + [x._buf for x in input_rawbuffers]
-    for (j,i),input_idx in self.input_replace.items():
-      self.icb.indirectComputeCommandAtIndex_(j).setKernelBuffer_offset_atIndex_(input_rawbuffers[input_idx]._buf, 0, i)
-    for j in self.jc_idx_with_updatable_launch_dims:
-      global_size, local_size = cast(CompiledASTRunner, self.jit_cache[j].prg).launch_dims(var_vals)
-      self.icb.indirectComputeCommandAtIndex_(j).concurrentDispatchThreadgroups_threadsPerThreadgroup_(Metal.MTLSize(*global_size), Metal.MTLSize(*local_size))
-    self.int_buf_view[:] = list(var_vals.values())
-    command_buffer = METAL.mtl_queue.commandBuffer()
-    encoder = command_buffer.computeCommandEncoder()
-    encoder.executeCommandsInBuffer_withRange_(self.icb, Metal.MTLIndirectCommandBufferExecutionRangeMake(0,len(self.jit_cache)))
-    encoder.useResources_count_usage_(all_read_resources, len(all_read_resources), Metal.MTLResourceUsageRead)
-    encoder.useResources_count_usage_(self.write_resources, len(self.write_resources), Metal.MTLResourceUsageWrite)
-    encoder.endEncoding()
-    command_buffer.commit()
-    self.command_buffer = command_buffer
-    if wait:
-      command_buffer.waitUntilCompleted()
-      et = command_buffer.GPUEndTime() - command_buffer.GPUStartTime()
-    else:
-      METAL.mtl_buffers_in_flight.append(command_buffer)
-      et = None
-    update_stats(f"<batched {len(self.jit_cache)}>", self.op_estimate, self.mem_estimate, var_vals, et, buf_count=len(input_rawbuffers), jit=jit, num_kernels=len(self.jit_cache))
-    return et
-
-MetalDevice = Compiled(RawMetalBuffer, LinearizerOptions(device="METAL"), MetalRenderer, compile_metal, MetalProgram, METAL.synchronize, graph=MetalGraph)
+class MetalDevice(Compiled):
+  compiler_device = None
+  def __init__(self, device:str):
+    self.device = Metal.MTLCreateSystemDefaultDevice()
+    if MetalDevice.compiler_device is None: MetalDevice.compiler_device = self.device
+    self.mtl_queue = self.device.newCommandQueueWithMaxCommandBufferCount_(1024)
+    self.mtl_buffers_in_flight: List[Any] = []
+    from tinygrad.runtime.graph.metal import MetalGraph
+    super().__init__(MetalAllocator(self), LinearizerOptions(device="METAL"), MetalRenderer, compile_metal, functools.partial(MetalProgram, self), functools.partial(MetalGraph, self))
+  def synchronize(self):
+    for cbuf in self.mtl_buffers_in_flight: cbuf.waitUntilCompleted()
+    self.mtl_buffers_in_flight.clear()
--- a/tinygrad/runtime/ops_torch.py
+++ b/tinygrad/runtime/ops_torch.py
@ -1,24 +1,15 @@
 import torch
 import numpy as np
-from typing import Dict, Callable, Optional
+from typing import Dict, Callable
 from tinygrad.ops import BufferOps, UnaryOps, BinaryOps, MovementOps, TernaryOps, ReduceOps, Op
-from tinygrad.device import Interpreted
-from tinygrad.helpers import getenv, dtypes, prod, DType
+from tinygrad.device import Interpreted, Allocator
+from tinygrad.helpers import getenv, dtypes, DType
 from tinygrad.runtime.ops_cpu import einsum_mulacc, shape_to_axis
-from tinygrad.runtime.lib import RawBuffer

 device = torch.device("cuda:0" if torch.cuda.is_available() else ("mps" if getenv("MPS", 0) else "cpu"))
 type_map = {torch.float64: dtypes.float64, torch.float16: dtypes.float16, torch.float32: dtypes.float32, torch.int8: dtypes.int8, torch.int32: dtypes.int32, torch.int64: dtypes.int64, torch.uint8: dtypes.uint8, torch.bool: dtypes.bool, torch.int16: dtypes.int16}
 inverse_type_map = {v:k for k,v in type_map.items()}

-class RawTorchBuffer(RawBuffer):
-  def __init__(self, size:int, dtype:DType, buf:Optional[torch.Tensor]=None): super().__init__(size, dtype, buf)
-  def _copyin(self, x):
-    buf = torch.from_numpy(x if all(s>=0 for s in x.strides) else x.copy()).requires_grad_(False).to(device)
-    self.size, self.dtype, self._buf = prod(x.shape), type_map[buf.dtype], buf
-  def _get_buf(self): return self._buf if self._buf is not None else torch.empty([self.size], device=device, dtype=inverse_type_map[self.dtype])
-  def toCPU(self): return self._get_buf().cpu().numpy()
-
 def output_type(x, y): return x.dtype if type_map[x.dtype].priority > type_map[y.dtype].priority else y.dtype
 def match_types(x, y, disallow_bool=False):
  up = output_type(x, y)
@ -35,7 +26,6 @@ torch_fxn_for_op: Dict[Op, Callable] = {
  # TODO: torch.tensor should work here. it doesn't due to "overflow" in uint8
  #BufferOps.CONST: lambda val, dtype: torch.tensor(val, device=device, dtype=inverse_type_map[dtype]),
  BufferOps.CONST: lambda val, dtype: torch.from_numpy(np.array(val, dtype=dtype.np)).to(device),
-  BufferOps.LOAD: lambda x: x._get_buf(), BufferOps.STORE: lambda x: RawTorchBuffer(prod(x.shape), type_map[x.dtype], x),
  UnaryOps.NOOP: lambda x: x.contiguous(), UnaryOps.SQRT: lambda x: x.sqrt(), UnaryOps.EXP2: lambda x: x.exp2(), UnaryOps.LOG2: lambda x: x.log2(), UnaryOps.SIN: torch.sin,
  UnaryOps.CAST: lambda x,y: (x.view if y[1] else x.type)(next(k for k,v in type_map.items() if v==y[0])), UnaryOps.NEG: lambda x: torch.logical_not(x) if x.dtype is torch.bool else torch.neg(x),
  BinaryOps.MAX: torch.maximum, BinaryOps.CMPLT: lambda x,y: (x<y).type(torch.promote_types(x.dtype, y.dtype)),
@ -51,4 +41,9 @@ torch_fxn_for_op: Dict[Op, Callable] = {
  TernaryOps.WHERE: lambda x, y, z: torch.where(x != 0, y, z),
 }

-TorchDevice = Interpreted(RawTorchBuffer, torch_fxn_for_op)
+class TorchAllocator(Allocator):
+  def _alloc(self, size:int, dtype:DType): return torch.empty([size], device=device, dtype=inverse_type_map[dtype])
+  def copyin(self, dest:torch.Tensor, src:memoryview): dest.copy_(torch.frombuffer(src, dtype=dest.dtype))
+  def copyout(self, dest:memoryview, src:torch.Tensor): torch.frombuffer(dest, dtype=src.dtype).copy_(src.flatten())
+
+TorchDevice = Interpreted(TorchAllocator(), torch_fxn_for_op)
--- a/tinygrad/runtime/ops_webgpu.py
+++ b/tinygrad/runtime/ops_webgpu.py
@ -1,9 +1,7 @@
-import numpy as np
 import functools
 from wgpu.utils.device import get_default_device
-from tinygrad.runtime.lib import RawBuffer, LRUAllocator
 from tinygrad.helpers import dtypes, DType
-from tinygrad.device import Compiled
+from tinygrad.device import Compiled, Allocator
 from tinygrad.codegen.kernel import LinearizerOptions
 from tinygrad.renderer.cstyle import uops_to_cstyle
 from tinygrad.renderer.wgsl import WGSLLanguage
@ -12,11 +10,11 @@ import wgpu
 wgpu_device = get_default_device()

 class WebGPUProgram:
-  def __init__(self, name: str, prg: str): self.name,self.prg = name,wgpu_device.create_shader_module(code=prg)
+  def __init__(self, name: str, prg: str, bufs:int=0, vars:int=0): self.name,self.prg = name,wgpu_device.create_shader_module(code=prg)
  def __call__(self, *bufs, global_size, local_size, wait=False):
    assert len(bufs) <= 8, "WEBGPU only supports 8 buffers"
    binding_layouts = [{"binding": i, "visibility": wgpu.ShaderStage.COMPUTE, "buffer": {"type": wgpu.BufferBindingType.storage}} for i in range(len(bufs))]
-    bindings = [{"binding": i, "resource": {"buffer": x._buf, "offset": 0, "size": x._buf.size}} for i, x in enumerate(bufs)]
+    bindings = [{"binding": i, "resource": {"buffer": x, "offset": 0, "size": x.size}} for i, x in enumerate(bufs)]
    bind_group_layout = wgpu_device.create_bind_group_layout(entries=binding_layouts)
    pipeline_layout = wgpu_device.create_pipeline_layout(bind_group_layouts=[bind_group_layout])
    bind_group = wgpu_device.create_bind_group(layout=bind_group_layout, entries=bindings)
@ -29,17 +27,14 @@ class WebGPUProgram:
    compute_pass.end()
    wgpu_device.queue.submit([command_encoder.finish()])

-class RawWebGPUAllocator(LRUAllocator):
-  def _do_alloc(self, size, dtype, device, **kwargs): return wgpu_device.create_buffer(size=size*dtype.itemsize, usage=wgpu.BufferUsage.STORAGE | wgpu.BufferUsage.COPY_DST | wgpu.BufferUsage.COPY_SRC)
-  def _cached_bufkey(self, size, dtype, device): return (device, size*dtype.itemsize) # Buffers of the same length could be reused, no matter what dtype.
-WebGPUAlloc = RawWebGPUAllocator(wgpu_device.limits['max_buffer_size'])
-
-class RawWebGPUBuffer(RawBuffer):
-  def __init__(self, size:int, dtype:DType):
+class WebGpuAllocator(Allocator):
+  def _alloc(self, size: int, dtype: DType):
    assert dtype not in [dtypes.int8,dtypes.uint8,dtypes.int64,dtypes.uint64,dtypes.double], f"dtype {dtype} not supported on WEBGPU"
-    super().__init__(size, dtype, allocator=WebGPUAlloc)
-  def _copyin(self, x:np.ndarray): wgpu_device.queue.write_buffer(self._buf, 0, np.ascontiguousarray(x))
-  def toCPU(self) -> np.ndarray: return np.frombuffer(wgpu_device.queue.read_buffer(self._buf, 0), dtype=np.dtype(self.dtype.np, metadata={"backing": self})) # type: ignore
+    return wgpu_device.create_buffer(size=size*dtype.itemsize, usage=wgpu.BufferUsage.STORAGE | wgpu.BufferUsage.COPY_DST | wgpu.BufferUsage.COPY_SRC)
+  def copyin(self, dest, src: memoryview): wgpu_device.queue.write_buffer(dest, 0, src)
+  def copyout(self, dest, src: memoryview): dest[:] = wgpu_device.queue.read_buffer(src, 0)    # TODO: remove this copy

-renderer = functools.partial(uops_to_cstyle, WGSLLanguage())
-WebGpuDevice = Compiled(RawWebGPUBuffer, LinearizerOptions(device="WEBGPU", supports_float4=False, local_max=[256, 256, 64], global_max=[65535, 65535, 65535]), renderer, lambda x: x, WebGPUProgram)
+class WebGpuDevice(Compiled):
+  def __init__(self, device:str):
+    super().__init__(WebGpuAllocator(), LinearizerOptions(device="WEBGPU", supports_float4=False, local_max=[256, 256, 64], global_max=[65535, 65535, 65535]),
+                     functools.partial(uops_to_cstyle, WGSLLanguage()), lambda x: x, WebGPUProgram)
--- a/tinygrad/tensor.py
+++ b/tinygrad/tensor.py
@ -106,10 +106,10 @@ class Tensor:
    return self

  def assign(self, x) -> Tensor:
-    # TODO: this is a hack for writing to DISK
+    # TODO: this is a hack for writing to DISK. remove with working assign
    if self.device.startswith("DISK"):
      if x.__class__ is not Tensor: x = Tensor(x, device="CPU", dtype=self.dtype)
-      self.contiguous().realize().lazydata.realized._copyin(x.numpy())
+      self.contiguous().realize().lazydata.realized.copyin(x.numpy().data)
      return self
    if x.__class__ is not Tensor: x = Tensor(x, device=self.device, dtype=self.dtype)
    assert self.shape == x.shape and self.device == x.device, f"assign shape mismatch {self.shape} != {x.shape} or device mismatch {self.device} != {x.device}"