dtypes nice and clean (#673)

* add dtype class * dtypes * buffers are lazy * dtype is tracked by lazybuffer and GenericShape * fix types in llvm * llvm store * dtype tests * fix tests maybe * fix flop counter * fix CI * CI fix and check format * fix dtype and dtype check * fix custom test * fix test graph
2023-03-10 16:56:07 -08:00 · 2023-03-10 16:56:07 -08:00 · 1826ff6b89
parent d26345595d
commit 1826ff6b89
20 changed files with 215 additions and 141 deletions
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@ -164,6 +164,7 @@ jobs:
    - name: Test openpilot model
      run: |
        ALLOWED_KERNEL_COUNT=197 FLOAT16=1 VALIDHACKS=1 DEBUGCL=1 GPU=1 IMAGE=2 python3 openpilot/compile.py
+        python3 -c 'import os; assert os.path.getsize("/tmp/output.thneed") < 100_000_000'
        DEBUGCL=1 GPU=1 IMAGE=2 python3 openpilot/compile.py
        VALIDHACKS=1 DEBUGCL=1 GPU=1 IMAGE=2 python3 openpilot/compile.py

--- a/extra/thneed.py
+++ b/extra/thneed.py
@ -1,4 +1,4 @@
-# this can be constructed from a cl_cache or loaded from a thneed file 
+# this can be constructed from a cl_cache or loaded from a thneed file
 import time
 import struct
 import json
@ -26,7 +26,7 @@ class Thneed:
      for a in args[3:]:
        nodes[a]['out_edges'].append(args[2])
        nodes[args[2]]['in_edges'].append(a)
-    
+
    # get buffers to save
    self.buffers_to_save = set()
    self.outputs = []
@ -35,7 +35,7 @@ class Thneed:
        self.buffers_to_save.add(n)
      if len(nodes[n]['out_edges']) == 0:
        self.outputs.append(n)
-  
+
    fake_inputs = []
    for k,n in self.inputs.items():
      if n in self.buffers_to_save:
@ -97,7 +97,7 @@ class Thneed:
        else:
          # zero out buffers
          buf = cl.Buffer(CL.cl_ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=b'\x00'*o['size'])
-        
+
      bufs[o['id']] = buf
      bufs_loaded[o['id']] = 'data' in o
      # if it's loaded, it's saved
@ -114,13 +114,13 @@ class Thneed:
        print("FAILED", k)
        traceback.print_exc()
        exit(0)
-    
+
    # load binaries
    for o in jdat['binaries']:
      nptr = ptr + o['length']
      prgs[o['name']] = CLProgram(o['name'], weights[ptr:nptr], binary=True)
      ptr = nptr
-  
+
    # populate the cl_cache
    for i,k in enumerate(jdat['kernels']):
      kernel = prgs[k['name']]
@ -166,7 +166,7 @@ class Thneed:
        jdat['binaries'].append({"name":prg.name, "length":len(binary[0])})
        binaries.append(binary[0])
        saved_binaries.add(prg.name)
-      
+
      # get the args from the kernel, some need the data saved
      targs, args_size = [], []
      argdtypes = prg.argdtypes if prg.argdtypes is not None else [None]*(len(args)-2)
@ -196,6 +196,7 @@ class Thneed:
                cl.enqueue_copy(CL.cl_queue, data, a, is_blocking=True)
                weights.append(data.tobytes())
            elif isinstance(a, cl.Image):
+              assert a.format == cl.ImageFormat(cl.channel_order.RGBA, cl.channel_type.HALF_FLOAT if FLOAT16 else cl.channel_type.FLOAT), "wrong type"
              needs_load = a in self.buffers_to_save
              row_pitch = (a.shape[0]*4*(2 if FLOAT16 else 4) + 63)//64 * 64
              size = row_pitch * a.shape[1]
@ -244,7 +245,7 @@ class Thneed:
        "local_work_size": [1 for _ in args[0]] if args[1] is None else args[1],
        "num_args": len(args)-2,
        "args": targs,
-        "args_size": args_size 
+        "args_size": args_size
      })

    jdat['outputs'] = [{
--- a/run_multibackend.sh
+++ b/run_multibackend.sh
@ -0,0 +1,13 @@
+#!/bin/bash -e
+echo "********* CPU *********"
+CPU=1 python3 $@
+echo "********* GPU *********"
+GPU=1 python3 $@
+echo "********* METAL *********"
+METAL=1 python3 $@
+echo "********* CLANG *********"
+CLANG=1 python3 $@
+echo "********* LLVM *********"
+LLVM=1 python3 $@
+echo "********* TORCH *********"
+TORCH=1 python3 $@
--- a/test/test_custom_function.py
+++ b/test/test_custom_function.py
@ -4,7 +4,7 @@
 import unittest
 import numpy as np
 from typing import Optional, Tuple
-from tinygrad.helpers import prod
+from tinygrad.helpers import prod, dtypes

 # *** first, we implement the atan2 op at the lowest level ***
 # `atan2_gpu` for GPUBuffers and `atan2_cpu` for CPUBuffers
@ -16,6 +16,7 @@ from tinygrad.runtime.ops_cpu import CPUBuffer
 def atan2_gpu(a:CompiledBuffer, b:CompiledBuffer) -> CompiledBuffer:
  from tinygrad.runtime.ops_gpu import GPUBuffer
  assert type(a) == GPUBuffer and type(b) == GPUBuffer, "gpu function requires GPUBuffers"
+  assert a.dtype == b.dtype and a.dtype == dtypes.float32, "gpu function only supports float32"
  ret = GPUBuffer(a.shape)
  ASTRunner("atan2", """
    __kernel void atan2(global float *c, global float *a, global float *b) {
@ -40,7 +41,7 @@ class ATan2(Function):
    assert prod(a.shape) == prod(b.shape) and a.device == b.device, "shape or device mismatch"
    self.a, self.b = a, b
    ast = LazyOp(LoadOps.CUSTOM, (a, b), {"GPU": atan2_gpu, "CPU": atan2_cpu}[a.device])
-    return LazyBuffer(a.device, a.shape, LoadOps, ast)
+    return LazyBuffer(a.device, a.shape, LoadOps, ast, max(a.dtype, b.dtype))
  def backward(self, grad_output:LazyBuffer) -> Tuple[Optional[LazyBuffer], Optional[LazyBuffer]]:
    denom = (self.a.binary_op(BinaryOps.MUL, self.a)).binary_op(BinaryOps.ADD, self.b.binary_op(BinaryOps.MUL, self.b))
    return grad_output.binary_op(BinaryOps.MUL, self.b.binary_op(BinaryOps.DIV, denom)) if self.needs_input_grad[0] else None, \
--- a/test/test_dtype.py
+++ b/test/test_dtype.py
@ -0,0 +1,41 @@
+import unittest
+import numpy as np
+from tinygrad.helpers import getenv
+from tinygrad.lazy import Device
+from tinygrad.tensor import Tensor, dtypes
+
+# for GPU, cl_khr_fp16 isn't supported
+# for LLVM, it segfaults because it can't link to the casting function
+@unittest.skipIf(getenv("CI", "") != "" and Device.DEFAULT in ["GPU", "LLVM"], "float16 broken in some CI backends")
+class TestDtype(unittest.TestCase):
+  def test_half_to_np(self):
+    a = Tensor([1,2,3,4], dtype=dtypes.float16)
+    print(a)
+    na = a.numpy()
+    print(na, na.dtype, a.lazydata.realized)
+    assert na.dtype == np.float16
+
+  def test_half_add(self):
+    a = Tensor([1,2,3,4], dtype=dtypes.float16)
+    b = Tensor([1,2,3,4], dtype=dtypes.float16)
+    c = a+b
+    print(c.numpy())
+    assert c.dtype == dtypes.float16
+
+  def test_upcast_float(self):
+    # NOTE: there's no downcasting support
+    a = Tensor([1,2,3,4], dtype=dtypes.float16).float()
+    print(a)
+    na = a.numpy()
+    print(na, na.dtype)
+    assert na.dtype == np.float32
+
+  def test_half_add_upcast(self):
+    a = Tensor([1,2,3,4], dtype=dtypes.float16)
+    b = Tensor([1,2,3,4], dtype=dtypes.float32)
+    c = a+b
+    print(c.numpy())
+    assert c.dtype == dtypes.float32
+
+if __name__ == '__main__':
+  unittest.main()
--- a/test/unit/test_graph.py
+++ b/test/unit/test_graph.py
@ -14,10 +14,10 @@ class TestGraph(unittest.TestCase):
    assert nx.is_isomorphic(G, RG, node_match=lambda x,y: x["label"] == y["label"], edge_match=lambda x,y: x["label"] == y["label"] if "label" in y else True)

  def test_add_graph(self):
-    a = CPUBuffer.fromCPU(np.ones((4,4)))
-    b = CPUBuffer.fromCPU(np.ones((4,4)))
+    a = CPUBuffer.fromCPU(np.ones((4,4), dtype=np.float32))
+    b = CPUBuffer.fromCPU(np.ones((4,4), dtype=np.float32))
    ast = LazyOp(BinaryOps.ADD, (a,b))
-    ret = CPUBuffer(np.ones((4,4)))
+    ret = CPUBuffer(np.ones((4,4), dtype=np.float32))

    RG = nx.DiGraph()
    RG.add_node(0, label="(4, 4)")
@ -30,12 +30,12 @@ class TestGraph(unittest.TestCase):
    self.helper_compare_graph(RG)

  def test_add_sum_graph(self):
-    a = CPUBuffer.fromCPU(np.ones((4,4)))
-    b = CPUBuffer.fromCPU(np.ones((1,1)))
+    a = CPUBuffer.fromCPU(np.ones((4,4), dtype=np.float32))
+    b = CPUBuffer.fromCPU(np.ones((1,1), dtype=np.float32))
    op0 = LazyOp(MovementOps.RESHAPE, (b,), (4, 4))
    op1 = LazyOp(BinaryOps.ADD, (a,op0))
    ast = LazyOp(ReduceOps.SUM, (op1,), (1,1))
-    ret = CPUBuffer(np.ones((1,1)))
+    ret = CPUBuffer(np.ones((1,1), dtype=np.float32))

    RG = nx.DiGraph()
    RG.add_node(0, label="(4, 4)")
@ -48,14 +48,14 @@ class TestGraph(unittest.TestCase):
    self.helper_compare_graph(RG)

  def test_add_graph_prune(self):
-    a = CPUBuffer.fromCPU(np.ones((1,1)))
+    a = CPUBuffer.fromCPU(np.ones((1,1), dtype=np.float32))
    ast = LazyOp(MovementOps.RESHAPE, (a,), (4, 4))
-    ret = CPUBuffer(np.ones((4,4)))
+    ret = CPUBuffer(np.ones((4,4), dtype=np.float32))
    log_op(ret, ast, show_graph=True)

-    b = CPUBuffer.fromCPU(np.ones((4,4)))
+    b = CPUBuffer.fromCPU(np.ones((4,4), dtype=np.float32))
    ast = LazyOp(BinaryOps.ADD, (ret,b))
-    ret = CPUBuffer(np.ones((4,4)))
+    ret = CPUBuffer(np.ones((4,4), dtype=np.float32))
    log_op(ret, ast, show_graph=True)
    prune_graph()

--- a/tinygrad/codegen/ast.py
+++ b/tinygrad/codegen/ast.py
@ -1,7 +1,7 @@
 import itertools
 from enum import Enum, auto
 from typing import List, Tuple
-from tinygrad.helpers import prod, dedup, all_same, colored
+from tinygrad.helpers import prod, dedup, all_same, colored, dtypes
 from tinygrad.ops import LazyOp, MovementOps, get_lazyop_info, get_buffers, ReduceOps, get_lazyops, map_buffers
 from tinygrad.shape import ShapeTracker, View, strides_for_shape

@ -26,7 +26,7 @@ class Token:
    if len(self.axis) == 0: return [0]
    acc_strides = [x*(1-self.axis[::-1][i][2]) for i,x in enumerate(strides_for_shape(tuple(1 if r else s for s,_,r in self.axis[::-1])))]
    return [sum(t) for t in itertools.product(*[[y*acc_strides[i] for y in range(x[0])] for i,x in enumerate(self.axis[::-1])])]
-  def decltype(self): return ('float' if self.typ == Types.FLOAT else 'float4') + ('*' if self.ptr else str())
+  def decltype(self, dtype=dtypes.float32): return (dtype.name if self.typ == Types.FLOAT else f'{dtype.name}4') + ('*' if self.ptr else str())
  def __repr__(self): return f"<{self.typ}{'*' if self.ptr else str()} {self.tok}{f'[{self.axis}]' if len(self.axis) else str()}>"

 # ast kernel can contain one ReduceOp with arbitrary Binary/Unary ops
@ -55,8 +55,9 @@ class ASTKernel:
          break

    # create the buffer we are returning (as the same type as the input buffers) and add it as the first buffer
-    self.ret = output_buffer if output_buffer else type(self.bufs[0])(output_shape if output_shape else self.info.shape, force_create=True)
-    self.bufs = ([type(self.ret)(self.info.shape, hostbuf=self.ret)] if output_shape else [self.ret]) + self.bufs
+    self.ret = output_buffer if output_buffer else type(self.bufs[0])(output_shape if output_shape else self.info.shape, force_create=True, dtype=self.info.dtype)
+    assert self.ret.dtype == self.info.dtype, f"return dtype {self.ret.dtype} != {self.info.dtype}"
+    self.bufs = ([type(self.ret)(self.info.shape, hostbuf=self.ret, dtype=self.info.dtype)] if output_shape else [self.ret]) + self.bufs

    # key for lookup in cache (can change, str might not be right)
    # bufs are needed because kernels like f(x) = x + x and f(x, y) = x + y have the same str(ast), but are different kernels.
@ -121,7 +122,7 @@ class ASTKernel:
    if print_shapetrackers:
      for st in self.sts: print(st)
    for i in range(len(self.sts)):
-      print(prefix, self.buftokens[i], f"early:{'T' if i < len(self.bufs) and self.bufs[i] in self.earlybufs else 'F'}", self.sts[i].shape, self.sts[i].views[-1].strides, len(self.sts[i].views), type(self.bufs[i]._buf) if self.bufs[i] is not None else "FAKE")
+      print(prefix, self.bufs[i].dtype, self.buftokens[i], f"early:{'T' if i < len(self.bufs) and self.bufs[i] in self.earlybufs else 'F'}", self.sts[i].shape, self.sts[i].views[-1].strides, len(self.sts[i].views), type(self.bufs[i]._buf) if self.bufs[i] is not None else "FAKE")

  @property
  def shape_len(self) -> int: return len(self.sts[0].shape)
--- a/tinygrad/codegen/gpu.py
+++ b/tinygrad/codegen/gpu.py
@ -25,6 +25,7 @@ class GPULanguage(NamedTuple):
  lid : List[str] = []
  extra_args : List[str] = []
  float4 : Optional[str] = None
+  half_prekernel : Optional[str] = None

 def to_image_idx(base_shape:Tuple[int, ...], idxy:Node, valid:Node, validhacks=False) -> Tuple[Node, Node]:
  idy = (idxy//(4*base_shape[1]))
@ -266,6 +267,8 @@ class GPUCodegen(ASTKernel):
    self.prekernel : Set[str] = set()
    self.kernel : List[str] = ["const sampler_t smp = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;\n"] if any(hasattr(buf._buf, "IMAGE") for buf in self.bufs if buf is not None) else []

+    if self.lang.half_prekernel: self.prekernel.add(self.lang.half_prekernel+"\n")
+
    if len(self.lang.gid) == 0:
      self.kernel += [f"for (int idx{i} = 0; idx{i} < {self.output_shape[i]}; idx{i}++) {{\n" for i in range(0, len(self.output_shape))]
    else:
@ -320,7 +323,7 @@ class GPUCodegen(ASTKernel):
    self.kernel.append("\n}")

    # concat kernel into prg
-    buftypes = [f"{'read_only' if i > 0 else 'write_only'} image2d_t" if hasattr(x._buf, "IMAGE") else self.lang.buffer_prefix+self.buftokens[i].decltype()+self.lang.buffer_suffix for i,x in enumerate(self.bufs) if x is not None]
+    buftypes = [f"{'read_only' if i > 0 else 'write_only'} image2d_t" if hasattr(x._buf, "IMAGE") else self.lang.buffer_prefix+self.buftokens[i].decltype(self.bufs[i].dtype)+self.lang.buffer_suffix for i,x in enumerate(self.bufs) if x is not None]
    prg = ' '.join(list(self.prekernel) + [f"{self.lang.kernel_prefix} void KERNEL_NAME_PLACEHOLDER(",] +
      [', '.join([f'{t} data{i}' for i,t in enumerate(buftypes) if i not in self.bufs_to_delete] + self.lang.extra_args)] +
      [") {\n"] + self.kernel)
--- a/tinygrad/codegen/llvm.py
+++ b/tinygrad/codegen/llvm.py
@ -3,7 +3,7 @@ from typing import ClassVar, List
 from llvmlite import ir  # type: ignore
 from tinygrad.codegen.ast import ASTKernel
 from tinygrad.ops import UnaryOps, BinaryOps, ReduceOps, LazyOp, ASTRunner
-from tinygrad.helpers import DEBUG, prod
+from tinygrad.helpers import DEBUG, prod, dtypes

 from tinygrad.shape.symbolic import Variable, NumNode, MulNode, DivNode, ModNode, GeNode, LtNode, SumNode, AndNode
 def int_const(x): return ir.Constant(ir.IntType(64), x)
@ -105,7 +105,8 @@ class LLVMCodegen(ASTKernel):

    # create llvm function
    module = ir.Module(name=__file__)
-    func = ir.Function(module, ir.FunctionType(ir.VoidType(), [ir.FloatType().as_pointer()]*(len(self.bufs))), name='exec')
+    func_dtypes = [{dtypes.float16:ir.HalfType(), dtypes.float32:ir.FloatType()}[buf.dtype] for buf in self.bufs]
+    func = ir.Function(module, ir.FunctionType(ir.VoidType(), [x.as_pointer() for x in func_dtypes]), name='exec')

    # force llvmlite to allow us to add function attribute then add the attribute
    func.attributes._known = func.attributes._known.union(frozenset(['"no-nans-fp-math"="true"']))
@ -143,9 +144,11 @@ class LLVMCodegen(ASTKernel):
            # this always does the load, so we have it load *0 if the arg won't be used
            # TODO: would control flow be faster?
            aug_idx = builder.select(valid, idx, int_const(0))
-            element = builder.select(valid, builder.load(builder.gep(func.args[buf_index], [aug_idx], inbounds=True)), ir.Constant(ir.FloatType(), 0))
+            element = builder.select(valid, builder.load(builder.gep(func.args[buf_index], [aug_idx], inbounds=True)), ir.Constant(func_dtypes[buf_index], 0))
          else:
            element = builder.load(builder.gep(func.args[buf_index], [idx], inbounds=True))
+          # upcast
+          if func_dtypes[buf_index] != ir.FloatType(): element = builder.fpext(element, ir.FloatType())
          m = element if kernel_output_dim == 1 else builder.insert_element(m, element, int_const(i))
        return m
      if isinstance(x.op, ReduceOps):
@ -194,6 +197,7 @@ class LLVMCodegen(ASTKernel):
    builder = loop_exit[store_loop]
    for i, idx in enumerate(get_idxs(builder, idx_level[0][store_loop], 0)):
      element = result if kernel_output_dim == 1 else builder.extract_element(result, int_const(i))
+      if func_dtypes[0] != ir.FloatType(): element = builder.fptrunc(element, func_dtypes[0])
      builder.store(element, builder.gep(func.args[0], [idx], inbounds=True))

    # add the looping
--- a/tinygrad/helpers.py
+++ b/tinygrad/helpers.py
@ -1,5 +1,6 @@
 import os, math, functools
-from typing import Tuple, Union, List
+import numpy as np
+from typing import Tuple, Union, List, NamedTuple

 def dedup(x): return list(dict.fromkeys(x))   # retains list order
 def prod(x:Union[List[int], Tuple[int, ...]]) -> int: return math.prod(x)
@ -16,3 +17,17 @@ def mnum(i) -> str: return str(i) if i >= 0 else f"m{-i}"
 def getenv(key, default=0): return type(default)(os.getenv(key, default))

 DEBUG, IMAGE = getenv("DEBUG", 0), getenv("IMAGE", 0)
+
+# **** tinygrad now supports dtypes! *****
+
+class DType(NamedTuple):
+  itemsize : int
+  name : str
+  np : type  # TODO: someday this will be removed with the "remove numpy" project
+  def __repr__(self): return f"dtypes.{self.name}"
+
+class dtypes:
+  float16 = half = DType(2, "half", np.float16)
+  float32 = float = DType(4, "float", np.float32)
+  @staticmethod
+  def from_np(x:np.ndarray) -> DType: return {np.dtype(np.float16): dtypes.float16, np.dtype(np.float32): dtypes.float32}[np.dtype(x.dtype)]
--- a/tinygrad/lazy.py
+++ b/tinygrad/lazy.py
@ -2,7 +2,7 @@ from __future__ import annotations
 from typing import Optional, Tuple, Union, List, Dict, Any, ClassVar, Type
 import os, sys, weakref, importlib, inspect, functools
 from weakref import WeakValueDictionary
-from tinygrad.helpers import prod, getenv
+from tinygrad.helpers import prod, getenv, DType, dtypes
 from tinygrad.shape import ShapeTracker, get_contraction
 from tinygrad.ops import InterpretedBuffer, DeviceBuffer, UnaryOps, BinaryOps, ReduceOps, MovementOps, LoadOps, OpType, LazyOp, get_buffers, get_lazyops, map_buffers
 from tinygrad.graph import log_op
@ -13,21 +13,12 @@ sys.setrecursionlimit(10000)
 OPT = getenv("OPT", 2)
 LAZY = getenv("LAZY", 1)

-def get_buffer(name, base='tinygrad.runtime'):
-  try:
-    return [cls for cname, cls in inspect.getmembers(importlib.import_module(f'{base}.ops_{name}'), inspect.isclass) if (cname.lower() == name + "buffer")][0]
-  except Exception as e:  # NOTE: this can't be put on one line due to mypy issue
-    print(name, "backend not available", e, file=sys.stderr)
-
 class _Device:
  def __init__(self) -> None:
-    # TODO: make this dynamic to when you try to access the _buffers
-    self._buffers : Dict[str, Type[DeviceBuffer]] = {x.upper():get_buffer(x) for x in
-      [os.path.splitext(x)[0][len("ops_"):] for x in sorted(os.listdir(os.path.join(os.path.dirname(os.path.realpath(__file__)), "runtime"))) if x.startswith("ops_")] if x is not None}
-    self.DEFAULT : str = "CPU"
-    for name in self._buffers:
-      if getenv(name) == 1: self.DEFAULT = name  # note: DEFAULT can be a Device that can't be imported. better than silent use of a different device
-      if self._buffers[name] is not None: self.__setattr__(name, name)
+    self._buffers = {y.upper():y for y in [os.path.splitext(x)[0][len("ops_"):] for x in sorted(os.listdir(os.path.join(os.path.dirname(os.path.realpath(__file__)), "runtime"))) if x.startswith("ops_")]}
+    self.DEFAULT : str = functools.reduce(lambda val, ele: val if getenv(val) == 1 else ele, self._buffers, "CPU")
+  @functools.lru_cache(maxsize=None)  # this class is a singleton, pylint: disable=method-cache-max-size-none
+  def __getitem__(self, x:str) -> Type[DeviceBuffer]: return [cls for cname, cls in inspect.getmembers(importlib.import_module(f'tinygrad.runtime.ops_{self._buffers[x]}'), inspect.isclass) if (cname.lower() == self._buffers[x] + "buffer")][0]
 Device = _Device()

 # TODO: movement ops that only change shape are really nops. treat them as such
@ -81,9 +72,9 @@ def replace_with_movement_op(y:Union[LazyOp, LazyBuffer], op:MovementOps, arg:Tu
  return elementwise_op(y.op, *[replace_with_movement_op(z, op, arg) for z in y.src])   # type: ignore

 class LazyNumpyArray:
-  def __init__(self, fxn, shape): self.fxn, self.shape = fxn, shape
-  def __call__(self): return self.fxn(self.shape)
-  def reshape(self, new_shape): return LazyNumpyArray(self.fxn, new_shape)
+  def __init__(self, fxn, shape, dtype): self.fxn, self.shape, self.dtype = fxn, shape, dtype
+  def __call__(self): return self.fxn(self.shape, self.dtype)
+  def reshape(self, new_shape): return LazyNumpyArray(self.fxn, new_shape, self.dtype)
  def copy(self): return self
  def astype(self, typ): return self

@ -91,32 +82,32 @@ def support_weakref(x): return x
@support_weakref  # needed for mypyc, this prevents LazyBuffer from becoming a native class
 class LazyBuffer:
  __deletable__ = ('op',)
-  lazycache : ClassVar[WeakValueDictionary[Tuple[str, OpType, LazyOp], LazyBuffer]] = WeakValueDictionary()
-  def __new__(cls, device:str, shape:Union[ShapeTracker, Tuple[int, ...]], optype:OpType, op:LazyOp):
+  lazycache : ClassVar[WeakValueDictionary[Tuple[str, DType, OpType, LazyOp], LazyBuffer]] = WeakValueDictionary()
+  def __new__(cls, device:str, shape:Union[ShapeTracker, Tuple[int, ...]], optype:OpType, op:LazyOp, dtype:DType):
    # fromcpu aren't cached
    if optype == LoadOps and op.op == LoadOps.FROMCPU:
      return super().__new__(cls)
-    wop = (device, optype, get_weakop(op))   # NOTE: shape should be deterministic. annoying to cache with the ShapeTracker
+    wop = (device, dtype, optype, get_weakop(op))   # NOTE: shape should be deterministic. annoying to cache with the ShapeTracker
    # NOTE: we need "ret" to prevent the new buffer from being immediately deleted
    if wop not in LazyBuffer.lazycache: LazyBuffer.lazycache[wop] = ret = super().__new__(cls)
    else: ret = LazyBuffer.lazycache[wop]
    return ret

-  def __init__(self, device:str, shape:Union[ShapeTracker, Tuple[int, ...]], optype:OpType, op:LazyOp):
+  def __init__(self, device:str, shape:Union[ShapeTracker, Tuple[int, ...]], optype:OpType, op:LazyOp, dtype:DType):
    if hasattr(self, 'device'):
      return  # cache hit, we return and don't reinit
    self.st = shape if isinstance(shape, ShapeTracker) else ShapeTracker(tuple(shape))
-    self.shape, self.optype, self.op = self.st.shape, optype, op
+    self.shape, self.optype, self.op, self.dtype = self.st.shape, optype, op, dtype
    self.realized : Optional[DeviceBuffer] = None
    self.output_buffer : Optional[DeviceBuffer] = None
-    self.device, self.dbuffer = device, Device._buffers[device]
+    self.device, self.dbuffer = device, Device[device]
    # TODO: does children have to be a ref count instead of a set? can a Buffer be a double child?
    self.children : weakref.WeakSet[LazyBuffer] = weakref.WeakSet()
    # NOTE: op should be read only after construction of LazyBuffer
    for x in get_buffers(op): x.children.add(self)
    if not LAZY: self.realize()

-  def __repr__(self): return f"<LB {self.shape} op:{self.op.op if self.realized is None else 'realized'}>"
+  def __repr__(self): return f"<LB {self.shape} {self.dtype} op:{self.op.op if self.realized is None else 'realized'}>"

  # this produces a device buffer
  def realize(self:LazyBuffer, required_device=None) -> DeviceBuffer:
@ -124,7 +115,7 @@ class LazyBuffer:
    if self.realized is None:
      # get real ops first
      if self.op.op == LoadOps.FROMCPU:
-        self.realized = Device._buffers[self.device].fromCPU(self.op.arg() if isinstance(self.op.arg, LazyNumpyArray) else self.op.arg)
+        self.realized = Device[self.device].fromCPU(self.op.arg() if isinstance(self.op.arg, LazyNumpyArray) else self.op.arg)
        ast = LazyOp(self.op.op, tuple())
      elif self.op.op == LoadOps.CONTIGUOUS:
        real_src = self.op.src[0].realize(self.device)
@ -160,12 +151,13 @@ class LazyBuffer:
      log_op(self.realized, ast)

    assert self.realized.shape == self.shape, f"shape mismatch on realize got {self.realized.shape} expected {self.shape}"
-    assert isinstance(self.realized, Device._buffers[self.device]), f"device mismatch on realized got {type(self.realized)} expected {self.device}"
+    assert isinstance(self.realized, Device[self.device]), f"device mismatch on realized got {type(self.realized)} expected {self.device}"
+    assert self.realized.dtype == self.dtype, f"dtype mismatch on realize got {self.realized.dtype} expected {self.dtype}"
    return self.realized

  # NOTE: we have to make a copy of the numpy array here in case the user changes it. expose this?
  @staticmethod
-  def fromCPU(x, device) -> LazyBuffer: return LazyBuffer(device, x.shape, LoadOps, LazyOp(LoadOps.FROMCPU, tuple(), x.copy()))
+  def fromCPU(x, device) -> LazyBuffer: return LazyBuffer(device, x.shape, LoadOps, LazyOp(LoadOps.FROMCPU, tuple(), x.copy()), dtypes.from_np(x))
  def toCPU(self):
    ret = self.realize().toCPU()
    log_op(InterpretedBuffer(ret), LazyOp(LoadOps.TOCPU, (self.realized,), None))
@ -173,11 +165,11 @@ class LazyBuffer:

  def unary_op(self:LazyBuffer, op:UnaryOps) -> LazyBuffer: return elementwise_op(op, self)
  def binary_op(self:LazyBuffer, op:BinaryOps, y:LazyBuffer) -> LazyBuffer: return elementwise_op(op, self, y)
-  def contiguous(self:LazyBuffer) -> LazyBuffer: return LazyBuffer(self.device, self.shape, LoadOps, LazyOp(LoadOps.CONTIGUOUS, (self,)))
+  def contiguous(self:LazyBuffer) -> LazyBuffer: return LazyBuffer(self.device, self.shape, LoadOps, LazyOp(LoadOps.CONTIGUOUS, (self,)), self.dtype)

  def reduce_op(self:LazyBuffer, op:ReduceOps, new_shape:Tuple[int, ...]) -> LazyBuffer:
    if self.shape == tuple(new_shape): return self
-    return LazyBuffer(self.device, new_shape, ReduceOps, LazyOp(op, (self,), new_shape))
+    return LazyBuffer(self.device, new_shape, ReduceOps, LazyOp(op, (self,), new_shape), self.dtype)

  def movement_op(self:LazyBuffer, op:MovementOps, arg:Tuple[Any, ...]) -> LazyBuffer:
    # very instant nop
@ -228,7 +220,7 @@ class LazyBuffer:
      return replace_with_movement_op(self.op, op, arg)

    # create the buffer
-    ret = LazyBuffer(self.device, ShapeTracker(self.st).movement_op(op, arg), MovementOps, LazyOp(op, (self,), arg))
+    ret = LazyBuffer(self.device, ShapeTracker(self.st).movement_op(op, arg), MovementOps, LazyOp(op, (self,), arg), self.dtype)

    # if the ShapeTracker becomes contiguous, replace the whole thing with a reshape (or nothing if shapes match)
    # NOTE: if ret is in the cache, it can already be realized
@ -241,7 +233,7 @@ class LazyBuffer:
    return ret

 def elementwise_op(op:Union[UnaryOps, BinaryOps], *srcs:LazyBuffer) -> LazyBuffer:
-  out_device, out_shape = srcs[0].device, srcs[0].shape
+  out_device, out_shape, out_dtype = srcs[0].device, srcs[0].shape, max(x.dtype for x in srcs)

  # push all contiguous to the end of BinaryOps. kernels 198 -> 196
  if PUSH_CONTIGUOUS and any(x.realized is None and x.op.op == LoadOps.CONTIGUOUS and len(x.op.src[0].children) <= 1 for x in srcs):
@ -258,4 +250,4 @@ def elementwise_op(op:Union[UnaryOps, BinaryOps], *srcs:LazyBuffer) -> LazyBuffe
    # remove the buffers from any (childless) BinaryOps that feed into this
    srcs = tuple(x.op if x.optype == BinaryOps and len(x.children) == 0 and x.realized is None else x for x in srcs)  # type: ignore

-  return LazyBuffer(out_device, out_shape, BinaryOps, LazyOp(op, srcs))
+  return LazyBuffer(out_device, out_shape, BinaryOps, LazyOp(op, srcs), out_dtype)
--- a/tinygrad/ops.py
+++ b/tinygrad/ops.py
@ -3,7 +3,7 @@ import functools, itertools, operator, random
 import numpy as np
 from enum import Enum, auto
 from typing import Union, Type, NamedTuple, Tuple, Any, List, ClassVar, Optional, Callable, Dict, TypeVar, Set
-from tinygrad.helpers import prod, DEBUG, getenv
+from tinygrad.helpers import prod, DEBUG, getenv, DType, dtypes
 from tinygrad.shape import ShapeTracker

 # these are the llops your accelerator must implement, along with toCpu
@ -39,17 +39,19 @@ class Copyable:
  def toCPU(self:Copyable) -> np.ndarray: raise NotImplementedError("must be implemented")

 class RawBuffer(Copyable):  # pylint: disable=abstract-method
-  def __init__(self, size:int):
+  def __init__(self, size:int, dtype:DType):
    self.size : int = size
-    GlobalCounters.mem_used += self.size
-  def __del__(self): GlobalCounters.mem_used -= self.size
+    self.dtype : DType = dtype
+    self._memsz : int = size*dtype.itemsize
+    GlobalCounters.mem_used += self._memsz
+  def __del__(self): GlobalCounters.mem_used -= self.size*self._memsz

 class RawBufferCopyIn(RawBuffer):
  def copyin(self, x:np.ndarray) -> None: raise NotImplementedError("must be implemented")

  @classmethod
  def fromCPU(cls, x:np.ndarray):
-    ret = cls(4*prod(x.shape))
+    ret = cls(prod(x.shape), dtypes.from_np(x))
    ret.copyin(x)
    return ret

@ -57,7 +59,7 @@ class RawBufferCopyInOut(RawBufferCopyIn):
  def copyout(self, x:np.ndarray) -> None: raise NotImplementedError("must be implemented")

  def toCPU(self) -> np.ndarray:
-    x = np.empty((self.size//4), dtype=np.float32)
+    x: np.ndarray = np.empty(self.size, dtype=self.dtype.np)
    self.copyout(x)
    return x

@ -65,26 +67,27 @@ class RawBufferCopyInOut(RawBufferCopyIn):
 class DeviceBuffer(Copyable):
  _buf: Any                # underlying buffer
  shape: Tuple[int, ...]
+  dtype: DType
  @classmethod
  def exec_ast(cls, ast:LazyOp, output_buffer=None): raise NotImplementedError("must be implemented")

 # this is a quick "buffer" class for flop tracking and getting the output shape
 class GenericShape:
-  def __init__(self, shape:Tuple[int, ...], flops:int=0): self.shape, self.flops = shape, flops
+  def __init__(self, shape:Tuple[int, ...], dtype:DType=dtypes.float32, flops:int=0): self.shape, self.dtype, self.flops = shape, dtype, flops
  def consume_flops(self):
    self.flops, ret = 0, self.flops
    return ret
 shape_fxn_for_op : Dict[Op, Callable] = {
-  **{op:lambda self: GenericShape(self.shape, self.consume_flops() + prod(self.shape)) for op in UnaryOps},
-  **{op:lambda self,y: GenericShape(self.shape, self.consume_flops() + y.consume_flops() + prod(self.shape)) for op in BinaryOps},
-  **{op:lambda self,new_shape: GenericShape(new_shape, self.consume_flops() + prod(self.shape)) for op in ReduceOps},
-  **{op:functools.partial(lambda mop,self,arg: GenericShape(ShapeTracker(self.shape).movement_op(mop, arg).shape, self.consume_flops()), op) for op in MovementOps}}
+  **{op:lambda self: GenericShape(self.shape, self.dtype, self.consume_flops() + prod(self.shape)) for op in UnaryOps},
+  **{op:lambda self,y: GenericShape(self.shape, max(self.dtype, y.dtype), self.consume_flops() + y.consume_flops() + prod(self.shape)) for op in BinaryOps},
+  **{op:lambda self,new_shape: GenericShape(new_shape, self.dtype, self.consume_flops() + prod(self.shape)) for op in ReduceOps},
+  **{op:functools.partial(lambda mop,self,arg: GenericShape(ShapeTracker(self.shape).movement_op(mop, arg).shape, self.dtype, self.consume_flops()), op) for op in MovementOps}}
+def get_lazyop_info(ast:LazyOp): return InterpretedBuffer.exec_ast(map_buffers({x:InterpretedBuffer(GenericShape(x.shape, x.dtype)) for x in get_buffers(ast)}, ast))._buf

 # used in CPUBuffer and TorchBuffer
 class InterpretedBuffer(DeviceBuffer):  # pylint: disable=abstract-method
  fxn_for_op : ClassVar = shape_fxn_for_op
-  # TODO: use generic types here to remove __init__ in specialized classes
-  def __init__(self, lbuf:Any): self._buf, self.shape = lbuf, tuple(lbuf.shape)
+  def __init__(self, lbuf:Any): self._buf, self.shape, self.dtype = lbuf, tuple(lbuf.shape), self.to_tinygrad_dtype(lbuf) if hasattr(self, 'to_tinygrad_dtype') else lbuf.dtype
  def contiguous(self): return type(self).exec_ast(LazyOp(op=UnaryOps.NOOP, src=(self,)))
  def movement_op(self, op:MovementOps, arg=None): return type(self)(self.fxn_for_op[op](self._buf, arg)) if op in self.fxn_for_op else type(self)(getattr(self._buf, op.name.lower())(arg))
  @classmethod
@ -101,12 +104,11 @@ class InterpretedBuffer(DeviceBuffer):  # pylint: disable=abstract-method
    else: ret = cls(cls.fxn_for_op[ast.op](*([x._buf for x in srcs] + ([ast.arg] if ast.arg else []))))
    context[ast] = ret
    if output_buffer is not None:
-      assert output_buffer.shape == ret.shape
+      assert output_buffer.shape == ret.shape, output_buffer.dtype == ret.dtype
      output_buffer._buf = ret._buf
      return output_buffer
    else:
      return ret
-def get_lazyop_info(ast:LazyOp): return InterpretedBuffer.exec_ast(map_buffers({x:InterpretedBuffer(GenericShape(x.shape)) for x in get_buffers(ast)}, ast))._buf

 class ASTRunner:
  def __init__(self, name, prg, bufs_to_delete:Optional[Set[int]]=None, global_size:Optional[List[int]]=None, local_size:Optional[List[int]]=None, op_estimate=0, mem_estimate=0):
@ -139,7 +141,7 @@ class ASTRunner:
  def optimize_local_size(self, rawbufs:List[RawBuffer]) -> List[int]:
    assert self.global_size is not None, "needs a global size to optimize local size"
    if any(x == rawbufs[0] for x in rawbufs[1:]):  # this is an assignment, replace the output buffer
-      output_replacement = type(rawbufs[0])(rawbufs[0].size)
+      output_replacement = type(rawbufs[0])(rawbufs[0].size, rawbufs[0].dtype)
      rawbufs = [output_replacement if x == rawbufs[0] else x for x in rawbufs]
    MAX_WORKGROUP = self.clprg.max_work_group_size() if hasattr(self.clprg, 'max_work_group_size') else 1024
    local_dims = [[x for x in set([sz, 1, 2, 4, 8, 16, 32, 64, 128, 256, MAX_WORKGROUP]) if x<=sz] for sz in self.global_size]
@ -149,32 +151,36 @@ class ASTRunner:
 # assumes you are using ShapeTracker
 # used in GPUBuffer and LLVMBuffer
 class CompiledBuffer(DeviceBuffer):  # pylint: disable=abstract-method
-  def __init__(self, shape:Union[ShapeTracker, Tuple[int, ...]], hostbuf:Optional[CompiledBuffer]=None, backing:Optional[np.ndarray]=None, force_create=False):
+  def __init__(self, shape:Union[ShapeTracker, Tuple[int, ...]], hostbuf:Optional[CompiledBuffer]=None, backing:Optional[np.ndarray]=None, force_create=False, dtype:DType=dtypes.float32):
    self.st = shape if isinstance(shape, ShapeTracker) else ShapeTracker(tuple(shape))
    self.shape = self.st.shape
+    self.dtype = dtype
+    assert hostbuf is None or hostbuf.dtype == dtype, f"hostbuf dtype {hostbuf.dtype} != {dtype}"
    self._base_shape : Tuple[int, ...] = hostbuf._base_shape if hostbuf is not None else self.shape
    self._buf = hostbuf._buf if hostbuf is not None else None
    self._backing : Optional[np.ndarray] = hostbuf._backing if hostbuf is not None else backing
    if (self._backing is not None and self._backing.shape != (1,)) or force_create: self.raw()

  # TODO: not GPUBuffer, get name of class
-  def __repr__(self): return f"GPUBuffer(shape={self.st}, hostbuf=GPUBuffer(shape={self._base_shape}" + (f", backing=np.array({self._backing}, dtype=np.float32)))" if self._backing else ", force_create=True))")
+  # TODO: needs dtype
+  def __repr__(self): return f"{type(self).__name__}(shape={self.st}, hostbuf={type(self).__name__}(shape={self._base_shape}" + (f", backing=np.array({self._backing}, dtype=np.{self.dtype.np.__name__})))" if self._backing is not None else ", force_create=True))")

  raw_buffer_type : Type[RawBuffer]
  @classmethod
-  def create_raw_buffer(cls, shape, backing) -> RawBuffer:
+  def create_raw_buffer(cls, shape:Tuple[int, ...], backing:Optional[np.ndarray], dtype:DType) -> RawBuffer:
    assert backing is None or prod(shape) == prod(backing.shape), "backing has the wrong shape"
    assert backing is None or GlobalCounters.cache is None, f"can't copy in {backing.shape} while caching"
-    return cls.raw_buffer_type(4*prod(shape)) if backing is None else cls.raw_buffer_type.fromCPU(backing)
+    if DEBUG >= 4: print(f"create raw buffer {shape} {dtype} backed:{backing is not None}")
+    return cls.raw_buffer_type(prod(shape), dtype) if backing is None else cls.raw_buffer_type.fromCPU(backing)
  def raw(self) -> RawBuffer:
    if self._buf is None:
      if DEBUG >= 4 and self._backing is not None: print(f"**** copy in {self._backing.shape} to {type(self)}")
-      self._buf = self.create_raw_buffer(self._base_shape, self._backing)
+      self._buf = self.create_raw_buffer(self._base_shape, self._backing, self.dtype)
      self._backing = None
    return self._buf

  @classmethod
-  def fromCPU(cls, x:np.ndarray) -> CompiledBuffer: return cls(x.shape, backing=x.view(np.ndarray).astype(np.float32).ravel())
+  def fromCPU(cls, x:np.ndarray) -> CompiledBuffer: return cls(x.shape, backing=x.ravel(), dtype=dtypes.from_np(x))
  def toCPU(self) -> np.ndarray:
    assert GlobalCounters.cache is None, f"can't copy out {self} while caching"
    if DEBUG >= 3: print(f"**** copy out {self.shape}")
@ -192,7 +198,7 @@ class CompiledBuffer(DeviceBuffer):  # pylint: disable=abstract-method
      prg = cls.method_cache[k.key]
    else:
      prg = k.codegen().build(cls.runtime_type)
-    if getenv("PRINT_AST", "") == prg.name:
+    if getenv("PRINT_AST", "") == prg.name or getenv("PRINT_AST", "") == "1":
      k.print()
      print(prg.prg)
    prg.exec(k.bufs)
--- a/tinygrad/runtime/ops_clang.py
+++ b/tinygrad/runtime/ops_clang.py
@ -2,21 +2,22 @@ import os, time, ctypes, hashlib, subprocess, platform
 import numpy as np
 from collections import defaultdict
 from typing import Final, Dict
+from tinygrad.helpers import dtypes, DType
 from tinygrad.ops import CompiledBuffer, RawBufferCopyIn
 from tinygrad.codegen.gpu import GPUCodegen, GPULanguage

 class RawMallocBuffer(RawBufferCopyIn):
-  def __init__(self, size):
-    super().__init__(size)
-    self._buf = (ctypes.c_float * (size//4))()
+  def __init__(self, size, dtype : DType):
+    super().__init__(size, dtype)
+    self._buf = ({dtypes.float32: ctypes.c_float, dtypes.float16: ctypes.c_int16}[dtype] * size)()
  def _buffer(self): return self._buf
-  def copyin(self, x:np.ndarray): ctypes.memmove(self._buf, x.ctypes.data, x.size*4)
-  def toCPU(self): return np.ctypeslib.as_array(self._buf)
+  def copyin(self, x:np.ndarray): ctypes.memmove(self._buf, x.ctypes.data, x.size*np.dtype(x.dtype).itemsize)
+  def toCPU(self): return np.frombuffer(self._buf, dtype=self.dtype.np)

 class ClangProgram:
  kernel_cnt : Final[Dict[str, int]] = defaultdict(int)
  def __init__(self, name:str, prg:str):
-    prg = "#include <math.h>\n#define max(x,y) ((x>y)?x:y)\n" + prg
+    prg = "#include <math.h>\n#define max(x,y) ((x>y)?x:y)\n#define half __fp16\n" + prg
    # TODO: is there a way to not write this to disk?
    fn = f"/tmp/clang_{hashlib.md5(prg.encode('utf-8')).hexdigest()}.{'dylib' if platform.system() == 'Darwin' else 'so'}"
    if not os.path.exists(fn):
--- a/tinygrad/runtime/ops_cpu.py
+++ b/tinygrad/runtime/ops_cpu.py
@ -1,6 +1,7 @@
 import numpy as np
 import operator
 from typing import ClassVar, Callable, Dict, Tuple
+from tinygrad.helpers import dtypes
 from tinygrad.ops import UnaryOps, BinaryOps, MovementOps, ReduceOps, FusedOps, InterpretedBuffer, Op

 def shape_to_axis(old_shape:Tuple[int, ...], new_shape:Tuple[int, ...]) -> Tuple[int, ...]:
@ -35,6 +36,7 @@ numpy_fxn_for_op : Dict[Op, Callable] = {**base_fxn_for_op, **{

 class CPUBuffer(InterpretedBuffer):
  fxn_for_op : ClassVar = numpy_fxn_for_op
+  to_tinygrad_dtype = staticmethod(dtypes.from_np)

  @staticmethod
  def fromCPU(x): return CPUBuffer(x)
--- a/tinygrad/runtime/ops_cuda.py
+++ b/tinygrad/runtime/ops_cuda.py
@ -8,9 +8,9 @@ from tinygrad.ops import CompiledBuffer, RawBufferCopyInOut
 from tinygrad.codegen.gpu import GPUCodegen, GPULanguage

 class RawCUDABuffer(RawBufferCopyInOut):
-  def __init__(self, size):
-    super().__init__(size)
-    self._cl = cuda.mem_alloc(size)
+  def __init__(self, size, dtype):
+    super().__init__(size, dtype)
+    self._cl = cuda.mem_alloc(self._memsz)
  def copyin(self, x:np.ndarray, stream:Optional[cuda.Stream]=None): cuda.memcpy_htod_async(self._cl, x, stream)
  def copyout(self, x:np.ndarray): cuda.memcpy_dtoh(x, self._cl)

@ -42,6 +42,7 @@ class CUDAProgram:
 class CUDACodegen(GPUCodegen):
  lang = GPULanguage(
    kernel_prefix = "__global__", smem_prefix = "__shared__ ", barrier = "__syncthreads();", float4 = "make_float4",
+    half_prekernel = "#include <cuda_fp16.h>",
    gid = [f'blockDim.{chr(120+i)}*blockIdx.{chr(120+i)}+threadIdx.{chr(120+i)}' for i in range(3)],
    lid = [f'threadIdx.{chr(120+i)}' for i in range(3)])

--- a/tinygrad/runtime/ops_gpu.py
+++ b/tinygrad/runtime/ops_gpu.py
@ -2,15 +2,13 @@ from __future__ import annotations
 import platform, functools
 import numpy as np
 import pyopencl as cl  # type: ignore
-from typing import Dict, Optional, List, ClassVar, Final
-from collections import defaultdict
-from tinygrad.helpers import IMAGE, DEBUG, getenv
+from typing import Optional, List, Final
+from tinygrad.helpers import IMAGE, DEBUG, getenv, dtypes
 from tinygrad.ops import CompiledBuffer, GlobalCounters, RawBufferCopyInOut, RawBuffer
 from tinygrad.codegen.gpu import GPUCodegen, GPULanguage

 OSX = platform.system() == "Darwin"
 OSX_TIMING_RATIO = (125/3) if OSX else 1.0   # see test/external_osx_profiling.py to determine this ratio. it's in like GPU clocks or something
-CLCACHE = getenv("CLCACHE", 1)
 FLOAT16 = getenv("FLOAT16", 0)

 class _CL:
@ -27,33 +25,18 @@ class _CL:
 CL = _CL()

 class CLBuffer(RawBufferCopyInOut):
-  # TODO: this can be in RawBuffer generically
-  BUFFER_CACHE : ClassVar[Dict[int, List[cl.Buffer]]] = defaultdict(list)
-
-  def __init__(self, size):  # pylint: disable=super-init-not-called
-    self.size = size
-    if len(CLBuffer.BUFFER_CACHE[size]) > 0:
-      self._cl = CLBuffer.BUFFER_CACHE[size].pop()
-    else:
-      # TODO: on GPU OOM, clear the cache
-      self._cl = cl.Buffer(CL.cl_ctx, cl.mem_flags.READ_WRITE, size)
-      GlobalCounters.mem_used += self._cl.size
-
-  def __del__(self):
-    if CLCACHE: CLBuffer.BUFFER_CACHE[self._cl.size].append(self._cl)
-    else: GlobalCounters.mem_used -= self._cl.size
-
+  def __init__(self, size, dtype):
+    super().__init__(size, dtype)
+    self._cl = cl.Buffer(CL.cl_ctx, cl.mem_flags.READ_WRITE, self._memsz)
  def copyin(self, x:np.ndarray): cl.enqueue_copy(CL.cl_queue, self._cl, x, is_blocking=False)
  def copyout(self, x:np.ndarray): cl.enqueue_copy(CL.cl_queue, x, self._cl, is_blocking=True)

 class CLImage(RawBuffer):  # pylint: disable=abstract-method
-  fmt : Final = cl.ImageFormat(cl.channel_order.RGBA, cl.channel_type.HALF_FLOAT if FLOAT16 else cl.channel_type.FLOAT)
  IMAGE : Final = True
-
-  def __init__(self, shape):  # pylint: disable=super-init-not-called
-    self.size, self._cl = shape, cl.Image(CL.cl_ctx, cl.mem_flags.READ_WRITE, CLImage.fmt, shape=(shape[1], shape[0]))
+  def __init__(self, shape, dtype=dtypes.float16 if getenv("FLOAT16") else dtypes.float32):  # pylint: disable=super-init-not-called
+    fmt = cl.ImageFormat(cl.channel_order.RGBA, {dtypes.float16: cl.channel_type.HALF_FLOAT, dtypes.float32: cl.channel_type.FLOAT}[dtype])
+    self.size, self.dtype, self._cl = shape, dtype, cl.Image(CL.cl_ctx, cl.mem_flags.READ_WRITE, fmt, shape=(shape[1], shape[0]))
    GlobalCounters.mem_used += self._cl.row_pitch * self._cl.height
-
  def __del__(self): GlobalCounters.mem_used -= self._cl.row_pitch * self._cl.height

@functools.lru_cache(maxsize=None)
@ -89,6 +72,7 @@ class CLProgram:
 class CLCodegen(GPUCodegen):
  lang = GPULanguage(
    kernel_prefix = "__kernel", buffer_prefix = "__global ", smem_prefix = "__local ",
+    half_prekernel = "#pragma OPENCL EXTENSION cl_khr_fp16 : enable",
    barrier = "barrier(CLK_LOCAL_MEM_FENCE);", float4 = "(float4)",
    gid = [f'get_global_id({i})' for i in range(3)], lid = [f'get_local_id({i})' for i in range(3)])

@ -96,8 +80,8 @@ class GPUBuffer(CompiledBuffer):
  raw_buffer_type = CLBuffer
  # override this method for image
  @classmethod
-  def create_raw_buffer(cls, shape, backing) -> RawBuffer:
-    if len(shape) == 3 and shape[2] == 4 and IMAGE >= 2 and backing is None: return CLImage(shape)
-    else: return super().create_raw_buffer(shape, backing)
+  def create_raw_buffer(cls, shape, backing, dtype) -> RawBuffer:
+    if len(shape) == 3 and shape[2] == 4 and IMAGE >= 2 and backing is None: return CLImage(shape)   # NOTE: this is a hack. we don't pass in the dtype here, it's controlled by the FLOAT16 env var
+    else: return super().create_raw_buffer(shape, backing, dtype)
  codegen_type = CLCodegen
  runtime_type = CLProgram
--- a/tinygrad/runtime/ops_llvm.py
+++ b/tinygrad/runtime/ops_llvm.py
@ -57,7 +57,7 @@ class LLVMProgram:
  def __del__(self): LLVM.engine.remove_module(self.mod)

  def __call__(self, unused_global_size, unused_local_size, *bufs, wait=False):
-    cfunc = CFUNCTYPE(ctypes.c_int, *[ctypes.POINTER(ctypes.c_float) for _ in bufs])(self.fxn)
+    cfunc = CFUNCTYPE(ctypes.c_int, *[ctypes.c_void_p for _ in bufs])(self.fxn)
    if wait: st = time.monotonic()
    cfunc(*[x._buf for x in bufs])
    if wait: return time.monotonic()-st
--- a/tinygrad/runtime/ops_metal.py
+++ b/tinygrad/runtime/ops_metal.py
@ -4,7 +4,7 @@ import Metal, Cocoa, libdispatch # type: ignore
 import numpy as np
 from typing import List, Any
 from tinygrad.codegen.gpu import GPUCodegen, GPULanguage
-from tinygrad.helpers import prod, getenv, DEBUG
+from tinygrad.helpers import prod, getenv, DEBUG, DType
 from tinygrad.ops import CompiledBuffer, RawBufferCopyIn

 METAL_XCODE = getenv("METAL_XCODE")
@ -20,14 +20,14 @@ class _METAL:
 METAL = _METAL()

 class RawMetalBuffer(RawBufferCopyIn):
-  def __init__(self, size):
-    super().__init__(size)
-    self._cl = METAL.device.newBufferWithLength_options_(size, Metal.MTLResourceStorageModeShared)
+  def __init__(self, size:int, dtype:DType):
+    super().__init__(size, dtype)
+    self._cl = METAL.device.newBufferWithLength_options_(size*dtype.itemsize, Metal.MTLResourceStorageModeShared)
  def __del__(self):
    self._cl.release()
    super().__del__()
  def _buffer(self): return self._cl.contents().as_buffer(self._cl.length())
-  def _as_np(self, dtype=np.float32): return np.frombuffer(self._buffer(), dtype=dtype)
+  def _as_np(self): return np.frombuffer(self._buffer(), dtype=self.dtype.np)
  def copyin(self, x:np.ndarray): np.copyto(self._as_np(), x.reshape(-1).data)
  def toCPU(self) -> np.ndarray:
    for cbuf in METAL.mtl_buffers_in_flight: cbuf.waitUntilCompleted()
--- a/tinygrad/runtime/ops_torch.py
+++ b/tinygrad/runtime/ops_torch.py
@ -1,7 +1,7 @@
 import torch
 from typing import ClassVar, Dict, Callable
 from tinygrad.ops import UnaryOps, BinaryOps, MovementOps, FusedOps, InterpretedBuffer, Op
-from tinygrad.helpers import getenv
+from tinygrad.helpers import getenv, dtypes
 from tinygrad.runtime.ops_cpu import base_fxn_for_op, einsum_mulacc

 torch_fxn_for_op : Dict[Op, Callable] = {**base_fxn_for_op, **{
@ -14,6 +14,7 @@ torch_fxn_for_op : Dict[Op, Callable] = {**base_fxn_for_op, **{
 device = torch.device("cuda:0" if torch.cuda.is_available() else ("mps" if getenv("MPS", 0) else "cpu"))
 class TorchBuffer(InterpretedBuffer):
  fxn_for_op : ClassVar = torch_fxn_for_op
+  to_tinygrad_dtype = staticmethod(lambda lbuf: {torch.float16: dtypes.float16, torch.float32: dtypes.float32}[lbuf.dtype])

  @staticmethod
  def fromCPU(x): return TorchBuffer(torch.from_numpy(x).requires_grad_(False).to(device))
--- a/tinygrad/tensor.py
+++ b/tinygrad/tensor.py
@ -3,7 +3,7 @@ from __future__ import annotations
 import math, functools, itertools
 import numpy as np
 from typing import List, Tuple, Callable, Optional, ClassVar, Type, Union, Sequence
-from tinygrad.helpers import prod, argfix, make_pair, getenv, DEBUG, flatten
+from tinygrad.helpers import prod, argfix, make_pair, getenv, DEBUG, flatten, DType, dtypes
 from tinygrad.lazy import Device, LazyBuffer, LazyNumpyArray
 from tinygrad.image import image_conv2d_decorator, image_dot_decorator

@ -32,18 +32,20 @@ class Tensor:
  __deletable__ = ('_ctx',)
  training : ClassVar[bool] = False
  no_grad : ClassVar[bool] = False
+  default_type : DType = dtypes.float32

-  def __init__(self, data, device=Device.DEFAULT, requires_grad:Optional[bool]=None):
+  def __init__(self, data, device=Device.DEFAULT, dtype:Optional[DType]=None, requires_grad:Optional[bool]=None):
    if isinstance(data, list):
-      data = np.array(data, dtype=np.float32)
+      data = np.array(data, dtype=(dtype if dtype is not None else Tensor.default_type).np)
    elif isinstance(data, LazyBuffer) and data.device != device:
      # TODO: this has to realize, it shouldn't have to
      data = data.realize().toCPU()

    if isinstance(data, (np.ndarray, LazyNumpyArray)):
      data = data if data.shape else data.reshape((1,))
-      self.lazydata = LazyBuffer.fromCPU(data.astype(np.float32), device)
+      self.lazydata = LazyBuffer.fromCPU(data.astype(dtype.np) if dtype is not None else data, device)
    elif isinstance(data, LazyBuffer):
+      assert dtype is None or dtype == data.dtype, "dtype doesn't match, and casting isn't supported"
      self.lazydata = data
    else:
      raise RuntimeError(f"can't create Tensor from {data}")
@ -64,13 +66,12 @@ class Tensor:
  @property
  def shape(self) -> Tuple[int, ...]: return self.lazydata.shape

-  # dtype handling was very broken. it's always float32 now
-  @property
-  def dtype(self) -> type: return np.float32
-
  @property
  def device(self) -> str: return self.lazydata.device

+  @property
+  def dtype(self) -> DType: return self.lazydata.dtype
+
  # ***** data handlers ****

  def realize(self) -> Tensor:
@ -132,11 +133,11 @@ class Tensor:
  def manual_seed(seed=None): Tensor._rng = np.random.default_rng(seed=seed)

  @staticmethod
-  def rand(*shape, **kwargs) -> Tensor: return Tensor(LazyNumpyArray(lambda shape: Tensor._rng.random(size=shape, dtype=np.float32), shape), **kwargs)
+  def rand(*shape, **kwargs) -> Tensor: return Tensor(LazyNumpyArray(lambda shape, dtype: Tensor._rng.random(size=shape, dtype=dtype), shape, np.float32), **kwargs)

  # TODO: replace with a transformation from uniform -> gaussian
  @staticmethod
-  def randn(*shape, **kwargs) -> Tensor: return Tensor(LazyNumpyArray(lambda shape: Tensor._rng.standard_normal(size=shape, dtype=np.float32), shape), **kwargs)
+  def randn(*shape, **kwargs) -> Tensor: return Tensor(LazyNumpyArray(lambda shape, dtype: Tensor._rng.standard_normal(size=shape, dtype=dtype), shape, np.float32), **kwargs)

  # ***** rng hlops *****

@ -442,10 +443,16 @@ class Tensor:

  def dropout(self, p=0.5) -> Tensor:
    if not Tensor.training: return self
-    _mask : np.ndarray = np.asarray(Tensor._rng.binomial(1, 1.0-p, size=self.shape), dtype=self.dtype)
+    # TODO: why is this going through numpy?
+    _mask : np.ndarray = np.asarray(Tensor._rng.binomial(1, 1.0-p, size=self.shape), dtype=np.float32)
    return self * Tensor(_mask, requires_grad=False, device=self.device) * (1/(1.0 - p))

+  # ***** cast ops *****
+
+  # TODO: this is a hack, but if we add float(0), it will become a float. need real casting support
+  def float(self) -> Tensor: return self.add(Tensor([0], device=self.device, dtype=dtypes.float32, requires_grad=self.requires_grad))
+
 # register functions to move between devices
-for device in [device for device in Device._buffers.keys() if device[0] != "_"]:
+for device in Device._buffers:
  setattr(Tensor, f"{device.lower()}", functools.partialmethod(Tensor.to, device))
  setattr(Tensor, f"{device.lower()}_", functools.partialmethod(Tensor.to_, device))