diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 8802742e8..3b5e0922d 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -164,6 +164,7 @@ jobs:
     - name: Test openpilot model
       run: |
         ALLOWED_KERNEL_COUNT=197 FLOAT16=1 VALIDHACKS=1 DEBUGCL=1 GPU=1 IMAGE=2 python3 openpilot/compile.py
+        python3 -c 'import os; assert os.path.getsize("/tmp/output.thneed") < 100_000_000'
         DEBUGCL=1 GPU=1 IMAGE=2 python3 openpilot/compile.py
         VALIDHACKS=1 DEBUGCL=1 GPU=1 IMAGE=2 python3 openpilot/compile.py
 
diff --git a/extra/thneed.py b/extra/thneed.py
index b9b896ca6..32b1b2700 100644
--- a/extra/thneed.py
+++ b/extra/thneed.py
@@ -1,4 +1,4 @@
-# this can be constructed from a cl_cache or loaded from a thneed file 
+# this can be constructed from a cl_cache or loaded from a thneed file
 import time
 import struct
 import json
@@ -26,7 +26,7 @@ class Thneed:
       for a in args[3:]:
         nodes[a]['out_edges'].append(args[2])
         nodes[args[2]]['in_edges'].append(a)
-    
+
     # get buffers to save
     self.buffers_to_save = set()
     self.outputs = []
@@ -35,7 +35,7 @@ class Thneed:
         self.buffers_to_save.add(n)
       if len(nodes[n]['out_edges']) == 0:
         self.outputs.append(n)
-  
+
     fake_inputs = []
     for k,n in self.inputs.items():
       if n in self.buffers_to_save:
@@ -97,7 +97,7 @@ class Thneed:
         else:
           # zero out buffers
           buf = cl.Buffer(CL.cl_ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=b'\x00'*o['size'])
-        
+
       bufs[o['id']] = buf
       bufs_loaded[o['id']] = 'data' in o
       # if it's loaded, it's saved
@@ -114,13 +114,13 @@ class Thneed:
         print("FAILED", k)
         traceback.print_exc()
         exit(0)
-    
+
     # load binaries
     for o in jdat['binaries']:
       nptr = ptr + o['length']
       prgs[o['name']] = CLProgram(o['name'], weights[ptr:nptr], binary=True)
       ptr = nptr
-  
+
     # populate the cl_cache
     for i,k in enumerate(jdat['kernels']):
       kernel = prgs[k['name']]
@@ -166,7 +166,7 @@ class Thneed:
         jdat['binaries'].append({"name":prg.name, "length":len(binary[0])})
         binaries.append(binary[0])
         saved_binaries.add(prg.name)
-      
+
       # get the args from the kernel, some need the data saved
       targs, args_size = [], []
       argdtypes = prg.argdtypes if prg.argdtypes is not None else [None]*(len(args)-2)
@@ -196,6 +196,7 @@ class Thneed:
                 cl.enqueue_copy(CL.cl_queue, data, a, is_blocking=True)
                 weights.append(data.tobytes())
             elif isinstance(a, cl.Image):
+              assert a.format == cl.ImageFormat(cl.channel_order.RGBA, cl.channel_type.HALF_FLOAT if FLOAT16 else cl.channel_type.FLOAT), "wrong type"
               needs_load = a in self.buffers_to_save
               row_pitch = (a.shape[0]*4*(2 if FLOAT16 else 4) + 63)//64 * 64
               size = row_pitch * a.shape[1]
@@ -244,7 +245,7 @@ class Thneed:
         "local_work_size": [1 for _ in args[0]] if args[1] is None else args[1],
         "num_args": len(args)-2,
         "args": targs,
-        "args_size": args_size 
+        "args_size": args_size
       })
 
     jdat['outputs'] = [{
diff --git a/run_multibackend.sh b/run_multibackend.sh
new file mode 100755
index 000000000..c20ef3a33
--- /dev/null
+++ b/run_multibackend.sh
@@ -0,0 +1,13 @@
+#!/bin/bash -e
+echo "********* CPU *********"
+CPU=1 python3 $@
+echo "********* GPU *********"
+GPU=1 python3 $@
+echo "********* METAL *********"
+METAL=1 python3 $@
+echo "********* CLANG *********"
+CLANG=1 python3 $@
+echo "********* LLVM *********"
+LLVM=1 python3 $@
+echo "********* TORCH *********"
+TORCH=1 python3 $@
diff --git a/test/test_custom_function.py b/test/test_custom_function.py
index 4cccf8286..3c283a590 100644
--- a/test/test_custom_function.py
+++ b/test/test_custom_function.py
@@ -4,7 +4,7 @@
 import unittest
 import numpy as np
 from typing import Optional, Tuple
-from tinygrad.helpers import prod
+from tinygrad.helpers import prod, dtypes
 
 # *** first, we implement the atan2 op at the lowest level ***
 # `atan2_gpu` for GPUBuffers and `atan2_cpu` for CPUBuffers
@@ -16,6 +16,7 @@ from tinygrad.runtime.ops_cpu import CPUBuffer
 def atan2_gpu(a:CompiledBuffer, b:CompiledBuffer) -> CompiledBuffer:
   from tinygrad.runtime.ops_gpu import GPUBuffer
   assert type(a) == GPUBuffer and type(b) == GPUBuffer, "gpu function requires GPUBuffers"
+  assert a.dtype == b.dtype and a.dtype == dtypes.float32, "gpu function only supports float32"
   ret = GPUBuffer(a.shape)
   ASTRunner("atan2", """
     __kernel void atan2(global float *c, global float *a, global float *b) {
@@ -40,7 +41,7 @@ class ATan2(Function):
     assert prod(a.shape) == prod(b.shape) and a.device == b.device, "shape or device mismatch"
     self.a, self.b = a, b
     ast = LazyOp(LoadOps.CUSTOM, (a, b), {"GPU": atan2_gpu, "CPU": atan2_cpu}[a.device])
-    return LazyBuffer(a.device, a.shape, LoadOps, ast)
+    return LazyBuffer(a.device, a.shape, LoadOps, ast, max(a.dtype, b.dtype))
   def backward(self, grad_output:LazyBuffer) -> Tuple[Optional[LazyBuffer], Optional[LazyBuffer]]:
     denom = (self.a.binary_op(BinaryOps.MUL, self.a)).binary_op(BinaryOps.ADD, self.b.binary_op(BinaryOps.MUL, self.b))
     return grad_output.binary_op(BinaryOps.MUL, self.b.binary_op(BinaryOps.DIV, denom)) if self.needs_input_grad[0] else None, \
diff --git a/test/test_dtype.py b/test/test_dtype.py
new file mode 100644
index 000000000..e1119b32f
--- /dev/null
+++ b/test/test_dtype.py
@@ -0,0 +1,41 @@
+import unittest
+import numpy as np
+from tinygrad.helpers import getenv
+from tinygrad.lazy import Device
+from tinygrad.tensor import Tensor, dtypes
+
+# for GPU, cl_khr_fp16 isn't supported
+# for LLVM, it segfaults because it can't link to the casting function
+@unittest.skipIf(getenv("CI", "") != "" and Device.DEFAULT in ["GPU", "LLVM"], "float16 broken in some CI backends")
+class TestDtype(unittest.TestCase):
+  def test_half_to_np(self):
+    a = Tensor([1,2,3,4], dtype=dtypes.float16)
+    print(a)
+    na = a.numpy()
+    print(na, na.dtype, a.lazydata.realized)
+    assert na.dtype == np.float16
+
+  def test_half_add(self):
+    a = Tensor([1,2,3,4], dtype=dtypes.float16)
+    b = Tensor([1,2,3,4], dtype=dtypes.float16)
+    c = a+b
+    print(c.numpy())
+    assert c.dtype == dtypes.float16
+
+  def test_upcast_float(self):
+    # NOTE: there's no downcasting support
+    a = Tensor([1,2,3,4], dtype=dtypes.float16).float()
+    print(a)
+    na = a.numpy()
+    print(na, na.dtype)
+    assert na.dtype == np.float32
+
+  def test_half_add_upcast(self):
+    a = Tensor([1,2,3,4], dtype=dtypes.float16)
+    b = Tensor([1,2,3,4], dtype=dtypes.float32)
+    c = a+b
+    print(c.numpy())
+    assert c.dtype == dtypes.float32
+
+if __name__ == '__main__':
+  unittest.main()
\ No newline at end of file
diff --git a/test/unit/test_graph.py b/test/unit/test_graph.py
index 4f235c901..ca480efbf 100644
--- a/test/unit/test_graph.py
+++ b/test/unit/test_graph.py
@@ -14,10 +14,10 @@ class TestGraph(unittest.TestCase):
     assert nx.is_isomorphic(G, RG, node_match=lambda x,y: x["label"] == y["label"], edge_match=lambda x,y: x["label"] == y["label"] if "label" in y else True)
 
   def test_add_graph(self):
-    a = CPUBuffer.fromCPU(np.ones((4,4)))
-    b = CPUBuffer.fromCPU(np.ones((4,4)))
+    a = CPUBuffer.fromCPU(np.ones((4,4), dtype=np.float32))
+    b = CPUBuffer.fromCPU(np.ones((4,4), dtype=np.float32))
     ast = LazyOp(BinaryOps.ADD, (a,b))
-    ret = CPUBuffer(np.ones((4,4)))
+    ret = CPUBuffer(np.ones((4,4), dtype=np.float32))
 
     RG = nx.DiGraph()
     RG.add_node(0, label="(4, 4)")
@@ -30,12 +30,12 @@ class TestGraph(unittest.TestCase):
     self.helper_compare_graph(RG)
 
   def test_add_sum_graph(self):
-    a = CPUBuffer.fromCPU(np.ones((4,4)))
-    b = CPUBuffer.fromCPU(np.ones((1,1)))
+    a = CPUBuffer.fromCPU(np.ones((4,4), dtype=np.float32))
+    b = CPUBuffer.fromCPU(np.ones((1,1), dtype=np.float32))
     op0 = LazyOp(MovementOps.RESHAPE, (b,), (4, 4))
     op1 = LazyOp(BinaryOps.ADD, (a,op0))
     ast = LazyOp(ReduceOps.SUM, (op1,), (1,1))
-    ret = CPUBuffer(np.ones((1,1)))
+    ret = CPUBuffer(np.ones((1,1), dtype=np.float32))
 
     RG = nx.DiGraph()
     RG.add_node(0, label="(4, 4)")
@@ -48,14 +48,14 @@ class TestGraph(unittest.TestCase):
     self.helper_compare_graph(RG)
 
   def test_add_graph_prune(self):
-    a = CPUBuffer.fromCPU(np.ones((1,1)))
+    a = CPUBuffer.fromCPU(np.ones((1,1), dtype=np.float32))
     ast = LazyOp(MovementOps.RESHAPE, (a,), (4, 4))
-    ret = CPUBuffer(np.ones((4,4)))
+    ret = CPUBuffer(np.ones((4,4), dtype=np.float32))
     log_op(ret, ast, show_graph=True)
 
-    b = CPUBuffer.fromCPU(np.ones((4,4)))
+    b = CPUBuffer.fromCPU(np.ones((4,4), dtype=np.float32))
     ast = LazyOp(BinaryOps.ADD, (ret,b))
-    ret = CPUBuffer(np.ones((4,4)))
+    ret = CPUBuffer(np.ones((4,4), dtype=np.float32))
     log_op(ret, ast, show_graph=True)
     prune_graph()
 
diff --git a/tinygrad/codegen/ast.py b/tinygrad/codegen/ast.py
index 16a67ffc6..3f2162a0d 100644
--- a/tinygrad/codegen/ast.py
+++ b/tinygrad/codegen/ast.py
@@ -1,7 +1,7 @@
 import itertools
 from enum import Enum, auto
 from typing import List, Tuple
-from tinygrad.helpers import prod, dedup, all_same, colored
+from tinygrad.helpers import prod, dedup, all_same, colored, dtypes
 from tinygrad.ops import LazyOp, MovementOps, get_lazyop_info, get_buffers, ReduceOps, get_lazyops, map_buffers
 from tinygrad.shape import ShapeTracker, View, strides_for_shape
 
@@ -26,7 +26,7 @@ class Token:
     if len(self.axis) == 0: return [0]
     acc_strides = [x*(1-self.axis[::-1][i][2]) for i,x in enumerate(strides_for_shape(tuple(1 if r else s for s,_,r in self.axis[::-1])))]
     return [sum(t) for t in itertools.product(*[[y*acc_strides[i] for y in range(x[0])] for i,x in enumerate(self.axis[::-1])])]
-  def decltype(self): return ('float' if self.typ == Types.FLOAT else 'float4') + ('*' if self.ptr else str())
+  def decltype(self, dtype=dtypes.float32): return (dtype.name if self.typ == Types.FLOAT else f'{dtype.name}4') + ('*' if self.ptr else str())
   def __repr__(self): return f"<{self.typ}{'*' if self.ptr else str()} {self.tok}{f'[{self.axis}]' if len(self.axis) else str()}>"
 
 # ast kernel can contain one ReduceOp with arbitrary Binary/Unary ops
@@ -55,8 +55,9 @@ class ASTKernel:
           break
 
     # create the buffer we are returning (as the same type as the input buffers) and add it as the first buffer
-    self.ret = output_buffer if output_buffer else type(self.bufs[0])(output_shape if output_shape else self.info.shape, force_create=True)
-    self.bufs = ([type(self.ret)(self.info.shape, hostbuf=self.ret)] if output_shape else [self.ret]) + self.bufs
+    self.ret = output_buffer if output_buffer else type(self.bufs[0])(output_shape if output_shape else self.info.shape, force_create=True, dtype=self.info.dtype)
+    assert self.ret.dtype == self.info.dtype, f"return dtype {self.ret.dtype} != {self.info.dtype}"
+    self.bufs = ([type(self.ret)(self.info.shape, hostbuf=self.ret, dtype=self.info.dtype)] if output_shape else [self.ret]) + self.bufs
 
     # key for lookup in cache (can change, str might not be right)
     # bufs are needed because kernels like f(x) = x + x and f(x, y) = x + y have the same str(ast), but are different kernels.
@@ -121,7 +122,7 @@ class ASTKernel:
     if print_shapetrackers:
       for st in self.sts: print(st)
     for i in range(len(self.sts)):
-      print(prefix, self.buftokens[i], f"early:{'T' if i < len(self.bufs) and self.bufs[i] in self.earlybufs else 'F'}", self.sts[i].shape, self.sts[i].views[-1].strides, len(self.sts[i].views), type(self.bufs[i]._buf) if self.bufs[i] is not None else "FAKE")
+      print(prefix, self.bufs[i].dtype, self.buftokens[i], f"early:{'T' if i < len(self.bufs) and self.bufs[i] in self.earlybufs else 'F'}", self.sts[i].shape, self.sts[i].views[-1].strides, len(self.sts[i].views), type(self.bufs[i]._buf) if self.bufs[i] is not None else "FAKE")
 
   @property
   def shape_len(self) -> int: return len(self.sts[0].shape)
diff --git a/tinygrad/codegen/gpu.py b/tinygrad/codegen/gpu.py
index b3999f19d..9caaec2c9 100644
--- a/tinygrad/codegen/gpu.py
+++ b/tinygrad/codegen/gpu.py
@@ -25,6 +25,7 @@ class GPULanguage(NamedTuple):
   lid : List[str] = []
   extra_args : List[str] = []
   float4 : Optional[str] = None
+  half_prekernel : Optional[str] = None
 
 def to_image_idx(base_shape:Tuple[int, ...], idxy:Node, valid:Node, validhacks=False) -> Tuple[Node, Node]:
   idy = (idxy//(4*base_shape[1]))
@@ -266,6 +267,8 @@ class GPUCodegen(ASTKernel):
     self.prekernel : Set[str] = set()
     self.kernel : List[str] = ["const sampler_t smp = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;\n"] if any(hasattr(buf._buf, "IMAGE") for buf in self.bufs if buf is not None) else []
 
+    if self.lang.half_prekernel: self.prekernel.add(self.lang.half_prekernel+"\n")
+
     if len(self.lang.gid) == 0:
       self.kernel += [f"for (int idx{i} = 0; idx{i} < {self.output_shape[i]}; idx{i}++) {{\n" for i in range(0, len(self.output_shape))]
     else:
@@ -320,7 +323,7 @@ class GPUCodegen(ASTKernel):
     self.kernel.append("\n}")
 
     # concat kernel into prg
-    buftypes = [f"{'read_only' if i > 0 else 'write_only'} image2d_t" if hasattr(x._buf, "IMAGE") else self.lang.buffer_prefix+self.buftokens[i].decltype()+self.lang.buffer_suffix for i,x in enumerate(self.bufs) if x is not None]
+    buftypes = [f"{'read_only' if i > 0 else 'write_only'} image2d_t" if hasattr(x._buf, "IMAGE") else self.lang.buffer_prefix+self.buftokens[i].decltype(self.bufs[i].dtype)+self.lang.buffer_suffix for i,x in enumerate(self.bufs) if x is not None]
     prg = ' '.join(list(self.prekernel) + [f"{self.lang.kernel_prefix} void KERNEL_NAME_PLACEHOLDER(",] +
       [', '.join([f'{t} data{i}' for i,t in enumerate(buftypes) if i not in self.bufs_to_delete] + self.lang.extra_args)] +
       [") {\n"] + self.kernel)
diff --git a/tinygrad/codegen/llvm.py b/tinygrad/codegen/llvm.py
index 4761e7a67..420c473d1 100644
--- a/tinygrad/codegen/llvm.py
+++ b/tinygrad/codegen/llvm.py
@@ -3,7 +3,7 @@ from typing import ClassVar, List
 from llvmlite import ir  # type: ignore
 from tinygrad.codegen.ast import ASTKernel
 from tinygrad.ops import UnaryOps, BinaryOps, ReduceOps, LazyOp, ASTRunner
-from tinygrad.helpers import DEBUG, prod
+from tinygrad.helpers import DEBUG, prod, dtypes
 
 from tinygrad.shape.symbolic import Variable, NumNode, MulNode, DivNode, ModNode, GeNode, LtNode, SumNode, AndNode
 def int_const(x): return ir.Constant(ir.IntType(64), x)
@@ -105,7 +105,8 @@ class LLVMCodegen(ASTKernel):
 
     # create llvm function
     module = ir.Module(name=__file__)
-    func = ir.Function(module, ir.FunctionType(ir.VoidType(), [ir.FloatType().as_pointer()]*(len(self.bufs))), name='exec')
+    func_dtypes = [{dtypes.float16:ir.HalfType(), dtypes.float32:ir.FloatType()}[buf.dtype] for buf in self.bufs]
+    func = ir.Function(module, ir.FunctionType(ir.VoidType(), [x.as_pointer() for x in func_dtypes]), name='exec')
 
     # force llvmlite to allow us to add function attribute then add the attribute
     func.attributes._known = func.attributes._known.union(frozenset(['"no-nans-fp-math"="true"']))
@@ -143,9 +144,11 @@ class LLVMCodegen(ASTKernel):
             # this always does the load, so we have it load *0 if the arg won't be used
             # TODO: would control flow be faster?
             aug_idx = builder.select(valid, idx, int_const(0))
-            element = builder.select(valid, builder.load(builder.gep(func.args[buf_index], [aug_idx], inbounds=True)), ir.Constant(ir.FloatType(), 0))
+            element = builder.select(valid, builder.load(builder.gep(func.args[buf_index], [aug_idx], inbounds=True)), ir.Constant(func_dtypes[buf_index], 0))
           else:
             element = builder.load(builder.gep(func.args[buf_index], [idx], inbounds=True))
+          # upcast
+          if func_dtypes[buf_index] != ir.FloatType(): element = builder.fpext(element, ir.FloatType())
           m = element if kernel_output_dim == 1 else builder.insert_element(m, element, int_const(i))
         return m
       if isinstance(x.op, ReduceOps):
@@ -194,6 +197,7 @@ class LLVMCodegen(ASTKernel):
     builder = loop_exit[store_loop]
     for i, idx in enumerate(get_idxs(builder, idx_level[0][store_loop], 0)):
       element = result if kernel_output_dim == 1 else builder.extract_element(result, int_const(i))
+      if func_dtypes[0] != ir.FloatType(): element = builder.fptrunc(element, func_dtypes[0])
       builder.store(element, builder.gep(func.args[0], [idx], inbounds=True))
 
     # add the looping
diff --git a/tinygrad/helpers.py b/tinygrad/helpers.py
index 78cda40e3..f7abc7f2e 100644
--- a/tinygrad/helpers.py
+++ b/tinygrad/helpers.py
@@ -1,5 +1,6 @@
 import os, math, functools
-from typing import Tuple, Union, List
+import numpy as np
+from typing import Tuple, Union, List, NamedTuple
 
 def dedup(x): return list(dict.fromkeys(x))   # retains list order
 def prod(x:Union[List[int], Tuple[int, ...]]) -> int: return math.prod(x)
@@ -16,3 +17,17 @@ def mnum(i) -> str: return str(i) if i >= 0 else f"m{-i}"
 def getenv(key, default=0): return type(default)(os.getenv(key, default))
 
 DEBUG, IMAGE = getenv("DEBUG", 0), getenv("IMAGE", 0)
+
+# **** tinygrad now supports dtypes! *****
+
+class DType(NamedTuple):
+  itemsize : int
+  name : str
+  np : type  # TODO: someday this will be removed with the "remove numpy" project
+  def __repr__(self): return f"dtypes.{self.name}"
+
+class dtypes:
+  float16 = half = DType(2, "half", np.float16)
+  float32 = float = DType(4, "float", np.float32)
+  @staticmethod
+  def from_np(x:np.ndarray) -> DType: return {np.dtype(np.float16): dtypes.float16, np.dtype(np.float32): dtypes.float32}[np.dtype(x.dtype)]
\ No newline at end of file
diff --git a/tinygrad/lazy.py b/tinygrad/lazy.py
index d79a0aed9..c1c9fe0f4 100644
--- a/tinygrad/lazy.py
+++ b/tinygrad/lazy.py
@@ -2,7 +2,7 @@ from __future__ import annotations
 from typing import Optional, Tuple, Union, List, Dict, Any, ClassVar, Type
 import os, sys, weakref, importlib, inspect, functools
 from weakref import WeakValueDictionary
-from tinygrad.helpers import prod, getenv
+from tinygrad.helpers import prod, getenv, DType, dtypes
 from tinygrad.shape import ShapeTracker, get_contraction
 from tinygrad.ops import InterpretedBuffer, DeviceBuffer, UnaryOps, BinaryOps, ReduceOps, MovementOps, LoadOps, OpType, LazyOp, get_buffers, get_lazyops, map_buffers
 from tinygrad.graph import log_op
@@ -13,21 +13,12 @@ sys.setrecursionlimit(10000)
 OPT = getenv("OPT", 2)
 LAZY = getenv("LAZY", 1)
 
-def get_buffer(name, base='tinygrad.runtime'):
-  try:
-    return [cls for cname, cls in inspect.getmembers(importlib.import_module(f'{base}.ops_{name}'), inspect.isclass) if (cname.lower() == name + "buffer")][0]
-  except Exception as e:  # NOTE: this can't be put on one line due to mypy issue
-    print(name, "backend not available", e, file=sys.stderr)
-
 class _Device:
   def __init__(self) -> None:
-    # TODO: make this dynamic to when you try to access the _buffers
-    self._buffers : Dict[str, Type[DeviceBuffer]] = {x.upper():get_buffer(x) for x in
-      [os.path.splitext(x)[0][len("ops_"):] for x in sorted(os.listdir(os.path.join(os.path.dirname(os.path.realpath(__file__)), "runtime"))) if x.startswith("ops_")] if x is not None}
-    self.DEFAULT : str = "CPU"
-    for name in self._buffers:
-      if getenv(name) == 1: self.DEFAULT = name  # note: DEFAULT can be a Device that can't be imported. better than silent use of a different device
-      if self._buffers[name] is not None: self.__setattr__(name, name)
+    self._buffers = {y.upper():y for y in [os.path.splitext(x)[0][len("ops_"):] for x in sorted(os.listdir(os.path.join(os.path.dirname(os.path.realpath(__file__)), "runtime"))) if x.startswith("ops_")]}
+    self.DEFAULT : str = functools.reduce(lambda val, ele: val if getenv(val) == 1 else ele, self._buffers, "CPU")
+  @functools.lru_cache(maxsize=None)  # this class is a singleton, pylint: disable=method-cache-max-size-none
+  def __getitem__(self, x:str) -> Type[DeviceBuffer]: return [cls for cname, cls in inspect.getmembers(importlib.import_module(f'tinygrad.runtime.ops_{self._buffers[x]}'), inspect.isclass) if (cname.lower() == self._buffers[x] + "buffer")][0]
 Device = _Device()
 
 # TODO: movement ops that only change shape are really nops. treat them as such
@@ -81,9 +72,9 @@ def replace_with_movement_op(y:Union[LazyOp, LazyBuffer], op:MovementOps, arg:Tu
   return elementwise_op(y.op, *[replace_with_movement_op(z, op, arg) for z in y.src])   # type: ignore
 
 class LazyNumpyArray:
-  def __init__(self, fxn, shape): self.fxn, self.shape = fxn, shape
-  def __call__(self): return self.fxn(self.shape)
-  def reshape(self, new_shape): return LazyNumpyArray(self.fxn, new_shape)
+  def __init__(self, fxn, shape, dtype): self.fxn, self.shape, self.dtype = fxn, shape, dtype
+  def __call__(self): return self.fxn(self.shape, self.dtype)
+  def reshape(self, new_shape): return LazyNumpyArray(self.fxn, new_shape, self.dtype)
   def copy(self): return self
   def astype(self, typ): return self
 
@@ -91,32 +82,32 @@ def support_weakref(x): return x
 @support_weakref  # needed for mypyc, this prevents LazyBuffer from becoming a native class
 class LazyBuffer:
   __deletable__ = ('op',)
-  lazycache : ClassVar[WeakValueDictionary[Tuple[str, OpType, LazyOp], LazyBuffer]] = WeakValueDictionary()
-  def __new__(cls, device:str, shape:Union[ShapeTracker, Tuple[int, ...]], optype:OpType, op:LazyOp):
+  lazycache : ClassVar[WeakValueDictionary[Tuple[str, DType, OpType, LazyOp], LazyBuffer]] = WeakValueDictionary()
+  def __new__(cls, device:str, shape:Union[ShapeTracker, Tuple[int, ...]], optype:OpType, op:LazyOp, dtype:DType):
     # fromcpu aren't cached
     if optype == LoadOps and op.op == LoadOps.FROMCPU:
       return super().__new__(cls)
-    wop = (device, optype, get_weakop(op))   # NOTE: shape should be deterministic. annoying to cache with the ShapeTracker
+    wop = (device, dtype, optype, get_weakop(op))   # NOTE: shape should be deterministic. annoying to cache with the ShapeTracker
     # NOTE: we need "ret" to prevent the new buffer from being immediately deleted
     if wop not in LazyBuffer.lazycache: LazyBuffer.lazycache[wop] = ret = super().__new__(cls)
     else: ret = LazyBuffer.lazycache[wop]
     return ret
 
-  def __init__(self, device:str, shape:Union[ShapeTracker, Tuple[int, ...]], optype:OpType, op:LazyOp):
+  def __init__(self, device:str, shape:Union[ShapeTracker, Tuple[int, ...]], optype:OpType, op:LazyOp, dtype:DType):
     if hasattr(self, 'device'):
       return  # cache hit, we return and don't reinit
     self.st = shape if isinstance(shape, ShapeTracker) else ShapeTracker(tuple(shape))
-    self.shape, self.optype, self.op = self.st.shape, optype, op
+    self.shape, self.optype, self.op, self.dtype = self.st.shape, optype, op, dtype
     self.realized : Optional[DeviceBuffer] = None
     self.output_buffer : Optional[DeviceBuffer] = None
-    self.device, self.dbuffer = device, Device._buffers[device]
+    self.device, self.dbuffer = device, Device[device]
     # TODO: does children have to be a ref count instead of a set? can a Buffer be a double child?
     self.children : weakref.WeakSet[LazyBuffer] = weakref.WeakSet()
     # NOTE: op should be read only after construction of LazyBuffer
     for x in get_buffers(op): x.children.add(self)
     if not LAZY: self.realize()
 
-  def __repr__(self): return f"<LB {self.shape} op:{self.op.op if self.realized is None else 'realized'}>"
+  def __repr__(self): return f"<LB {self.shape} {self.dtype} op:{self.op.op if self.realized is None else 'realized'}>"
 
   # this produces a device buffer
   def realize(self:LazyBuffer, required_device=None) -> DeviceBuffer:
@@ -124,7 +115,7 @@ class LazyBuffer:
     if self.realized is None:
       # get real ops first
       if self.op.op == LoadOps.FROMCPU:
-        self.realized = Device._buffers[self.device].fromCPU(self.op.arg() if isinstance(self.op.arg, LazyNumpyArray) else self.op.arg)
+        self.realized = Device[self.device].fromCPU(self.op.arg() if isinstance(self.op.arg, LazyNumpyArray) else self.op.arg)
         ast = LazyOp(self.op.op, tuple())
       elif self.op.op == LoadOps.CONTIGUOUS:
         real_src = self.op.src[0].realize(self.device)
@@ -160,12 +151,13 @@ class LazyBuffer:
       log_op(self.realized, ast)
 
     assert self.realized.shape == self.shape, f"shape mismatch on realize got {self.realized.shape} expected {self.shape}"
-    assert isinstance(self.realized, Device._buffers[self.device]), f"device mismatch on realized got {type(self.realized)} expected {self.device}"
+    assert isinstance(self.realized, Device[self.device]), f"device mismatch on realized got {type(self.realized)} expected {self.device}"
+    assert self.realized.dtype == self.dtype, f"dtype mismatch on realize got {self.realized.dtype} expected {self.dtype}"
     return self.realized
 
   # NOTE: we have to make a copy of the numpy array here in case the user changes it. expose this?
   @staticmethod
-  def fromCPU(x, device) -> LazyBuffer: return LazyBuffer(device, x.shape, LoadOps, LazyOp(LoadOps.FROMCPU, tuple(), x.copy()))
+  def fromCPU(x, device) -> LazyBuffer: return LazyBuffer(device, x.shape, LoadOps, LazyOp(LoadOps.FROMCPU, tuple(), x.copy()), dtypes.from_np(x))
   def toCPU(self):
     ret = self.realize().toCPU()
     log_op(InterpretedBuffer(ret), LazyOp(LoadOps.TOCPU, (self.realized,), None))
@@ -173,11 +165,11 @@ class LazyBuffer:
 
   def unary_op(self:LazyBuffer, op:UnaryOps) -> LazyBuffer: return elementwise_op(op, self)
   def binary_op(self:LazyBuffer, op:BinaryOps, y:LazyBuffer) -> LazyBuffer: return elementwise_op(op, self, y)
-  def contiguous(self:LazyBuffer) -> LazyBuffer: return LazyBuffer(self.device, self.shape, LoadOps, LazyOp(LoadOps.CONTIGUOUS, (self,)))
+  def contiguous(self:LazyBuffer) -> LazyBuffer: return LazyBuffer(self.device, self.shape, LoadOps, LazyOp(LoadOps.CONTIGUOUS, (self,)), self.dtype)
 
   def reduce_op(self:LazyBuffer, op:ReduceOps, new_shape:Tuple[int, ...]) -> LazyBuffer:
     if self.shape == tuple(new_shape): return self
-    return LazyBuffer(self.device, new_shape, ReduceOps, LazyOp(op, (self,), new_shape))
+    return LazyBuffer(self.device, new_shape, ReduceOps, LazyOp(op, (self,), new_shape), self.dtype)
 
   def movement_op(self:LazyBuffer, op:MovementOps, arg:Tuple[Any, ...]) -> LazyBuffer:
     # very instant nop
@@ -228,7 +220,7 @@ class LazyBuffer:
       return replace_with_movement_op(self.op, op, arg)
 
     # create the buffer
-    ret = LazyBuffer(self.device, ShapeTracker(self.st).movement_op(op, arg), MovementOps, LazyOp(op, (self,), arg))
+    ret = LazyBuffer(self.device, ShapeTracker(self.st).movement_op(op, arg), MovementOps, LazyOp(op, (self,), arg), self.dtype)
 
     # if the ShapeTracker becomes contiguous, replace the whole thing with a reshape (or nothing if shapes match)
     # NOTE: if ret is in the cache, it can already be realized
@@ -241,7 +233,7 @@ class LazyBuffer:
     return ret
 
 def elementwise_op(op:Union[UnaryOps, BinaryOps], *srcs:LazyBuffer) -> LazyBuffer:
-  out_device, out_shape = srcs[0].device, srcs[0].shape
+  out_device, out_shape, out_dtype = srcs[0].device, srcs[0].shape, max(x.dtype for x in srcs)
 
   # push all contiguous to the end of BinaryOps. kernels 198 -> 196
   if PUSH_CONTIGUOUS and any(x.realized is None and x.op.op == LoadOps.CONTIGUOUS and len(x.op.src[0].children) <= 1 for x in srcs):
@@ -258,4 +250,4 @@ def elementwise_op(op:Union[UnaryOps, BinaryOps], *srcs:LazyBuffer) -> LazyBuffe
     # remove the buffers from any (childless) BinaryOps that feed into this
     srcs = tuple(x.op if x.optype == BinaryOps and len(x.children) == 0 and x.realized is None else x for x in srcs)  # type: ignore
 
-  return LazyBuffer(out_device, out_shape, BinaryOps, LazyOp(op, srcs))
+  return LazyBuffer(out_device, out_shape, BinaryOps, LazyOp(op, srcs), out_dtype)
diff --git a/tinygrad/ops.py b/tinygrad/ops.py
index b805c184b..34a84bdb0 100644
--- a/tinygrad/ops.py
+++ b/tinygrad/ops.py
@@ -3,7 +3,7 @@ import functools, itertools, operator, random
 import numpy as np
 from enum import Enum, auto
 from typing import Union, Type, NamedTuple, Tuple, Any, List, ClassVar, Optional, Callable, Dict, TypeVar, Set
-from tinygrad.helpers import prod, DEBUG, getenv
+from tinygrad.helpers import prod, DEBUG, getenv, DType, dtypes
 from tinygrad.shape import ShapeTracker
 
 # these are the llops your accelerator must implement, along with toCpu
@@ -39,17 +39,19 @@ class Copyable:
   def toCPU(self:Copyable) -> np.ndarray: raise NotImplementedError("must be implemented")
 
 class RawBuffer(Copyable):  # pylint: disable=abstract-method
-  def __init__(self, size:int):
+  def __init__(self, size:int, dtype:DType):
     self.size : int = size
-    GlobalCounters.mem_used += self.size
-  def __del__(self): GlobalCounters.mem_used -= self.size
+    self.dtype : DType = dtype
+    self._memsz : int = size*dtype.itemsize
+    GlobalCounters.mem_used += self._memsz
+  def __del__(self): GlobalCounters.mem_used -= self.size*self._memsz
 
 class RawBufferCopyIn(RawBuffer):
   def copyin(self, x:np.ndarray) -> None: raise NotImplementedError("must be implemented")
 
   @classmethod
   def fromCPU(cls, x:np.ndarray):
-    ret = cls(4*prod(x.shape))
+    ret = cls(prod(x.shape), dtypes.from_np(x))
     ret.copyin(x)
     return ret
 
@@ -57,7 +59,7 @@ class RawBufferCopyInOut(RawBufferCopyIn):
   def copyout(self, x:np.ndarray) -> None: raise NotImplementedError("must be implemented")
 
   def toCPU(self) -> np.ndarray:
-    x = np.empty((self.size//4), dtype=np.float32)
+    x: np.ndarray = np.empty(self.size, dtype=self.dtype.np)
     self.copyout(x)
     return x
 
@@ -65,26 +67,27 @@ class RawBufferCopyInOut(RawBufferCopyIn):
 class DeviceBuffer(Copyable):
   _buf: Any                # underlying buffer
   shape: Tuple[int, ...]
+  dtype: DType
   @classmethod
   def exec_ast(cls, ast:LazyOp, output_buffer=None): raise NotImplementedError("must be implemented")
 
 # this is a quick "buffer" class for flop tracking and getting the output shape
 class GenericShape:
-  def __init__(self, shape:Tuple[int, ...], flops:int=0): self.shape, self.flops = shape, flops
+  def __init__(self, shape:Tuple[int, ...], dtype:DType=dtypes.float32, flops:int=0): self.shape, self.dtype, self.flops = shape, dtype, flops
   def consume_flops(self):
     self.flops, ret = 0, self.flops
     return ret
 shape_fxn_for_op : Dict[Op, Callable] = {
-  **{op:lambda self: GenericShape(self.shape, self.consume_flops() + prod(self.shape)) for op in UnaryOps},
-  **{op:lambda self,y: GenericShape(self.shape, self.consume_flops() + y.consume_flops() + prod(self.shape)) for op in BinaryOps},
-  **{op:lambda self,new_shape: GenericShape(new_shape, self.consume_flops() + prod(self.shape)) for op in ReduceOps},
-  **{op:functools.partial(lambda mop,self,arg: GenericShape(ShapeTracker(self.shape).movement_op(mop, arg).shape, self.consume_flops()), op) for op in MovementOps}}
+  **{op:lambda self: GenericShape(self.shape, self.dtype, self.consume_flops() + prod(self.shape)) for op in UnaryOps},
+  **{op:lambda self,y: GenericShape(self.shape, max(self.dtype, y.dtype), self.consume_flops() + y.consume_flops() + prod(self.shape)) for op in BinaryOps},
+  **{op:lambda self,new_shape: GenericShape(new_shape, self.dtype, self.consume_flops() + prod(self.shape)) for op in ReduceOps},
+  **{op:functools.partial(lambda mop,self,arg: GenericShape(ShapeTracker(self.shape).movement_op(mop, arg).shape, self.dtype, self.consume_flops()), op) for op in MovementOps}}
+def get_lazyop_info(ast:LazyOp): return InterpretedBuffer.exec_ast(map_buffers({x:InterpretedBuffer(GenericShape(x.shape, x.dtype)) for x in get_buffers(ast)}, ast))._buf
 
 # used in CPUBuffer and TorchBuffer
 class InterpretedBuffer(DeviceBuffer):  # pylint: disable=abstract-method
   fxn_for_op : ClassVar = shape_fxn_for_op
-  # TODO: use generic types here to remove __init__ in specialized classes
-  def __init__(self, lbuf:Any): self._buf, self.shape = lbuf, tuple(lbuf.shape)
+  def __init__(self, lbuf:Any): self._buf, self.shape, self.dtype = lbuf, tuple(lbuf.shape), self.to_tinygrad_dtype(lbuf) if hasattr(self, 'to_tinygrad_dtype') else lbuf.dtype
   def contiguous(self): return type(self).exec_ast(LazyOp(op=UnaryOps.NOOP, src=(self,)))
   def movement_op(self, op:MovementOps, arg=None): return type(self)(self.fxn_for_op[op](self._buf, arg)) if op in self.fxn_for_op else type(self)(getattr(self._buf, op.name.lower())(arg))
   @classmethod
@@ -101,12 +104,11 @@ class InterpretedBuffer(DeviceBuffer):  # pylint: disable=abstract-method
     else: ret = cls(cls.fxn_for_op[ast.op](*([x._buf for x in srcs] + ([ast.arg] if ast.arg else []))))
     context[ast] = ret
     if output_buffer is not None:
-      assert output_buffer.shape == ret.shape
+      assert output_buffer.shape == ret.shape, output_buffer.dtype == ret.dtype
       output_buffer._buf = ret._buf
       return output_buffer
     else:
       return ret
-def get_lazyop_info(ast:LazyOp): return InterpretedBuffer.exec_ast(map_buffers({x:InterpretedBuffer(GenericShape(x.shape)) for x in get_buffers(ast)}, ast))._buf
 
 class ASTRunner:
   def __init__(self, name, prg, bufs_to_delete:Optional[Set[int]]=None, global_size:Optional[List[int]]=None, local_size:Optional[List[int]]=None, op_estimate=0, mem_estimate=0):
@@ -139,7 +141,7 @@ class ASTRunner:
   def optimize_local_size(self, rawbufs:List[RawBuffer]) -> List[int]:
     assert self.global_size is not None, "needs a global size to optimize local size"
     if any(x == rawbufs[0] for x in rawbufs[1:]):  # this is an assignment, replace the output buffer
-      output_replacement = type(rawbufs[0])(rawbufs[0].size)
+      output_replacement = type(rawbufs[0])(rawbufs[0].size, rawbufs[0].dtype)
       rawbufs = [output_replacement if x == rawbufs[0] else x for x in rawbufs]
     MAX_WORKGROUP = self.clprg.max_work_group_size() if hasattr(self.clprg, 'max_work_group_size') else 1024
     local_dims = [[x for x in set([sz, 1, 2, 4, 8, 16, 32, 64, 128, 256, MAX_WORKGROUP]) if x<=sz] for sz in self.global_size]
@@ -149,32 +151,36 @@ class ASTRunner:
 # assumes you are using ShapeTracker
 # used in GPUBuffer and LLVMBuffer
 class CompiledBuffer(DeviceBuffer):  # pylint: disable=abstract-method
-  def __init__(self, shape:Union[ShapeTracker, Tuple[int, ...]], hostbuf:Optional[CompiledBuffer]=None, backing:Optional[np.ndarray]=None, force_create=False):
+  def __init__(self, shape:Union[ShapeTracker, Tuple[int, ...]], hostbuf:Optional[CompiledBuffer]=None, backing:Optional[np.ndarray]=None, force_create=False, dtype:DType=dtypes.float32):
     self.st = shape if isinstance(shape, ShapeTracker) else ShapeTracker(tuple(shape))
     self.shape = self.st.shape
+    self.dtype = dtype
+    assert hostbuf is None or hostbuf.dtype == dtype, f"hostbuf dtype {hostbuf.dtype} != {dtype}"
     self._base_shape : Tuple[int, ...] = hostbuf._base_shape if hostbuf is not None else self.shape
     self._buf = hostbuf._buf if hostbuf is not None else None
     self._backing : Optional[np.ndarray] = hostbuf._backing if hostbuf is not None else backing
     if (self._backing is not None and self._backing.shape != (1,)) or force_create: self.raw()
 
   # TODO: not GPUBuffer, get name of class
-  def __repr__(self): return f"GPUBuffer(shape={self.st}, hostbuf=GPUBuffer(shape={self._base_shape}" + (f", backing=np.array({self._backing}, dtype=np.float32)))" if self._backing else ", force_create=True))")
+  # TODO: needs dtype
+  def __repr__(self): return f"{type(self).__name__}(shape={self.st}, hostbuf={type(self).__name__}(shape={self._base_shape}" + (f", backing=np.array({self._backing}, dtype=np.{self.dtype.np.__name__})))" if self._backing is not None else ", force_create=True))")
 
   raw_buffer_type : Type[RawBuffer]
   @classmethod
-  def create_raw_buffer(cls, shape, backing) -> RawBuffer:
+  def create_raw_buffer(cls, shape:Tuple[int, ...], backing:Optional[np.ndarray], dtype:DType) -> RawBuffer:
     assert backing is None or prod(shape) == prod(backing.shape), "backing has the wrong shape"
     assert backing is None or GlobalCounters.cache is None, f"can't copy in {backing.shape} while caching"
-    return cls.raw_buffer_type(4*prod(shape)) if backing is None else cls.raw_buffer_type.fromCPU(backing)
+    if DEBUG >= 4: print(f"create raw buffer {shape} {dtype} backed:{backing is not None}")
+    return cls.raw_buffer_type(prod(shape), dtype) if backing is None else cls.raw_buffer_type.fromCPU(backing)
   def raw(self) -> RawBuffer:
     if self._buf is None:
       if DEBUG >= 4 and self._backing is not None: print(f"**** copy in {self._backing.shape} to {type(self)}")
-      self._buf = self.create_raw_buffer(self._base_shape, self._backing)
+      self._buf = self.create_raw_buffer(self._base_shape, self._backing, self.dtype)
       self._backing = None
     return self._buf
 
   @classmethod
-  def fromCPU(cls, x:np.ndarray) -> CompiledBuffer: return cls(x.shape, backing=x.view(np.ndarray).astype(np.float32).ravel())
+  def fromCPU(cls, x:np.ndarray) -> CompiledBuffer: return cls(x.shape, backing=x.ravel(), dtype=dtypes.from_np(x))
   def toCPU(self) -> np.ndarray:
     assert GlobalCounters.cache is None, f"can't copy out {self} while caching"
     if DEBUG >= 3: print(f"**** copy out {self.shape}")
@@ -192,7 +198,7 @@ class CompiledBuffer(DeviceBuffer):  # pylint: disable=abstract-method
       prg = cls.method_cache[k.key]
     else:
       prg = k.codegen().build(cls.runtime_type)
-    if getenv("PRINT_AST", "") == prg.name:
+    if getenv("PRINT_AST", "") == prg.name or getenv("PRINT_AST", "") == "1":
       k.print()
       print(prg.prg)
     prg.exec(k.bufs)
diff --git a/tinygrad/runtime/ops_clang.py b/tinygrad/runtime/ops_clang.py
index 23694d2a0..0fc6f287e 100644
--- a/tinygrad/runtime/ops_clang.py
+++ b/tinygrad/runtime/ops_clang.py
@@ -2,21 +2,22 @@ import os, time, ctypes, hashlib, subprocess, platform
 import numpy as np
 from collections import defaultdict
 from typing import Final, Dict
+from tinygrad.helpers import dtypes, DType
 from tinygrad.ops import CompiledBuffer, RawBufferCopyIn
 from tinygrad.codegen.gpu import GPUCodegen, GPULanguage
 
 class RawMallocBuffer(RawBufferCopyIn):
-  def __init__(self, size):
-    super().__init__(size)
-    self._buf = (ctypes.c_float * (size//4))()
+  def __init__(self, size, dtype : DType):
+    super().__init__(size, dtype)
+    self._buf = ({dtypes.float32: ctypes.c_float, dtypes.float16: ctypes.c_int16}[dtype] * size)()
   def _buffer(self): return self._buf
-  def copyin(self, x:np.ndarray): ctypes.memmove(self._buf, x.ctypes.data, x.size*4)
-  def toCPU(self): return np.ctypeslib.as_array(self._buf)
+  def copyin(self, x:np.ndarray): ctypes.memmove(self._buf, x.ctypes.data, x.size*np.dtype(x.dtype).itemsize)
+  def toCPU(self): return np.frombuffer(self._buf, dtype=self.dtype.np)
 
 class ClangProgram:
   kernel_cnt : Final[Dict[str, int]] = defaultdict(int)
   def __init__(self, name:str, prg:str):
-    prg = "#include <math.h>\n#define max(x,y) ((x>y)?x:y)\n" + prg
+    prg = "#include <math.h>\n#define max(x,y) ((x>y)?x:y)\n#define half __fp16\n" + prg
     # TODO: is there a way to not write this to disk?
     fn = f"/tmp/clang_{hashlib.md5(prg.encode('utf-8')).hexdigest()}.{'dylib' if platform.system() == 'Darwin' else 'so'}"
     if not os.path.exists(fn):
diff --git a/tinygrad/runtime/ops_cpu.py b/tinygrad/runtime/ops_cpu.py
index 0cec65a79..921b87140 100644
--- a/tinygrad/runtime/ops_cpu.py
+++ b/tinygrad/runtime/ops_cpu.py
@@ -1,6 +1,7 @@
 import numpy as np
 import operator
 from typing import ClassVar, Callable, Dict, Tuple
+from tinygrad.helpers import dtypes
 from tinygrad.ops import UnaryOps, BinaryOps, MovementOps, ReduceOps, FusedOps, InterpretedBuffer, Op
 
 def shape_to_axis(old_shape:Tuple[int, ...], new_shape:Tuple[int, ...]) -> Tuple[int, ...]:
@@ -35,6 +36,7 @@ numpy_fxn_for_op : Dict[Op, Callable] = {**base_fxn_for_op, **{
 
 class CPUBuffer(InterpretedBuffer):
   fxn_for_op : ClassVar = numpy_fxn_for_op
+  to_tinygrad_dtype = staticmethod(dtypes.from_np)
 
   @staticmethod
   def fromCPU(x): return CPUBuffer(x)
diff --git a/tinygrad/runtime/ops_cuda.py b/tinygrad/runtime/ops_cuda.py
index ef9d29e25..b7ba4c56b 100644
--- a/tinygrad/runtime/ops_cuda.py
+++ b/tinygrad/runtime/ops_cuda.py
@@ -8,9 +8,9 @@ from tinygrad.ops import CompiledBuffer, RawBufferCopyInOut
 from tinygrad.codegen.gpu import GPUCodegen, GPULanguage
 
 class RawCUDABuffer(RawBufferCopyInOut):
-  def __init__(self, size):
-    super().__init__(size)
-    self._cl = cuda.mem_alloc(size)
+  def __init__(self, size, dtype):
+    super().__init__(size, dtype)
+    self._cl = cuda.mem_alloc(self._memsz)
   def copyin(self, x:np.ndarray, stream:Optional[cuda.Stream]=None): cuda.memcpy_htod_async(self._cl, x, stream)
   def copyout(self, x:np.ndarray): cuda.memcpy_dtoh(x, self._cl)
 
@@ -42,6 +42,7 @@ class CUDAProgram:
 class CUDACodegen(GPUCodegen):
   lang = GPULanguage(
     kernel_prefix = "__global__", smem_prefix = "__shared__ ", barrier = "__syncthreads();", float4 = "make_float4",
+    half_prekernel = "#include <cuda_fp16.h>",
     gid = [f'blockDim.{chr(120+i)}*blockIdx.{chr(120+i)}+threadIdx.{chr(120+i)}' for i in range(3)],
     lid = [f'threadIdx.{chr(120+i)}' for i in range(3)])
 
diff --git a/tinygrad/runtime/ops_gpu.py b/tinygrad/runtime/ops_gpu.py
index ae1821578..ec9f45829 100644
--- a/tinygrad/runtime/ops_gpu.py
+++ b/tinygrad/runtime/ops_gpu.py
@@ -2,15 +2,13 @@ from __future__ import annotations
 import platform, functools
 import numpy as np
 import pyopencl as cl  # type: ignore
-from typing import Dict, Optional, List, ClassVar, Final
-from collections import defaultdict
-from tinygrad.helpers import IMAGE, DEBUG, getenv
+from typing import Optional, List, Final
+from tinygrad.helpers import IMAGE, DEBUG, getenv, dtypes
 from tinygrad.ops import CompiledBuffer, GlobalCounters, RawBufferCopyInOut, RawBuffer
 from tinygrad.codegen.gpu import GPUCodegen, GPULanguage
 
 OSX = platform.system() == "Darwin"
 OSX_TIMING_RATIO = (125/3) if OSX else 1.0   # see test/external_osx_profiling.py to determine this ratio. it's in like GPU clocks or something
-CLCACHE = getenv("CLCACHE", 1)
 FLOAT16 = getenv("FLOAT16", 0)
 
 class _CL:
@@ -27,33 +25,18 @@ class _CL:
 CL = _CL()
 
 class CLBuffer(RawBufferCopyInOut):
-  # TODO: this can be in RawBuffer generically
-  BUFFER_CACHE : ClassVar[Dict[int, List[cl.Buffer]]] = defaultdict(list)
-
-  def __init__(self, size):  # pylint: disable=super-init-not-called
-    self.size = size
-    if len(CLBuffer.BUFFER_CACHE[size]) > 0:
-      self._cl = CLBuffer.BUFFER_CACHE[size].pop()
-    else:
-      # TODO: on GPU OOM, clear the cache
-      self._cl = cl.Buffer(CL.cl_ctx, cl.mem_flags.READ_WRITE, size)
-      GlobalCounters.mem_used += self._cl.size
-
-  def __del__(self):
-    if CLCACHE: CLBuffer.BUFFER_CACHE[self._cl.size].append(self._cl)
-    else: GlobalCounters.mem_used -= self._cl.size
-
+  def __init__(self, size, dtype):
+    super().__init__(size, dtype)
+    self._cl = cl.Buffer(CL.cl_ctx, cl.mem_flags.READ_WRITE, self._memsz)
   def copyin(self, x:np.ndarray): cl.enqueue_copy(CL.cl_queue, self._cl, x, is_blocking=False)
   def copyout(self, x:np.ndarray): cl.enqueue_copy(CL.cl_queue, x, self._cl, is_blocking=True)
 
 class CLImage(RawBuffer):  # pylint: disable=abstract-method
-  fmt : Final = cl.ImageFormat(cl.channel_order.RGBA, cl.channel_type.HALF_FLOAT if FLOAT16 else cl.channel_type.FLOAT)
   IMAGE : Final = True
-
-  def __init__(self, shape):  # pylint: disable=super-init-not-called
-    self.size, self._cl = shape, cl.Image(CL.cl_ctx, cl.mem_flags.READ_WRITE, CLImage.fmt, shape=(shape[1], shape[0]))
+  def __init__(self, shape, dtype=dtypes.float16 if getenv("FLOAT16") else dtypes.float32):  # pylint: disable=super-init-not-called
+    fmt = cl.ImageFormat(cl.channel_order.RGBA, {dtypes.float16: cl.channel_type.HALF_FLOAT, dtypes.float32: cl.channel_type.FLOAT}[dtype])
+    self.size, self.dtype, self._cl = shape, dtype, cl.Image(CL.cl_ctx, cl.mem_flags.READ_WRITE, fmt, shape=(shape[1], shape[0]))
     GlobalCounters.mem_used += self._cl.row_pitch * self._cl.height
-
   def __del__(self): GlobalCounters.mem_used -= self._cl.row_pitch * self._cl.height
 
 @functools.lru_cache(maxsize=None)
@@ -89,6 +72,7 @@ class CLProgram:
 class CLCodegen(GPUCodegen):
   lang = GPULanguage(
     kernel_prefix = "__kernel", buffer_prefix = "__global ", smem_prefix = "__local ",
+    half_prekernel = "#pragma OPENCL EXTENSION cl_khr_fp16 : enable",
     barrier = "barrier(CLK_LOCAL_MEM_FENCE);", float4 = "(float4)",
     gid = [f'get_global_id({i})' for i in range(3)], lid = [f'get_local_id({i})' for i in range(3)])
 
@@ -96,8 +80,8 @@ class GPUBuffer(CompiledBuffer):
   raw_buffer_type = CLBuffer
   # override this method for image
   @classmethod
-  def create_raw_buffer(cls, shape, backing) -> RawBuffer:
-    if len(shape) == 3 and shape[2] == 4 and IMAGE >= 2 and backing is None: return CLImage(shape)
-    else: return super().create_raw_buffer(shape, backing)
+  def create_raw_buffer(cls, shape, backing, dtype) -> RawBuffer:
+    if len(shape) == 3 and shape[2] == 4 and IMAGE >= 2 and backing is None: return CLImage(shape)   # NOTE: this is a hack. we don't pass in the dtype here, it's controlled by the FLOAT16 env var
+    else: return super().create_raw_buffer(shape, backing, dtype)
   codegen_type = CLCodegen
   runtime_type = CLProgram
diff --git a/tinygrad/runtime/ops_llvm.py b/tinygrad/runtime/ops_llvm.py
index 2b156c9c7..b2e778ab4 100644
--- a/tinygrad/runtime/ops_llvm.py
+++ b/tinygrad/runtime/ops_llvm.py
@@ -57,7 +57,7 @@ class LLVMProgram:
   def __del__(self): LLVM.engine.remove_module(self.mod)
 
   def __call__(self, unused_global_size, unused_local_size, *bufs, wait=False):
-    cfunc = CFUNCTYPE(ctypes.c_int, *[ctypes.POINTER(ctypes.c_float) for _ in bufs])(self.fxn)
+    cfunc = CFUNCTYPE(ctypes.c_int, *[ctypes.c_void_p for _ in bufs])(self.fxn)
     if wait: st = time.monotonic()
     cfunc(*[x._buf for x in bufs])
     if wait: return time.monotonic()-st
diff --git a/tinygrad/runtime/ops_metal.py b/tinygrad/runtime/ops_metal.py
index 5f1bdb4ae..f8fa1563a 100644
--- a/tinygrad/runtime/ops_metal.py
+++ b/tinygrad/runtime/ops_metal.py
@@ -4,7 +4,7 @@ import Metal, Cocoa, libdispatch # type: ignore
 import numpy as np
 from typing import List, Any
 from tinygrad.codegen.gpu import GPUCodegen, GPULanguage
-from tinygrad.helpers import prod, getenv, DEBUG
+from tinygrad.helpers import prod, getenv, DEBUG, DType
 from tinygrad.ops import CompiledBuffer, RawBufferCopyIn
 
 METAL_XCODE = getenv("METAL_XCODE")
@@ -20,14 +20,14 @@ class _METAL:
 METAL = _METAL()
 
 class RawMetalBuffer(RawBufferCopyIn):
-  def __init__(self, size):
-    super().__init__(size)
-    self._cl = METAL.device.newBufferWithLength_options_(size, Metal.MTLResourceStorageModeShared)
+  def __init__(self, size:int, dtype:DType):
+    super().__init__(size, dtype)
+    self._cl = METAL.device.newBufferWithLength_options_(size*dtype.itemsize, Metal.MTLResourceStorageModeShared)
   def __del__(self):
     self._cl.release()
     super().__del__()
   def _buffer(self): return self._cl.contents().as_buffer(self._cl.length())
-  def _as_np(self, dtype=np.float32): return np.frombuffer(self._buffer(), dtype=dtype)
+  def _as_np(self): return np.frombuffer(self._buffer(), dtype=self.dtype.np)
   def copyin(self, x:np.ndarray): np.copyto(self._as_np(), x.reshape(-1).data)
   def toCPU(self) -> np.ndarray:
     for cbuf in METAL.mtl_buffers_in_flight: cbuf.waitUntilCompleted()
diff --git a/tinygrad/runtime/ops_torch.py b/tinygrad/runtime/ops_torch.py
index cd820152e..fdc2dd084 100644
--- a/tinygrad/runtime/ops_torch.py
+++ b/tinygrad/runtime/ops_torch.py
@@ -1,7 +1,7 @@
 import torch
 from typing import ClassVar, Dict, Callable
 from tinygrad.ops import UnaryOps, BinaryOps, MovementOps, FusedOps, InterpretedBuffer, Op
-from tinygrad.helpers import getenv
+from tinygrad.helpers import getenv, dtypes
 from tinygrad.runtime.ops_cpu import base_fxn_for_op, einsum_mulacc
 
 torch_fxn_for_op : Dict[Op, Callable] = {**base_fxn_for_op, **{
@@ -14,6 +14,7 @@ torch_fxn_for_op : Dict[Op, Callable] = {**base_fxn_for_op, **{
 device = torch.device("cuda:0" if torch.cuda.is_available() else ("mps" if getenv("MPS", 0) else "cpu"))
 class TorchBuffer(InterpretedBuffer):
   fxn_for_op : ClassVar = torch_fxn_for_op
+  to_tinygrad_dtype = staticmethod(lambda lbuf: {torch.float16: dtypes.float16, torch.float32: dtypes.float32}[lbuf.dtype])
 
   @staticmethod
   def fromCPU(x): return TorchBuffer(torch.from_numpy(x).requires_grad_(False).to(device))
diff --git a/tinygrad/tensor.py b/tinygrad/tensor.py
index 9c91fa80b..a3aa08b55 100644
--- a/tinygrad/tensor.py
+++ b/tinygrad/tensor.py
@@ -3,7 +3,7 @@ from __future__ import annotations
 import math, functools, itertools
 import numpy as np
 from typing import List, Tuple, Callable, Optional, ClassVar, Type, Union, Sequence
-from tinygrad.helpers import prod, argfix, make_pair, getenv, DEBUG, flatten
+from tinygrad.helpers import prod, argfix, make_pair, getenv, DEBUG, flatten, DType, dtypes
 from tinygrad.lazy import Device, LazyBuffer, LazyNumpyArray
 from tinygrad.image import image_conv2d_decorator, image_dot_decorator
 
@@ -32,18 +32,20 @@ class Tensor:
   __deletable__ = ('_ctx',)
   training : ClassVar[bool] = False
   no_grad : ClassVar[bool] = False
+  default_type : DType = dtypes.float32
 
-  def __init__(self, data, device=Device.DEFAULT, requires_grad:Optional[bool]=None):
+  def __init__(self, data, device=Device.DEFAULT, dtype:Optional[DType]=None, requires_grad:Optional[bool]=None):
     if isinstance(data, list):
-      data = np.array(data, dtype=np.float32)
+      data = np.array(data, dtype=(dtype if dtype is not None else Tensor.default_type).np)
     elif isinstance(data, LazyBuffer) and data.device != device:
       # TODO: this has to realize, it shouldn't have to
       data = data.realize().toCPU()
 
     if isinstance(data, (np.ndarray, LazyNumpyArray)):
       data = data if data.shape else data.reshape((1,))
-      self.lazydata = LazyBuffer.fromCPU(data.astype(np.float32), device)
+      self.lazydata = LazyBuffer.fromCPU(data.astype(dtype.np) if dtype is not None else data, device)
     elif isinstance(data, LazyBuffer):
+      assert dtype is None or dtype == data.dtype, "dtype doesn't match, and casting isn't supported"
       self.lazydata = data
     else:
       raise RuntimeError(f"can't create Tensor from {data}")
@@ -64,13 +66,12 @@ class Tensor:
   @property
   def shape(self) -> Tuple[int, ...]: return self.lazydata.shape
 
-  # dtype handling was very broken. it's always float32 now
-  @property
-  def dtype(self) -> type: return np.float32
-
   @property
   def device(self) -> str: return self.lazydata.device
 
+  @property
+  def dtype(self) -> DType: return self.lazydata.dtype
+
   # ***** data handlers ****
 
   def realize(self) -> Tensor:
@@ -132,11 +133,11 @@ class Tensor:
   def manual_seed(seed=None): Tensor._rng = np.random.default_rng(seed=seed)
 
   @staticmethod
-  def rand(*shape, **kwargs) -> Tensor: return Tensor(LazyNumpyArray(lambda shape: Tensor._rng.random(size=shape, dtype=np.float32), shape), **kwargs)
+  def rand(*shape, **kwargs) -> Tensor: return Tensor(LazyNumpyArray(lambda shape, dtype: Tensor._rng.random(size=shape, dtype=dtype), shape, np.float32), **kwargs)
 
   # TODO: replace with a transformation from uniform -> gaussian
   @staticmethod
-  def randn(*shape, **kwargs) -> Tensor: return Tensor(LazyNumpyArray(lambda shape: Tensor._rng.standard_normal(size=shape, dtype=np.float32), shape), **kwargs)
+  def randn(*shape, **kwargs) -> Tensor: return Tensor(LazyNumpyArray(lambda shape, dtype: Tensor._rng.standard_normal(size=shape, dtype=dtype), shape, np.float32), **kwargs)
 
   # ***** rng hlops *****
 
@@ -442,10 +443,16 @@ class Tensor:
 
   def dropout(self, p=0.5) -> Tensor:
     if not Tensor.training: return self
-    _mask : np.ndarray = np.asarray(Tensor._rng.binomial(1, 1.0-p, size=self.shape), dtype=self.dtype)
+    # TODO: why is this going through numpy?
+    _mask : np.ndarray = np.asarray(Tensor._rng.binomial(1, 1.0-p, size=self.shape), dtype=np.float32)
     return self * Tensor(_mask, requires_grad=False, device=self.device) * (1/(1.0 - p))
 
+  # ***** cast ops *****
+
+  # TODO: this is a hack, but if we add float(0), it will become a float. need real casting support
+  def float(self) -> Tensor: return self.add(Tensor([0], device=self.device, dtype=dtypes.float32, requires_grad=self.requires_grad))
+
 # register functions to move between devices
-for device in [device for device in Device._buffers.keys() if device[0] != "_"]:
+for device in Device._buffers:
   setattr(Tensor, f"{device.lower()}", functools.partialmethod(Tensor.to, device))
   setattr(Tensor, f"{device.lower()}_", functools.partialmethod(Tensor.to_, device))