stop wasting time with the compiler. tinygrad needs to just jit

2023-03-12 12:08:46 -07:00 · 2023-03-12 12:08:46 -07:00 · dcac618515
parent 46b49d50bd
commit dcac618515
4 changed files with 9 additions and 6 deletions
--- a/compile.sh
+++ b/compile.sh
@ -1,6 +1,8 @@
 #!/bin/bash
 # note: if we compile tinygrad/nn/__init__.py __dict__ no longer works, and optimizers will silently fail
 mypyc --check-untyped-defs --explicit-package-bases --warn-unreachable tinygrad/shape/shapetracker.py tinygrad/shape/symbolic.py \
-  tinygrad/nn/__init__.py tinygrad/helpers.py tinygrad/mlops.py tinygrad/tensor.py tinygrad/graph.py
+  tinygrad/helpers.py tinygrad/mlops.py tinygrad/tensor.py tinygrad/graph.py \
+  #tinygrad/codegen/gpu.py tinygrad/runtime/ops_metal.py
+  #tinygrad/codegen/ast.py
+  #tinygrad/nn/__init__.py
  #tinygrad/ops.py tinygrad/runtime/ops_metal.py tinygrad/runtime/ops_gpu.py tinygrad/runtime/ops_cpu.py tinygrad/lazy.py
-  #tinygrad/codegen/ast.py tinygrad/codegen/gpu.py
--- a/tinygrad/codegen/ast.py
+++ b/tinygrad/codegen/ast.py
@ -1,7 +1,7 @@
 import itertools
 from enum import Enum, auto
 from typing import List, Tuple
-from tinygrad.helpers import prod, dedup, all_same, colored, dtypes
+from tinygrad.helpers import prod, dedup, all_same, colored, DType
 from tinygrad.ops import LazyOp, MovementOps, get_lazyop_info, get_buffers, ReduceOps, get_lazyops, map_buffers, GenericShape, ASTRunner
 from tinygrad.shape.shapetracker import ShapeTracker, View, strides_for_shape

@ -26,7 +26,7 @@ class Token:
    if len(self.axis) == 0: return [0]
    acc_strides = [x*(1-self.axis[::-1][i][2]) for i,x in enumerate(strides_for_shape(tuple(1 if r else s for s,_,r in self.axis[::-1])))]
    return [sum(t) for t in itertools.product(*[[y*acc_strides[i] for y in range(x[0])] for i,x in enumerate(self.axis[::-1])])]
-  def decltype(self, dtype=dtypes.float32): return (dtype.name if self.typ == Types.FLOAT else f'{dtype.name}4') + ('*' if self.ptr else str())
+  def decltype(self, dtype:DType): return (dtype.name if self.typ == Types.FLOAT else f'{dtype.name}4') + ('*' if self.ptr else str())
  def __repr__(self): return f"<{self.typ}{'*' if self.ptr else str()} {self.tok}{f'[{self.axis}]' if len(self.axis) else str()}>"

 # ast kernel can contain one ReduceOp with arbitrary Binary/Unary ops
--- a/tinygrad/codegen/gpu.py
+++ b/tinygrad/codegen/gpu.py
@ -129,7 +129,8 @@ class GPUCodegen(ASTKernel):
        if const is not None:
          self.loaded_keys[(buf_index,o)] = ldr
        else:
-          self.kernel.append(f"{ldr.decltype()} {key} = {ldr.tok};\n")
+          # NOTE: we always do compute in float32
+          self.kernel.append(f"{ldr.decltype(dtypes.float32)} {key} = {ldr.tok};\n")
          if should_upcast and can_merge:
            for j in range(4):
              self.loaded_keys[(buf_index,o+j)] = Token(key+f'.{"xyzw"[j]}', Types.FLOAT)
--- a/tinygrad/ops.py
+++ b/tinygrad/ops.py
@ -22,7 +22,7 @@ class LazyOp(NamedTuple):
  # Any == Union[LazyOp, LazyBuffer, DeviceBuffer]
  src: Tuple[Any, ...]  # type: ignore
  arg: Any = None
-  # TODO: add dest to support multiple outputs
+  # TODO: add dest to support multiple outputs. on second thought, multiple outputs will have multiple LazyOps.

 # Any == Union[LazyBuffer, DeviceBuffer]
 def get_buffers(op:LazyOp) -> List[Any]: return functools.reduce(operator.add, [get_buffers(x) if isinstance(x, LazyOp) else [x] for x in op.src], [])