Reenable tensor cores for self-hosted Mac CI (#1717)

* debug 5 matmul * allow tensor cores in CI * tensor cores on arm64 * put debug back
2023-08-30 07:53:04 -07:00 · 2023-08-30 07:53:04 -07:00 · fdd7f282cb
parent ac183568be
commit fdd7f282cb
3 changed files with 6 additions and 3 deletions
--- a/.github/workflows/benchmark.yml
+++ b/.github/workflows/benchmark.yml
@ -22,7 +22,9 @@ jobs:
      run: BIG=2 MPS=1 python3 test/test_speed_v_torch.py | tee torch_speed.txt
      shell: bash
    - name: Run Tensor Core GEMM
-      run: DEBUG=2 python3 extra/gemm/simple_matmul.py | tee matmul.txt
+      run: |
+        ln -s ~/tinygrad/disassemblers/applegpu disassemblers/applegpu
+        DEBUG=2 python3 extra/gemm/simple_matmul.py | tee matmul.txt
      shell: bash
    - name: Run Stable Diffusion
      run: |
--- a/.gitignore
+++ b/.gitignore
@ -6,6 +6,7 @@ notebooks
 .*.swo
 *.pyc
 *.so
+*.txt
 build
 /dist
 *.egg-info
--- a/tinygrad/codegen/optimizer.py
+++ b/tinygrad/codegen/optimizer.py
@ -1,5 +1,5 @@
 from typing import Tuple, List, cast
-import itertools, math
+import itertools, math, os
 from tinygrad.helpers import DEBUG, prod, getenv, ImageDType, dtypes
 from tinygrad.ops import ReduceOps, BinaryOps, UnaryOps, LazyOp
 from tinygrad.codegen.kernel import Kernel, LocalBuffer
@ -228,7 +228,7 @@ class OptimizedKernel(Kernel):

    # should use METAL tensor cores?
    # first, confirm it's a straightforward mulacc on a device with real locals
-    tensor_cores_allowed = getenv("TC", 1) != 0 and (getenv("TC", 1) == 2 or (self.bufs[0].device == "METAL" and getenv("CI", "") != "true"))
+    tensor_cores_allowed = getenv("TC", 1) != 0 and (getenv("TC", 1) == 2 or (self.bufs[0].device == "METAL" and os.uname().machine == "arm64"))
    if tensor_cores_allowed and self.reduceop and self.reduceop.op == ReduceOps.SUM and \
        isinstance(self.reduceop.src[0], LazyOp) and self.reduceop.src[0].op == BinaryOps.MUL and \
        isinstance(self.reduceop.src[0].src[0], LazyBuffer) and isinstance(self.reduceop.src[0].src[1], LazyBuffer) and self.opts.has_local: