diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
index 1b185b698..7a28ce210 100644
--- a/.github/workflows/benchmark.yml
+++ b/.github/workflows/benchmark.yml
@@ -22,7 +22,9 @@ jobs:
       run: BIG=2 MPS=1 python3 test/test_speed_v_torch.py | tee torch_speed.txt
       shell: bash
     - name: Run Tensor Core GEMM
-      run: DEBUG=2 python3 extra/gemm/simple_matmul.py | tee matmul.txt
+      run: |
+        ln -s ~/tinygrad/disassemblers/applegpu disassemblers/applegpu
+        DEBUG=2 python3 extra/gemm/simple_matmul.py | tee matmul.txt
       shell: bash
     - name: Run Stable Diffusion
       run: |
diff --git a/.gitignore b/.gitignore
index d7c42b02c..406a7696c 100644
--- a/.gitignore
+++ b/.gitignore
@@ -6,6 +6,7 @@ notebooks
 .*.swo
 *.pyc
 *.so
+*.txt
 build
 /dist
 *.egg-info
diff --git a/tinygrad/codegen/optimizer.py b/tinygrad/codegen/optimizer.py
index 99f16d5da..27b1eef54 100644
--- a/tinygrad/codegen/optimizer.py
+++ b/tinygrad/codegen/optimizer.py
@@ -1,5 +1,5 @@
 from typing import Tuple, List, cast
-import itertools, math
+import itertools, math, os
 from tinygrad.helpers import DEBUG, prod, getenv, ImageDType, dtypes
 from tinygrad.ops import ReduceOps, BinaryOps, UnaryOps, LazyOp
 from tinygrad.codegen.kernel import Kernel, LocalBuffer
@@ -228,7 +228,7 @@ class OptimizedKernel(Kernel):
 
     # should use METAL tensor cores?
     # first, confirm it's a straightforward mulacc on a device with real locals
-    tensor_cores_allowed = getenv("TC", 1) != 0 and (getenv("TC", 1) == 2 or (self.bufs[0].device == "METAL" and getenv("CI", "") != "true"))
+    tensor_cores_allowed = getenv("TC", 1) != 0 and (getenv("TC", 1) == 2 or (self.bufs[0].device == "METAL" and os.uname().machine == "arm64"))
     if tensor_cores_allowed and self.reduceop and self.reduceop.op == ReduceOps.SUM and \
         isinstance(self.reduceop.src[0], LazyOp) and self.reduceop.src[0].op == BinaryOps.MUL and \
         isinstance(self.reduceop.src[0].src[0], LazyBuffer) and isinstance(self.reduceop.src[0].src[1], LazyBuffer) and self.opts.has_local: