diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index 1b185b698..7a28ce210 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -22,7 +22,9 @@ jobs: run: BIG=2 MPS=1 python3 test/test_speed_v_torch.py | tee torch_speed.txt shell: bash - name: Run Tensor Core GEMM - run: DEBUG=2 python3 extra/gemm/simple_matmul.py | tee matmul.txt + run: | + ln -s ~/tinygrad/disassemblers/applegpu disassemblers/applegpu + DEBUG=2 python3 extra/gemm/simple_matmul.py | tee matmul.txt shell: bash - name: Run Stable Diffusion run: | diff --git a/.gitignore b/.gitignore index d7c42b02c..406a7696c 100644 --- a/.gitignore +++ b/.gitignore @@ -6,6 +6,7 @@ notebooks .*.swo *.pyc *.so +*.txt build /dist *.egg-info diff --git a/tinygrad/codegen/optimizer.py b/tinygrad/codegen/optimizer.py index 99f16d5da..27b1eef54 100644 --- a/tinygrad/codegen/optimizer.py +++ b/tinygrad/codegen/optimizer.py @@ -1,5 +1,5 @@ from typing import Tuple, List, cast -import itertools, math +import itertools, math, os from tinygrad.helpers import DEBUG, prod, getenv, ImageDType, dtypes from tinygrad.ops import ReduceOps, BinaryOps, UnaryOps, LazyOp from tinygrad.codegen.kernel import Kernel, LocalBuffer @@ -228,7 +228,7 @@ class OptimizedKernel(Kernel): # should use METAL tensor cores? # first, confirm it's a straightforward mulacc on a device with real locals - tensor_cores_allowed = getenv("TC", 1) != 0 and (getenv("TC", 1) == 2 or (self.bufs[0].device == "METAL" and getenv("CI", "") != "true")) + tensor_cores_allowed = getenv("TC", 1) != 0 and (getenv("TC", 1) == 2 or (self.bufs[0].device == "METAL" and os.uname().machine == "arm64")) if tensor_cores_allowed and self.reduceop and self.reduceop.op == ReduceOps.SUM and \ isinstance(self.reduceop.src[0], LazyOp) and self.reduceop.src[0].op == BinaryOps.MUL and \ isinstance(self.reduceop.src[0].src[0], LazyBuffer) and isinstance(self.reduceop.src[0].src[1], LazyBuffer) and self.opts.has_local: