From fdd7f282cbd2925f0a00e80e72857024439145bf Mon Sep 17 00:00:00 2001 From: George Hotz <72895+geohot@users.noreply.github.com> Date: Wed, 30 Aug 2023 07:53:04 -0700 Subject: [PATCH] Reenable tensor cores for self-hosted Mac CI (#1717) * debug 5 matmul * allow tensor cores in CI * tensor cores on arm64 * put debug back --- .github/workflows/benchmark.yml | 4 +++- .gitignore | 1 + tinygrad/codegen/optimizer.py | 4 ++-- 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index 1b185b698..7a28ce210 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -22,7 +22,9 @@ jobs: run: BIG=2 MPS=1 python3 test/test_speed_v_torch.py | tee torch_speed.txt shell: bash - name: Run Tensor Core GEMM - run: DEBUG=2 python3 extra/gemm/simple_matmul.py | tee matmul.txt + run: | + ln -s ~/tinygrad/disassemblers/applegpu disassemblers/applegpu + DEBUG=2 python3 extra/gemm/simple_matmul.py | tee matmul.txt shell: bash - name: Run Stable Diffusion run: | diff --git a/.gitignore b/.gitignore index d7c42b02c..406a7696c 100644 --- a/.gitignore +++ b/.gitignore @@ -6,6 +6,7 @@ notebooks .*.swo *.pyc *.so +*.txt build /dist *.egg-info diff --git a/tinygrad/codegen/optimizer.py b/tinygrad/codegen/optimizer.py index 99f16d5da..27b1eef54 100644 --- a/tinygrad/codegen/optimizer.py +++ b/tinygrad/codegen/optimizer.py @@ -1,5 +1,5 @@ from typing import Tuple, List, cast -import itertools, math +import itertools, math, os from tinygrad.helpers import DEBUG, prod, getenv, ImageDType, dtypes from tinygrad.ops import ReduceOps, BinaryOps, UnaryOps, LazyOp from tinygrad.codegen.kernel import Kernel, LocalBuffer @@ -228,7 +228,7 @@ class OptimizedKernel(Kernel): # should use METAL tensor cores? # first, confirm it's a straightforward mulacc on a device with real locals - tensor_cores_allowed = getenv("TC", 1) != 0 and (getenv("TC", 1) == 2 or (self.bufs[0].device == "METAL" and getenv("CI", "") != "true")) + tensor_cores_allowed = getenv("TC", 1) != 0 and (getenv("TC", 1) == 2 or (self.bufs[0].device == "METAL" and os.uname().machine == "arm64")) if tensor_cores_allowed and self.reduceop and self.reduceop.op == ReduceOps.SUM and \ isinstance(self.reduceop.src[0], LazyOp) and self.reduceop.src[0].op == BinaryOps.MUL and \ isinstance(self.reduceop.src[0].src[0], LazyBuffer) and isinstance(self.reduceop.src[0].src[1], LazyBuffer) and self.opts.has_local: