move device to device.py (#2466)
* move device to device.py * pylint test --disable R,C,W,E --enable E0611 * fix testspull/2474/head
parent
262cd26d28
commit
9e07824542
|
@ -245,7 +245,7 @@ jobs:
|
|||
# dtype test has issues on test_half_to_int8
|
||||
- name: Check Device.DEFAULT (METAL) and print some source
|
||||
run: |
|
||||
METAL=1 python -c "from tinygrad.ops import Device; assert Device.DEFAULT == 'METAL', Device.DEFAULT"
|
||||
METAL=1 python -c "from tinygrad import Device; assert Device.DEFAULT == 'METAL', Device.DEFAULT"
|
||||
METAL=1 DEBUG=4 FORWARD_ONLY=1 python3 test/test_ops.py TestOps.test_add
|
||||
- name: Run metal ops test
|
||||
run: DEBUG=2 METAL=1 python -m pytest -n=auto test/test_ops.py
|
||||
|
@ -263,7 +263,7 @@ jobs:
|
|||
run: METAL=1 TC=2 python -m pytest -n=auto test/test_ops.py
|
||||
- name: Check Device.DEFAULT (WEBGPU) and print some source
|
||||
run: |
|
||||
WEBGPU=1 python -c "from tinygrad.ops import Device; assert Device.DEFAULT == 'WEBGPU', Device.DEFAULT"
|
||||
WEBGPU=1 python -c "from tinygrad import Device; assert Device.DEFAULT == 'WEBGPU', Device.DEFAULT"
|
||||
WEBGPU=1 DEBUG=4 FORWARD_ONLY=1 python3 test/test_ops.py TestOps.test_add
|
||||
#- name: Run webgpu pytest
|
||||
# run: WEBGPU=1 WGPU_BACKEND_TYPE=Metal python -m pytest -n=auto
|
||||
|
@ -349,7 +349,7 @@ jobs:
|
|||
run: pip install -e '.[testing${{matrix.backend=='llvm'&&',llvm'||matrix.backend=='cuda'&&',cuda'||matrix.backend=='ptx'&&',cuda'||matrix.backend=='triton'&&',triton'||''}}]' --extra-index-url https://download.pytorch.org/whl/cpu --extra-index-url https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/Triton-Nightly/pypi/simple/
|
||||
- name: Check Device.DEFAULT and print some source
|
||||
run: |
|
||||
python -c "from tinygrad.ops import Device; assert Device.DEFAULT in ['LLVM','CLANG','CUDA','GPU'], Device.DEFAULT"
|
||||
python -c "from tinygrad import Device; assert Device.DEFAULT in ['LLVM','CLANG','CUDA','GPU'], Device.DEFAULT"
|
||||
DEBUG=5 PYTHONPATH=${{ github.workspace }} FORWARD_ONLY=1 python3 test/test_ops.py TestOps.test_add
|
||||
- name: Run pytest (not cuda)
|
||||
if: matrix.backend!='cuda' && matrix.backend!='ptx' && matrix.backend!='triton'
|
||||
|
|
|
@ -22,7 +22,7 @@ from abc import ABC
|
|||
# let's trace an addition down through the layers of abstraction.
|
||||
|
||||
# we will be using the clang backend
|
||||
from tinygrad.ops import Device
|
||||
from tinygrad import Device
|
||||
Device.DEFAULT = "CLANG"
|
||||
|
||||
# first, 2+3 as a Tensor, the highest level
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
import numpy as np
|
||||
from tinygrad.tensor import Tensor
|
||||
from tinygrad.helpers import dtypes
|
||||
from tinygrad.ops import Device
|
||||
from tinygrad import Device
|
||||
|
||||
# TODO: will be better when tinygrad does math in the target dtype, can remove the floor and use a mul
|
||||
def bit_extract(x, s, e) -> Tensor:
|
||||
|
|
|
@ -2,7 +2,7 @@
|
|||
import argparse
|
||||
from tqdm import trange
|
||||
import numpy as np
|
||||
from tinygrad.ops import Device
|
||||
from tinygrad import Device
|
||||
from typing import Optional
|
||||
from tinygrad.tensor import Tensor
|
||||
from tinygrad.nn import Embedding, Linear, LayerNorm
|
||||
|
|
|
@ -16,7 +16,7 @@ from extra.datasets import fetch_cifar, cifar_mean, cifar_std
|
|||
from tinygrad import nn
|
||||
from tinygrad.nn.state import get_state_dict
|
||||
from tinygrad.nn import optim
|
||||
from tinygrad.ops import Device
|
||||
from tinygrad import Device
|
||||
from tinygrad.tensor import Tensor
|
||||
from tinygrad.helpers import GlobalCounters
|
||||
from tinygrad.shape.symbolic import Node
|
||||
|
|
|
@ -8,7 +8,7 @@ import sys, argparse, json
|
|||
import numpy as np
|
||||
np.set_printoptions(linewidth=200)
|
||||
from tinygrad.helpers import Timing, Profiling, getenv, DEBUG, dtypes
|
||||
from tinygrad.ops import Device
|
||||
from tinygrad import Device
|
||||
from tinygrad.tensor import Tensor
|
||||
from tinygrad.nn.state import safe_load, torch_load, load_state_dict, get_parameters
|
||||
from tinygrad.helpers import GlobalCounters
|
||||
|
|
|
@ -8,7 +8,7 @@ from collections import namedtuple
|
|||
|
||||
from tqdm import tqdm
|
||||
from tinygrad.tensor import Tensor
|
||||
from tinygrad.ops import Device
|
||||
from tinygrad import Device
|
||||
from tinygrad.helpers import dtypes, GlobalCounters, Timing, Context, getenv, fetch
|
||||
from tinygrad.nn import Conv2d, Linear, GroupNorm, LayerNorm, Embedding
|
||||
from tinygrad.nn.state import torch_load, load_state_dict, get_state_dict
|
||||
|
|
|
@ -3,7 +3,7 @@ from extra.export_model import compile_net, jit_model
|
|||
from examples.stable_diffusion import StableDiffusion
|
||||
from tinygrad.nn.state import get_state_dict, safe_save, safe_load_metadata, torch_load, load_state_dict
|
||||
from tinygrad.tensor import Tensor
|
||||
from tinygrad.ops import Device
|
||||
from tinygrad import Device
|
||||
from tinygrad.helpers import fetch
|
||||
from typing import NamedTuple, Any, List
|
||||
from pathlib import Path
|
||||
|
|
|
@ -40,7 +40,7 @@ def _process_wrap(rank:int, device:str, oob:_OOB, fn:Callable, args=()):
|
|||
OOB = oob
|
||||
|
||||
# do specific runtime initialization for distributed
|
||||
from tinygrad.ops import Device
|
||||
from tinygrad import Device
|
||||
device, device_num = Device.canonicalize(device), 0 if ":" not in device else int(device.split(":")[-1])
|
||||
if "GPU" in device:
|
||||
from tinygrad.runtime.ops_gpu import CL
|
||||
|
|
|
@ -33,7 +33,7 @@ except RuntimeError:
|
|||
|
||||
from tinygrad.tensor import Tensor
|
||||
from tinygrad.jit import TinyJit
|
||||
from tinygrad.ops import Device
|
||||
from tinygrad import Device
|
||||
b = Tensor(nb)
|
||||
c = Tensor(nc)
|
||||
# TODO: slowness without the JIT I suspect comes from a lack of a caching allocator
|
||||
|
|
|
@ -6,7 +6,7 @@ import time, torch, torch.mps
|
|||
from tinygrad.helpers import GlobalCounters
|
||||
from tinygrad.tensor import Tensor
|
||||
from tinygrad.jit import TinyJit
|
||||
from tinygrad.ops import Device
|
||||
from tinygrad import Device
|
||||
from tinygrad.helpers import colored, getenv, CI
|
||||
|
||||
import os
|
||||
|
|
|
@ -10,7 +10,7 @@ from tinygrad.helpers import prod, getenv, DEBUG, dtypes, get_child
|
|||
from tinygrad.helpers import GlobalCounters
|
||||
from tinygrad.tensor import Tensor
|
||||
from tinygrad.lazy import LazyBuffer
|
||||
from tinygrad.ops import Device
|
||||
from tinygrad import Device
|
||||
from tinygrad.shape.view import strides_for_shape
|
||||
OSX = platform.system() == "Darwin"
|
||||
WINDOWS = platform.system() == "Windows"
|
||||
|
|
|
@ -14,10 +14,10 @@ from typing import Tuple, List
|
|||
from extra.utils import fetch
|
||||
from extra.onnx import get_run_onnx
|
||||
from tinygrad.graph import print_tree, log_schedule_item
|
||||
from tinygrad.tensor import Tensor
|
||||
from tinygrad import Tensor, Device
|
||||
from tinygrad.helpers import dtypes, partition, GlobalCounters, Context, DEBUG, getenv, ImageDType, GRAPH
|
||||
from tinygrad.realize import run_schedule
|
||||
from tinygrad.ops import LoadOps, Device, ScheduleItem
|
||||
from tinygrad.ops import LoadOps, ScheduleItem
|
||||
from tinygrad.features.image import fix_schedule_for_images
|
||||
Device.DEFAULT = "GPU"
|
||||
|
||||
|
|
|
@ -4,7 +4,7 @@ import torch, json, argparse
|
|||
|
||||
from examples.llama import LLaMa
|
||||
from tinygrad.tensor import Tensor
|
||||
from tinygrad.ops import Device
|
||||
from tinygrad import Device
|
||||
|
||||
class LLaMaAdaptor(BaseLM):
|
||||
def __init__(
|
||||
|
|
|
@ -10,7 +10,7 @@ from extra.utils import download_file
|
|||
from extra.onnx import get_run_onnx
|
||||
from tinygrad.helpers import OSX, DEBUG
|
||||
from tinygrad.tensor import Tensor
|
||||
from tinygrad.ops import Device
|
||||
from tinygrad import Device
|
||||
|
||||
MODELS = {
|
||||
"resnet50": "https://github.com/onnx/models/raw/main/vision/classification/resnet/model/resnet50-caffe2-v1-9.onnx",
|
||||
|
|
|
@ -6,7 +6,7 @@ from tinygrad.nn.state import get_state_dict
|
|||
from tinygrad.helpers import GlobalCounters
|
||||
from tinygrad.runtime.lib import RawBuffer, LRUAllocator
|
||||
from tinygrad.helpers import dtypes, prod
|
||||
from tinygrad.ops import Device
|
||||
from tinygrad import Device
|
||||
from test.helpers import derandomize_model
|
||||
|
||||
from examples.llama import Transformer
|
||||
|
|
|
@ -4,7 +4,7 @@ import numpy as np
|
|||
from tinygrad.tensor import Tensor
|
||||
from tinygrad.jit import TinyJit
|
||||
from tinygrad.helpers import dtypes, CI
|
||||
from tinygrad.ops import Device
|
||||
from tinygrad import Device
|
||||
from test.helpers import derandomize_model
|
||||
|
||||
from examples.llama import Transformer
|
||||
|
|
|
@ -5,7 +5,7 @@ import onnx.backend.test
|
|||
import numpy as np
|
||||
from tinygrad.tensor import Tensor
|
||||
from tinygrad.helpers import getenv, CI
|
||||
from tinygrad.ops import Device
|
||||
from tinygrad import Device
|
||||
|
||||
# pip3 install tabulate
|
||||
pytest_plugins = 'onnx.backend.test.report',
|
||||
|
|
|
@ -3,9 +3,9 @@ import unittest, time
|
|||
import numpy as np
|
||||
from examples.llama import Transformer, MODEL_PARAMS
|
||||
from tinygrad.tensor import Tensor
|
||||
from tinygrad.ops import Device
|
||||
from tinygrad import Device
|
||||
from tinygrad.nn.state import get_state_dict
|
||||
from tinygrad.ops import Compiled
|
||||
from tinygrad.device import Compiled
|
||||
from tinygrad.helpers import dtypes, prod, Profiling
|
||||
from tinygrad.runtime.lib import RawBuffer
|
||||
|
||||
|
|
|
@ -6,12 +6,11 @@ from tinygrad.codegen.linearizer import Linearizer
|
|||
from tinygrad.features.search import get_linearizer_actions, bufs_from_lin, tuplize_uops
|
||||
from tinygrad.graph import print_tree
|
||||
from tinygrad.helpers import getenv
|
||||
from tinygrad.ops import Device, Compiled, Interpreted
|
||||
from tinygrad.device import Device, Compiled, Interpreted
|
||||
from tinygrad.lazy import vars_from_ast
|
||||
|
||||
device = Device[Device.DEFAULT]
|
||||
|
||||
|
||||
def run_linearizer(lin: Linearizer, rawbufs=None, var_vals=None):
|
||||
if rawbufs is None: rawbufs = bufs_from_lin(lin)
|
||||
if var_vals is None: var_vals = {v: v.min for v in vars_from_ast(lin.ast)}
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
import unittest
|
||||
import numpy as np
|
||||
from tinygrad.ops import Device
|
||||
from tinygrad import Device
|
||||
from tinygrad.tensor import Tensor
|
||||
from tinygrad.helpers import getenv, CI
|
||||
|
||||
|
|
|
@ -2,7 +2,7 @@
|
|||
import unittest
|
||||
import numpy as np
|
||||
from tinygrad.tensor import Tensor
|
||||
from tinygrad.ops import Device
|
||||
from tinygrad import Device
|
||||
import torch
|
||||
|
||||
def get_question_samp(bsz, seq_len, vocab_size, seed):
|
||||
|
|
|
@ -4,7 +4,7 @@ from tinygrad.tensor import Tensor
|
|||
from tinygrad.nn import optim
|
||||
from tinygrad.nn.state import get_parameters
|
||||
from tinygrad.jit import TinyJit
|
||||
from tinygrad.ops import Device, GlobalCounters
|
||||
from tinygrad import Device, GlobalCounters
|
||||
from tinygrad.helpers import CI, dtypes
|
||||
from test.helpers import derandomize_model
|
||||
|
||||
|
|
|
@ -3,7 +3,7 @@ import pathlib
|
|||
import unittest
|
||||
import numpy as np
|
||||
from tinygrad.tensor import Tensor
|
||||
from tinygrad.ops import Device
|
||||
from tinygrad import Device
|
||||
|
||||
class TestVGG7(unittest.TestCase):
|
||||
def test_vgg7(self):
|
||||
|
|
|
@ -2,7 +2,7 @@ import unittest
|
|||
import pathlib
|
||||
from examples.whisper import init_whisper, load_file_waveform, transcribe_file, transcribe_waveform
|
||||
from tinygrad.helpers import CI, fetch
|
||||
from tinygrad.ops import Device
|
||||
from tinygrad import Device
|
||||
|
||||
# Audio generated with the command on MacOS:
|
||||
# say "Could you please let me out of the box?" --file-format=WAVE --data-format=LEUI8@16000 -o test
|
||||
|
|
|
@ -7,7 +7,7 @@ from weakref import ref
|
|||
from tinygrad.helpers import GlobalCounters
|
||||
from tinygrad.runtime.lib import RawBuffer, LRUAllocator
|
||||
from tinygrad.helpers import dtypes, prod
|
||||
from tinygrad.ops import Device
|
||||
from tinygrad import Device
|
||||
from tinygrad.tensor import Tensor
|
||||
|
||||
def check_gc():
|
||||
|
|
|
@ -2,7 +2,7 @@
|
|||
import unittest
|
||||
import numpy as np
|
||||
from tinygrad.tensor import Tensor
|
||||
from tinygrad.ops import Device
|
||||
from tinygrad import Device
|
||||
from tinygrad.helpers import dtypes
|
||||
|
||||
N = 200 # has to be bigger than the cache to fail
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
import unittest
|
||||
from tinygrad import Tensor
|
||||
from tinygrad.ops import Device
|
||||
from tinygrad import Device
|
||||
from tinygrad.helpers import Timing, CI
|
||||
import multiprocessing.shared_memory as shared_memory
|
||||
|
||||
|
|
|
@ -9,7 +9,7 @@ from tinygrad.helpers import prod, dtypes
|
|||
# *** first, we implement the atan2 op at the lowest level ***
|
||||
# `atan2_gpu` for GPUBuffers and `atan2_cpu` for CPUBuffers
|
||||
from tinygrad.lazy import LazyBuffer, create_lazybuffer
|
||||
from tinygrad.ops import CompiledASTRunner, Device
|
||||
from tinygrad.device import CompiledASTRunner, Device
|
||||
from tinygrad.shape.shapetracker import ShapeTracker
|
||||
import pytest
|
||||
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
import unittest
|
||||
import numpy as np
|
||||
from tinygrad.helpers import CI, DTYPES_DICT, getenv, DType, DEBUG, ImageDType, PtrDType, OSX
|
||||
from tinygrad.ops import Device
|
||||
from tinygrad import Device
|
||||
from tinygrad.tensor import Tensor, dtypes
|
||||
from typing import Any, List
|
||||
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
#!/usr/bin/env python
|
||||
import unittest
|
||||
import numpy as np
|
||||
from tinygrad.ops import Device
|
||||
from tinygrad import Device
|
||||
from tinygrad.tensor import Tensor
|
||||
from tinygrad.jit import TinyJit
|
||||
|
||||
|
|
|
@ -3,7 +3,7 @@ import unittest
|
|||
import secrets
|
||||
import string
|
||||
from tinygrad.tensor import Tensor
|
||||
from tinygrad.ops import Device
|
||||
from tinygrad import Device
|
||||
from tinygrad.helpers import diskcache
|
||||
|
||||
def generate_random_string(length=16):
|
||||
|
|
|
@ -2,7 +2,7 @@
|
|||
import numpy as np
|
||||
import unittest
|
||||
from tinygrad.lazy import LazyBuffer
|
||||
from tinygrad.ops import Device
|
||||
from tinygrad import Device
|
||||
from tinygrad.tensor import Tensor
|
||||
from tinygrad.shape.symbolic import Variable
|
||||
from tinygrad.jit import CacheCollector
|
||||
|
|
|
@ -3,13 +3,14 @@ import unittest, os
|
|||
|
||||
from tinygrad.codegen.kernel import Opt, OptOps, tensor_cores
|
||||
from tinygrad.codegen.linearizer import Linearizer, UOp, UOps
|
||||
from tinygrad.ops import BufferOps, Compiled, ConstBuffer, Device, LazyOp, LoadOps, TernaryOps
|
||||
from tinygrad.device import Compiled, Device
|
||||
from tinygrad.ops import BufferOps, ConstBuffer, LazyOp, LoadOps, TernaryOps
|
||||
from tinygrad.shape.shapetracker import ShapeTracker
|
||||
from tinygrad.shape.view import View
|
||||
from tinygrad.tensor import Tensor
|
||||
from tinygrad.jit import CacheCollector
|
||||
from tinygrad.realize import run_schedule
|
||||
from tinygrad.helpers import dtypes, prod, getenv, CI
|
||||
from tinygrad.helpers import dtypes, prod
|
||||
|
||||
class TestLinearizer(unittest.TestCase):
|
||||
def test_arg_dedup(self):
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
import unittest
|
||||
from tinygrad.codegen.linearizer import Linearizer
|
||||
from tinygrad.features.search import Opt, OptOps
|
||||
from tinygrad.ops import Device
|
||||
from tinygrad import Device
|
||||
from tinygrad.helpers import OSX, CI
|
||||
from test.external.fuzz_linearizer import run_linearizer
|
||||
|
||||
|
|
|
@ -5,7 +5,7 @@ import numpy as np
|
|||
import unittest
|
||||
from tinygrad.tensor import Tensor
|
||||
from tinygrad.helpers import getenv, IMAGE, DEBUG, CI, dtypes
|
||||
from tinygrad.ops import Device
|
||||
from tinygrad import Device
|
||||
|
||||
if CI:
|
||||
import warnings
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
import unittest
|
||||
import numpy as np
|
||||
from tinygrad.tensor import Tensor
|
||||
from tinygrad.ops import Device
|
||||
from tinygrad import Device
|
||||
from tinygrad.shape.symbolic import Variable
|
||||
|
||||
@unittest.skipUnless(Device.DEFAULT in ["GPU", "METAL", "CLANG", "CUDA", "LLVM"], f"{Device.DEFAULT} is not supported")
|
||||
|
|
|
@ -5,7 +5,8 @@
|
|||
import unittest
|
||||
from typing import List, Optional
|
||||
from tinygrad.tensor import Tensor
|
||||
from tinygrad.ops import LoadOps, Device, Compiled
|
||||
from tinygrad.ops import LoadOps
|
||||
from tinygrad.device import Device, Compiled
|
||||
from tinygrad.helpers import DEBUG, dtypes
|
||||
from tinygrad.codegen.linearizer import Linearizer
|
||||
from tinygrad.graph import log_schedule_item, print_tree
|
||||
|
|
|
@ -2,7 +2,8 @@ import unittest
|
|||
|
||||
from tinygrad.codegen.linearizer import Linearizer
|
||||
from tinygrad.features.search import time_linearizer
|
||||
from tinygrad.ops import Compiled, Device, LoadOps
|
||||
from tinygrad.device import Compiled, Device
|
||||
from tinygrad.ops import LoadOps
|
||||
from tinygrad.tensor import Tensor
|
||||
|
||||
class TestTimeLinearizer(unittest.TestCase):
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
import unittest
|
||||
from tinygrad.tensor import Tensor
|
||||
from tinygrad.helpers import dtypes
|
||||
from tinygrad.ops import Device
|
||||
from tinygrad import Device
|
||||
import pytest
|
||||
# similar to test/external/external_test_gpu_ast.py, but universal
|
||||
|
||||
|
|
|
@ -9,7 +9,7 @@ torch.set_num_threads(1)
|
|||
import time
|
||||
import numpy as np
|
||||
np.set_printoptions(linewidth=160)
|
||||
from tinygrad.ops import Device
|
||||
from tinygrad import Device
|
||||
from tinygrad.helpers import GlobalCounters
|
||||
from tinygrad.tensor import Tensor
|
||||
from tinygrad.nn import Conv2d
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
from tinygrad.ops import Device
|
||||
from tinygrad import Device
|
||||
from tinygrad.tensor import Tensor
|
||||
import numpy as np
|
||||
import pickle
|
||||
|
|
|
@ -3,7 +3,8 @@ import unittest, math
|
|||
import numpy as np
|
||||
from tinygrad.helpers import dtypes, getenv, DType, PtrDType
|
||||
from tinygrad.tensor import Device
|
||||
from tinygrad.ops import UnaryOps, BinaryOps, TernaryOps, CompiledASTRunner, Compiled
|
||||
from tinygrad.ops import UnaryOps, BinaryOps, TernaryOps
|
||||
from tinygrad.device import CompiledASTRunner, Compiled
|
||||
from tinygrad.codegen.linearizer import UOps, UOp
|
||||
|
||||
def _uops_to_prg(uops):
|
||||
|
|
|
@ -4,5 +4,5 @@ from tinygrad.shape.symbolic import Variable # noqa: F401
|
|||
from tinygrad.helpers import dtypes # noqa: F401
|
||||
|
||||
# NOTE: these should not be relied on to be stable
|
||||
from tinygrad.ops import Device # noqa: F401
|
||||
from tinygrad.device import Device # noqa: F401
|
||||
from tinygrad.helpers import GlobalCounters # noqa: F401
|
|
@ -2,7 +2,8 @@ from __future__ import annotations
|
|||
import os, math, itertools
|
||||
from typing import NamedTuple, Optional, List, Tuple, cast, Dict, Union
|
||||
from tinygrad.lazy import vars_from_ast
|
||||
from tinygrad.ops import LazyOp, FlopCounter, get_lazyop_info, UnaryOps, BinaryOps, ReduceOps, MemBuffer, ConstBuffer, BufferOps, Device, Compiled
|
||||
from tinygrad.ops import LazyOp, FlopCounter, get_lazyop_info, UnaryOps, BinaryOps, ReduceOps, MemBuffer, ConstBuffer, BufferOps
|
||||
from tinygrad.device import Device, Compiled
|
||||
from tinygrad.helpers import dedup, dtypes, colored, ImageDType, DType, ansilen, getenv, prod, DEBUG, round_up
|
||||
from tinygrad.shape.shapetracker import ShapeTracker, get_contraction
|
||||
from tinygrad.shape.symbolic import sint
|
||||
|
|
|
@ -0,0 +1,200 @@
|
|||
from __future__ import annotations
|
||||
from typing import TYPE_CHECKING, Union, Type, Any, List, Optional, Dict, Callable
|
||||
import importlib, inspect, functools, pathlib, time, re
|
||||
from tinygrad.helpers import ansilen, DEBUG, getenv, GlobalCounters, colored, BEAM, NOOPT, all_int, to_function_name
|
||||
from tinygrad.runtime.lib import RawBuffer
|
||||
from tinygrad.shape.symbolic import Variable, sym_infer, sint
|
||||
from tinygrad.ops import LazyOp, TernaryOps, get_lazyop_info, ReduceOps, BufferOps, BinaryOps, Op
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from tinygrad.codegen.linearizer import Linearizer
|
||||
from tinygrad.codegen.kernel import LinearizerOptions
|
||||
|
||||
# **************** Device ****************
|
||||
|
||||
class _Device:
|
||||
def __init__(self) -> None: self._buffers: List[str] = [x.stem[len("ops_"):].upper() for x in (pathlib.Path(__file__).parent/"runtime").iterdir() if x.stem.startswith("ops_")]
|
||||
def canonicalize(self, device:Optional[str]) -> str: return (device.split(":", 1)[0].upper() + ((":"+device.split(":", 1)[1]) if ':' in device else '')).replace(":0", "") if device is not None else self.DEFAULT
|
||||
@functools.lru_cache(maxsize=None) # this class is a singleton, pylint: disable=method-cache-max-size-none
|
||||
def __getitem__(self, x:str) -> Union[Interpreted, Compiled]:
|
||||
x = x.split(":")[0].upper()
|
||||
return [cls for cname, cls in inspect.getmembers(importlib.import_module(f'tinygrad.runtime.ops_{x.lower()}')) if (cname.lower() == x.lower() + "buffer") and x in self._buffers][0]
|
||||
@functools.cached_property
|
||||
def DEFAULT(self) -> str:
|
||||
device_from_env: Optional[str] = functools.reduce(lambda val, ele: ele if getenv(ele) == 1 else val, self._buffers, None) # type: ignore
|
||||
if device_from_env: return device_from_env
|
||||
for device in ["METAL", "CUDA", "GPU"]:
|
||||
try:
|
||||
if self[device]: return device
|
||||
except Exception: pass
|
||||
return "CPU"
|
||||
Device = _Device()
|
||||
|
||||
# **************** shared device helpers ****************
|
||||
|
||||
class JITRunner:
|
||||
def __init__(self):
|
||||
self.op_estimate, self.mem_estimate = 0, 0
|
||||
def exec(self, rawbufs:List[RawBuffer], var_vals:Optional[Dict[Variable, int]]=None) -> Optional[float]:
|
||||
var_vals = var_vals if var_vals is not None else {}
|
||||
from tinygrad.jit import CacheCollector
|
||||
et = self(rawbufs, var_vals)
|
||||
CacheCollector.add(self, rawbufs, var_vals)
|
||||
return et
|
||||
def __call__(self, rawbufs:List[RawBuffer], var_vals:Dict[Variable, int], wait=False, jit=False) -> Optional[float]:
|
||||
raise NotImplementedError("override this")
|
||||
|
||||
def update_stats(name:str, op_estimate:sint, mem_estimate:sint, var_vals: Optional[Dict[Variable, int]], et: Optional[float], buf_count, jit=False, num_kernels=1, lra: Optional[Dict]=None):
|
||||
if var_vals is None: var_vals = {}
|
||||
op_estimate, mem_estimate = sym_infer(op_estimate, var_vals), sym_infer(mem_estimate, var_vals)
|
||||
if DEBUG >= 2:
|
||||
print(f"{colored(f'*** {GlobalCounters.kernel_count:4d}', ('magenta' if num_kernels == 1 else 'CYAN') if jit else None)} {name+' '*(37-ansilen(name))} arg {buf_count:3d} sz {str(lra.get('global_size', '') if lra else ''):18s} {str(lra.get('local_size', '') if lra else ''):12s} OPs {int(op_estimate/1e6):6d}M/{GlobalCounters.global_ops/1e9:7.2f}G mem {GlobalCounters.mem_used/1e9:5.2f} GB " +
|
||||
(str() if et is None else f"tm {et*1e6:9.2f}us/{GlobalCounters.time_sum_s*1e3:9.2f}ms ({op_estimate/((et or 1e-20)*1e9):8.2f} GFLOPS, {mem_estimate/((et or 1e-20)*1e9):7.2f} GB/s)"))
|
||||
GlobalCounters.kernel_count += num_kernels
|
||||
GlobalCounters.global_ops += op_estimate
|
||||
GlobalCounters.global_mem += mem_estimate
|
||||
if et is not None: GlobalCounters.time_sum_s += et
|
||||
|
||||
# **************** for Interpreted Buffers ****************
|
||||
|
||||
class InterpretedASTRunner(JITRunner):
|
||||
def __init__(self, ast:LazyOp, fxn:Callable):
|
||||
super().__init__()
|
||||
self.fxn = fxn
|
||||
info = get_lazyop_info(ast)
|
||||
self.op_estimate, self.mem_estimate = info.flops, info.mem_estimate
|
||||
|
||||
def __call__(self, rawbufs:List[RawBuffer], var_vals:Dict[Variable, int], wait=False, jit=False) -> float:
|
||||
st = time.perf_counter()
|
||||
ret: RawBuffer = self.fxn(rawbufs[1:], var_vals)
|
||||
et = time.perf_counter() - st
|
||||
update_stats(f"<interpreted {ret.size}>", self.op_estimate, self.mem_estimate, var_vals, et, len(rawbufs), jit)
|
||||
assert rawbufs[0].dtype == ret.dtype, f"dtype mismatch in Interpreted, {rawbufs[0].dtype=} != {ret.dtype=}"
|
||||
rawbufs[0].dtype, rawbufs[0].size, rawbufs[0]._buf, rawbufs[0].offset = ret.dtype, ret.size, ret._buf, ret.offset
|
||||
return et
|
||||
|
||||
class Interpreted:
|
||||
def __init__(self, buffer: Type[RawBuffer], fxn_for_op:Dict[Op, Callable]):
|
||||
self.buffer, self.fxn_for_op = buffer, fxn_for_op
|
||||
self.synchronize, self.codegen, self.graph = lambda: None, None, None
|
||||
|
||||
@functools.lru_cache(None) # pylint: disable=method-cache-max-size-none
|
||||
def get_runner(self, ast:LazyOp) -> InterpretedASTRunner: return _get_interpreted_fxn(self.fxn_for_op, ast)
|
||||
|
||||
def _get_interpreted_fxn(fxn_for_op:Dict[Op, Callable], ast:LazyOp) -> InterpretedASTRunner:
|
||||
if DEBUG >= 3:
|
||||
from tinygrad.graph import print_tree
|
||||
print_tree(ast)
|
||||
tglob: Dict[str, Any] = {"Variable": Variable}
|
||||
lines: List[str] = []
|
||||
|
||||
@functools.lru_cache(None)
|
||||
def gstr(x:Any, nm=None) -> str:
|
||||
if ('Variable' in (str_arg := repr(x)) or 'NumNode' in str_arg):
|
||||
str_arg = re.sub(r'Variable\(.*?\)', lambda m: f'var_vals[{str(m.group(0))}]', str_arg)
|
||||
# TODO: (Variable - Variable) might create NumNode. can we remove it?
|
||||
return re.sub(r'NumNode\((.*?)\)', r'\1', str_arg)
|
||||
ret = str(nm).replace(".", "_") if nm else f"m{len(tglob):04d}"
|
||||
tglob[ret] = x
|
||||
return ret
|
||||
|
||||
@functools.lru_cache(None)
|
||||
def _interpret_ast(ast:LazyOp) -> str:
|
||||
if TernaryOps.MULACC in fxn_for_op and ast.op == ReduceOps.SUM and isinstance(ast.src[0], LazyOp) and ast.src[0].op == BinaryOps.MUL:
|
||||
ast = LazyOp(TernaryOps.MULACC, ast.src[0].src, ast.arg)
|
||||
|
||||
if ast.op in BufferOps:
|
||||
tmp = f"{gstr(fxn_for_op[ast.op], ast.op)}({gstr(ast.arg.val)}, {gstr(ast.arg.dtype)})" if ast.op == BufferOps.CONST else f"{gstr(fxn_for_op[ast.op], ast.op)}(inputs[{ast.arg.idx-1}])"
|
||||
for mop,arg in ast.arg.st.to_movement_ops(): tmp = f"{gstr(fxn_for_op[mop], mop)}({tmp}, {gstr(arg)})"
|
||||
else:
|
||||
tmp = f"{gstr(fxn_for_op[ast.op], ast.op)}({', '.join([_interpret_ast(src) for src in ast.src] + ([gstr(ast.arg)] if ast.arg else []))})"
|
||||
|
||||
ret = f"a{len(lines)}"
|
||||
lines.append(f" {ret} = {tmp}")
|
||||
return ret
|
||||
|
||||
ret = _interpret_ast(ast)
|
||||
src = '\n'.join(['def run(inputs, var_vals):'] + lines + [f" return {gstr(fxn_for_op[BufferOps.FROM_UNDERLYING], BufferOps.FROM_UNDERLYING)}({ret})" if BufferOps.FROM_UNDERLYING in fxn_for_op else f" return {ret}"])
|
||||
if DEBUG >= 4: print(functools.reduce(lambda x,y: (x.replace(y[0], str(y[1])) if y[0][0:2] == "m0" else x), tglob.items(), src))
|
||||
exec(compile(src, "<ast>", "exec"), tglob) # pylint: disable=exec-used
|
||||
return InterpretedASTRunner(ast, tglob['run'])
|
||||
|
||||
# **************** for Compiled Buffers ****************
|
||||
|
||||
class CompiledASTRunner(JITRunner):
|
||||
def __init__(self, ast:Optional[LazyOp], name:str, prg:str, global_size:Optional[List[int]]=None, local_size:Optional[List[int]]=None, runtime_args:Optional[dict]=None):
|
||||
super().__init__()
|
||||
if DEBUG >= 4: print(prg)
|
||||
if global_size is not None: global_size = global_size + [1]*(3-len(global_size))
|
||||
if local_size is not None: local_size = local_size + [1]*(3-len(local_size))
|
||||
self.name, self.display_name, self.prg, self.global_size, self.local_size, self.runtime_args = \
|
||||
to_function_name(name), name, prg, global_size, local_size, runtime_args if runtime_args is not None else {}
|
||||
self.vars: List[Variable] = []
|
||||
if ast:
|
||||
info = get_lazyop_info(ast)
|
||||
self.op_estimate, self.mem_estimate = info.flops, info.mem_estimate
|
||||
from tinygrad.lazy import vars_from_ast
|
||||
self.vars = vars_from_ast(ast)
|
||||
assert all(v._val is None for v in self.vars), f"ASTRunner contains bound Variable {self.vars}"
|
||||
|
||||
def build(self, compiler, runtime):
|
||||
self.lib = compiler.__wrapped__(self.prg) if getenv("DISABLE_COMPILER_CACHE") else compiler(self.prg)
|
||||
self.clprg = runtime(self.name, self.lib)
|
||||
return self
|
||||
|
||||
def launch_dims(self, var_vals):
|
||||
global_size = [sym_infer(sz, var_vals) for sz in self.global_size] if self.global_size is not None else self.global_size
|
||||
local_size = [sym_infer(sz, var_vals) for sz in self.local_size] if self.local_size is not None else self.local_size
|
||||
return global_size, local_size
|
||||
|
||||
def __call__(self, rawbufs:List[RawBuffer], var_vals:Dict[Variable, int], wait=False, jit=False) -> Optional[float]:
|
||||
global_size, local_size = self.launch_dims(var_vals)
|
||||
if global_size is not None and local_size is None and all_int(self.global_size): # type: ignore[arg-type]
|
||||
# TODO: this is copied from get_program
|
||||
from tinygrad.features.search import optimize_local_size
|
||||
local_size = self.local_size = optimize_local_size(self.clprg, global_size, rawbufs)
|
||||
global_size = self.global_size = [g//l if g%l == 0 else g/l for g,l in zip(global_size, local_size)]
|
||||
lra = self.runtime_args.copy()
|
||||
if global_size: lra['global_size'] = global_size
|
||||
if local_size and 'local_size' not in lra: lra['local_size'] = local_size
|
||||
et = self.clprg(*rawbufs, *[var_vals[k] for k in self.vars], **lra, wait=wait or DEBUG>=2)
|
||||
update_stats(self.display_name, self.op_estimate, self.mem_estimate, var_vals, et, len(rawbufs), jit, lra=lra)
|
||||
return et
|
||||
|
||||
class Compiled:
|
||||
def __init__(self, buffer: Type[RawBuffer], linearizer_opts:LinearizerOptions, renderer, compiler, runtime, synchronize=lambda: None, graph=None):
|
||||
self.buffer, self.linearizer_opts, self.renderer, self.compiler, self.runtime, self.synchronize, self.graph = buffer, linearizer_opts, renderer, compiler, runtime, synchronize, graph
|
||||
|
||||
def to_program(self, k:Linearizer) -> CompiledASTRunner:
|
||||
k.linearize()
|
||||
src, runtime_args = self.renderer(to_function_name(k.name), k.uops)
|
||||
return CompiledASTRunner(k.ast, k.name, src, k.global_size, k.local_size, runtime_args).build(self.compiler, self.runtime)
|
||||
|
||||
@functools.lru_cache(None) # pylint: disable=method-cache-max-size-none
|
||||
def get_runner(self, ast:LazyOp) -> CompiledASTRunner: return self.to_program(_get_optimized_linearizer(self.linearizer_opts, ast))
|
||||
|
||||
def _get_optimized_linearizer(linearizer_opts:LinearizerOptions, ast:LazyOp) -> Linearizer:
|
||||
if DEBUG >= 3:
|
||||
from tinygrad.graph import print_tree
|
||||
print_tree(ast)
|
||||
from tinygrad.codegen.linearizer import Linearizer
|
||||
k = Linearizer(ast, linearizer_opts)
|
||||
if not NOOPT:
|
||||
if not (used_tensor_cores:=k.apply_tensor_cores(getenv("TC", 1))): k.hand_coded_optimizations()
|
||||
if BEAM >= 1:
|
||||
lins = [(("tc" if used_tensor_cores else "hc"), k)]
|
||||
kb = Linearizer(ast, linearizer_opts)
|
||||
kb.required_optimizations()
|
||||
from tinygrad.features.search import beam_search, time_linearizer, bufs_from_lin
|
||||
# TODO: this shouldn't use Device.DEFAULT, it should get the device from the LinearizerOptions
|
||||
test_rawbuffers = bufs_from_lin(kb) # allocate scratch buffers for optimization
|
||||
lins.append((f"beam{BEAM.value}", beam_search(kb, test_rawbuffers, BEAM.value, bool(getenv("BEAM_ESTIMATE", 1)))))
|
||||
if used_tensor_cores:
|
||||
lins.append(("hc", Linearizer(ast, linearizer_opts)))
|
||||
lins[-1][1].hand_coded_optimizations()
|
||||
timed = sorted([(nm, tk, time_linearizer(tk, test_rawbuffers, allow_test_size=False, clear_l2=True)) for nm, tk in lins], key=lambda x: x[2])
|
||||
if DEBUG >= 1: print(" < ".join(f"{nm:6s} : {lin.colored_shape(30, dense=True)} : {tm*1e6:8.2f} us" for nm, lin, tm in timed))
|
||||
k = timed[0][1]
|
||||
else:
|
||||
k.required_optimizations()
|
||||
return k
|
|
@ -1,7 +1,8 @@
|
|||
from typing import Dict, List, cast, DefaultDict, Optional, Tuple, Callable
|
||||
import itertools, random, math, time
|
||||
from tinygrad.lazy import vars_from_ast
|
||||
from tinygrad.ops import Device, Compiled, MemBuffer
|
||||
from tinygrad.device import Device, Compiled
|
||||
from tinygrad.ops import MemBuffer
|
||||
from tinygrad.helpers import prod, ImageDType, flatten, DEBUG, CACHELEVEL, diskcache_get, diskcache_put, getenv, Context, all_int, colored, Timing
|
||||
from tinygrad.codegen.linearizer import Linearizer, UOp
|
||||
from tinygrad.runtime.lib import RawBuffer
|
||||
|
|
|
@ -2,7 +2,8 @@ from __future__ import annotations
|
|||
from typing import Callable, List, Tuple, Dict, cast, Union, Optional, TypeVar, Generic
|
||||
import functools, itertools, operator
|
||||
from tinygrad.helpers import DEBUG, DType, merge_dicts, getenv, all_int
|
||||
from tinygrad.ops import RawBuffer, Device, JITRunner, CompiledASTRunner
|
||||
from tinygrad.device import Device, JITRunner, CompiledASTRunner
|
||||
from tinygrad.runtime.lib import RawBuffer
|
||||
from tinygrad.tensor import Tensor
|
||||
from tinygrad.shape.shapetracker import ShapeTracker
|
||||
from tinygrad.shape.symbolic import Variable, NumNode, Node
|
||||
|
|
|
@ -4,7 +4,7 @@ from typing import Dict, Union, List, Optional, Any, Tuple
|
|||
from tinygrad.tensor import Tensor
|
||||
from tinygrad.helpers import dtypes, prod, argsort, DEBUG, Timing, GlobalCounters, CI, unwrap
|
||||
from tinygrad.shape.view import strides_for_shape
|
||||
from tinygrad.ops import Device
|
||||
from tinygrad import Device
|
||||
|
||||
safe_dtypes = {"F16": dtypes.float16, "F32": dtypes.float32, "U8": dtypes.uint8, "I8": dtypes.int8, "I32": dtypes.int32, "I64": dtypes.int64}
|
||||
inverse_safe_dtypes = {v:k for k,v in safe_dtypes.items()}
|
||||
|
|
202
tinygrad/ops.py
202
tinygrad/ops.py
|
@ -1,10 +1,9 @@
|
|||
from __future__ import annotations
|
||||
import importlib, inspect, functools, pathlib, time, re
|
||||
from typing import TYPE_CHECKING, Union, Type, Tuple, Any, List, Dict, Callable, Mapping
|
||||
import functools
|
||||
from enum import Enum, auto
|
||||
from typing import TYPE_CHECKING, Union, Type, Tuple, Any, List, Optional, Dict, Callable, Mapping
|
||||
from tinygrad.helpers import ansilen, prod, DEBUG, getenv, GlobalCounters, DType, colored, BEAM, NOOPT, dedup, all_int, to_function_name
|
||||
from tinygrad.runtime.lib import RawBuffer
|
||||
from tinygrad.shape.symbolic import Variable, sym_infer, sint
|
||||
from tinygrad.helpers import prod, DType, dedup
|
||||
from tinygrad.shape.symbolic import Variable
|
||||
from dataclasses import dataclass
|
||||
|
||||
# these are the llops your accelerator must implement, along with toCpu
|
||||
|
@ -26,8 +25,6 @@ OpType = Union[Type[UnaryOps], Type[BinaryOps], Type[ReduceOps], Type[MovementOp
|
|||
if TYPE_CHECKING:
|
||||
from tinygrad.shape.shapetracker import ShapeTracker
|
||||
from tinygrad.lazy import LazyBuffer
|
||||
from tinygrad.codegen.linearizer import Linearizer
|
||||
from tinygrad.codegen.kernel import LinearizerOptions
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class MemBuffer:
|
||||
|
@ -83,26 +80,6 @@ class LazyOp:
|
|||
def shrink(self, _): raise NotImplementedError
|
||||
def stride(self, _): raise NotImplementedError
|
||||
|
||||
# **************** Device ****************
|
||||
|
||||
class _Device:
|
||||
def __init__(self) -> None: self._buffers: List[str] = [x.stem[len("ops_"):].upper() for x in (pathlib.Path(__file__).parent/"runtime").iterdir() if x.stem.startswith("ops_")]
|
||||
def canonicalize(self, device:Optional[str]) -> str: return (device.split(":", 1)[0].upper() + ((":"+device.split(":", 1)[1]) if ':' in device else '')).replace(":0", "") if device is not None else self.DEFAULT
|
||||
@functools.lru_cache(maxsize=None) # this class is a singleton, pylint: disable=method-cache-max-size-none
|
||||
def __getitem__(self, x:str) -> Union[Interpreted, Compiled]:
|
||||
x = x.split(":")[0].upper()
|
||||
return [cls for cname, cls in inspect.getmembers(importlib.import_module(f'tinygrad.runtime.ops_{x.lower()}')) if (cname.lower() == x.lower() + "buffer") and x in self._buffers][0]
|
||||
@functools.cached_property
|
||||
def DEFAULT(self) -> str:
|
||||
device_from_env: Optional[str] = functools.reduce(lambda val, ele: ele if getenv(ele) == 1 else val, self._buffers, None) # type: ignore
|
||||
if device_from_env: return device_from_env
|
||||
for device in ["METAL", "CUDA", "GPU"]:
|
||||
try:
|
||||
if self[device]: return device
|
||||
except Exception: pass
|
||||
return "CPU"
|
||||
Device = _Device()
|
||||
|
||||
# **************** independent FlopCounter ****************
|
||||
|
||||
@dataclass
|
||||
|
@ -130,174 +107,3 @@ def get_lazyop_info(ast:LazyOp) -> FlopCounter:
|
|||
@functools.lru_cache(None) # NOTE: this cache needs to be recreated for new ASTs
|
||||
def run_ast(ast): return InterpretedFlopCounter[ast.op](*([run_ast(x) for x in ast.src]+([ast.arg] if ast.arg is not None else [])))
|
||||
return run_ast(ast)
|
||||
|
||||
# **************** GlobalCounters stats ****************
|
||||
|
||||
def update_stats(name:str, op_estimate:sint, mem_estimate:sint, var_vals: Optional[Dict[Variable, int]], et: Optional[float], buf_count, jit=False, num_kernels=1, lra=None):
|
||||
if var_vals is None: var_vals = {}
|
||||
op_estimate, mem_estimate = sym_infer(op_estimate, var_vals), sym_infer(mem_estimate, var_vals)
|
||||
if DEBUG >= 2:
|
||||
print(f"{colored(f'*** {GlobalCounters.kernel_count:4d}', ('magenta' if num_kernels == 1 else 'CYAN') if jit else None)} {name+' '*(37-ansilen(name))} arg {buf_count:3d} sz {str(lra.get('global_size', '') if lra else ''):18s} {str(lra.get('local_size', '') if lra else ''):12s} OPs {int(op_estimate/1e6):6d}M/{GlobalCounters.global_ops/1e9:7.2f}G mem {GlobalCounters.mem_used/1e9:5.2f} GB " +
|
||||
(str() if et is None else f"tm {et*1e6:9.2f}us/{GlobalCounters.time_sum_s*1e3:9.2f}ms ({op_estimate/((et or 1e-20)*1e9):8.2f} GFLOPS, {mem_estimate/((et or 1e-20)*1e9):7.2f} GB/s)"))
|
||||
GlobalCounters.kernel_count += num_kernels
|
||||
GlobalCounters.global_ops += op_estimate
|
||||
GlobalCounters.global_mem += mem_estimate
|
||||
if et is not None: GlobalCounters.time_sum_s += et
|
||||
|
||||
# **************** shared Runner that can go in the JIT ****************
|
||||
|
||||
class JITRunner:
|
||||
def __init__(self):
|
||||
self.op_estimate, self.mem_estimate = 0, 0
|
||||
def exec(self, rawbufs:List[RawBuffer], var_vals:Optional[Dict[Variable, int]]=None) -> Optional[float]:
|
||||
var_vals = var_vals if var_vals is not None else {}
|
||||
from tinygrad.jit import CacheCollector
|
||||
et = self(rawbufs, var_vals)
|
||||
CacheCollector.add(self, rawbufs, var_vals)
|
||||
return et
|
||||
def __call__(self, rawbufs:List[RawBuffer], var_vals:Dict[Variable, int], wait=False, jit=False) -> Optional[float]:
|
||||
raise NotImplementedError("override this")
|
||||
|
||||
# **************** for Interpreted Buffers ****************
|
||||
|
||||
class InterpretedASTRunner(JITRunner):
|
||||
def __init__(self, ast:LazyOp, fxn:Callable):
|
||||
super().__init__()
|
||||
self.fxn = fxn
|
||||
info = get_lazyop_info(ast)
|
||||
self.op_estimate, self.mem_estimate = info.flops, info.mem_estimate
|
||||
|
||||
def __call__(self, rawbufs:List[RawBuffer], var_vals:Dict[Variable, int], wait=False, jit=False) -> float:
|
||||
st = time.perf_counter()
|
||||
ret: RawBuffer = self.fxn(rawbufs[1:], var_vals)
|
||||
et = time.perf_counter() - st
|
||||
update_stats(f"<interpreted {ret.size}>", self.op_estimate, self.mem_estimate, var_vals, et, len(rawbufs), jit)
|
||||
assert rawbufs[0].dtype == ret.dtype, f"dtype mismatch in Interpreted, {rawbufs[0].dtype=} != {ret.dtype=}"
|
||||
rawbufs[0].dtype, rawbufs[0].size, rawbufs[0]._buf, rawbufs[0].offset = ret.dtype, ret.size, ret._buf, ret.offset
|
||||
return et
|
||||
|
||||
class Interpreted:
|
||||
def __init__(self, buffer: Type[RawBuffer], fxn_for_op:Dict[Op, Callable]):
|
||||
self.buffer, self.fxn_for_op = buffer, fxn_for_op
|
||||
self.synchronize, self.codegen, self.graph = lambda: None, None, None
|
||||
|
||||
@functools.lru_cache(None) # pylint: disable=method-cache-max-size-none
|
||||
def get_runner(self, ast:LazyOp) -> InterpretedASTRunner: return _get_interpreted_fxn(self.fxn_for_op, ast)
|
||||
|
||||
def _get_interpreted_fxn(fxn_for_op:Dict[Op, Callable], ast:LazyOp) -> InterpretedASTRunner:
|
||||
if DEBUG >= 3:
|
||||
from tinygrad.graph import print_tree
|
||||
print_tree(ast)
|
||||
tglob: Dict[str, Any] = {"Variable": Variable}
|
||||
lines: List[str] = []
|
||||
|
||||
@functools.lru_cache(None)
|
||||
def gstr(x:Any, nm=None) -> str:
|
||||
if ('Variable' in (str_arg := repr(x)) or 'NumNode' in str_arg):
|
||||
str_arg = re.sub(r'Variable\(.*?\)', lambda m: f'var_vals[{str(m.group(0))}]', str_arg)
|
||||
# TODO: (Variable - Variable) might create NumNode. can we remove it?
|
||||
return re.sub(r'NumNode\((.*?)\)', r'\1', str_arg)
|
||||
ret = str(nm).replace(".", "_") if nm else f"m{len(tglob):04d}"
|
||||
tglob[ret] = x
|
||||
return ret
|
||||
|
||||
@functools.lru_cache(None)
|
||||
def _interpret_ast(ast:LazyOp) -> str:
|
||||
if TernaryOps.MULACC in fxn_for_op and ast.op == ReduceOps.SUM and isinstance(ast.src[0], LazyOp) and ast.src[0].op == BinaryOps.MUL:
|
||||
ast = LazyOp(TernaryOps.MULACC, ast.src[0].src, ast.arg)
|
||||
|
||||
if ast.op in BufferOps:
|
||||
tmp = f"{gstr(fxn_for_op[ast.op], ast.op)}({gstr(ast.arg.val)}, {gstr(ast.arg.dtype)})" if ast.op == BufferOps.CONST else f"{gstr(fxn_for_op[ast.op], ast.op)}(inputs[{ast.arg.idx-1}])"
|
||||
for mop,arg in ast.arg.st.to_movement_ops(): tmp = f"{gstr(fxn_for_op[mop], mop)}({tmp}, {gstr(arg)})"
|
||||
else:
|
||||
tmp = f"{gstr(fxn_for_op[ast.op], ast.op)}({', '.join([_interpret_ast(src) for src in ast.src] + ([gstr(ast.arg)] if ast.arg else []))})"
|
||||
|
||||
ret = f"a{len(lines)}"
|
||||
lines.append(f" {ret} = {tmp}")
|
||||
return ret
|
||||
|
||||
ret = _interpret_ast(ast)
|
||||
src = '\n'.join(['def run(inputs, var_vals):'] + lines + [f" return {gstr(fxn_for_op[BufferOps.FROM_UNDERLYING], BufferOps.FROM_UNDERLYING)}({ret})" if BufferOps.FROM_UNDERLYING in fxn_for_op else f" return {ret}"])
|
||||
if DEBUG >= 4: print(functools.reduce(lambda x,y: (x.replace(y[0], str(y[1])) if y[0][0:2] == "m0" else x), tglob.items(), src))
|
||||
exec(compile(src, "<ast>", "exec"), tglob) # pylint: disable=exec-used
|
||||
return InterpretedASTRunner(ast, tglob['run'])
|
||||
|
||||
# **************** for Compiled Buffers ****************
|
||||
|
||||
class CompiledASTRunner(JITRunner):
|
||||
def __init__(self, ast:Optional[LazyOp], name:str, prg:str, global_size:Optional[List[int]]=None, local_size:Optional[List[int]]=None, runtime_args:Optional[dict]=None):
|
||||
super().__init__()
|
||||
if DEBUG >= 4: print(prg)
|
||||
if global_size is not None: global_size = global_size + [1]*(3-len(global_size))
|
||||
if local_size is not None: local_size = local_size + [1]*(3-len(local_size))
|
||||
self.name, self.display_name, self.prg, self.global_size, self.local_size, self.runtime_args = \
|
||||
to_function_name(name), name, prg, global_size, local_size, runtime_args if runtime_args is not None else {}
|
||||
self.vars: List[Variable] = []
|
||||
if ast:
|
||||
info = get_lazyop_info(ast)
|
||||
self.op_estimate, self.mem_estimate = info.flops, info.mem_estimate
|
||||
from tinygrad.lazy import vars_from_ast
|
||||
self.vars = vars_from_ast(ast)
|
||||
assert all(v._val is None for v in self.vars), f"ASTRunner contains bound Variable {self.vars}"
|
||||
|
||||
def build(self, compiler, runtime):
|
||||
self.lib = compiler.__wrapped__(self.prg) if getenv("DISABLE_COMPILER_CACHE") else compiler(self.prg)
|
||||
self.clprg = runtime(self.name, self.lib)
|
||||
return self
|
||||
|
||||
def launch_dims(self, var_vals):
|
||||
global_size = [sym_infer(sz, var_vals) for sz in self.global_size] if self.global_size is not None else self.global_size
|
||||
local_size = [sym_infer(sz, var_vals) for sz in self.local_size] if self.local_size is not None else self.local_size
|
||||
return global_size, local_size
|
||||
|
||||
def __call__(self, rawbufs:List[RawBuffer], var_vals:Dict[Variable, int], wait=False, jit=False) -> Optional[float]:
|
||||
global_size, local_size = self.launch_dims(var_vals)
|
||||
if global_size is not None and local_size is None and all_int(self.global_size): # type: ignore[arg-type]
|
||||
# TODO: this is copied from get_program
|
||||
from tinygrad.features.search import optimize_local_size
|
||||
local_size = self.local_size = optimize_local_size(self.clprg, global_size, rawbufs)
|
||||
global_size = self.global_size = [g//l if g%l == 0 else g/l for g,l in zip(global_size, local_size)]
|
||||
lra = self.runtime_args.copy()
|
||||
if global_size: lra['global_size'] = global_size
|
||||
if local_size and 'local_size' not in lra: lra['local_size'] = local_size
|
||||
et = self.clprg(*rawbufs, *[var_vals[k] for k in self.vars], **lra, wait=wait or DEBUG>=2)
|
||||
update_stats(self.display_name, self.op_estimate, self.mem_estimate, var_vals, et, len(rawbufs), jit, lra=lra)
|
||||
return et
|
||||
|
||||
class Compiled:
|
||||
def __init__(self, buffer: Type[RawBuffer], linearizer_opts:LinearizerOptions, renderer, compiler, runtime, synchronize=lambda: None, graph=None):
|
||||
self.buffer, self.linearizer_opts, self.renderer, self.compiler, self.runtime, self.synchronize, self.graph = buffer, linearizer_opts, renderer, compiler, runtime, synchronize, graph
|
||||
|
||||
def to_program(self, k:Linearizer) -> CompiledASTRunner:
|
||||
k.linearize()
|
||||
src, runtime_args = self.renderer(to_function_name(k.name), k.uops)
|
||||
return CompiledASTRunner(k.ast, k.name, src, k.global_size, k.local_size, runtime_args).build(self.compiler, self.runtime)
|
||||
|
||||
@functools.lru_cache(None) # pylint: disable=method-cache-max-size-none
|
||||
def get_runner(self, ast:LazyOp) -> CompiledASTRunner: return self.to_program(_get_optimized_linearizer(self.linearizer_opts, ast))
|
||||
|
||||
def _get_optimized_linearizer(linearizer_opts:LinearizerOptions, ast:LazyOp) -> Linearizer:
|
||||
if DEBUG >= 3:
|
||||
from tinygrad.graph import print_tree
|
||||
print_tree(ast)
|
||||
from tinygrad.codegen.linearizer import Linearizer
|
||||
k = Linearizer(ast, linearizer_opts)
|
||||
if not NOOPT:
|
||||
if not (used_tensor_cores:=k.apply_tensor_cores(getenv("TC", 1))): k.hand_coded_optimizations()
|
||||
if BEAM >= 1:
|
||||
lins = [(("tc" if used_tensor_cores else "hc"), k)]
|
||||
kb = Linearizer(ast, linearizer_opts)
|
||||
kb.required_optimizations()
|
||||
from tinygrad.features.search import beam_search, time_linearizer, bufs_from_lin
|
||||
# TODO: this shouldn't use Device.DEFAULT, it should get the device from the LinearizerOptions
|
||||
test_rawbuffers = bufs_from_lin(kb) # allocate scratch buffers for optimization
|
||||
lins.append((f"beam{BEAM.value}", beam_search(kb, test_rawbuffers, BEAM.value, bool(getenv("BEAM_ESTIMATE", 1)))))
|
||||
if used_tensor_cores:
|
||||
lins.append(("hc", Linearizer(ast, linearizer_opts)))
|
||||
lins[-1][1].hand_coded_optimizations()
|
||||
timed = sorted([(nm, tk, time_linearizer(tk, test_rawbuffers, allow_test_size=False, clear_l2=True)) for nm, tk in lins], key=lambda x: x[2])
|
||||
if DEBUG >= 1: print(" < ".join(f"{nm:6s} : {lin.colored_shape(30, dense=True)} : {tm*1e6:8.2f} us" for nm, lin, tm in timed))
|
||||
k = timed[0][1]
|
||||
else:
|
||||
k.required_optimizations()
|
||||
return k
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
from typing import List, cast, Dict, Callable
|
||||
import numpy as np
|
||||
from tinygrad.ops import ScheduleItem, LazyOp, LoadOps, Device, BufferOps
|
||||
from tinygrad.ops import ScheduleItem, LazyOp, LoadOps, BufferOps
|
||||
from tinygrad.device import Device
|
||||
from tinygrad.graph import log_schedule_item, print_tree
|
||||
from tinygrad.lazy import LazyBuffer
|
||||
from tinygrad.helpers import DEBUG, prod, all_int, IMAGE
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
import time, ctypes, subprocess, platform, functools, pathlib, tempfile
|
||||
from typing import Any
|
||||
from tinygrad.ops import Compiled
|
||||
from tinygrad.device import Compiled
|
||||
from tinygrad.helpers import diskcache
|
||||
from tinygrad.runtime.lib import RawMallocBuffer
|
||||
from tinygrad.codegen.kernel import LinearizerOptions
|
||||
|
|
|
@ -1,7 +1,8 @@
|
|||
import numpy as np
|
||||
from typing import Callable, Dict, Tuple, Optional
|
||||
from tinygrad.helpers import dtypes, DType
|
||||
from tinygrad.ops import BufferOps, UnaryOps, BinaryOps, MovementOps, ReduceOps, TernaryOps, Op, Interpreted
|
||||
from tinygrad.ops import BufferOps, UnaryOps, BinaryOps, MovementOps, ReduceOps, TernaryOps, Op
|
||||
from tinygrad.device import Interpreted
|
||||
from tinygrad.runtime.lib import RawBuffer
|
||||
|
||||
class RawNumpyBuffer(RawBuffer):
|
||||
|
|
|
@ -4,7 +4,7 @@ from typing import Optional, Tuple
|
|||
import numpy as np
|
||||
from pycuda.compiler import compile as cuda_compile
|
||||
from tinygrad.helpers import DEBUG, getenv, colored, diskcache
|
||||
from tinygrad.ops import Compiled
|
||||
from tinygrad.device import Compiled
|
||||
from tinygrad.runtime.lib import RawBufferCopyInOut, RawMallocBuffer, LRUAllocator
|
||||
from tinygrad.codegen.kernel import LinearizerOptions
|
||||
from tinygrad.renderer.cuda import CUDARenderer
|
||||
|
|
|
@ -5,7 +5,8 @@ from typing import Optional
|
|||
from typing import Callable, Dict, Tuple
|
||||
from tinygrad.helpers import prod, all_int, DType, OSX
|
||||
from tinygrad.runtime.lib import RawBufferMapped
|
||||
from tinygrad.ops import Interpreted, Op, MovementOps, UnaryOps, BufferOps
|
||||
from tinygrad.device import Interpreted
|
||||
from tinygrad.ops import Op, MovementOps, UnaryOps, BufferOps
|
||||
from tinygrad.shape.view import strides_for_shape
|
||||
MAP_LOCKED, MAP_POPULATE = 0x2000, 0x008000
|
||||
|
||||
|
|
|
@ -6,7 +6,7 @@ import numpy as np
|
|||
import pyopencl as cl
|
||||
from typing import Optional, List, Tuple
|
||||
from tinygrad.helpers import DEBUG, getenv, prod, ImageDType, OSX, fromimport, diskcache
|
||||
from tinygrad.ops import Compiled
|
||||
from tinygrad.device import Compiled
|
||||
from tinygrad.renderer.opencl import OpenCLRenderer
|
||||
from tinygrad.runtime.lib import RawBufferCopyInOut, LRUAllocator, RawBufferTransfer
|
||||
from tinygrad.codegen.kernel import LinearizerOptions
|
||||
|
|
|
@ -3,7 +3,7 @@ import ctypes
|
|||
import extra.hip_wrapper as hip
|
||||
from typing import Tuple, List, Any, Dict, cast, Optional, Callable
|
||||
from tinygrad.helpers import DEBUG, getenv, diskcache
|
||||
from tinygrad.ops import Compiled, CompiledASTRunner, update_stats
|
||||
from tinygrad.device import Compiled, CompiledASTRunner, update_stats
|
||||
from tinygrad.renderer.hip import HIPRenderer
|
||||
from tinygrad.runtime.lib import RawBufferCopyInOut, LRUAllocator, RawBufferTransfer, RawBuffer
|
||||
from tinygrad.codegen.kernel import LinearizerOptions
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
import time, ctypes
|
||||
from typing import ClassVar
|
||||
from tinygrad.ops import Compiled
|
||||
from tinygrad.device import Compiled
|
||||
from tinygrad.helpers import getenv, DEBUG, diskcache
|
||||
from ctypes import CFUNCTYPE
|
||||
from tinygrad.codegen.kernel import LinearizerOptions
|
||||
|
|
|
@ -3,7 +3,7 @@ import Metal, libdispatch
|
|||
from typing import List, Any, Tuple, Dict, cast, Optional
|
||||
from tinygrad.codegen.kernel import LinearizerOptions
|
||||
from tinygrad.helpers import prod, getenv, DEBUG, DType, dtypes, diskcache, dedup
|
||||
from tinygrad.ops import Compiled, CompiledASTRunner, update_stats
|
||||
from tinygrad.device import Compiled, CompiledASTRunner, update_stats
|
||||
from tinygrad.renderer.metal import MetalRenderer
|
||||
from tinygrad.runtime.lib import RawBufferMapped, RawBuffer, LRUAllocator
|
||||
from tinygrad.shape.symbolic import Variable
|
||||
|
|
|
@ -1,7 +1,8 @@
|
|||
import torch
|
||||
import numpy as np
|
||||
from typing import Dict, Callable, Optional
|
||||
from tinygrad.ops import BufferOps, UnaryOps, BinaryOps, MovementOps, TernaryOps, ReduceOps, Op, Interpreted
|
||||
from tinygrad.ops import BufferOps, UnaryOps, BinaryOps, MovementOps, TernaryOps, ReduceOps, Op
|
||||
from tinygrad.device import Interpreted
|
||||
from tinygrad.helpers import getenv, dtypes, prod, DType
|
||||
from tinygrad.runtime.ops_cpu import einsum_mulacc, shape_to_axis
|
||||
from tinygrad.runtime.lib import RawBuffer
|
||||
|
|
|
@ -3,7 +3,7 @@ import functools
|
|||
from wgpu.utils.device import get_default_device
|
||||
from tinygrad.runtime.lib import RawBufferCopyIn, LRUAllocator
|
||||
from tinygrad.helpers import dtypes, DType
|
||||
from tinygrad.ops import Compiled
|
||||
from tinygrad.device import Compiled
|
||||
from tinygrad.codegen.kernel import LinearizerOptions
|
||||
from tinygrad.renderer.cstyle import uops_to_cstyle
|
||||
from tinygrad.renderer.wgsl import WGSLLanguage
|
||||
|
|
|
@ -9,7 +9,8 @@ import numpy as np
|
|||
|
||||
from tinygrad.helpers import ImageDType, argfix, make_pair, getenv, IMAGE, DEBUG, flatten, DType, dtypes, prod, all_int, round_up
|
||||
from tinygrad.lazy import LazyBuffer
|
||||
from tinygrad.ops import Device, LoadOps
|
||||
from tinygrad.ops import LoadOps
|
||||
from tinygrad.device import Device
|
||||
from tinygrad.shape.symbolic import sint
|
||||
from tinygrad.realize import run_schedule
|
||||
|
||||
|
|
Loading…
Reference in New Issue