1
0
Fork 0

Merge branch 'master' into arm

pull/1421/head
Steven Anderson 2023-08-02 19:49:06 -04:00
commit 901051fcb1
36 changed files with 497 additions and 311 deletions

View File

@ -36,7 +36,7 @@ jobs:
- name: Run mypy - name: Run mypy
run: mypy tinygrad/ --ignore-missing-imports --check-untyped-defs --explicit-package-bases --warn-unreachable run: mypy tinygrad/ --ignore-missing-imports --check-untyped-defs --explicit-package-bases --warn-unreachable
- name: Install SLOCCount - name: Install SLOCCount
run: sudo apt install sloccount run: sudo apt-get install sloccount
- name: Check <5000 lines - name: Check <5000 lines
run: sloccount tinygrad test examples extra; if [ $(sloccount tinygrad | sed -n 's/.*Total Physical Source Lines of Code (SLOC)[ ]*= \([^ ]*\).*/\1/p' | tr -d ',') -gt 5000 ]; then exit 1; fi run: sloccount tinygrad test examples extra; if [ $(sloccount tinygrad | sed -n 's/.*Total Physical Source Lines of Code (SLOC)[ ]*= \([^ ]*\).*/\1/p' | tr -d ',') -gt 5000 ]; then exit 1; fi
@ -111,14 +111,16 @@ jobs:
steps: steps:
- name: Checkout Code - name: Checkout Code
uses: actions/checkout@v3 uses: actions/checkout@v3
- name: Find faster apt mirror
uses: vegardit/fast-apt-mirror.sh@v1
- name: Update packages - name: Update packages
run: | run: |
wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | sudo tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | sudo tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null
echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" | sudo tee /etc/apt/sources.list.d/oneAPI.list echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" | sudo tee /etc/apt/sources.list.d/oneAPI.list
sudo apt update sudo apt-get update
- name: Install OpenCL - name: Install OpenCL
#run: sudo apt-get install -y pocl-opencl-icd #run: sudo apt-get install -y pocl-opencl-icd
run: sudo apt install -y intel-oneapi-runtime-compilers intel-oneapi-runtime-opencl run: sudo apt-get install -y intel-oneapi-runtime-compilers intel-oneapi-runtime-opencl
- name: Set up Python 3.8 - name: Set up Python 3.8
uses: actions/setup-python@v4 uses: actions/setup-python@v4
with: with:
@ -184,7 +186,7 @@ jobs:
- name: Run webgpu pytest - name: Run webgpu pytest
run: WEBGPU=1 WGPU_BACKEND_TYPE=Metal python -m pytest -n=auto -m 'webgpu' run: WEBGPU=1 WGPU_BACKEND_TYPE=Metal python -m pytest -n=auto -m 'webgpu'
- name: Build WEBGPU Efficientnet - name: Build WEBGPU Efficientnet
run: WEBGPU=1 WGPU_BACKEND_TYPE=Metal python -m examples.webgpu.compile_webgpu run: WEBGPU=1 WGPU_BACKEND_TYPE=Metal python -m examples.compile_efficientnet
testdocker: testdocker:
name: Docker Test name: Docker Test
@ -222,19 +224,20 @@ jobs:
key: ${{ matrix.backend }} key: ${{ matrix.backend }}
- name: Set env - name: Set env
run: printf "${{ matrix.backend == 'llvm' && 'ENABLE_METHOD_CACHE=1\nLLVM=1' || matrix.backend == 'clang' && 'CLANG=1\nENABLED_METHOD_CACHE=1' || matrix.backend == 'gpu' && 'GPU=1' || matrix.backend == 'cuda' && 'FORWARD_ONLY=1\nJIT=1\nOPT=2\nCUDA=1\nCUDACPU=1\n'}}" >> $GITHUB_ENV run: printf "${{ matrix.backend == 'llvm' && 'ENABLE_METHOD_CACHE=1\nLLVM=1' || matrix.backend == 'clang' && 'CLANG=1\nENABLED_METHOD_CACHE=1' || matrix.backend == 'gpu' && 'GPU=1' || matrix.backend == 'cuda' && 'FORWARD_ONLY=1\nJIT=1\nOPT=2\nCUDA=1\nCUDACPU=1\n'}}" >> $GITHUB_ENV
- name: Find faster apt mirror
uses: vegardit/fast-apt-mirror.sh@v1
- name: Install packages (gpu) - name: Install packages (gpu)
if: matrix.backend == 'gpu' if: matrix.backend == 'gpu'
run: | run: |
wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | sudo tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | sudo tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null
echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" | sudo tee /etc/apt/sources.list.d/oneAPI.list echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" | sudo tee /etc/apt/sources.list.d/oneAPI.list
sudo apt update && \ sudo apt-get update -y && \
sudo apt install -y intel-oneapi-runtime-compilers intel-oneapi-runtime-opencl sudo apt-get install -y intel-oneapi-runtime-compilers intel-oneapi-runtime-opencl
- name: Install packages (cuda) - name: Install packages (cuda)
if: matrix.backend == 'cuda' if: matrix.backend == 'cuda'
run: | run: |
export DEBIAN_FRONTEND=noninteractive sudo apt-get update -y && \
sudo apt update -y && \ sudo apt-get install -y --no-install-recommends git g++ cmake ninja-build llvm-15-dev zlib1g-dev libglew-dev flex bison libfl-dev libboost-thread-dev libboost-filesystem-dev nvidia-cuda-toolkit-gcc
sudo apt install -y --no-install-recommends git g++ cmake ninja-build llvm-15-dev zlib1g-dev libglew-dev flex bison libfl-dev libboost-thread-dev libboost-filesystem-dev nvidia-cuda-toolkit-gcc
- name: Cache gpuocelot - name: Cache gpuocelot
if: matrix.backend == 'cuda' if: matrix.backend == 'cuda'
id: cache-build id: cache-build

6
.gitignore vendored
View File

@ -12,7 +12,6 @@ dist
/env /env
a.out a.out
boxes.jpg boxes.jpg
weights/*
pandecode.dump pandecode.dump
vertex.bin vertex.bin
recognize* recognize*
@ -31,9 +30,8 @@ extra/datasets/kits/
extra/datasets/COCO/ extra/datasets/COCO/
extra/datasets/audio* extra/datasets/audio*
venv venv
examples/webgpu/net.js examples/net.*[js,json,safetensors]
examples/webgpu/net.safetensors
node_modules node_modules
package.json package.json
package-lock.json package-lock.json
temp temp

View File

@ -5,11 +5,26 @@ Most of these are self-explanatory, and are usually used to set an option at run
Example: `GPU=1 DEBUG=4 python3 -m pytest` Example: `GPU=1 DEBUG=4 python3 -m pytest`
The columns are: Variable, Possible Value(s) and Description. However you can also decorate a function to set a value only inside that function.
- A `#` means that the variable can take any integer value. ```python
# in tensor.py (probably only useful if you are a tinygrad developer)
@Context(DEBUG=4)
def numpy(self) -> ...
```
Or use contextmanager to temporarily set a value inside some scope:
```python
with Context(DEBUG=0):
a = Tensor.ones(10, 10)
a *= 2
```
## Global Variables ## Global Variables
The columns of this list are are: Variable, Possible Value(s) and Description.
- A `#` means that the variable can take any integer value.
These control the behavior of core tinygrad even when used as a library. These control the behavior of core tinygrad even when used as a library.

View File

@ -9,7 +9,7 @@ We need some imports to get started:
```python ```python
import numpy as np import numpy as np
import time from tinygrad.helpers import Timing
``` ```
## Tensors ## Tensors
@ -221,23 +221,22 @@ We will be using the same batch size of 64 and will be evaluating for 1000 of th
# set training flag to false # set training flag to false
Tensor.training = False Tensor.training = False
st = time.perf_counter() with Timing("Time: "):
avg_acc = 0 avg_acc = 0
for step in range(1000): for step in range(1000):
# random sample a batch # random sample a batch
samp = np.random.randint(0, X_test.shape[0], size=(64)) samp = np.random.randint(0, X_test.shape[0], size=(64))
batch = Tensor(X_test[samp], requires_grad=False) batch = Tensor(X_test[samp], requires_grad=False)
# get the corresponding labels # get the corresponding labels
labels = Y_test[samp] labels = Y_test[samp]
# forward pass # forward pass
out = net(batch) out = net(batch)
# calculate accuracy # calculate accuracy
pred = np.argmax(out.numpy(), axis=-1) pred = np.argmax(out.numpy(), axis=-1)
avg_acc += (pred == labels).mean() avg_acc += (pred == labels).mean()
print(f"Test Accuracy: {avg_acc / 1000}") print(f"Test Accuracy: {avg_acc / 1000}")
print(f"Time: {time.perf_counter() - st}")
``` ```
## And that's it ## And that's it
@ -266,23 +265,22 @@ from tinygrad.jit import TinyJit
def jit(x): def jit(x):
return net(x).realize() return net(x).realize()
st = time.perf_counter() with Timing("Time: "):
avg_acc = 0 avg_acc = 0
for step in range(1000): for step in range(1000):
# random sample a batch # random sample a batch
samp = np.random.randint(0, X_test.shape[0], size=(64)) samp = np.random.randint(0, X_test.shape[0], size=(64))
batch = Tensor(X_test[samp], requires_grad=False) batch = Tensor(X_test[samp], requires_grad=False)
# get the corresponding labels # get the corresponding labels
labels = Y_test[samp] labels = Y_test[samp]
# forward pass with jit # forward pass with jit
out = jit(batch) out = jit(batch)
# calculate accuracy # calculate accuracy
pred = np.argmax(out.numpy(), axis=-1) pred = np.argmax(out.numpy(), axis=-1)
avg_acc += (pred == labels).mean() avg_acc += (pred == labels).mean()
print(f"Test Accuracy: {avg_acc / 1000}") print(f"Test Accuracy: {avg_acc / 1000}")
print(f"Time: {time.perf_counter() - st}")
``` ```
You will find that the evaluation time is much faster than before and that your accelerator utilization is much higher. You will find that the evaluation time is much faster than before and that your accelerator utilization is much higher.

View File

@ -1,109 +1,67 @@
from models.efficientnet import EfficientNet from models.efficientnet import EfficientNet
from tinygrad.tensor import Tensor from tinygrad.tensor import Tensor
from tinygrad.jit import TinyJit from tinygrad.state import safe_save
from extra.utils import fetch from extra.utils import fetch
import ast from extra.export_model import export_model
from tinygrad.helpers import getenv
def compile_net(run, special_names): import ast, os
functions, bufs, bufs_to_save, statements, bufnum = {}, {}, {}, [], 0
for fxn,args in run.jit_cache:
functions[fxn.name] = fxn.prg # NOTE: this assumes all with the same name are the same
cargs = []
for i,arg in enumerate(args):
key = id(arg)
if key not in bufs:
if key in special_names:
bufs[key] = (special_names[key], arg._memsz, key)
else:
bufs[key] = (f"buf_{bufnum}", arg._memsz, key)
bufnum += 1
if i > 0: bufs_to_save[bufs[key][0]] = arg # if first usage of a buffer is not an output, and it's not a special name
cargs.append(bufs[key][0])
statements.append((fxn.name, cargs, fxn.global_size))
return functions, statements, bufs, bufs_to_save
def jit_model(model, the_input):
@TinyJit
def run(x): return model.forward(x).realize()
# twice to run the JIT
the_output = run(the_input)
the_output = run(the_input)
# hack to put the inputs back
assert len(run.input_replace) == 1, f"didn't get one input to replace {run.input_replace}"
for (j,i),idx in run.input_replace.items():
run.jit_cache[j][1][i] = the_input.lazydata.realized
# TODO: fetch this from the jit in self.input_replace and self.ret (hint: use get_parameters on self.ret)
special_names = {id(the_input.lazydata.realized): "input", id(the_output.lazydata.realized): "outputs"}
return run, special_names
if __name__ == "__main__": if __name__ == "__main__":
model = EfficientNet(0) model = EfficientNet(0)
model.load_from_pretrained() model.load_from_pretrained()
run, special_names = jit_model(model, Tensor.randn(1,3,224,224)) mode = "clang" if getenv("CLANG", "") != "" else "webgpu" if getenv("WEBGPU", "") != "" else ""
functions, statements, bufs, bufs_to_save = compile_net(run, special_names) prg, inp_size, out_size, state = export_model(model, Tensor.randn(1,3,224,224), mode)
if getenv("CLANG", "") == "":
safe_save(state, os.path.join(os.path.dirname(__file__), "net.safetensors"))
ext = "js" if getenv("WEBGPU", "") != "" else "json"
with open(os.path.join(os.path.dirname(__file__), f"net.{ext}"), "w") as text_file:
text_file.write(prg)
else:
cprog = [prg]
# image library!
cprog += ["#define STB_IMAGE_IMPLEMENTATION", fetch("https://raw.githubusercontent.com/nothings/stb/master/stb_image.h").decode('utf-8').replace("half", "_half")]
# c header # imagenet labels, move to datasets?
cprog = ["#include <stdio.h>", "#include <math.h>", "#define max(x,y) ((x>y)?x:y)"] lbls = fetch("https://gist.githubusercontent.com/yrevar/942d3a0ac09ec9e5eb3a/raw/238f720ff059c1f82f368259d1ca4ffa5dd8f9f5/imagenet1000_clsidx_to_labels.txt")
lbls = ast.literal_eval(lbls.decode('utf-8'))
lbls = ['"'+lbls[i]+'"' for i in range(1000)]
cprog.append(f"char *lbls[] = {{{','.join(lbls)}}};")
cprog.append(f"float input[{inp_size}];")
cprog.append(f"float outputs[{out_size}];")
# save the weights # buffers (empty + weights)
for name,cl in bufs_to_save.items(): cprog.append("""
weight = ''.join(["\\x%02X"%x for x in bytes(cl._buf)]) int main(int argc, char* argv[]) {
cprog.append(f"unsigned char {name}_data[] = \"{weight}\";") int DEBUG = getenv("DEBUG") != NULL ? atoi(getenv("DEBUG")) : 0;
int X=0, Y=0, chan=0;
# image library! stbi_uc *image = (argc > 1) ? stbi_load(argv[1], &X, &Y, &chan, 3) : stbi_load_from_file(stdin, &X, &Y, &chan, 3);
cprog += ["#define STB_IMAGE_IMPLEMENTATION", fetch("https://raw.githubusercontent.com/nothings/stb/master/stb_image.h").decode('utf-8')] assert(image != NULL);
if (DEBUG) printf("loaded image %dx%d channels %d\\n", X, Y, chan);
# imagenet labels, move to datasets? assert(chan == 3);
lbls = fetch("https://gist.githubusercontent.com/yrevar/942d3a0ac09ec9e5eb3a/raw/238f720ff059c1f82f368259d1ca4ffa5dd8f9f5/imagenet1000_clsidx_to_labels.txt") // resize to input[1,3,224,224] and rescale
lbls = ast.literal_eval(lbls.decode('utf-8')) for (int y = 0; y < 224; y++) {
lbls = ['"'+lbls[i]+'"' for i in range(1000)] for (int x = 0; x < 224; x++) {
cprog.append(f"char *lbls[] = {{{','.join(lbls)}}};") // get sample position
int tx = (x/224.)*X;
# buffers (empty + weights) int ty = (y/224.)*Y;
cprog += [f"float {name}[{len}];" if name not in bufs_to_save else f"float *{name} = (float *){name}_data;" for name,len,_key in bufs.values()] for (int c = 0; c < 3; c++) {
input[c*224*224 + y*224 + x] = (image[ty*X*chan + tx*chan + c] / 255.0 - 0.45) / 0.225;
# the functions }
cprog += list(functions.values())
# the net
cprog += ["void net() {"] + [f"{name}({', '.join(args)});" for (name, args, _global_size) in statements] + ["}"]
cprog += ["""
int main(int argc, char* argv[]) {
int DEBUG = getenv("DEBUG") != NULL ? atoi(getenv("DEBUG")) : 0;
int X=0, Y=0, chan=0;
stbi_uc *image = (argc > 1) ? stbi_load(argv[1], &X, &Y, &chan, 3) : stbi_load_from_file(stdin, &X, &Y, &chan, 3);
assert(image != NULL);
if (DEBUG) printf("loaded image %dx%d channels %d\\n", X, Y, chan);
assert(chan == 3);
// resize to input[1,3,224,224] and rescale
for (int y = 0; y < 224; y++) {
for (int x = 0; x < 224; x++) {
// get sample position
int tx = (x/224.)*X;
int ty = (y/224.)*Y;
for (int c = 0; c < 3; c++) {
input[c*224*224 + y*224 + x] = (image[ty*X*chan + tx*chan + c] / 255.0 - 0.45) / 0.225;
} }
} }
} net(input, outputs);
net(); float best = -INFINITY;
float best = -INFINITY; int best_idx = -1;
int best_idx = -1; for (int i = 0; i < 1000; i++) {
for (int i = 0; i < 1000; i++) { if (outputs[i] > best) {
if (outputs[i] > best) { best = outputs[i];
best = outputs[i]; best_idx = i;
best_idx = i; }
} }
} if (DEBUG) printf("category : %d (%s) with %f\\n", best_idx, lbls[best_idx], best);
if (DEBUG) printf("category : %d (%s) with %f\\n", best_idx, lbls[best_idx], best); else printf("%s\\n", lbls[best_idx]);
else printf("%s\\n", lbls[best_idx]); }""")
}"""]
# CLANG=1 python3 examples/compile_efficientnet.py | clang -O2 -lm -x c - -o recognize && DEBUG=1 time ./recognize docs/showcase/stable_diffusion_by_tinygrad.jpg # CLANG=1 python3 examples/compile_efficientnet.py | clang -O2 -lm -x c - -o recognize && DEBUG=1 time ./recognize docs/showcase/stable_diffusion_by_tinygrad.jpg
# category : 281 (tabby, tabby cat) with 9.452788 # category : 281 (tabby, tabby cat) with 9.452788
print('\n'.join(cprog)) print('\n'.join(cprog))

View File

@ -1,87 +0,0 @@
from os import path
from examples.compile_efficientnet import compile_net, jit_model
from models.efficientnet import EfficientNet
from tinygrad.state import get_state_dict, safe_save
from tinygrad.tensor import Tensor
if __name__ == "__main__":
model = EfficientNet(0)
model.load_from_pretrained()
run, special_names = jit_model(model, Tensor.randn(1,3,224,224))
functions, statements, bufs, _bufs_to_save = compile_net(run, special_names)
state = get_state_dict(model)
weights = {id(x.lazydata.realized): name for name, x in state.items()}
safe_save(state, path.join(path.dirname(__file__), "net.safetensors"))
kernel_code = '\n\n'.join([f"const {key} = `{code.replace(key, 'main')}`;" for key, code in functions.items()])
kernel_names = ', '.join([name for (name, _args, _global_size) in statements])
kernel_calls = '\n '.join([f"addComputePass(device, commandEncoder, piplines[{i}], [{', '.join(args)}], {global_size});" for i, (_name, args, global_size) in enumerate(statements) ])
bufs = '\n '.join([f"const {buf[0]} = " + (f"createEmptyBuf(device, {buf[1]});" if buf[2] not in weights else f"createWeightBuf(device, {buf[1]}, getTensorBuffer(safetensor, metadata['{weights[buf[2]]}']))") + ";" for buf in bufs.values()])
prg = f"""const getTensorMetadata = (safetensorBuffer) => {{
const metadataLength = Number(new DataView(safetensorBuffer.buffer).getBigUint64(0, true));
const metadata = JSON.parse(new TextDecoder("utf8").decode(safetensorBuffer.subarray(8, 8 + metadataLength)));
return Object.fromEntries(Object.entries(metadata).filter(([k, v]) => k !== "__metadata__").map(([k, v]) => [k, {{...v, data_offsets: v.data_offsets.map(x => 8 + metadataLength + x)}}]));
}};
const getTensorBuffer = (safetensorBuffer, tensorMetadata) => {{
return safetensorBuffer.subarray(...tensorMetadata.data_offsets);
}}
const createEmptyBuf = (device, size) => {{
return device.createBuffer({{size, usage: GPUBufferUsage.STORAGE | GPUBufferUsage.COPY_SRC | GPUBufferUsage.COPY_DST }});
}};
const createWeightBuf = (device, size, data) => {{
const buf = device.createBuffer({{ mappedAtCreation: true, size, usage: GPUBufferUsage.STORAGE }});
new Uint8Array(buf.getMappedRange()).set(data);
buf.unmap();
return buf;
}};
const addComputePass = (device, commandEncoder, pipeline, bufs, workgroup) => {{
const bindGroup = device.createBindGroup({{layout: pipeline.getBindGroupLayout(0), entries: bufs.map((buffer, index) => ({{ binding: index, resource: {{ buffer }} }}))}});
const passEncoder = commandEncoder.beginComputePass();
passEncoder.setPipeline(pipeline);
passEncoder.setBindGroup(0, bindGroup);
passEncoder.dispatchWorkgroups(...workgroup);
passEncoder.end();
}};
{kernel_code}
const setupNet = async (device, safetensor) => {{
const metadata = getTensorMetadata(safetensor);
{bufs}
const gpuWriteBuffer = device.createBuffer({{size:input.size, usage: GPUBufferUsage.COPY_SRC | GPUBufferUsage.MAP_WRITE }});
const gpuReadBuffer = device.createBuffer({{ size: outputs.size, usage: GPUBufferUsage.COPY_DST | GPUBufferUsage.MAP_READ }});
const kernels = [{kernel_names}];
const piplines = await Promise.all(kernels.map(name => device.createComputePipelineAsync({{layout: "auto", compute: {{ module: device.createShaderModule({{ code: name }}), entryPoint: "main" }}}})));
return async (data) => {{
await gpuWriteBuffer.mapAsync(GPUMapMode.WRITE);
new Float32Array(gpuWriteBuffer.getMappedRange()).set(data);
gpuWriteBuffer.unmap();
const commandEncoder = device.createCommandEncoder();
commandEncoder.copyBufferToBuffer(gpuWriteBuffer, 0, input, 0, gpuWriteBuffer.size);
{kernel_calls}
commandEncoder.copyBufferToBuffer(outputs, 0, gpuReadBuffer, 0, outputs.size);
const gpuCommands = commandEncoder.finish();
device.queue.submit([gpuCommands]);
await gpuReadBuffer.mapAsync(GPUMapMode.READ);
const resultBuffer = new Float32Array(gpuReadBuffer.size);
resultBuffer.set(new Float32Array(gpuReadBuffer.getMappedRange()));
gpuReadBuffer.unmap();
return resultBuffer;
}}
}}
"""
with open(path.join(path.dirname(__file__), "net.js"), "w") as text_file:
text_file.write(prg)

View File

@ -11,6 +11,8 @@ from tinygrad.state import torch_load, load_state_dict
from tinygrad.helpers import getenv from tinygrad.helpers import getenv
import tinygrad.nn as nn import tinygrad.nn as nn
from tinygrad.tensor import Tensor from tinygrad.tensor import Tensor
import itertools
import librosa
# TODO: you have written this fifteen times # TODO: you have written this fifteen times
class MultiHeadAttention: class MultiHeadAttention:
@ -104,30 +106,22 @@ class Whisper:
def __call__(self, mel:Tensor, tokens:Tensor): def __call__(self, mel:Tensor, tokens:Tensor):
return self.decoder(tokens, self.encoder(mel)) return self.decoder(tokens, self.encoder(mel))
# TODO: this is tragic. remove this RATE = 16000
import functools CHUNK = 1600
import itertools RECORD_SECONDS = 10
import torch
import torchaudio
import librosa
@functools.lru_cache(None) def prep_audio(waveform=None, sr=RATE) -> Tensor:
def get_filters(sample_rate, n_fft, n_mels):return torch.tensor(librosa.filters.mel(sr=sample_rate, n_fft=n_fft, n_mels=n_mels))
@functools.lru_cache(None)
def get_window(n_fft): return torch.hann_window(n_fft)
def prep_audio(waveform, sample_rate) -> Tensor:
N_FFT = 400 N_FFT = 400
HOP_LENGTH = 160 HOP_LENGTH = 160
N_MELS = 80 N_MELS = 80
stft = torch.stft(waveform, N_FFT, HOP_LENGTH, window=get_window(N_FFT), return_complex=True) if waveform is None: waveform = np.zeros(N_FFT, dtype=np.float32)
magnitudes = stft[..., :-1].abs() ** 2 stft = librosa.stft(waveform, n_fft=N_FFT, hop_length=HOP_LENGTH, window='hann', dtype=np.float32)
mel_spec = get_filters(sample_rate, N_FFT, N_MELS) @ magnitudes magnitudes = stft[..., :-1] ** 2
log_spec = torch.clamp(mel_spec, min=1e-10).log10() mel_spec = librosa.filters.mel(sr=sr, n_fft=N_FFT, n_mels=N_MELS) @ magnitudes
log_spec = torch.maximum(log_spec, log_spec.max() - 8.0) log_spec = np.log10(np.clip(mel_spec, 1e-10, mel_spec.max() + 1e8))
log_spec = (log_spec + 4.0) / 4.0 log_spec = (log_spec + 4.0) / 4.0
#print(waveform.shape, log_spec.shape) #print(waveform.shape, log_spec.shape)
return log_spec.numpy() return log_spec
LANGUAGES = { LANGUAGES = {
"en": "english", "zh": "chinese", "de": "german", "es": "spanish", "ru": "russian", "ko": "korean", "fr": "french", "ja": "japanese", "pt": "portuguese", "tr": "turkish", "en": "english", "zh": "chinese", "de": "german", "es": "spanish", "ru": "russian", "ko": "korean", "fr": "french", "ja": "japanese", "pt": "portuguese", "tr": "turkish",
@ -175,12 +169,8 @@ def img(x):
plt.imshow(x.numpy()) plt.imshow(x.numpy())
plt.show() plt.show()
RATE = 16000
CHUNK = 1600
RECORD_SECONDS = 10
def listener(q): def listener(q):
prep_audio(torch.zeros(300), RATE) prep_audio()
import pyaudio import pyaudio
p = pyaudio.PyAudio() p = pyaudio.PyAudio()
stream = p.open(format=pyaudio.paInt16, channels=1, rate=RATE, input=True, frames_per_buffer=CHUNK) stream = p.open(format=pyaudio.paInt16, channels=1, rate=RATE, input=True, frames_per_buffer=CHUNK)
@ -205,7 +195,7 @@ if __name__ == "__main__":
if len(sys.argv) > 1: if len(sys.argv) > 1:
# offline # offline
waveform, sample_rate = torchaudio.load(sys.argv[1], normalize=True) waveform, sample_rate = librosa.load(sys.argv[1], normalize=True)
log_spec = prep_audio(waveform, sample_rate) log_spec = prep_audio(waveform, sample_rate)
lst = [enc._special_tokens["<|startoftranscript|>"]] lst = [enc._special_tokens["<|startoftranscript|>"]]
dat = model.encoder(Tensor(log_spec)).realize() dat = model.encoder(Tensor(log_spec)).realize()
@ -234,7 +224,7 @@ if __name__ == "__main__":
did_read = True did_read = True
if did_read: if did_read:
last_total = total.shape[1] last_total = total.shape[1]
log_spec = prep_audio(torch.Tensor(total), RATE) log_spec = prep_audio(waveform=Tensor(total).numpy(), sr=RATE)
encoded_audio = model.encoder(Tensor(log_spec)).realize() encoded_audio = model.encoder(Tensor(log_spec)).realize()
out = model.decoder(Tensor([lst]), encoded_audio).realize() out = model.decoder(Tensor([lst]), encoded_audio).realize()
idx = out[0,-1].numpy().argmax() idx = out[0,-1].numpy().argmax()

View File

@ -0,0 +1,165 @@
from typing import Tuple, Dict, List
from tinygrad.helpers import DType
from tinygrad.tensor import Device, Tensor
from tinygrad.jit import TinyJit
from tinygrad.state import get_state_dict
import json
def compile_net(run:TinyJit, special_names:Dict[int,str]) -> Tuple[Dict[str,str],List[Tuple[str,List[str],List[int]]],Dict[str,Tuple[int,DType,int]],Dict[str,Tensor]]:
functions, bufs, bufs_to_save, statements, bufnum = {}, {}, {}, [], 0
for fxn,args in run.jit_cache:
functions[fxn.name] = fxn.prg # NOTE: this assumes all with the same name are the same
cargs = []
for i,arg in enumerate(args):
key = id(arg)
if key not in bufs:
if key in special_names:
bufs[key] = (special_names[key], arg._memsz, arg.dtype, key)
else:
bufs[key] = (f"buf_{bufnum}", arg._memsz, arg.dtype, key)
bufnum += 1
if i > 0: bufs_to_save[bufs[key][0]] = arg # if first usage of a buffer is not an output, and it's not a special name
cargs.append(bufs[key][0])
statements.append((fxn.name, cargs, fxn.global_size, fxn.local_size))
return functions, statements, {name:(size, dtype, key) for (name,size,dtype,key) in bufs.values()}, bufs_to_save
def jit_model(model, the_input:Tensor) -> Tuple[TinyJit,Dict[int,str]]:
assert hasattr(model, "forward") or callable(model), "model needs a forward function"
@TinyJit
def run(x): return (model.forward(x) if hasattr(model, "forward") else model(x)).realize()
# twice to run the JIT
for _ in range(2): the_output = run(the_input)
# hack to put the inputs back
assert len(run.input_replace) == 1, f"didn't get one input to replace {run.input_replace}"
for (j,i),idx in run.input_replace.items():
run.jit_cache[j][1][i] = the_input.lazydata.realized
# TODO: fetch this from the jit in self.input_replace and self.ret (hint: use get_parameters on self.ret)
special_names = {id(the_input.lazydata.realized): "input", id(the_output.lazydata.realized): "outputs"}
return run, special_names
def export_model_clang(functions:Dict[str,str], statements:Dict[str,Tuple[str,int,int]], bufs:Dict[str,Tuple[str,int,int]], bufs_to_save:Dict[str,Tensor]) -> str:
from tinygrad.runtime.ops_clang import CLANG_PROGRAM_HEADER
cprog = [CLANG_PROGRAM_HEADER]
for name,cl in bufs_to_save.items():
weight = ''.join(["\\x%02X"%x for x in bytes(cl._buf)])
cprog.append(f"unsigned char {name}_data[] = \"{weight}\";")
cprog += [f"float {name}[{len}];" if name not in bufs_to_save else f"float *{name} = (float *){name}_data;" for name,(len,dtype,_key) in bufs.items() if name not in ['input', 'outputs']]
cprog += list(functions.values())
cprog += ["void net(float* input, float* outputs) {"] + [f"{name}({', '.join(args)});" for (name, args, _global_size, _local_size) in statements] + ["}"]
return '\n'.join(cprog)
def export_model_webgpu(functions, statements, bufs, bufs_to_save, weight_names) -> Tuple[str,int,int]:
kernel_code = '\n\n'.join([f"const {key} = `{code.replace(key, 'main')}`;" for key, code in functions.items()])
kernel_names = ', '.join([name for (name, _args, _global_size, _local_size) in statements])
kernel_calls = '\n '.join([f"addComputePass(device, commandEncoder, piplines[{i}], [{', '.join(args)}], {global_size});" for i, (_name, args, global_size, _local_size) in enumerate(statements) ])
_bufs = '\n '.join([f"const {name} = " + (f"createEmptyBuf(device, {size});" if _key not in weight_names else f"createWeightBuf(device, {size}, getTensorBuffer(safetensor, metadata['{weight_names[_key]}']))") + ";" for name,(size,dtype,_key) in bufs.items()])
return f"""
const getTensorMetadata = (safetensorBuffer) => {{
const metadataLength = Number(new DataView(safetensorBuffer.buffer).getBigUint64(0, true));
const metadata = JSON.parse(new TextDecoder("utf8").decode(safetensorBuffer.subarray(8, 8 + metadataLength)));
return Object.fromEntries(Object.entries(metadata).filter(([k, v]) => k !== "__metadata__").map(([k, v]) => [k, {{...v, data_offsets: v.data_offsets.map(x => 8 + metadataLength + x)}}]));
}};
const getTensorBuffer = (safetensorBuffer, tensorMetadata) => {{
return safetensorBuffer.subarray(...tensorMetadata.data_offsets);
}}
const createEmptyBuf = (device, size) => {{
return device.createBuffer({{size, usage: GPUBufferUsage.STORAGE | GPUBufferUsage.COPY_SRC | GPUBufferUsage.COPY_DST }});
}};
const createWeightBuf = (device, size, data) => {{
const buf = device.createBuffer({{ mappedAtCreation: true, size, usage: GPUBufferUsage.STORAGE }});
new Uint8Array(buf.getMappedRange()).set(data);
buf.unmap();
return buf;
}};
const addComputePass = (device, commandEncoder, pipeline, bufs, workgroup) => {{
const bindGroup = device.createBindGroup({{layout: pipeline.getBindGroupLayout(0), entries: bufs.map((buffer, index) => ({{ binding: index, resource: {{ buffer }} }}))}});
const passEncoder = commandEncoder.beginComputePass();
passEncoder.setPipeline(pipeline);
passEncoder.setBindGroup(0, bindGroup);
passEncoder.dispatchWorkgroups(...workgroup);
passEncoder.end();
}};
{kernel_code}
const setupNet = async (device, safetensor) => {{
const metadata = getTensorMetadata(safetensor);
{_bufs}
const gpuWriteBuffer = device.createBuffer({{size:input.size, usage: GPUBufferUsage.COPY_SRC | GPUBufferUsage.MAP_WRITE }});
const gpuReadBuffer = device.createBuffer({{ size: outputs.size, usage: GPUBufferUsage.COPY_DST | GPUBufferUsage.MAP_READ }});
const kernels = [{kernel_names}];
const piplines = await Promise.all(kernels.map(name => device.createComputePipelineAsync({{layout: "auto", compute: {{ module: device.createShaderModule({{ code: name }}), entryPoint: "main" }}}})));
return async (data) => {{
await gpuWriteBuffer.mapAsync(GPUMapMode.WRITE);
new Float32Array(gpuWriteBuffer.getMappedRange()).set(data);
gpuWriteBuffer.unmap();
const commandEncoder = device.createCommandEncoder();
commandEncoder.copyBufferToBuffer(gpuWriteBuffer, 0, input, 0, gpuWriteBuffer.size);
{kernel_calls}
commandEncoder.copyBufferToBuffer(outputs, 0, gpuReadBuffer, 0, outputs.size);
const gpuCommands = commandEncoder.finish();
device.queue.submit([gpuCommands]);
await gpuReadBuffer.mapAsync(GPUMapMode.READ);
const resultBuffer = new Float32Array(gpuReadBuffer.size);
resultBuffer.set(new Float32Array(gpuReadBuffer.getMappedRange()));
gpuReadBuffer.unmap();
return resultBuffer;
}}
}}
""" + f"\n\nconst loadNet = async (device) => {{ return await fetch('net.safetensors').then(x => x.arrayBuffer()).then(x => setupNet(device, new Uint8Array(x))); }}"
def export_model(model, input:Tensor, target:str):
assert Device.DEFAULT in ["WEBGPU", "CLANG", "CUDA", "GPU", "METAL"], "only WEBGPU, CLANG, CUDA, GPU, METAL are supported"
run,special_names = jit_model(model, input)
functions, statements, bufs, bufs_to_save = compile_net(run, special_names)
state = get_state_dict(model)
weight_names = {id(x.lazydata.realized): name for name, x in state.items()}
prg = ""
if target == "clang":
prg = export_model_clang(functions, statements, bufs, bufs_to_save)
elif target == "webgpu":
prg = export_model_webgpu(functions, statements, bufs, bufs_to_save, weight_names)
else:
prg = json.dumps({
"backend": Device.DEFAULT,
"input": {
"size": bufs['input'][0],
"dtype": bufs['input'][1].name
},
"output": {
"size": bufs["outputs"][0],
"dtype": bufs["outputs"][1].name
},
"functions": functions,
"statements": [{
"kernel": kernel,
"args": args,
"global_size": global_size,
"local_size": local_size
} for (kernel, args, global_size, local_size) in statements],
"buffers": {
name: {
"size": size,
"dtype": dtype.name,
"id": weight_names[_key] if _key in weight_names else ""
} for name, (size,dtype,_key) in bufs.items() if name not in ["input", "outputs"]
}
})
return prg, bufs['input'][0], bufs['outputs'][0], state

View File

@ -58,6 +58,7 @@ extern "C" __global__ void test(float* c, __half* a, __half* b) {{
}} }}
for (int y = 0; y < {KY}; y++) {{ for (int y = 0; y < {KY}; y++) {{
for (int x = 0; x < {KX}; x++) {{ for (int x = 0; x < {KX}; x++) {{
__syncthreads();
c_frag[y][x] = __builtin_amdgcn_wmma_f32_16x16x16_f16_w32(a_frag[x], b_frag[y], c_frag[y][x]); c_frag[y][x] = __builtin_amdgcn_wmma_f32_16x16x16_f16_w32(a_frag[x], b_frag[y], c_frag[y][x]);
}} }}
}} }}

View File

@ -1,2 +1,6 @@
[pytest] [pytest]
markers = ['exclude_cuda', 'exclude_gpu', 'exclude_clang', 'webgpu'] markers =
exclude_cuda
exclude_gpu
exclude_clang
webgpu

View File

@ -5,9 +5,6 @@ from tinygrad.tensor import Tensor
from tinygrad.lazy import LAZY from tinygrad.lazy import LAZY
from tinygrad.ops import GlobalCounters from tinygrad.ops import GlobalCounters
from tinygrad.graph import nm from tinygrad.graph import nm
import pytest
pytestmark = pytest.mark.webgpu
N = 200 # has to be bigger than the cache to fail N = 200 # has to be bigger than the cache to fail

View File

@ -3,7 +3,7 @@ import numpy as np
from tinygrad.tensor import Tensor from tinygrad.tensor import Tensor
import pytest import pytest
pytestmark = [pytest.mark.exclude_cuda, pytest.mark.webgpu] pytestmark = [pytest.mark.exclude_cuda]
class TestConv(unittest.TestCase): class TestConv(unittest.TestCase):
def test_simple(self): def test_simple(self):

View File

@ -90,5 +90,14 @@ with Context(VARIABLE=1):
self.assertEqual(VARIABLE.value, 1) self.assertEqual(VARIABLE.value, 1)
self.assertEqual(VARIABLE.value, 0) self.assertEqual(VARIABLE.value, 0)
def test_decorator(self):
@Context(VARIABLE=1, DEBUG=4)
def test():
self.assertEqual(VARIABLE.value, 1)
self.assertEqual(VARIABLE.value, 0)
test()
self.assertEqual(VARIABLE.value, 0)
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()

View File

@ -3,6 +3,7 @@ import numpy as np
import unittest import unittest
from tinygrad.lazy import LazyBuffer from tinygrad.lazy import LazyBuffer
from tinygrad.tensor import Tensor from tinygrad.tensor import Tensor
from tinygrad.shape.symbolic import Variable
class TestLazyBuffer(unittest.TestCase): class TestLazyBuffer(unittest.TestCase):
def test_fromcpu_buffer_sharing(self): def test_fromcpu_buffer_sharing(self):
@ -43,5 +44,29 @@ class TestLazyBuffer(unittest.TestCase):
z = Tensor([1, np.e]).numpy() z = Tensor([1, np.e]).numpy()
np.testing.assert_allclose(y, z) np.testing.assert_allclose(y, z)
class TestVariableBuffer(unittest.TestCase):
def test_get_variable_buffers_no_variable(self):
t = Tensor.rand(2, 3)
assert t.lazydata.get_variable_buffers() == {}
def test_get_variable_buffers_one_variable(self):
v = Variable("v", 1, 10)
t = Tensor.rand(2, 3).reshape(v, 3)
buffers = t.lazydata.get_variable_buffers()
assert len(buffers) == 1 and buffers[v].realize().realized.toCPU() == 2
v = Variable("v", 1, 10)
t = Tensor.rand(2, 3).reshape(2, v)
buffers = t.lazydata.get_variable_buffers()
assert len(buffers) == 1 and buffers[v].realize().realized.toCPU() == 3
def test_get_variable_buffers_cat(self):
v1 = Variable("v1", 1, 10)
v2 = Variable("v2", 1, 10)
t1 = Tensor.rand(2, 3).reshape(v1, 3)
t2 = Tensor.rand(6, 3).reshape(v2, 3)
t = t1.cat(t2)
buffers = t.lazydata.get_variable_buffers()
assert len(buffers) == 2 and buffers[v1].realize().realized.toCPU() == 2 and buffers[v2].realize().realized.toCPU() == 6
if __name__ == "__main__": if __name__ == "__main__":
unittest.main() unittest.main()

View File

@ -9,7 +9,7 @@ from tinygrad.nn import BatchNorm2d, Conv1d, ConvTranspose1d, Conv2d, ConvTransp
import torch import torch
import pytest import pytest
pytestmark = [pytest.mark.exclude_cuda, pytest.mark.webgpu] pytestmark = [pytest.mark.exclude_cuda]
class TestNN(unittest.TestCase): class TestNN(unittest.TestCase):

View File

@ -6,9 +6,6 @@ import unittest
from tinygrad.tensor import Tensor from tinygrad.tensor import Tensor
from tinygrad.helpers import getenv, IMAGE, DEBUG, CI from tinygrad.helpers import getenv, IMAGE, DEBUG, CI
from tinygrad.lazy import Device from tinygrad.lazy import Device
import pytest
pytestmark = pytest.mark.webgpu
if CI: if CI:
import warnings import warnings
@ -612,6 +609,12 @@ class TestOps(unittest.TestCase):
def test_pad2d(self): def test_pad2d(self):
helper_test_op([(3,3,3,3)], lambda x: torch.nn.functional.pad(x, (1,2,3,4)), lambda x: x.pad2d(padding=(1,2,3,4))) helper_test_op([(3,3,3,3)], lambda x: torch.nn.functional.pad(x, (1,2,3,4)), lambda x: x.pad2d(padding=(1,2,3,4)))
helper_test_op([(3,3,3,3)], lambda x: torch.nn.functional.pad(x, (-1,2,-3,4)), lambda x: x.pad2d(padding=(-1,2,-3,4)))
helper_test_op([(3,3,3,3)], lambda x: torch.nn.functional.pad(x, (1,2,3,4), value=5), lambda x: x.pad2d(padding=(1,2,3,4),value=5))
helper_test_op([(3,3,3,3)], lambda x: torch.nn.functional.pad(x, (-1,2,-3,4), value=5), lambda x: x.pad2d(padding=(-1,2,-3,4),value=5))
def test_pad(self):
helper_test_op([(3,3)], lambda x: torch.nn.functional.pad(x, (1,2,3,4)),lambda x: x.pad(((3,4),(1,2))))
helper_test_op([(3,3)], lambda x: torch.nn.functional.pad(x, (1,2,3,4), value=5), lambda x: x.pad(((3,4), (1,2)), value=5))
def test_transpose(self): def test_transpose(self):
helper_test_op([(3,3,3)], lambda x: x.transpose(1,2), lambda x: x.transpose(1,2)) helper_test_op([(3,3,3)], lambda x: x.transpose(1,2), lambda x: x.transpose(1,2))
@ -804,7 +807,7 @@ class TestOps(unittest.TestCase):
lambda x,w: torch.nn.functional.conv_transpose3d(x,w).relu(), lambda x,w: torch.nn.functional.conv_transpose3d(x,w).relu(),
lambda x,w: Tensor.conv_transpose2d(x,w).relu(), atol=1e-4, grad_rtol=1e-5) lambda x,w: Tensor.conv_transpose2d(x,w).relu(), atol=1e-4, grad_rtol=1e-5)
@unittest.skipIf((IMAGE>0 or (Device.DEFAULT == "WEBGPU" and getenv("CI","") != "")), "no conv1d on images") @unittest.skipIf((IMAGE>0), "no conv1d on images")
def test_conv1d(self): def test_conv1d(self):
for bs in [1,8]: for bs in [1,8]:
for cin in [1,3]: for cin in [1,3]:

View File

@ -18,7 +18,7 @@ from tinygrad.helpers import colored, getenv, DEBUG, CI
from tinygrad.jit import TinyJit from tinygrad.jit import TinyJit
import pytest import pytest
pytestmark = [pytest.mark.exclude_cuda, pytest.mark.exclude_gpu, pytest.mark.exclude_clang, pytest.mark.webgpu] pytestmark = [pytest.mark.exclude_cuda, pytest.mark.exclude_gpu, pytest.mark.exclude_clang]
IN_CHANS = [int(x) for x in getenv("IN_CHANS", "4,16,64").split(",")] IN_CHANS = [int(x) for x in getenv("IN_CHANS", "4,16,64").split(",")]
@ -130,7 +130,7 @@ class TestBigSpeed(unittest.TestCase):
def test_large_conv_3x3(self): helper_test_conv(bs=4, in_chans=128, out_chans=128, kernel_size=3, img_size_y=130, img_size_x=130) def test_large_conv_3x3(self): helper_test_conv(bs=4, in_chans=128, out_chans=128, kernel_size=3, img_size_y=130, img_size_x=130)
def test_large_conv_5x5(self): helper_test_conv(bs=4, in_chans=128, out_chans=128, kernel_size=5, img_size_y=130, img_size_x=130) def test_large_conv_5x5(self): helper_test_conv(bs=4, in_chans=128, out_chans=128, kernel_size=5, img_size_y=130, img_size_x=130)
@unittest.skipIf((getenv("BIG") == 1 or Device.DEFAULT == "WEBGPU"), "only big tests") @unittest.skipIf((getenv("BIG") == 1), "only big tests")
class TestSpeed(unittest.TestCase): class TestSpeed(unittest.TestCase):
def test_sub(self): def test_sub(self):
def f(a, b): return a-b def f(a, b): return a-b

View File

@ -30,7 +30,57 @@ class TestSymbolic(unittest.TestCase):
st = t1.lazydata.st st = t1.lazydata.st
assert st.shape == (i+j+k, 4) assert st.shape == (i+j+k, 4)
assert st.real_strides() == (4, 1) assert st.real_strides() == (4, 1)
i = Variable("i", 1, 5)
j = Variable("j", 1, 5)
k = Variable("k", 1, 5)
t1 = Tensor.rand(3, 4).reshape(3, i).cat(Tensor.rand(3, 4).reshape(3, j), dim=1).cat(Tensor.rand(3, 4).reshape(3, k), dim=1) t1 = Tensor.rand(3, 4).reshape(3, i).cat(Tensor.rand(3, 4).reshape(3, j), dim=1).cat(Tensor.rand(3, 4).reshape(3, k), dim=1)
st = t1.lazydata.st st = t1.lazydata.st
assert st.shape == (3, i+j+k) assert st.shape == (3, i+j+k)
assert st.real_strides() == (i+j+k, 1) assert st.real_strides() == (i+j+k, 1)
class TestSymbolicReshape(unittest.TestCase):
def test_reshape_into_symbols_simple(self):
for i in range(1, 5):
vi = Variable("i", 1, 10)
assert Tensor.rand(i, 4).reshape(vi, 4).shape == (vi, 4)
assert vi.val == i
vi = Variable("i", 1, 10)
assert Tensor.rand(i, 6).reshape(vi, 2, 3).shape == (vi, 2, 3)
assert vi.val == i
def test_reshape_symbols_reshape_ints(self):
for i in range(1, 5):
vi = Variable("i", 1, 10)
assert Tensor.rand(i, 4).reshape(vi, 4).reshape(i, 4).shape == (i, 4)
assert Tensor.rand(i, 4).reshape(vi, 4).reshape(i*4,).shape == (i*4,)
assert Tensor.rand(i, 6).reshape(vi, 6).reshape(i*2, 3).shape == (i*2, 3)
with self.assertRaises(AssertionError):
Tensor.rand(i, 6).reshape(vi, 6).reshape(1, 77).shape
def test_reshape_reuse_var_same_value_ok(self):
for i in range(1, 5):
vi = Variable("i", 1, 10)
a = Tensor.rand(i, 4).reshape(vi, 4)
b = Tensor.rand(i, 3).reshape(vi, 3)
assert vi.val == i
def test_reshape_reuse_var_different_value_fail(self):
for i in range(1, 5):
vi = Variable("i", 1, 10)
a = Tensor.rand(i, 4).reshape(vi, 2)
with self.assertRaises(AssertionError):
b = Tensor.rand(i, 3).reshape(vi, 3)
def test_reshape_into_symbols_bad_shape(self):
vi = Variable("i", 1, 10)
vj = Variable("j", 1, 10)
with self.assertRaises(AssertionError):
t = Tensor.rand(3, 4).reshape(vi, vj)
with self.assertRaises(AssertionError):
t = Tensor.rand(4, 4).reshape(vi, vi)
with self.assertRaises(AssertionError):
t = Tensor.rand(4, 6).reshape(vi, 6).reshape(vi, 4)
with self.assertRaises(AssertionError):
t = Tensor.rand(100, 4).reshape(Variable("too_small", 1, 10), 4)
with self.assertRaises(AssertionError):
t = Tensor.rand(3, 4).reshape(Variable("too_big", 100, 200), 4)

View File

@ -2,13 +2,9 @@ import dataclasses
import numpy as np import numpy as np
import torch import torch
import unittest import unittest
import itertools from tinygrad.tensor import Tensor
from tinygrad.tensor import Tensor, Device
from tinygrad.helpers import dtypes from tinygrad.helpers import dtypes
from extra.gradcheck import numerical_jacobian, jacobian, gradcheck from extra.gradcheck import numerical_jacobian, jacobian, gradcheck
import pytest
pytestmark = pytest.mark.webgpu
x_init = np.random.randn(1,3).astype(np.float32) x_init = np.random.randn(1,3).astype(np.float32)
U_init = np.random.randn(3,3).astype(np.float32) U_init = np.random.randn(3,3).astype(np.float32)

View File

@ -188,7 +188,6 @@ class TestIndexExpressions2d(unittest.TestCase):
st.expand((base_shape[0], base_shape[1], base_shape[1])) st.expand((base_shape[0], base_shape[1], base_shape[1]))
self.node_exprs.append(lambda idx, base_shape=base_shape, offset=offset: idx//(base_shape[1]*base_shape[1])%base_shape[0]*base_shape[1] + idx%base_shape[1] + offset) self.node_exprs.append(lambda idx, base_shape=base_shape, offset=offset: idx//(base_shape[1]*base_shape[1])%base_shape[0]*base_shape[1] + idx%base_shape[1] + offset)
self.idxs_exprs.append(lambda idxs, base_shape=base_shape, offset=offset: idxs[0]*base_shape[1] + idxs[2] + offset) self.idxs_exprs.append(lambda idxs, base_shape=base_shape, offset=offset: idxs[0]*base_shape[1] + idxs[2] + offset)
def test_permute_reshape_1(self): # This tests multiple views def test_permute_reshape_1(self): # This tests multiple views
for st, base_shape, offset in zip(self.sts, self.shapes, self.offset): for st, base_shape, offset in zip(self.sts, self.shapes, self.offset):
st.permute((1, 0)) st.permute((1, 0))

View File

@ -1,6 +1,6 @@
#!/usr/bin/env python #!/usr/bin/env python
import unittest import unittest
from tinygrad.shape.symbolic import MulNode, SumNode, Variable, NumNode, Node from tinygrad.shape.symbolic import MulNode, SumNode, Variable, NumNode, sym_vars
class TestSymbolic(unittest.TestCase): class TestSymbolic(unittest.TestCase):
def helper_test_variable(self, v, n, m, s): def helper_test_variable(self, v, n, m, s):
@ -240,6 +240,36 @@ class TestSymbolicNumeric(unittest.TestCase):
def test_times_2_plus_3_div_4(self): self.helper_test_numeric(lambda x: (x*2 + 3)//4) def test_times_2_plus_3_div_4(self): self.helper_test_numeric(lambda x: (x*2 + 3)//4)
def test_times_2_plus_3_div_4_mod_4(self): self.helper_test_numeric(lambda x: ((x*2 + 3)//4)%4) def test_times_2_plus_3_div_4_mod_4(self): self.helper_test_numeric(lambda x: ((x*2 + 3)//4)%4)
class TestSymbolicVars(unittest.TestCase):
def test_simple(self):
z = NumNode(0)
a = Variable("a", 0, 10)
b = Variable("b", 0, 10)
c = Variable("c", 0, 10)
assert z.vars() == z.vars() == []
assert a.vars() == a.vars() == [a]
m = MulNode(a, 3)
assert m.vars() == [a]
s = SumNode([a, b, c])
assert s.vars() == [a, b, c]
def test_compound(self):
a = Variable("a", 0, 10)
b = Variable("b", 0, 10)
c = Variable("c", 0, 10)
# TODO: update this after we support symbolic * symbolic
assert (a + b * c).vars() == [a, b]
assert (a % 3 + b // 5).vars() == [a, b]
assert (a + b + c - a).vars() == [b, c]
def test_sym_vars(self):
a = Variable("a", 0, 10)
b = Variable("b", 0, 10)
assert sym_vars(1) == []
assert sym_vars(a) == [a]
assert sym_vars(a+b) == [a, b]
assert sym_vars(a*3) == [a]
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()

View File

@ -21,6 +21,7 @@ class CStyleLanguage(NamedTuple):
gid: List[str] = [] gid: List[str] = []
lid: List[str] = [] lid: List[str] = []
global_max: List[int] = [] global_max: List[int] = []
local_max: List[int] = []
extra_args: List[str] = [] extra_args: List[str] = []
float4: Optional[str] = None float4: Optional[str] = None
half_prekernel: Optional[str] = None half_prekernel: Optional[str] = None
@ -195,7 +196,7 @@ class CStyleCodegen(Linearizer):
def codegen(self): def codegen(self):
self.process() self.process()
if self.lang.global_max: self.limit_global_dims(len(self.lang.gid), self.lang.global_max) # NOTE: this is optional now if self.lang.global_max: self.limit_global_dims(len(self.lang.gid), self.lang.global_max, self.lang.local_max) # NOTE: this is optional now
self.linearize() self.linearize()
prg, global_size, local_size = uops_to_cstyle(self.uops, self.lang) prg, global_size, local_size = uops_to_cstyle(self.uops, self.lang)

View File

@ -595,8 +595,20 @@ class Linearizer:
for i,x in enumerate(rets): self.sts[i].reshape(tuple([y[0] for y in x])) for i,x in enumerate(rets): self.sts[i].reshape(tuple([y[0] for y in x]))
# ******************** GPU simplifiers ******************** # ******************** GPU simplifiers ********************
def _limit_size(self, x: Tuple[int], max_size: List) -> Tuple[int, ...]:
new_shape,dims = list(x), len(x)
for i in range(dims):
next_idx = (i + 1) % dims
while new_shape[i] > max_size[i]:
new_shape[i] = new_shape[i] // 2
if (new_shape[next_idx] <= max_size[next_idx]):
new_shape[next_idx] = new_shape[next_idx] * 2
else:
next_idx = (next_idx + 1) % dims
new_shape[next_idx] = new_shape[next_idx] * 2
return tuple(new_shape)
def limit_global_dims(self, limit, global_max): def limit_global_dims(self, limit: int, global_max: List[int], local_max: List[int]):
# sometimes, there's more dimensions than len(self.lang.gid). # sometimes, there's more dimensions than len(self.lang.gid).
# compact all the dimensions into the first # compact all the dimensions into the first
# NOTE: this might make multiview shapetrackers # NOTE: this might make multiview shapetrackers
@ -607,8 +619,11 @@ class Linearizer:
# Check the global allocation limit, current the global_size will be flipped during codegen # Check the global allocation limit, current the global_size will be flipped during codegen
# and then padded right with 1s if its length < 3 which makes this part a bit awkward to write # and then padded right with 1s if its length < 3 which makes this part a bit awkward to write
global_dims = self.first_reduce-self.local_dims global_dims = self.first_reduce-self.local_dims
if global_dims > 0: if global_dims > 0:
assert max(global_max) >= max(self.full_shape[0:global_dims]), f"device max allocation {max(self.full_shape[0:global_dims])} exceeds global dim maximum {max(global_max)}" if global_max:
tmp = global_max[:global_dims] + (local_max[:self.local_dims] if local_max else [])
if max(global_max) < max(self.full_shape[:global_dims]): self.reshape_and_permute(lambda x: self._limit_size(x, tmp + [math.inf] * (len(self.full_shape)-len(tmp))), None)
assert max(global_max) >= max(self.full_shape[:global_dims]), f"device max allocation {max(self.full_shape[:global_dims])} exceeds global dim maximum {max(global_max)}"
for i in range(global_dims-1): for i in range(global_dims-1):
if self.full_shape[i] > global_max[i]: if self.full_shape[i] > global_max[i]:
order = list(range(len(self.full_shape))) order = list(range(len(self.full_shape)))

View File

@ -30,9 +30,9 @@ def kernel_optimize_search(k:Linearizer, create_k:Callable[[], Linearizer], runt
k.process() k.process()
apply_opt(k, x) apply_opt(k, x)
prg = k.codegen().build(runtime) prg = k.codegen().build(runtime)
first_tm = prg.exec(k.bufs, force_wait=True) first_tm = prg.exec(k.bufs, force_wait=True, optimizing=True)
if baseline*5 < first_tm*1000: return first_tm*1000 # very slow if baseline*5 < first_tm*1000: return first_tm*1000 # very slow
tm = min([first_tm]+[prg.exec(k.bufs, force_wait=True) for _ in range(2)])*1000 tm = min([first_tm]+[prg.exec(k.bufs, force_wait=True, optimizing=True) for _ in range(2)])*1000
return tm return tm
except Exception: except Exception:
if DEBUG >= 3: if DEBUG >= 3:
@ -76,7 +76,7 @@ def kernel_optimize(k:Linearizer, create_k:Callable[[], Linearizer], runtime):
k = create_k() k = create_k()
hand_coded_optimizations(k) hand_coded_optimizations(k)
prg = k.codegen().build(runtime) prg = k.codegen().build(runtime)
return min([prg.exec(k.bufs, force_wait=True) for _ in range(5)])*1000 return min([prg.exec(k.bufs, force_wait=True, optimizing=True) for _ in range(5)])*1000
choice = kernel_optimize_search(k, create_k, runtime, get_baseline()) choice = kernel_optimize_search(k, create_k, runtime, get_baseline())
if global_db is not None: if global_db is not None:
global_db[skey] = choice global_db[skey] = choice

View File

@ -11,6 +11,8 @@ class WGSLLanguage(CStyleLanguage):
gid = [f"i32(gindex.{'xyz'[x]})" for x in range(3)] gid = [f"i32(gindex.{'xyz'[x]})" for x in range(3)]
lid = [f"i32(lindex.{'xyz'[x]})" for x in range(3)] lid = [f"i32(lindex.{'xyz'[x]})" for x in range(3)]
size_prefix = "let" size_prefix = "let"
global_max = [65535, 65535, 65535]
local_max = [256, 256, 64]
barrier="workgroupBarrier();" barrier="workgroupBarrier();"
generic_var_prefix = "var " generic_var_prefix = "var "
external_local_bufs = True external_local_bufs = True

View File

@ -1,5 +1,5 @@
from __future__ import annotations from __future__ import annotations
import os, functools, platform, time, re import os, functools, platform, time, re, contextlib
from weakref import KeyedRef, ref from weakref import KeyedRef, ref
from _weakref import _remove_dead_weakref # type: ignore from _weakref import _remove_dead_weakref # type: ignore
import numpy as np import numpy as np
@ -26,7 +26,7 @@ def fromimport(mod, frm): return getattr(__import__(mod, fromlist=[frm]), frm)
@functools.lru_cache(maxsize=None) @functools.lru_cache(maxsize=None)
def getenv(key, default=0): return type(default)(os.getenv(key, default)) def getenv(key, default=0): return type(default)(os.getenv(key, default))
class Context: class Context(contextlib.ContextDecorator):
stack: ClassVar[List[dict[str, int]]] = [{}] stack: ClassVar[List[dict[str, int]]] = [{}]
def __init__(self, **kwargs): self.kwargs = kwargs def __init__(self, **kwargs): self.kwargs = kwargs
def __enter__(self): def __enter__(self):
@ -52,7 +52,7 @@ class ContextVar:
DEBUG, IMAGE = ContextVar("DEBUG", 0), ContextVar("IMAGE", 0) DEBUG, IMAGE = ContextVar("DEBUG", 0), ContextVar("IMAGE", 0)
GRAPH, PRUNEGRAPH, GRAPHPATH = getenv("GRAPH", 0), getenv("PRUNEGRAPH", 0), getenv("GRAPHPATH", "/tmp/net") GRAPH, PRUNEGRAPH, GRAPHPATH = getenv("GRAPH", 0), getenv("PRUNEGRAPH", 0), getenv("GRAPHPATH", "/tmp/net")
class Timing(object): class Timing(contextlib.ContextDecorator):
def __init__(self, prefix="", on_exit=None, enabled=True): self.prefix, self.on_exit, self.enabled = prefix, on_exit, enabled def __init__(self, prefix="", on_exit=None, enabled=True): self.prefix, self.on_exit, self.enabled = prefix, on_exit, enabled
def __enter__(self): self.st = time.perf_counter_ns() def __enter__(self): self.st = time.perf_counter_ns()
def __exit__(self, exc_type, exc_val, exc_tb): def __exit__(self, exc_type, exc_val, exc_tb):

View File

@ -9,6 +9,7 @@ from tinygrad.helpers import GRAPH, DEBUG, prod, getenv, DType, dtypes, flatten,
from tinygrad.runtime.ops_cpu import RawNumpyBuffer from tinygrad.runtime.ops_cpu import RawNumpyBuffer
from tinygrad.runtime.ops_disk import RawDiskBuffer from tinygrad.runtime.ops_disk import RawDiskBuffer
from tinygrad.shape.shapetracker import MovementOps, ShapeTracker, View, get_contraction from tinygrad.shape.shapetracker import MovementOps, ShapeTracker, View, get_contraction
from tinygrad.shape.symbolic import Variable, sym_vars
from tinygrad.ops import Compiled, Interpreted, UnaryOps, BinaryOps, TernaryOps, ReduceOps, LoadOps, OpType, LazyOp from tinygrad.ops import Compiled, Interpreted, UnaryOps, BinaryOps, TernaryOps, ReduceOps, LoadOps, OpType, LazyOp
from tinygrad.runtime.lib import RawBufferMapped, RawConst, RawBuffer from tinygrad.runtime.lib import RawBufferMapped, RawConst, RawBuffer
@ -270,6 +271,7 @@ class LazyBuffer:
def buffers(self) -> Tuple[LazyBuffer, ...]: return (self,) def buffers(self) -> Tuple[LazyBuffer, ...]: return (self,)
def map_buffers(self, real_srcs: Dict[Any, Any]): return real_srcs.get(self, self) def map_buffers(self, real_srcs: Dict[Any, Any]): return real_srcs.get(self, self)
def get_lazyops(self) -> List[Any]: return [] def get_lazyops(self) -> List[Any]: return []
def get_variable_buffers(self) -> Dict[Variable, LazyBuffer]: return {v:LazyBuffer.loadop(LoadOps.FROM, (1,), dtypes.int32, self.device, src=LazyBuffer.fromCPU(np.array([v.val], dtype=np.int32))) for s in self.shape for v in sym_vars(s)}
def replace_with_movement_ops(self: LazyBuffer, ops:List[Tuple[MovementOps, Any]]) -> LazyBuffer: def replace_with_movement_ops(self: LazyBuffer, ops:List[Tuple[MovementOps, Any]]) -> LazyBuffer:
y = self y = self
for op, arg in ops: y = MOVEMENT_OPS_DISPATCHER[op](y, arg) for op, arg in ops: y = MOVEMENT_OPS_DISPATCHER[op](y, arg)

View File

@ -133,9 +133,9 @@ class ASTRunner:
self.clprg = runtime(self.name, self.prg, **self.runtime_args) self.clprg = runtime(self.name, self.prg, **self.runtime_args)
return self return self
def exec(self, bufs, force_wait=False) -> Optional[float]: def exec(self, bufs, force_wait=False, optimizing=False) -> Optional[float]:
rawbufs = dedup([x.realized for x in bufs if buf_is_kernel_arg(x)]) rawbufs = dedup([x.realized for x in bufs if buf_is_kernel_arg(x)])
if GlobalCounters.cache is not None: GlobalCounters.cache.append((self, rawbufs)) if GlobalCounters.cache is not None and not optimizing: GlobalCounters.cache.append((self, rawbufs))
return self(rawbufs, force_wait=force_wait) return self(rawbufs, force_wait=force_wait)
def __call__(self, rawbufs:List[RawBuffer], jit=False, force_wait=False) -> Optional[float]: def __call__(self, rawbufs:List[RawBuffer], jit=False, force_wait=False) -> Optional[float]:

View File

@ -9,9 +9,11 @@ args = {
'Linux': {'cflags':'-lm -fPIC --rtlib=compiler-rt ', 'ext':'so', 'exp':''}, 'Linux': {'cflags':'-lm -fPIC --rtlib=compiler-rt ', 'ext':'so', 'exp':''},
'Darwin': {'cflags':'-lm -fPIC --rtlib=compiler-rt ', 'ext':'dylib', 'exp':''} 'Darwin': {'cflags':'-lm -fPIC --rtlib=compiler-rt ', 'ext':'dylib', 'exp':''}
}[platform.system()] }[platform.system()]
CLANG_PROGRAM_HEADER = '#include <math.h>\n#define max(x,y) ((x>y)?x:y)\n#define int64 long\n#define half __fp16\n#define uchar unsigned char\n#define bool uchar\n'
class ClangProgram: class ClangProgram:
def __init__(self, name:str, prg:str, binary:bool=False): def __init__(self, name:str, prg:str, binary:bool=False):
prg = CLANG_PROGRAM_HEADER + prg
# TODO: is there a way to not write this to disk?
fn = f"{tempfile.gettempdir()}/clang_{hashlib.md5(prg.encode('utf-8')).hexdigest()}.{args['ext']}" fn = f"{tempfile.gettempdir()}/clang_{hashlib.md5(prg.encode('utf-8')).hexdigest()}.{args['ext']}"
if not binary: if not binary:
prg = '#include <math.h>\n#define max(x,y) ((x>y)?x:y)\n#define int64 long\n#define half __fp16\n#define uchar unsigned char\n#define bool uchar\n' + prg prg = '#include <math.h>\n#define max(x,y) ((x>y)?x:y)\n#define int64 long\n#define half __fp16\n#define uchar unsigned char\n#define bool uchar\n' + prg

View File

@ -17,7 +17,6 @@ if DEBUG >= 5:
early_exec = fromimport("extra.helpers", "enable_early_exec")() early_exec = fromimport("extra.helpers", "enable_early_exec")()
class _CL: class _CL:
def __init__(self): self.events_in_flight = []
def post_init(self, device=None): def post_init(self, device=None):
platforms: List[List[cl.Device]] = [y for y in ([x.get_devices(device_type=cl.device_type.GPU) for x in cl.get_platforms()] + [x.get_devices(device_type=cl.device_type.CPU) for x in cl.get_platforms()]) if len(y)] platforms: List[List[cl.Device]] = [y for y in ([x.get_devices(device_type=cl.device_type.GPU) for x in cl.get_platforms()] + [x.get_devices(device_type=cl.device_type.CPU) for x in cl.get_platforms()]) if len(y)]
self.cl_platform = cl.get_platforms()[getenv('CL_PLATFORM', 0)] self.cl_platform = cl.get_platforms()[getenv('CL_PLATFORM', 0)]
@ -25,8 +24,6 @@ class _CL:
if DEBUG >= 1: print(f"using devices: {[ctx.devices[0].hashable_model_and_version_identifier for ctx in self.cl_ctxs]}") if DEBUG >= 1: print(f"using devices: {[ctx.devices[0].hashable_model_and_version_identifier for ctx in self.cl_ctxs]}")
self.cl_queue: List[cl.CommandQueue] = [cl.CommandQueue(ctx, device=ctx.devices[0], properties=cl.command_queue_properties.PROFILING_ENABLE) for ctx in self.cl_ctxs] self.cl_queue: List[cl.CommandQueue] = [cl.CommandQueue(ctx, device=ctx.devices[0], properties=cl.command_queue_properties.PROFILING_ENABLE) for ctx in self.cl_ctxs]
def synchronize(self): def synchronize(self):
for evt in self.events_in_flight: evt.wait()
self.events_in_flight.clear()
for q in self.cl_queue: q.finish() for q in self.cl_queue: q.finish()
CL = _CL() CL = _CL()
CL.post_init() if not getenv("DELAYED_RUNTIME_INIT", False) else None CL.post_init() if not getenv("DELAYED_RUNTIME_INIT", False) else None
@ -43,13 +40,14 @@ class CLBuffer(RawBufferCopyInOut):
setattr(buf, 'device', int(device)) # device is tracked on the underlying buffer setattr(buf, 'device', int(device)) # device is tracked on the underlying buffer
super().__init__(size, dtype, buf) super().__init__(size, dtype, buf)
def _copyin(self, x: np.ndarray): def _copyin(self, x:np.ndarray):
assert not self.dtype.name.startswith("image"), f"can't copyin images {self.dtype}" assert not self.dtype.name.startswith("image"), f"can't copyin images {self.dtype}"
CL.events_in_flight.append(cl.enqueue_copy(CL.cl_queue[self._buf.device], self._buf, np.require(x, requirements='C'), is_blocking=False)) self.event = cl.enqueue_copy(CL.cl_queue[self._buf.device], self._buf, np.require(x, requirements='C'), is_blocking=False)
def _copyout(self, x:np.ndarray): def _copyout(self, x:np.ndarray):
CL.synchronize()
assert not self.dtype.name.startswith("image"), f"can't copyout images {self.dtype}" assert not self.dtype.name.startswith("image"), f"can't copyout images {self.dtype}"
cl.enqueue_copy(CL.cl_queue[self._buf.device], x, self._buf, is_blocking=True) buf = cl.Buffer(CL.cl_ctxs[self._buf.device], cl.mem_flags.WRITE_ONLY | cl.mem_flags.USE_HOST_PTR, 0, hostbuf=x.data)
mapped, event = cl.enqueue_map_buffer(CL.cl_queue[self._buf.device], buf, cl.map_flags.WRITE, 0, self.size, dtype=self.dtype.np, is_blocking=False)
with mapped.base: cl.enqueue_copy(CL.cl_queue[self._buf.device], mapped, self._buf, is_blocking=True, wait_for=[event])
class CLProgram: class CLProgram:
def __init__(self, name:str, prg:str, binary=False, argdtypes=None, options=None): def __init__(self, name:str, prg:str, binary=False, argdtypes=None, options=None):
@ -78,7 +76,7 @@ class CLProgram:
def __call__(self, global_size, local_size, *bufs, wait=False) -> Optional[float]: def __call__(self, global_size, local_size, *bufs, wait=False) -> Optional[float]:
cl_bufs = [x._buf if isinstance(x, CLBuffer) else x for x in bufs] cl_bufs = [x._buf if isinstance(x, CLBuffer) else x for x in bufs]
e = self.clprgs[cl_bufs[0].device](CL.cl_queue[cl_bufs[0].device], [g*l for g,l in zip(global_size, local_size)] if local_size is not None else global_size, local_size, *cl_bufs) e = self.clprgs[cl_bufs[0].device](CL.cl_queue[cl_bufs[0].device], [g*l for g,l in zip(global_size, local_size)] if local_size is not None else global_size, local_size, *cl_bufs, wait_for=[x.event for x in bufs if isinstance(x, CLBuffer) and hasattr(x, "event")])
if wait: if wait:
e.wait() e.wait()
try: try:

View File

@ -1,6 +1,7 @@
# pip3 install pyobjc-framework-Metal pyobjc-framework-Cocoa pyobjc-framework-libdispatch # pip3 install pyobjc-framework-Metal pyobjc-framework-Cocoa pyobjc-framework-libdispatch
import os, subprocess, pathlib import os, subprocess, pathlib
import Metal, Cocoa, libdispatch # type: ignore import Metal, Cocoa, libdispatch # type: ignore
from typing import List, Any
from tinygrad.codegen.cstyle import CStyleCodegen, CStyleLanguage from tinygrad.codegen.cstyle import CStyleCodegen, CStyleLanguage
from tinygrad.helpers import prod, getenv, DEBUG, DType from tinygrad.helpers import prod, getenv, DEBUG, DType
from tinygrad.ops import Compiled from tinygrad.ops import Compiled
@ -10,17 +11,13 @@ METAL_XCODE = getenv("METAL_XCODE")
class _METAL: class _METAL:
def __init__(self): def __init__(self):
self.mtl_buffers_in_flight: List[Any] = []
self.device = Metal.MTLCreateSystemDefaultDevice() self.device = Metal.MTLCreateSystemDefaultDevice()
self.dispatch_group = libdispatch.dispatch_group_create()
self.mtl_queue = self.device.newCommandQueue() self.mtl_queue = self.device.newCommandQueue()
def command_buffer(self): # TODO: is there a better way to do this?
command_buffer = self.mtl_queue.commandBuffer()
libdispatch.dispatch_group_enter(self.dispatch_group)
def leave(_): libdispatch.dispatch_group_leave(self.dispatch_group)
command_buffer.addCompletedHandler_(leave)
return command_buffer
def synchronize(self): def synchronize(self):
libdispatch.dispatch_group_wait(self.dispatch_group, libdispatch.DISPATCH_TIME_FOREVER) for cbuf in self.mtl_buffers_in_flight: cbuf.waitUntilCompleted()
self.mtl_buffers_in_flight.clear()
METAL = _METAL() METAL = _METAL()
class RawMetalBuffer(RawBufferMapped): class RawMetalBuffer(RawBufferMapped):
@ -63,7 +60,7 @@ class MetalProgram:
def __call__(self, global_size, local_size, *bufs, wait=False): def __call__(self, global_size, local_size, *bufs, wait=False):
assert prod(local_size) <= self.pipeline_state.maxTotalThreadsPerThreadgroup(), f"local size {local_size} bigger than {self.pipeline_state.maxTotalThreadsPerThreadgroup()} with exec width {self.pipeline_state.threadExecutionWidth()} memory length {self.pipeline_state.staticThreadgroupMemoryLength()}" assert prod(local_size) <= self.pipeline_state.maxTotalThreadsPerThreadgroup(), f"local size {local_size} bigger than {self.pipeline_state.maxTotalThreadsPerThreadgroup()} with exec width {self.pipeline_state.threadExecutionWidth()} memory length {self.pipeline_state.staticThreadgroupMemoryLength()}"
command_buffer = METAL.command_buffer() command_buffer = METAL.mtl_queue.commandBuffer()
encoder = command_buffer.computeCommandEncoder() encoder = command_buffer.computeCommandEncoder()
encoder.setComputePipelineState_(self.pipeline_state) encoder.setComputePipelineState_(self.pipeline_state)
for i,a in enumerate(bufs): encoder.setBuffer_offset_atIndex_(a._buf, 0, i) for i,a in enumerate(bufs): encoder.setBuffer_offset_atIndex_(a._buf, 0, i)
@ -73,6 +70,7 @@ class MetalProgram:
if wait: if wait:
command_buffer.waitUntilCompleted() command_buffer.waitUntilCompleted()
return command_buffer.GPUEndTime() - command_buffer.GPUStartTime() return command_buffer.GPUEndTime() - command_buffer.GPUStartTime()
METAL.mtl_buffers_in_flight.append(command_buffer)
class MetalCodegen(CStyleCodegen): class MetalCodegen(CStyleCodegen):
lang = CStyleLanguage( lang = CStyleLanguage(

View File

@ -235,12 +235,21 @@ class ShapeTracker:
self.views[-1] = View(new_shape, self.views[-1].strides, self.views[-1].offset, mask) self.views[-1] = View(new_shape, self.views[-1].strides, self.views[-1].offset, mask)
return self return self
def reshape(self, new_shape: Tuple[int, ...]): def reshape(self, new_shape: Tuple[Union[Node,int], ...]):
# reshape into symbolic shape, update the variable value
if all(isinstance(s, int) for s in self.shape) and len(new_vars:=list(s for s in new_shape if isinstance(s, Variable))) > 0:
assert len(new_vars) == 1, "only one variable is supported in a shape"
new_var, new_val = new_vars[0], prod(self.shape) // prod(s for s in new_shape if isinstance(s, int))
if new_var.val is None:
assert new_var.min <= new_val <= new_var.max, f"variable value {new_val} out of range [{new_var.min}, {new_var.max}]"
new_var.val = new_val
else: assert new_var.val == new_val, f"value conflicts, was {new_var.val}, set to {new_val}"
if self.views[-1].shape == new_shape: return self if self.views[-1].shape == new_shape: return self
assert all(is_sym_int(x) and x > 0 for x in new_shape), f"shape must be symbolic ints and can't contain 0 or negative numbers {new_shape}" assert all(is_sym_int(x) and x > 0 for x in new_shape), f"shape must be symbolic ints and can't contain 0 or negative numbers {new_shape}"
# only check size for int shapes. we don't check symbolic here as long as the reshape itself can be done # only check size for int shapes. we don't check symbolic here as long as the reshape itself can be done
if all(isinstance(s, int) for s in self.shape) and all(isinstance(s, int) for s in new_shape): if all(isinstance(s, int) for s in self.shape) and all(isinstance(s, int) for s in new_shape):
assert prod(self.shape) == prod(new_shape), f"can't reshape {self.shape} -> {new_shape}" assert prod(self.shape) == prod(new_shape), f"can't reshape {self.shape} -> {new_shape}" # type: ignore # mypy cannot resolve, all ints here
new_view, extra = _reshape(self.views[-1], new_shape) new_view, extra = _reshape(self.views[-1], new_shape)
if extra: self.views.append(new_view) if extra: self.views.append(new_view)
else: self.views[-1] = new_view else: self.views[-1] = new_view

View File

@ -9,6 +9,7 @@ from typing import List, Dict, Callable, Tuple, Type, Union, Optional, Any
# symbolic matches the Python behavior, but the code output is agnostic, and will never have negative numbers in div or mod # symbolic matches the Python behavior, but the code output is agnostic, and will never have negative numbers in div or mod
def is_sym_int(x: Any) -> bool: return isinstance(x, int) or isinstance(x, Node) def is_sym_int(x: Any) -> bool: return isinstance(x, int) or isinstance(x, Node)
def sym_vars(x: Union[Node, int]) -> List[Variable]: return [] if isinstance(x, int) else x.vars()
class Node: class Node:
b: Union[Node, int] b: Union[Node, int]
@ -134,6 +135,7 @@ class Variable(Node):
def __init__(self, expr:Optional[str], nmin:int, nmax:int): def __init__(self, expr:Optional[str], nmin:int, nmax:int):
self.expr, self.min, self.max = expr, nmin, nmax self.expr, self.min, self.max = expr, nmin, nmax
self.val: Optional[int] = None
def vars(self): return [self] def vars(self): return [self]
class NumNode(Node): class NumNode(Node):

View File

@ -243,16 +243,18 @@ class Tensor:
def expand(self, shape, *args) -> Tensor: return mlops.Expand.apply(self, shape=tuple([x if x != -1 else s for s,x in zip(self.shape, argfix(shape, *args))])) def expand(self, shape, *args) -> Tensor: return mlops.Expand.apply(self, shape=tuple([x if x != -1 else s for s,x in zip(self.shape, argfix(shape, *args))]))
def permute(self, order, *args) -> Tensor: return mlops.Permute.apply(self, order=argfix(order, *args)) def permute(self, order, *args) -> Tensor: return mlops.Permute.apply(self, order=argfix(order, *args))
def flip(self, axis, *args) -> Tensor: return mlops.Flip.apply(self, axis=[x if x >= 0 else x+len(self.shape) for x in argfix(axis, *args)]) def flip(self, axis, *args) -> Tensor: return mlops.Flip.apply(self, axis=[x if x >= 0 else x+len(self.shape) for x in argfix(axis, *args)])
def pad(self, arg:Tuple[Tuple[int, int], ...]) -> Tensor: return mlops.Pad.apply(self, arg=arg) if any(x != (0,0) for x in arg) else self
def shrink(self, arg:Tuple[Tuple[int, int], ...]) -> Tensor: return mlops.Shrink.apply(self, arg=arg) if any(x != (0,s) for x,s in zip(arg, self.shape)) else self def shrink(self, arg:Tuple[Tuple[int, int], ...]) -> Tensor: return mlops.Shrink.apply(self, arg=arg) if any(x != (0,s) for x,s in zip(arg, self.shape)) else self
def pad(self, arg: Tuple[Tuple[int, int], ...], value:float=0) -> Tensor:
ret = mlops.Pad.apply(self, arg=arg) if any(x != (0, 0) for x in arg) else self
return ret if 0 == value else ret + (value - mlops.Pad.apply(Tensor.full(self.shape, value), arg=arg))
# ***** movement hlops ***** # ***** movement hlops *****
# NOTE: using slice is discouraged and things should migrate to pad and shrink # NOTE: using slice is discouraged and things should migrate to pad and shrink
def slice(self, arg:Sequence[Optional[Tuple[int, int]]]) -> Tensor: def slice(self, arg:Sequence[Optional[Tuple[int, int]]], value:float=0) -> Tensor:
arg_ = tuple([a if a is not None else (0,s) for s,a in zip(self.shape, arg)]) arg_ = tuple([a if a is not None else (0,s) for s,a in zip(self.shape, arg)])
padding = tuple([(max(0, -p[0]), max(0, p[1]-self.shape[i])) for i,p in enumerate(arg_)]) padding = tuple([(max(0, -p[0]), max(0, p[1]-self.shape[i])) for i,p in enumerate(arg_)])
return self.pad(padding).shrink(tuple([(p[0] + padding[i][0], p[1] + padding[i][0]) for i,p in enumerate(arg_)])) return self.pad(padding, value=value).shrink(tuple([(p[0] + padding[i][0], p[1] + padding[i][0]) for i,p in enumerate(arg_)]))
# - Negative indices are taken relative to the end of the sequence, so X[-2] returns the 2nd-to-last element # - Negative indices are taken relative to the end of the sequence, so X[-2] returns the 2nd-to-last element
# - A slice i:j returns the elements with indices in [i, j) # - A slice i:j returns the elements with indices in [i, j)
@ -375,9 +377,9 @@ class Tensor:
return self.reshape(self.shape[:dim] + (1,) + self.shape[dim:]) return self.reshape(self.shape[:dim] + (1,) + self.shape[dim:])
# (padding_left, padding_right, padding_top, padding_bottom) # (padding_left, padding_right, padding_top, padding_bottom)
def pad2d(self, padding:Union[List[int], Tuple[int, ...]]): def pad2d(self, padding:Union[List[int], Tuple[int, ...]], value:float=0):
slc = [(-p0, s+p1) for p0,p1,s in zip(padding[::2], padding[1::2], self.shape[::-1])][::-1] slc = [(-p0, s+p1) for p0,p1,s in zip(padding[::2], padding[1::2], self.shape[::-1])][::-1]
return self.slice([(0,s) for s in self.shape[:-(len(padding)//2)]] + slc) return self.slice([(0,s) for s in self.shape[:-(len(padding)//2)]] + slc, value=value)
@property @property
def T(self) -> Tensor: return self.transpose() def T(self) -> Tensor: return self.transpose()

1
weights/.gitignore vendored 100644
View File

@ -0,0 +1 @@
*