tinygrab/tinygrad/runtime/ops_webgpu.py

from wgpu.utils.device import get_default_device
from tinygrad.device import Compiled, Allocator
from tinygrad.codegen.kernel import LinearizerOptions
from tinygrad.renderer.cstyle import WGSLRenderer
import wgpu

wgpu_device = get_default_device()


class WebGPUProgram:
    """
    This class represents a WebGPU program. It stores the name, library, and compiled shader module of a GPU program.

    Attributes:
        name (str): The name of the GPU program.
        lib (bytes): The library containing the code for the GPU program.
        prg (wgpu_device.create_shader_module): The compiled shader module of the GPU program.
    """

    def __init__(self, name: str, lib: bytes):
        """
        Constructs a WebGPUProgram object.

        Args:
            name (str): The name of the GPU program.
            lib (bytes): The library containing the code for the GPU program.

        Notes:
            This is the compiler for the GPU program.
        """
        self.name, self.lib, self.prg = (
            name,
            lib,
            wgpu_device.create_shader_module(code=lib),
        )  # NOTE: this is the compiler

    def __call__(self, *bufs, global_size, local_size, vals=(), wait=False):
        """
        Executes the GPU program.

        Args:
            *bufs (tuple of buffers): The input buffers for the GPU program.
            global_size (tuple): The size of the global workgroup.
            local_size (tuple): The size of the local workgroup.
            vals (tuple, optional): Additional values to pass to the GPU program. Defaults to empty tuple.
            wait (bool, optional): Whether or not to wait for the execution to finish before returning. Defaults to False.
        """
        assert len(bufs) <= 8, "WEBGPU only supports 8 buffers"
        binding_layouts = [
            {
                "binding": i,
                "visibility": wgpu.ShaderStage.COMPUTE,
                "buffer": {"type": wgpu.BufferBindingType.storage},
            }
            for i in range(len(bufs))
        ]
        bindings = [
            {"binding": i, "resource": {"buffer": x, "offset": 0, "size": x.size}}
            for i, x in enumerate(bufs)
        ]
        bind_group_layout = wgpu_device.create_bind_group_layout(
            entries=binding_layouts
        )
        pipeline_layout = wgpu_device.create_pipeline_layout(
            bind_group_layouts=[bind_group_layout]
        )
        bind_group = wgpu_device.create_bind_group(
            layout=bind_group_layout, entries=bindings
        )
        compute_pipeline = wgpu_device.create_compute_pipeline(
            layout=pipeline_layout,
            compute={"module": self.prg, "entry_point": self.name},
        )
        command_encoder = wgpu_device.create_command_encoder()
        compute_pass = command_encoder.begin_compute_pass()
        compute_pass.set_pipeline(compute_pipeline)
        compute_pass.set_bind_group(0, bind_group, [], 0, 999999)  # last 2 not used
        compute_pass.dispatch_workgroups(*global_size)  # x y z
        compute_pass.end()
        wgpu_device.queue.submit([command_encoder.finish()])


class WebGpuAllocator(Allocator):
    """
    WebGpuAllocator class.

    Attributes:
        Allocator (parent class): Parent class for this class.
    """

    def _alloc(self, size: int):
        """
        Allocate memory on the device.

        Args:
            size (int): Size of memory to be allocated.

        Returns:
            Memory buffer created by wgpu_device.create_buffer().
        """
        return wgpu_device.create_buffer(
            size=size,
            usage=wgpu.BufferUsage.STORAGE
            | wgpu.BufferUsage.COPY_DST
            | wgpu.BufferUsage.COPY_SRC,
        )

    def copyin(self, dest, src: memoryview):
        """
        Copy data from source to destination.

        Args:
            dest: Destination of the data.
            src (memoryview): Source of the data.
        """
        wgpu_device.queue.write_buffer(dest, 0, src)

    def copyout(self, dest, src: memoryview):
        """
        Copy data from source to destination.

        Args:
            dest: Destination of the data.
            src (memoryview): Source of the data.

        Note:
            This is a temporary solution and should be removed in the future.
        """
        dest[:] = wgpu_device.queue.read_buffer(src, 0)  # TODO: remove this copy


class WebGpuDevice(Compiled):
    """
    WebGpuDevice class.

    Attributes:
        Compiled (parent class): Parent class for this class.
    """

    def __init__(self, device: str):
        """
        Initialize an instance of the WebGpuDevice class.

        Args:
            device (str): Device identifier.

        Note:
            The WebGpuAllocator and LinearizerOptions classes are also initialized here.
        """
        super().__init__(
            WebGpuAllocator(),
            LinearizerOptions(
                device="WEBGPU",
                supports_float4=False,
                local_max=[256, 256, 64],
                global_max=[65535, 65535, 65535],
            ),
            WGSLRenderer,
            lambda x: x,
            WebGPUProgram,
        )
wgpu.utils._device -> wgpu.utils.device (#2330) * wgpu.utils._device -> wgpu.utils.device * can i do this? * no need to specify metal 2023-11-16 10:52:13 -07:00			`from wgpu.utils.device import get_default_device`
new style device (#2530) * cpu tests pass * torch works * works * metal works * fix ops_disk * metal jit works * fix openpilot * llvm and clang work * fix webgpu * docs are rly broken * LRU works on metal * delete comment * revert name to ._buf. LRU only on Compiled * changes * allocator * allocator, getting closer * lru alloc * LRUAllocator * all pass * metal * cuda * test examples * linearizer * test fixes * fix custom + clean realize * fix hip * skip tests * fix tests * fix size=0 * fix MOCKHIP * fix thneed * copy better * simple * old style metal copy * fix thneed * np reshape * give cuda a device 2023-11-30 18:07:16 -07:00			`from tinygrad.device import Compiled, Allocator`
move stuff in the linearizer (#1726) * move stuff in linearizer * move stuff in linearizer * minor * fix opts import 2023-08-31 15:42:09 -06:00			`from tinygrad.codegen.kernel import LinearizerOptions`
cleanups to save lines and files (#2577) * runtime/graph -> features/graph * put all the cstyle renderers in cstyle * same line for those * how did that pass mypy 2023-12-02 17:29:56 -07:00			`from tinygrad.renderer.cstyle import WGSLRenderer`
refactor/ci: delete many `# type: ignore` (#2281) * refactor/ci: delete many `# type: ignore` * replace `axis.__class__ is int` with `isinstance(axis, int)` to make mypy happy * add `--warn-unused-ignores` to mypy flag refs #2240 * ci: move `--warn-unused-ignores` flag to mypy config refs #2240 2023-11-12 12:04:20 -07:00			`import wgpu`
Webgpu support (#1077) * initial commit * 81 passing * 105 passing tests * 148 passing * CI tests * install dep on ci * try opencl pkgs * try using vulkan * down to only 6 failing * refactor * cleaning up * another test skipped due to buffer limit * linter * segfault * indent fix * another segfault found * small touchups * Fix max and maxpool tests * Add constant folding * Add javascript export script * better asserts in codegen * manual upcasting * reverted token type change * skip safetensor test due to unsupported type * FIx efficientnet and all other model tests * Remove np copy * fixed indent and missing import * manually destroy the buffer * revert back to length * linter errors * removed extra val * skip broken tests * skipping more tests * Make the page pretty * Save model weights as safetensor * Fix imagenet to c test * Fix second imagenet to c bug * Async and paralel kernel compilation * workgroup support * reversed local size * fixed non local bug * correct local groups * ci experiment * removed typo * Fix define local by using shared memory * Refactor * try running on mac * match metal tests * add more workers * scope down tests * trying windows runner * fixed windows env * see how many it can do * merged master * refactor * missed refactor * increase test suite coverage * missing import * whitespace in test_efficientnet.py * getting there * fixed reset * fixed bufs * switched to cstyle * cleanup * min/max rename * one more linter issue * fixed demo * linter * testing ci chrome * add unsafe webgpu arg * add build step * remove WEBGPU from cmd line * use module * try forcing directx * trying forced metal backend * temp disable conv2d for CI * disable conv_trasnpose2d --------- Co-authored-by: 0x4d - Martin Loretz <20306567+martinloretzzz@users.noreply.github.com> Co-authored-by: George Hotz <72895+geohot@users.noreply.github.com> 2023-07-12 13:52:06 -06:00
init allocator for compiled backends (#1467) * init allocator for compiled backends * Update ops_webgpu.py --------- Co-authored-by: George Hotz <72895+geohot@users.noreply.github.com> 2023-08-17 11:33:32 -06:00			`wgpu_device = get_default_device()`
Webgpu support (#1077) * initial commit * 81 passing * 105 passing tests * 148 passing * CI tests * install dep on ci * try opencl pkgs * try using vulkan * down to only 6 failing * refactor * cleaning up * another test skipped due to buffer limit * linter * segfault * indent fix * another segfault found * small touchups * Fix max and maxpool tests * Add constant folding * Add javascript export script * better asserts in codegen * manual upcasting * reverted token type change * skip safetensor test due to unsupported type * FIx efficientnet and all other model tests * Remove np copy * fixed indent and missing import * manually destroy the buffer * revert back to length * linter errors * removed extra val * skip broken tests * skipping more tests * Make the page pretty * Save model weights as safetensor * Fix imagenet to c test * Fix second imagenet to c bug * Async and paralel kernel compilation * workgroup support * reversed local size * fixed non local bug * correct local groups * ci experiment * removed typo * Fix define local by using shared memory * Refactor * try running on mac * match metal tests * add more workers * scope down tests * trying windows runner * fixed windows env * see how many it can do * merged master * refactor * missed refactor * increase test suite coverage * missing import * whitespace in test_efficientnet.py * getting there * fixed reset * fixed bufs * switched to cstyle * cleanup * min/max rename * one more linter issue * fixed demo * linter * testing ci chrome * add unsafe webgpu arg * add build step * remove WEBGPU from cmd line * use module * try forcing directx * trying forced metal backend * temp disable conv2d for CI * disable conv_trasnpose2d --------- Co-authored-by: 0x4d - Martin Loretz <20306567+martinloretzzz@users.noreply.github.com> Co-authored-by: George Hotz <72895+geohot@users.noreply.github.com> 2023-07-12 13:52:06 -06:00
Reformat, uh, everything, with black 2023-12-04 22:01:04 -07:00
Webgpu support (#1077) * initial commit * 81 passing * 105 passing tests * 148 passing * CI tests * install dep on ci * try opencl pkgs * try using vulkan * down to only 6 failing * refactor * cleaning up * another test skipped due to buffer limit * linter * segfault * indent fix * another segfault found * small touchups * Fix max and maxpool tests * Add constant folding * Add javascript export script * better asserts in codegen * manual upcasting * reverted token type change * skip safetensor test due to unsupported type * FIx efficientnet and all other model tests * Remove np copy * fixed indent and missing import * manually destroy the buffer * revert back to length * linter errors * removed extra val * skip broken tests * skipping more tests * Make the page pretty * Save model weights as safetensor * Fix imagenet to c test * Fix second imagenet to c bug * Async and paralel kernel compilation * workgroup support * reversed local size * fixed non local bug * correct local groups * ci experiment * removed typo * Fix define local by using shared memory * Refactor * try running on mac * match metal tests * add more workers * scope down tests * trying windows runner * fixed windows env * see how many it can do * merged master * refactor * missed refactor * increase test suite coverage * missing import * whitespace in test_efficientnet.py * getting there * fixed reset * fixed bufs * switched to cstyle * cleanup * min/max rename * one more linter issue * fixed demo * linter * testing ci chrome * add unsafe webgpu arg * add build step * remove WEBGPU from cmd line * use module * try forcing directx * trying forced metal backend * temp disable conv2d for CI * disable conv_trasnpose2d --------- Co-authored-by: 0x4d - Martin Loretz <20306567+martinloretzzz@users.noreply.github.com> Co-authored-by: George Hotz <72895+geohot@users.noreply.github.com> 2023-07-12 13:52:06 -06:00			`class WebGPUProgram:`
docstrings, requirements webgpu 2023-12-06 11:31:18 -07:00			`"""`
			`This class represents a WebGPU program. It stores the name, library, and compiled shader module of a GPU program.`

			`Attributes:`
			`name (str): The name of the GPU program.`
			`lib (bytes): The library containing the code for the GPU program.`
			`prg (wgpu_device.create_shader_module): The compiled shader module of the GPU program.`
			`"""`

Reformat, uh, everything, with black 2023-12-04 22:01:04 -07:00			`def __init__(self, name: str, lib: bytes):`
docstrings, requirements webgpu 2023-12-06 11:31:18 -07:00			`"""`
			`Constructs a WebGPUProgram object.`

			`Args:`
			`name (str): The name of the GPU program.`
			`lib (bytes): The library containing the code for the GPU program.`

			`Notes:`
			`This is the compiler for the GPU program.`
			`"""`
Reformat, uh, everything, with black 2023-12-04 22:01:04 -07:00			`self.name, self.lib, self.prg = (`
			`name,`
			`lib,`
			`wgpu_device.create_shader_module(code=lib),`
			`) # NOTE: this is the compiler`

			`def __call__(self, *bufs, global_size, local_size, vals=(), wait=False):`
docstrings, requirements webgpu 2023-12-06 11:31:18 -07:00			`"""`
			`Executes the GPU program.`

			`Args:`
			`*bufs (tuple of buffers): The input buffers for the GPU program.`
			`global_size (tuple): The size of the global workgroup.`
			`local_size (tuple): The size of the local workgroup.`
			`vals (tuple, optional): Additional values to pass to the GPU program. Defaults to empty tuple.`
			`wait (bool, optional): Whether or not to wait for the execution to finish before returning. Defaults to False.`
			`"""`
Reformat, uh, everything, with black 2023-12-04 22:01:04 -07:00			`assert len(bufs) <= 8, "WEBGPU only supports 8 buffers"`
			`binding_layouts = [`
			`{`
			`"binding": i,`
			`"visibility": wgpu.ShaderStage.COMPUTE,`
			`"buffer": {"type": wgpu.BufferBindingType.storage},`
			`}`
			`for i in range(len(bufs))`
			`]`
			`bindings = [`
			`{"binding": i, "resource": {"buffer": x, "offset": 0, "size": x.size}}`
			`for i, x in enumerate(bufs)`
			`]`
			`bind_group_layout = wgpu_device.create_bind_group_layout(`
			`entries=binding_layouts`
			`)`
			`pipeline_layout = wgpu_device.create_pipeline_layout(`
			`bind_group_layouts=[bind_group_layout]`
			`)`
			`bind_group = wgpu_device.create_bind_group(`
			`layout=bind_group_layout, entries=bindings`
			`)`
			`compute_pipeline = wgpu_device.create_compute_pipeline(`
			`layout=pipeline_layout,`
			`compute={"module": self.prg, "entry_point": self.name},`
			`)`
			`command_encoder = wgpu_device.create_command_encoder()`
			`compute_pass = command_encoder.begin_compute_pass()`
			`compute_pass.set_pipeline(compute_pipeline)`
			`compute_pass.set_bind_group(0, bind_group, [], 0, 999999) # last 2 not used`
			`compute_pass.dispatch_workgroups(*global_size) # x y z`
			`compute_pass.end()`
			`wgpu_device.queue.submit([command_encoder.finish()])`

init allocator for compiled backends (#1467) * init allocator for compiled backends * Update ops_webgpu.py --------- Co-authored-by: George Hotz <72895+geohot@users.noreply.github.com> 2023-08-17 11:33:32 -06:00
new style device (#2530) * cpu tests pass * torch works * works * metal works * fix ops_disk * metal jit works * fix openpilot * llvm and clang work * fix webgpu * docs are rly broken * LRU works on metal * delete comment * revert name to ._buf. LRU only on Compiled * changes * allocator * allocator, getting closer * lru alloc * LRUAllocator * all pass * metal * cuda * test examples * linearizer * test fixes * fix custom + clean realize * fix hip * skip tests * fix tests * fix size=0 * fix MOCKHIP * fix thneed * copy better * simple * old style metal copy * fix thneed * np reshape * give cuda a device 2023-11-30 18:07:16 -07:00			`class WebGpuAllocator(Allocator):`
docstrings, requirements webgpu 2023-12-06 11:31:18 -07:00			`"""`
			`WebGpuAllocator class.`

			`Attributes:`
			`Allocator (parent class): Parent class for this class.`
			`"""`

Reformat, uh, everything, with black 2023-12-04 22:01:04 -07:00			`def _alloc(self, size: int):`
docstrings, requirements webgpu 2023-12-06 11:31:18 -07:00			`"""`
			`Allocate memory on the device.`

			`Args:`
			`size (int): Size of memory to be allocated.`

			`Returns:`
			`Memory buffer created by wgpu_device.create_buffer().`
			`"""`
Reformat, uh, everything, with black 2023-12-04 22:01:04 -07:00			`return wgpu_device.create_buffer(`
			`size=size,`
			`usage=wgpu.BufferUsage.STORAGE`
			`\| wgpu.BufferUsage.COPY_DST`
			`\| wgpu.BufferUsage.COPY_SRC,`
			`)`

			`def copyin(self, dest, src: memoryview):`
docstrings, requirements webgpu 2023-12-06 11:31:18 -07:00			`"""`
			`Copy data from source to destination.`

			`Args:`
			`dest: Destination of the data.`
			`src (memoryview): Source of the data.`
			`"""`
Reformat, uh, everything, with black 2023-12-04 22:01:04 -07:00			`wgpu_device.queue.write_buffer(dest, 0, src)`

			`def copyout(self, dest, src: memoryview):`
docstrings, requirements webgpu 2023-12-06 11:31:18 -07:00			`"""`
			`Copy data from source to destination.`

			`Args:`
			`dest: Destination of the data.`
			`src (memoryview): Source of the data.`

			`Note:`
			`This is a temporary solution and should be removed in the future.`
			`"""`
Reformat, uh, everything, with black 2023-12-04 22:01:04 -07:00			`dest[:] = wgpu_device.queue.read_buffer(src, 0) # TODO: remove this copy`

Webgpu support (#1077) * initial commit * 81 passing * 105 passing tests * 148 passing * CI tests * install dep on ci * try opencl pkgs * try using vulkan * down to only 6 failing * refactor * cleaning up * another test skipped due to buffer limit * linter * segfault * indent fix * another segfault found * small touchups * Fix max and maxpool tests * Add constant folding * Add javascript export script * better asserts in codegen * manual upcasting * reverted token type change * skip safetensor test due to unsupported type * FIx efficientnet and all other model tests * Remove np copy * fixed indent and missing import * manually destroy the buffer * revert back to length * linter errors * removed extra val * skip broken tests * skipping more tests * Make the page pretty * Save model weights as safetensor * Fix imagenet to c test * Fix second imagenet to c bug * Async and paralel kernel compilation * workgroup support * reversed local size * fixed non local bug * correct local groups * ci experiment * removed typo * Fix define local by using shared memory * Refactor * try running on mac * match metal tests * add more workers * scope down tests * trying windows runner * fixed windows env * see how many it can do * merged master * refactor * missed refactor * increase test suite coverage * missing import * whitespace in test_efficientnet.py * getting there * fixed reset * fixed bufs * switched to cstyle * cleanup * min/max rename * one more linter issue * fixed demo * linter * testing ci chrome * add unsafe webgpu arg * add build step * remove WEBGPU from cmd line * use module * try forcing directx * trying forced metal backend * temp disable conv2d for CI * disable conv_trasnpose2d --------- Co-authored-by: 0x4d - Martin Loretz <20306567+martinloretzzz@users.noreply.github.com> Co-authored-by: George Hotz <72895+geohot@users.noreply.github.com> 2023-07-12 13:52:06 -06:00
new style device (#2530) * cpu tests pass * torch works * works * metal works * fix ops_disk * metal jit works * fix openpilot * llvm and clang work * fix webgpu * docs are rly broken * LRU works on metal * delete comment * revert name to ._buf. LRU only on Compiled * changes * allocator * allocator, getting closer * lru alloc * LRUAllocator * all pass * metal * cuda * test examples * linearizer * test fixes * fix custom + clean realize * fix hip * skip tests * fix tests * fix size=0 * fix MOCKHIP * fix thneed * copy better * simple * old style metal copy * fix thneed * np reshape * give cuda a device 2023-11-30 18:07:16 -07:00			`class WebGpuDevice(Compiled):`
docstrings, requirements webgpu 2023-12-06 11:31:18 -07:00			`"""`
			`WebGpuDevice class.`

			`Attributes:`
			`Compiled (parent class): Parent class for this class.`
			`"""`

Reformat, uh, everything, with black 2023-12-04 22:01:04 -07:00			`def __init__(self, device: str):`
docstrings, requirements webgpu 2023-12-06 11:31:18 -07:00			`"""`
			`Initialize an instance of the WebGpuDevice class.`

			`Args:`
			`device (str): Device identifier.`

			`Note:`
			`The WebGpuAllocator and LinearizerOptions classes are also initialized here.`
			`"""`
Reformat, uh, everything, with black 2023-12-04 22:01:04 -07:00			`super().__init__(`
			`WebGpuAllocator(),`
			`LinearizerOptions(`
			`device="WEBGPU",`
			`supports_float4=False,`
			`local_max=[256, 256, 64],`
			`global_max=[65535, 65535, 65535],`
			`),`
			`WGSLRenderer,`
			`lambda x: x,`
			`WebGPUProgram,`
			`)`