tinygrab/examples/handcode_resnet50_opt.py

from typing import List
from extra.models.resnet import ResNet50
from tinygrad.tensor import Tensor
from tinygrad.ops import LoadOps
from tinygrad.device import Device, Compiled
from tinygrad.codegen.linearizer import Linearizer
from tinygrad.features.search import time_linearizer, beam_search, bufs_from_lin
from tinygrad.helpers import ansilen, DEBUG, getenv
from tinygrad.lazy import vars_from_ast
from tinygrad.shape.symbolic import sym_infer


if __name__ == "__main__":
    mdl = ResNet50()
    seen = set()

    # the device we are optimizing for
    device: Compiled = Device[Device.DEFAULT]
    print(f"optimizing for {Device.DEFAULT}")

    # first model run to init the weights, they are saved in seen
    mdl(Tensor.empty(64, 3, 224, 224)).lazydata.schedule(seen)

    # run model again to get only what changes, these are the kernels of the model
    x = Tensor.empty(64, 3, 224, 224)
    out = mdl(x)
    sched = out.lazydata.schedule(seen)
    sched = [x for x in sched if x.ast.op not in LoadOps]

    # focus on one kernel
    if getenv("KERNEL", -1) >= 0:
        sched = sched[getenv("KERNEL", -1) : getenv("KERNEL", -1) + 1]

    # work with the schedule
    total_tm = 0
    running_gflops = 0
    for i, si in enumerate(sched):
        rawbufs = bufs_from_lin(Linearizer(si.ast))

        # "linearize" the op into uops in different ways
        lins: List[Linearizer] = []

        # always try hand coded opt
        lin = Linearizer(si.ast, device.linearizer_opts)
        lin.hand_coded_optimizations()
        lins.append(lin)

        # maybe try tensor cores
        lin = Linearizer(si.ast, device.linearizer_opts)
        if lin.apply_tensor_cores():
            lins.append(lin)

        # try a beam search
        if getenv("BEAM"):
            lin = Linearizer(si.ast, device.linearizer_opts)
            lin = beam_search(
                lin, rawbufs, getenv("BEAM"), bool(getenv("BEAM_ESTIMATE", 1))
            )
            lins.append(lin)

        # benchmark the programs
        choices = []
        for lin in lins:
            tm = time_linearizer(lin, rawbufs, allow_test_size=False, cnt=10)
            gflops = (
                sym_infer(lin.info.flops, {k: k.min for k in vars_from_ast(lin.ast)})
                * 1e-9
                / tm
            )
            choices.append((tm, gflops, lin.linearize()))

            # print all kernels
            if DEBUG >= 1:
                print(
                    f"                 kernel {i:2d} {lin.name+' '*(37-ansilen(lin.name))} {str(lin.global_size):18s} {str(lin.local_size):12s} takes {tm*1000:7.2f} ms, {gflops:6.0f} GFLOPS"
                )
        tm, gflops, lin = sorted(choices, key=lambda x: x[0])[0]
        print(
            f"*** {total_tm*1000:7.2f} ms : kernel {i:2d} {lin.name+' '*(37-ansilen(lin.name))} {str(lin.global_size):18s} {str(lin.local_size):12s} takes {tm*1000:7.2f} ms, {gflops:6.0f} GFLOPS"
        )
        total_tm += tm
        running_gflops += gflops * tm
    print(
        f"******* total {total_tm*1000:.2f} ms, {running_gflops/total_tm:6.0f} GFLOPS"
    )
resnet50 hand coded optimization (#1945) * resnet50 hand coded opt * hand optimize one kernel * opt in both places to fix test 2023-09-29 10:34:51 -06:00			`from typing import List`
move things, clean up extra (#2292) * move things * idk why pylint needs that now * delete unused 2023-11-13 21:18:40 -07:00			`from extra.models.resnet import ResNet50`
resnet50 hand coded optimization (#1945) * resnet50 hand coded opt * hand optimize one kernel * opt in both places to fix test 2023-09-29 10:34:51 -06:00			`from tinygrad.tensor import Tensor`
fix handcode_resnet50_opt.py (#2558) 2023-12-01 18:51:21 -07:00			`from tinygrad.ops import LoadOps`
			`from tinygrad.device import Device, Compiled`
resnet50 hand coded optimization (#1945) * resnet50 hand coded opt * hand optimize one kernel * opt in both places to fix test 2023-09-29 10:34:51 -06:00			`from tinygrad.codegen.linearizer import Linearizer`
fix handcode_resnet50_opt.py (#2558) 2023-12-01 18:51:21 -07:00			`from tinygrad.features.search import time_linearizer, beam_search, bufs_from_lin`
KOPT is over, BEAM is upstream (#2071) * create cache for q learning * make linter happy * global beam * where it belongs * bugfix * ditch the kopt, use the beam * faster lin and DEBUG=2 okay * remove kopt, move search to features 2023-10-16 10:46:03 -06:00			`from tinygrad.helpers import ansilen, DEBUG, getenv`
train value net, improve API, add BCE (#2047) * api cleanups, BCE losses * valuenet * fixup examples * learning okay * add valuenet runner * net improvements * net improvements * 40% win rate 2023-10-12 08:56:38 -06:00			`from tinygrad.lazy import vars_from_ast`
			`from tinygrad.shape.symbolic import sym_infer`
resnet50 hand coded optimization (#1945) * resnet50 hand coded opt * hand optimize one kernel * opt in both places to fix test 2023-09-29 10:34:51 -06:00
start work on auto opt (#2034) * start work on auto opt * lin failure * not beating hcopt * greedy * timing is fast * codegen.search * greedy search in handcode_opt * track running gflops * clean up those files * no failure 2023-10-11 13:54:53 -06:00
resnet50 hand coded optimization (#1945) * resnet50 hand coded opt * hand optimize one kernel * opt in both places to fix test 2023-09-29 10:34:51 -06:00			`if __name__ == "__main__":`
Reformat, uh, everything, with black 2023-12-04 22:01:04 -07:00			`mdl = ResNet50()`
			`seen = set()`
resnet50 hand coded optimization (#1945) * resnet50 hand coded opt * hand optimize one kernel * opt in both places to fix test 2023-09-29 10:34:51 -06:00
Reformat, uh, everything, with black 2023-12-04 22:01:04 -07:00			`# the device we are optimizing for`
			`device: Compiled = Device[Device.DEFAULT]`
			`print(f"optimizing for {Device.DEFAULT}")`
use the device abstraction in handcode_resnet50_opt 2023-10-07 14:22:20 -06:00
Reformat, uh, everything, with black 2023-12-04 22:01:04 -07:00			`# first model run to init the weights, they are saved in seen`
			`mdl(Tensor.empty(64, 3, 224, 224)).lazydata.schedule(seen)`
resnet50 hand coded optimization (#1945) * resnet50 hand coded opt * hand optimize one kernel * opt in both places to fix test 2023-09-29 10:34:51 -06:00
Reformat, uh, everything, with black 2023-12-04 22:01:04 -07:00			`# run model again to get only what changes, these are the kernels of the model`
			`x = Tensor.empty(64, 3, 224, 224)`
			`out = mdl(x)`
			`sched = out.lazydata.schedule(seen)`
			`sched = [x for x in sched if x.ast.op not in LoadOps]`
resnet50 hand coded optimization (#1945) * resnet50 hand coded opt * hand optimize one kernel * opt in both places to fix test 2023-09-29 10:34:51 -06:00
Reformat, uh, everything, with black 2023-12-04 22:01:04 -07:00			`# focus on one kernel`
			`if getenv("KERNEL", -1) >= 0:`
			`sched = sched[getenv("KERNEL", -1) : getenv("KERNEL", -1) + 1]`
0s in the action space (#2070) * 0s in the action space * simpler * skip duplicate actions 2023-10-14 12:22:48 -06:00
Reformat, uh, everything, with black 2023-12-04 22:01:04 -07:00			`# work with the schedule`
			`total_tm = 0`
			`running_gflops = 0`
			`for i, si in enumerate(sched):`
			`rawbufs = bufs_from_lin(Linearizer(si.ast))`
resnet50 hand coded optimization (#1945) * resnet50 hand coded opt * hand optimize one kernel * opt in both places to fix test 2023-09-29 10:34:51 -06:00
Reformat, uh, everything, with black 2023-12-04 22:01:04 -07:00			`# "linearize" the op into uops in different ways`
			`lins: List[Linearizer] = []`
resnet50 hand coded optimization (#1945) * resnet50 hand coded opt * hand optimize one kernel * opt in both places to fix test 2023-09-29 10:34:51 -06:00
Reformat, uh, everything, with black 2023-12-04 22:01:04 -07:00			`# always try hand coded opt`
			`lin = Linearizer(si.ast, device.linearizer_opts)`
			`lin.hand_coded_optimizations()`
			`lins.append(lin)`
resnet50 hand coded optimization (#1945) * resnet50 hand coded opt * hand optimize one kernel * opt in both places to fix test 2023-09-29 10:34:51 -06:00
Reformat, uh, everything, with black 2023-12-04 22:01:04 -07:00			`# maybe try tensor cores`
			`lin = Linearizer(si.ast, device.linearizer_opts)`
			`if lin.apply_tensor_cores():`
			`lins.append(lin)`
start work on auto opt (#2034) * start work on auto opt * lin failure * not beating hcopt * greedy * timing is fast * codegen.search * greedy search in handcode_opt * track running gflops * clean up those files * no failure 2023-10-11 13:54:53 -06:00
Reformat, uh, everything, with black 2023-12-04 22:01:04 -07:00			`# try a beam search`
			`if getenv("BEAM"):`
			`lin = Linearizer(si.ast, device.linearizer_opts)`
			`lin = beam_search(`
			`lin, rawbufs, getenv("BEAM"), bool(getenv("BEAM_ESTIMATE", 1))`
			`)`
			`lins.append(lin)`
resnet50 hand coded optimization (#1945) * resnet50 hand coded opt * hand optimize one kernel * opt in both places to fix test 2023-09-29 10:34:51 -06:00
Reformat, uh, everything, with black 2023-12-04 22:01:04 -07:00			`# benchmark the programs`
			`choices = []`
			`for lin in lins:`
			`tm = time_linearizer(lin, rawbufs, allow_test_size=False, cnt=10)`
			`gflops = (`
			`sym_infer(lin.info.flops, {k: k.min for k in vars_from_ast(lin.ast)})`
			`* 1e-9`
			`/ tm`
			`)`
			`choices.append((tm, gflops, lin.linearize()))`
resnet50 hand coded optimization (#1945) * resnet50 hand coded opt * hand optimize one kernel * opt in both places to fix test 2023-09-29 10:34:51 -06:00
Reformat, uh, everything, with black 2023-12-04 22:01:04 -07:00			`# print all kernels`
			`if DEBUG >= 1:`
			`print(`
			`f" kernel {i:2d} {lin.name+' '(37-ansilen(lin.name))} {str(lin.global_size):18s} {str(lin.local_size):12s} takes {tm1000:7.2f} ms, {gflops:6.0f} GFLOPS"`
			`)`
			`tm, gflops, lin = sorted(choices, key=lambda x: x[0])[0]`
			`print(`
			`f"*** {total_tm1000:7.2f} ms : kernel {i:2d} {lin.name+' '(37-ansilen(lin.name))} {str(lin.global_size):18s} {str(lin.local_size):12s} takes {tm*1000:7.2f} ms, {gflops:6.0f} GFLOPS"`
			`)`
			`total_tm += tm`
			`running_gflops += gflops * tm`
			`print(`
			`f"******* total {total_tm*1000:.2f} ms, {running_gflops/total_tm:6.0f} GFLOPS"`
			`)`