hip compile speed (#2606)
parent
19a0a839db
commit
09b6e254a3
|
@ -0,0 +1,31 @@
|
|||
import random, os
|
||||
from tinygrad.helpers import Timing
|
||||
from tinygrad.runtime.ops_hip import compile_hip, HIPDevice
|
||||
from tinygrad.runtime.ops_gpu import compile_cl, CLDevice
|
||||
|
||||
# OMP_NUM_THREADS=1 strace -tt -f -e trace=file python3 test/external/external_benchmark_hip_compile.py
|
||||
# AMD_COMGR_REDIRECT_LOGS=stdout AMD_COMGR_EMIT_VERBOSE_LOGS=1 python3 test/external/external_benchmark_hip_compile.py
|
||||
|
||||
# issue is in https://github.com/ROCm-Developer-Tools/clr/
|
||||
|
||||
if __name__ == "__main__":
|
||||
HIPDevice()
|
||||
CLDevice()
|
||||
|
||||
# warmup
|
||||
name = "none"+str(random.randint(0, 1000000))
|
||||
compile_cl.__wrapped__(f"void {name}() {{}}")
|
||||
print("compile cl warmed up")
|
||||
compile_hip.__wrapped__(f"void {name}() {{}}")
|
||||
print("compile hip warmed up")
|
||||
|
||||
print("**** benchmark ****")
|
||||
name = "none"+str(random.randint(0, 1000000))
|
||||
# this uses AMD_COMGR_ACTION_COMPILE_SOURCE_TO_BC, then it links the lib on the next step
|
||||
with Timing("compile cl: "): compile_cl.__wrapped__(f"void {name}() {{}}")
|
||||
# this uses AMD_COMGR_ACTION_COMPILE_SOURCE_WITH_DEVICE_LIBS_TO_BC, much slower
|
||||
with Timing("compile hip: "): compile_hip.__wrapped__(f"void {name}() {{}}")
|
||||
os._exit(0)
|
||||
|
||||
|
||||
|
|
@ -56,18 +56,7 @@ def to_function_name(s:str): return ''.join([c if c in (string.ascii_letters+str
|
|||
@functools.lru_cache(maxsize=None)
|
||||
def getenv(key:str, default=0): return type(default)(os.getenv(key, default))
|
||||
def temp(x:str) -> str: return (pathlib.Path(tempfile.gettempdir()) / x).as_posix()
|
||||
def from_mv(mv, to_type=ctypes.c_char): return ctypes.cast(ctypes.addressof(to_type.from_buffer(mv)), ctypes.POINTER(to_type))
|
||||
def to_char_p_p(options: List[bytes], to_type=ctypes.c_char): return (ctypes.POINTER(to_type) * len(options))(*[ctypes.cast(ctypes.create_string_buffer(o), ctypes.POINTER(to_type)) for o in options])
|
||||
@functools.lru_cache(maxsize=None)
|
||||
def init_c_struct_t(fields: Tuple[Tuple[str, ctypes._SimpleCData], ...]):
|
||||
class CStruct(ctypes.Structure):
|
||||
_pack_, _fields_ = 1, fields
|
||||
return CStruct
|
||||
def init_c_var(ctypes_var, creat_cb): return (creat_cb(ctypes_var), ctypes_var)[1]
|
||||
def get_bytes(arg, get_sz, get_str, check) -> bytes: return (sz := init_c_var(ctypes.c_size_t(), lambda x: check(get_sz(arg, ctypes.byref(x)))), ctypes.string_at(init_c_var(ctypes.create_string_buffer(sz.value), lambda x: check(get_str(arg, x))), size=sz.value))[1]
|
||||
def flat_mv(mv:memoryview):
|
||||
if len(mv) == 0: return mv
|
||||
return mv.cast("B", shape=(mv.nbytes,))
|
||||
|
||||
class Context(contextlib.ContextDecorator):
|
||||
stack: ClassVar[List[dict[str, int]]] = [{}]
|
||||
def __init__(self, **kwargs): self.kwargs = kwargs
|
||||
|
@ -274,6 +263,21 @@ def cpu_time_execution(cb, enable):
|
|||
cb()
|
||||
if enable: return time.perf_counter()-st
|
||||
|
||||
# *** ctypes helpers
|
||||
|
||||
def from_mv(mv, to_type=ctypes.c_char): return ctypes.cast(ctypes.addressof(to_type.from_buffer(mv)), ctypes.POINTER(to_type))
|
||||
def to_char_p_p(options: List[bytes], to_type=ctypes.c_char): return (ctypes.POINTER(to_type) * len(options))(*[ctypes.cast(ctypes.create_string_buffer(o), ctypes.POINTER(to_type)) for o in options])
|
||||
@functools.lru_cache(maxsize=None)
|
||||
def init_c_struct_t(fields: Tuple[Tuple[str, ctypes._SimpleCData], ...]):
|
||||
class CStruct(ctypes.Structure):
|
||||
_pack_, _fields_ = 1, fields
|
||||
return CStruct
|
||||
def init_c_var(ctypes_var, creat_cb): return (creat_cb(ctypes_var), ctypes_var)[1]
|
||||
def get_bytes(arg, get_sz, get_str, check) -> bytes: return (sz := init_c_var(ctypes.c_size_t(), lambda x: check(get_sz(arg, ctypes.byref(x)))), ctypes.string_at(init_c_var(ctypes.create_string_buffer(sz.value), lambda x: check(get_str(arg, x))), size=sz.value))[1]
|
||||
def flat_mv(mv:memoryview):
|
||||
if len(mv) == 0: return mv
|
||||
return mv.cast("B", shape=(mv.nbytes,))
|
||||
|
||||
# *** Helpers for CUDA-like APIs.
|
||||
|
||||
def pretty_ptx(s):
|
||||
|
|
|
@ -64,7 +64,7 @@ class HIPAllocator(LRUAllocator):
|
|||
|
||||
class HIPDevice(Compiled):
|
||||
default_arch_name = "gfx1100"
|
||||
def __init__(self, device:str):
|
||||
def __init__(self, device:str=""):
|
||||
self.device = int(device.split(":")[1]) if ":" in device else 0
|
||||
if self.device == 0 and not MOCKHIP: HIPDevice.default_arch_name = init_c_var(hip.hipDeviceProp_t(), lambda x: check(hip.hipGetDeviceProperties(x, self.device))).gcnArchName.decode()
|
||||
|
||||
|
|
Loading…
Reference in New Issue