1
0
Fork 0

hip compile speed (#2606)

pull/2591/merge
George Hotz 2023-12-04 13:47:40 -08:00 committed by GitHub
parent 19a0a839db
commit 09b6e254a3
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 48 additions and 13 deletions

View File

@ -0,0 +1,31 @@
import random, os
from tinygrad.helpers import Timing
from tinygrad.runtime.ops_hip import compile_hip, HIPDevice
from tinygrad.runtime.ops_gpu import compile_cl, CLDevice
# OMP_NUM_THREADS=1 strace -tt -f -e trace=file python3 test/external/external_benchmark_hip_compile.py
# AMD_COMGR_REDIRECT_LOGS=stdout AMD_COMGR_EMIT_VERBOSE_LOGS=1 python3 test/external/external_benchmark_hip_compile.py
# issue is in https://github.com/ROCm-Developer-Tools/clr/
if __name__ == "__main__":
HIPDevice()
CLDevice()
# warmup
name = "none"+str(random.randint(0, 1000000))
compile_cl.__wrapped__(f"void {name}() {{}}")
print("compile cl warmed up")
compile_hip.__wrapped__(f"void {name}() {{}}")
print("compile hip warmed up")
print("**** benchmark ****")
name = "none"+str(random.randint(0, 1000000))
# this uses AMD_COMGR_ACTION_COMPILE_SOURCE_TO_BC, then it links the lib on the next step
with Timing("compile cl: "): compile_cl.__wrapped__(f"void {name}() {{}}")
# this uses AMD_COMGR_ACTION_COMPILE_SOURCE_WITH_DEVICE_LIBS_TO_BC, much slower
with Timing("compile hip: "): compile_hip.__wrapped__(f"void {name}() {{}}")
os._exit(0)

View File

@ -56,18 +56,7 @@ def to_function_name(s:str): return ''.join([c if c in (string.ascii_letters+str
@functools.lru_cache(maxsize=None)
def getenv(key:str, default=0): return type(default)(os.getenv(key, default))
def temp(x:str) -> str: return (pathlib.Path(tempfile.gettempdir()) / x).as_posix()
def from_mv(mv, to_type=ctypes.c_char): return ctypes.cast(ctypes.addressof(to_type.from_buffer(mv)), ctypes.POINTER(to_type))
def to_char_p_p(options: List[bytes], to_type=ctypes.c_char): return (ctypes.POINTER(to_type) * len(options))(*[ctypes.cast(ctypes.create_string_buffer(o), ctypes.POINTER(to_type)) for o in options])
@functools.lru_cache(maxsize=None)
def init_c_struct_t(fields: Tuple[Tuple[str, ctypes._SimpleCData], ...]):
class CStruct(ctypes.Structure):
_pack_, _fields_ = 1, fields
return CStruct
def init_c_var(ctypes_var, creat_cb): return (creat_cb(ctypes_var), ctypes_var)[1]
def get_bytes(arg, get_sz, get_str, check) -> bytes: return (sz := init_c_var(ctypes.c_size_t(), lambda x: check(get_sz(arg, ctypes.byref(x)))), ctypes.string_at(init_c_var(ctypes.create_string_buffer(sz.value), lambda x: check(get_str(arg, x))), size=sz.value))[1]
def flat_mv(mv:memoryview):
if len(mv) == 0: return mv
return mv.cast("B", shape=(mv.nbytes,))
class Context(contextlib.ContextDecorator):
stack: ClassVar[List[dict[str, int]]] = [{}]
def __init__(self, **kwargs): self.kwargs = kwargs
@ -274,6 +263,21 @@ def cpu_time_execution(cb, enable):
cb()
if enable: return time.perf_counter()-st
# *** ctypes helpers
def from_mv(mv, to_type=ctypes.c_char): return ctypes.cast(ctypes.addressof(to_type.from_buffer(mv)), ctypes.POINTER(to_type))
def to_char_p_p(options: List[bytes], to_type=ctypes.c_char): return (ctypes.POINTER(to_type) * len(options))(*[ctypes.cast(ctypes.create_string_buffer(o), ctypes.POINTER(to_type)) for o in options])
@functools.lru_cache(maxsize=None)
def init_c_struct_t(fields: Tuple[Tuple[str, ctypes._SimpleCData], ...]):
class CStruct(ctypes.Structure):
_pack_, _fields_ = 1, fields
return CStruct
def init_c_var(ctypes_var, creat_cb): return (creat_cb(ctypes_var), ctypes_var)[1]
def get_bytes(arg, get_sz, get_str, check) -> bytes: return (sz := init_c_var(ctypes.c_size_t(), lambda x: check(get_sz(arg, ctypes.byref(x)))), ctypes.string_at(init_c_var(ctypes.create_string_buffer(sz.value), lambda x: check(get_str(arg, x))), size=sz.value))[1]
def flat_mv(mv:memoryview):
if len(mv) == 0: return mv
return mv.cast("B", shape=(mv.nbytes,))
# *** Helpers for CUDA-like APIs.
def pretty_ptx(s):

View File

@ -64,7 +64,7 @@ class HIPAllocator(LRUAllocator):
class HIPDevice(Compiled):
default_arch_name = "gfx1100"
def __init__(self, device:str):
def __init__(self, device:str=""):
self.device = int(device.split(":")[1]) if ":" in device else 0
if self.device == 0 and not MOCKHIP: HIPDevice.default_arch_name = init_c_var(hip.hipDeviceProp_t(), lambda x: check(hip.hipGetDeviceProperties(x, self.device))).gcnArchName.decode()