hip compile speed (#2606)

2023-12-04 13:47:40 -08:00 · 2023-12-04 13:47:40 -08:00 · 09b6e254a3
parent 19a0a839db
commit 09b6e254a3
3 changed files with 48 additions and 13 deletions
--- a/test/external/external_benchmark_hip_compile.py
+++ b/test/external/external_benchmark_hip_compile.py
@ -0,0 +1,31 @@
+import random, os
+from tinygrad.helpers import Timing
+from tinygrad.runtime.ops_hip import compile_hip, HIPDevice
+from tinygrad.runtime.ops_gpu import compile_cl, CLDevice
+
+# OMP_NUM_THREADS=1 strace -tt -f -e trace=file python3 test/external/external_benchmark_hip_compile.py
+# AMD_COMGR_REDIRECT_LOGS=stdout AMD_COMGR_EMIT_VERBOSE_LOGS=1 python3 test/external/external_benchmark_hip_compile.py
+
+# issue is in https://github.com/ROCm-Developer-Tools/clr/
+
+if __name__ == "__main__":
+  HIPDevice()
+  CLDevice()
+
+  # warmup
+  name = "none"+str(random.randint(0, 1000000))
+  compile_cl.__wrapped__(f"void {name}() {{}}")
+  print("compile cl warmed up")
+  compile_hip.__wrapped__(f"void {name}() {{}}")
+  print("compile hip warmed up")
+
+  print("**** benchmark ****")
+  name = "none"+str(random.randint(0, 1000000))
+  # this uses AMD_COMGR_ACTION_COMPILE_SOURCE_TO_BC, then it links the lib on the next step
+  with Timing("compile cl:  "): compile_cl.__wrapped__(f"void {name}() {{}}")
+  # this uses AMD_COMGR_ACTION_COMPILE_SOURCE_WITH_DEVICE_LIBS_TO_BC, much slower
+  with Timing("compile hip: "): compile_hip.__wrapped__(f"void {name}() {{}}")
+  os._exit(0)
+
+
+
--- a/tinygrad/helpers.py
+++ b/tinygrad/helpers.py
@ -56,18 +56,7 @@ def to_function_name(s:str): return ''.join([c if c in (string.ascii_letters+str
@functools.lru_cache(maxsize=None)
 def getenv(key:str, default=0): return type(default)(os.getenv(key, default))
 def temp(x:str) -> str: return (pathlib.Path(tempfile.gettempdir()) / x).as_posix()
-def from_mv(mv, to_type=ctypes.c_char): return ctypes.cast(ctypes.addressof(to_type.from_buffer(mv)), ctypes.POINTER(to_type))
-def to_char_p_p(options: List[bytes], to_type=ctypes.c_char): return (ctypes.POINTER(to_type) * len(options))(*[ctypes.cast(ctypes.create_string_buffer(o), ctypes.POINTER(to_type)) for o in options])
-@functools.lru_cache(maxsize=None)
-def init_c_struct_t(fields: Tuple[Tuple[str, ctypes._SimpleCData], ...]):
-  class CStruct(ctypes.Structure):
-    _pack_, _fields_ = 1, fields
-  return CStruct
-def init_c_var(ctypes_var, creat_cb): return (creat_cb(ctypes_var), ctypes_var)[1]
-def get_bytes(arg, get_sz, get_str, check) -> bytes: return (sz := init_c_var(ctypes.c_size_t(), lambda x: check(get_sz(arg, ctypes.byref(x)))), ctypes.string_at(init_c_var(ctypes.create_string_buffer(sz.value), lambda x: check(get_str(arg, x))), size=sz.value))[1]
-def flat_mv(mv:memoryview):
-  if len(mv) == 0: return mv
-  return mv.cast("B", shape=(mv.nbytes,))
+
 class Context(contextlib.ContextDecorator):
  stack: ClassVar[List[dict[str, int]]] = [{}]
  def __init__(self, **kwargs): self.kwargs = kwargs
@ -274,6 +263,21 @@ def cpu_time_execution(cb, enable):
  cb()
  if enable: return time.perf_counter()-st

+# *** ctypes helpers
+
+def from_mv(mv, to_type=ctypes.c_char): return ctypes.cast(ctypes.addressof(to_type.from_buffer(mv)), ctypes.POINTER(to_type))
+def to_char_p_p(options: List[bytes], to_type=ctypes.c_char): return (ctypes.POINTER(to_type) * len(options))(*[ctypes.cast(ctypes.create_string_buffer(o), ctypes.POINTER(to_type)) for o in options])
+@functools.lru_cache(maxsize=None)
+def init_c_struct_t(fields: Tuple[Tuple[str, ctypes._SimpleCData], ...]):
+  class CStruct(ctypes.Structure):
+    _pack_, _fields_ = 1, fields
+  return CStruct
+def init_c_var(ctypes_var, creat_cb): return (creat_cb(ctypes_var), ctypes_var)[1]
+def get_bytes(arg, get_sz, get_str, check) -> bytes: return (sz := init_c_var(ctypes.c_size_t(), lambda x: check(get_sz(arg, ctypes.byref(x)))), ctypes.string_at(init_c_var(ctypes.create_string_buffer(sz.value), lambda x: check(get_str(arg, x))), size=sz.value))[1]
+def flat_mv(mv:memoryview):
+  if len(mv) == 0: return mv
+  return mv.cast("B", shape=(mv.nbytes,))
+
 # *** Helpers for CUDA-like APIs.

 def pretty_ptx(s):
--- a/tinygrad/runtime/ops_hip.py
+++ b/tinygrad/runtime/ops_hip.py
@ -64,7 +64,7 @@ class HIPAllocator(LRUAllocator):

 class HIPDevice(Compiled):
  default_arch_name = "gfx1100"
-  def __init__(self, device:str):
+  def __init__(self, device:str=""):
    self.device = int(device.split(":")[1]) if ":" in device else 0
    if self.device == 0 and not MOCKHIP: HIPDevice.default_arch_name = init_c_var(hip.hipDeviceProp_t(), lambda x: check(hip.hipGetDeviceProperties(x, self.device))).gcnArchName.decode()