1
0
Fork 0

add check global dim limit in linearizer (#1299)

* need a better place for reshape and permute

* add permutation

* cuda fixed

* clean up

* enable nvidia GPU with global max

* fix order

* fix CI

* add check for global dim limit but need refactor

* refactor

* fix ignore
This commit is contained in:
Yixiang Gao 2023-07-31 14:14:54 -04:00 committed by GitHub
parent ce0ab1c14e
commit 6e62dcfbf3
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
4 changed files with 17 additions and 4 deletions

2
.gitignore vendored
View file

@ -36,4 +36,4 @@ examples/webgpu/net.safetensors
node_modules
package.json
package-lock.json
temp
temp

View file

@ -20,6 +20,7 @@ class CStyleLanguage(NamedTuple):
barrier: str = ""
gid: List[str] = []
lid: List[str] = []
global_max: List[int] = []
extra_args: List[str] = []
float4: Optional[str] = None
half_prekernel: Optional[str] = None
@ -194,7 +195,7 @@ class CStyleCodegen(Linearizer):
def codegen(self):
self.process()
#self.limit_global_dims(len(self.lang.gid)) # NOTE: this is optional now
if self.lang.global_max: self.limit_global_dims(len(self.lang.gid), self.lang.global_max) # NOTE: this is optional now
self.linearize()
prg, global_size, local_size = uops_to_cstyle(self.uops, self.lang)

View file

@ -596,14 +596,25 @@ class Linearizer:
# ******************** GPU simplifiers ********************
def limit_global_dims(self, limit):
def limit_global_dims(self, limit, global_max):
# sometimes, there's more dimensions than len(self.lang.gid).
# compact all the dimensions into the first
# NOTE: this might make multiview shapetrackers
if limit and (self.first_reduce-self.local_dims) > limit:
if (self.first_reduce-self.local_dims) > limit:
num_to_merge = ((self.first_reduce-self.local_dims) - limit)+1
self.reshape_and_permute(lambda x: (prod(x[0:num_to_merge]),)+x[num_to_merge:], None)
if DEBUG >= 3: print("reshaped to", self.full_shape, "due to too many global dimensions")
# Check the global allocation limit, current the global_size will be flipped during codegen
# and then padded right with 1s if its length < 3 which makes this part a bit awkward to write
global_dims = self.first_reduce-self.local_dims
if global_dims > 0:
assert max(global_max) >= max(self.full_shape[0:global_dims]), f"device max allocation {max(self.full_shape[0:global_dims])} exceeds global dim maximum {max(global_max)}"
for i in range(global_dims-1):
if self.full_shape[i] > global_max[i]:
order = list(range(len(self.full_shape)))
order[i], order[global_dims-1] = order[global_dims-1], order[i]
self.reshape_and_permute(None, order)
if DEBUG >= 3: print("permuted global dim", order, "due to allocation exceeds global limit")
def alias_buffer(self, i, pattern):
assert len(pattern) == len(self.sts[i].shape), f"must include a pattern for each shape {pattern} {self.sts[i].shape}"

View file

@ -82,6 +82,7 @@ class CUDAProgram:
class CUDACodegen(CStyleCodegen):
lang = CStyleLanguage(
kernel_prefix = "__global__", smem_prefix = "__shared__ ", barrier = "__syncthreads();", float4 = "make_float4",
global_max = [65535, 65535, 2147483647],
gid = [f'blockIdx.{chr(120+i)}' for i in range(3)],
lid = [f'threadIdx.{chr(120+i)}' for i in range(3)],
half_prekernel = """