Add pylint trailing whitespace rule (#1314)

2023-07-21 10:37:55 -07:00 · 2023-07-21 10:37:55 -07:00 · b112edd2c3
parent bfbb8d3d0f
commit b112edd2c3
12 changed files with 103 additions and 103 deletions
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@ -23,7 +23,7 @@ jobs:
    - name: Repo line count
      run: python3 sz.py
    - name: Lint with pylint
-      run: python -m pylint --disable=all -e W0311 --jobs=0 --indent-string='  ' **/*.py
+      run: python -m pylint --disable=all -e W0311 -e C0303 --jobs=0 --indent-string='  ' **/*.py
    - name: Lint with flake8
      run: flake8 tinygrad/ --indent-size=2 --select=F,E112,E113,E203,E304,E502,E702,E703,E71,E72,E731,W191,W6 --statistics -j4
    - name: Lint tinygrad with pylint
--- a/examples/compile_efficientnet.py
+++ b/examples/compile_efficientnet.py
@ -69,7 +69,7 @@ if __name__ == "__main__":
  # the functions
  cprog += list(functions.values())

-  # the net 
+  # the net
  cprog += ["void net() {"] + [f"{name}({', '.join(args)});" for (name, args, _global_size) in statements] + ["}"]

  cprog += ["""
--- a/examples/deep_deterministic_policy_gradient.py
+++ b/examples/deep_deterministic_policy_gradient.py
@ -114,8 +114,8 @@ class DeepDeterministicPolicyGradient:
      noise_stddev: The standard deviation of the exploration noise.

  Note:
-      In contrast to the original paper, actions are already included in the first layer 
-      of the Critic and we use a Gaussian distribution instead of an Ornstein Uhlenbeck 
+      In contrast to the original paper, actions are already included in the first layer
+      of the Critic and we use a Gaussian distribution instead of an Ornstein Uhlenbeck
      process for exploration noise.

  """
@ -203,7 +203,7 @@ class DeepDeterministicPolicyGradient:
        next_state_batch,
        done_batch,
    ) = self.memory.sample()
- 
+
    target_actions = self.target_actor.forward(next_state_batch, self.max_action)
    y = reward_batch + self.gamma * self.target_critic.forward(
        next_state_batch, target_actions.detach()
--- a/examples/yolov8-onnx.py
+++ b/examples/yolov8-onnx.py
@ -7,7 +7,7 @@ from tinygrad.tensor import Tensor

 os.chdir("/tmp")
 if not os.path.isfile("yolov8n-seg.onnx"):
-  model = YOLO("yolov8n-seg.pt") 
+  model = YOLO("yolov8n-seg.pt")
  model.export(format="onnx", imgsz=[480,640])
 onnx_model = onnx.load(open("yolov8n-seg.onnx", "rb"))
 # TODO: move get example inputs to onnx
--- a/examples/yolov8.py
+++ b/examples/yolov8.py
@ -48,10 +48,10 @@ def box_area(box):
 def box_iou(box1, box2):
  lt = np.maximum(box1[:, None, :2], box2[:, :2])
  rb = np.minimum(box1[:, None, 2:], box2[:, 2:])
-  wh = np.clip(rb - lt, 0, None) 
-  inter = wh[:, :, 0] * wh[:, :, 1]  
-  area1 = box_area(box1)[:, None]  
-  area2 = box_area(box2)[None, :]  
+  wh = np.clip(rb - lt, 0, None)
+  inter = wh[:, :, 0] * wh[:, :, 1]
+  area1 = box_area(box1)[:, None]
+  area2 = box_area(box2)[None, :]
  iou = inter / (area1 + area2 - inter)
  return iou

@ -66,7 +66,7 @@ def compute_nms(boxes, scores, iou_threshold):
    inds = np.where(iou.squeeze() <= iou_threshold)[0]
    order = order[inds + 1]
  return np.array(keep)
-    
+
 def non_max_suppression(prediction, conf_thres=0.25, iou_thres=0.45, agnostic=False, max_det=300, nc=0, max_wh=7680):
  prediction = prediction[0] if isinstance(prediction, (list, tuple)) else prediction
  bs, nc = prediction.shape[0], nc or (prediction.shape[1] - 4)
@ -86,7 +86,7 @@ def non_max_suppression(prediction, conf_thres=0.25, iou_thres=0.45, agnostic=Fa
    c = x[:, 5:6] * (0 if agnostic else max_wh)
    boxes, scores = x[:, :4] + c, x[:, 4]
    i = compute_nms(boxes, scores, iou_thres)[:max_det]
-    output[xi] = x[i] 
+    output[xi] = x[i]
  return output

 def postprocess(preds, img, orig_imgs):
@ -102,7 +102,7 @@ def postprocess(preds, img, orig_imgs):
      pred[:, :4] = scale_boxes(img.shape[2:], pred[:, :4], orig_img.shape)
      all_preds.append(pred)
  return all_preds
-  
+
 def draw_bounding_boxes_and_save(orig_img_paths, output_img_paths, all_predictions, class_labels, iou_threshold=0.5):
  color_dict = {label: tuple((((i+1) * 50) % 256, ((i+1) * 100) % 256, ((i+1) * 150) % 256)) for i, label in enumerate(class_labels)}
  font = cv2.FONT_HERSHEY_SIMPLEX
@ -159,7 +159,7 @@ def draw_bounding_boxes_and_save(orig_img_paths, output_img_paths, all_predictio
    cv2.imwrite(output_img_path, orig_img)
    print(f'saved detections at {output_img_path}')

-# utility functions for forward pass. 
+# utility functions for forward pass.
 def dist2bbox(distance, anchor_points, xywh=True, dim=-1):
  lt, rb = distance.chunk(2, dim)
  x1y1 = anchor_points - lt
@ -167,7 +167,7 @@ def dist2bbox(distance, anchor_points, xywh=True, dim=-1):
  if xywh:
    c_xy = (x1y1 + x2y2) / 2
    wh = x2y2 - x1y1
-    return c_xy.cat(wh, dim=1) 
+    return c_xy.cat(wh, dim=1)
  return x1y1.cat(x2y2, dim=1)

 def make_anchors(feats, strides, grid_cell_offset=0.5):
@ -175,13 +175,13 @@ def make_anchors(feats, strides, grid_cell_offset=0.5):
  assert feats is not None
  for i, stride in enumerate(strides):
    _, _, h, w = feats[i].shape
-    sx = Tensor.arange(w) + grid_cell_offset  
-    sy = Tensor.arange(h) + grid_cell_offset 
-    
-    # this is np.meshgrid but in tinygrad 
+    sx = Tensor.arange(w) + grid_cell_offset
+    sy = Tensor.arange(h) + grid_cell_offset
+
+    # this is np.meshgrid but in tinygrad
    sx = sx.reshape(1, -1).repeat([h, 1]).reshape(-1)
    sy = sy.reshape(-1, 1).repeat([1, w]).reshape(-1)
-    
+
    anchor_points.append(Tensor.stack((sx, sy), -1).reshape(-1, 2))
    stride_tensor.append(Tensor.full((h * w), stride))
  anchor_points = anchor_points[0].cat(anchor_points[1], anchor_points[2])
@ -244,32 +244,32 @@ class Upsample:
    (b, c), _lens = x.shape[:2], len(x.shape[2:])
    tmp = x.reshape([b, c, -1] + [1] * _lens) * Tensor.ones(*[1, 1, 1] + [self.scale_factor] * _lens)
    return tmp.reshape(list(x.shape) + [self.scale_factor] * _lens).permute([0, 1] + list(chain.from_iterable([[y+2, y+2+_lens] for y in range(_lens)]))).reshape([b, c] + [x * self.scale_factor for x in x.shape[2:]])
-  
+
 class Conv_Block():
  def __init__(self, c1, c2, kernel_size=1, stride=1, groups=1, dilation=1, padding=None):
    self.conv = Conv2d(c1,c2, kernel_size, stride, padding=autopad(kernel_size, padding, dilation), bias=False, groups=groups, dilation=dilation)
    self.bn = BatchNorm2d(c2, eps=0.001)
-  
+
  def __call__(self, x):
    return self.bn(self.conv(x)).silu()
-  
+
 class Bottleneck:
  def __init__(self, c1, c2 , shortcut: bool, g=1, kernels: list = (3,3), channel_factor=0.5):
    c_ = int(c2 * channel_factor)
    self.cv1 = Conv_Block(c1, c_, kernel_size=kernels[0], stride=1, padding=None)
    self.cv2 = Conv_Block(c_, c2, kernel_size=kernels[1], stride=1, padding=None, groups=g)
    self.residual = c1 == c2 and shortcut
-    
+
  def __call__(self, x):
    return x + self.cv2(self.cv1(x)) if self.residual else self.cv2(self.cv1(x))

 class C2f:
-  def __init__(self, c1, c2, n=1, shortcut=False, g=1, e=0.5): 
-    self.c = int(c2 * e) 
+  def __init__(self, c1, c2, n=1, shortcut=False, g=1, e=0.5):
+    self.c = int(c2 * e)
    self.cv1 = Conv_Block(c1, 2 * self.c, 1,)
    self.cv2 = Conv_Block((2 + n) * self.c, c2, 1)
    self.bottleneck = [Bottleneck(self.c, self.c, shortcut, g, kernels=[(3, 3), (3, 3)], channel_factor=1.0) for _ in range(n)]
-   
+
  def __call__(self, x):
    y= list(self.cv1(x).chunk(2, 1))
    y.extend(m(y[-1]) for m in self.bottleneck)
@ -282,17 +282,17 @@ class SPPF:
    c_ = c1 // 2  # hidden channels
    self.cv1 = Conv_Block(c1, c_, 1, 1, padding=None)
    self.cv2 = Conv_Block(c_ * 4, c2, 1, 1, padding=None)
-    
-    # TODO: this pads with 0s, whereas torch function pads with -infinity. This results in a < 2% difference in prediction which does not make a difference visually. 
+
+    # TODO: this pads with 0s, whereas torch function pads with -infinity. This results in a < 2% difference in prediction which does not make a difference visually.
    self.maxpool = lambda x : x.pad2d((k // 2, k // 2, k // 2, k // 2)).max_pool2d(kernel_size=k, stride=1)
-        
+
  def __call__(self, x):
    x = self.cv1(x)
    x2 = self.maxpool(x)
    x3 = self.maxpool(x2)
    x4 = self.maxpool(x3)
    return self.cv2(x.cat(x2, x3, x4, dim=1))
-  
+
 class DFL:
  def __init__(self, c1=16):
    self.conv = Conv2d(c1, 1, 1, bias=False)
@ -303,19 +303,19 @@ class DFL:
  def __call__(self, x):
    b, c, a = x.shape # batch, channels, anchors
    return self.conv(x.reshape(b, 4, self.c1, a).transpose(2, 1).softmax(1)).reshape(b, 4, a)
-  
-#backbone                               
+
+#backbone
 class Darknet:
-  def __init__(self, w, r, d): 
+  def __init__(self, w, r, d):
    self.b1 = [Conv_Block(c1=3, c2= int(64*w), kernel_size=3, stride=2, padding=1), Conv_Block(int(64*w), int(128*w), kernel_size=3, stride=2, padding=1)]
    self.b2 = [C2f(c1=int(128*w), c2=int(128*w), n=round(3*d), shortcut=True), Conv_Block(int(128*w), int(256*w), 3, 2, 1), C2f(int(256*w), int(256*w), round(6*d), True)]
    self.b3 = [Conv_Block(int(256*w), int(512*w), kernel_size=3, stride=2, padding=1), C2f(int(512*w), int(512*w), round(6*d), True)]
    self.b4 = [Conv_Block(int(512*w), int(512*w*r), kernel_size=3, stride=2, padding=1), C2f(int(512*w*r), int(512*w*r), round(3*d), True)]
    self.b5 = [SPPF(int(512*w*r), int(512*w*r), 5)]
-    
+
  def return_modules(self):
    return [*self.b1, *self.b2, *self.b3, *self.b4, *self.b5]
-  
+
  def __call__(self, x):
    x1 = x.sequential(self.b1)
    x2 = x1.sequential(self.b2)
@ -334,10 +334,10 @@ class Yolov8NECK:
    self.n4 = C2f(c1=int(768*w), c2=int(512*w), n=round(3*d), shortcut=False)
    self.n5 = Conv_Block(c1=int(512* w), c2=int(512 * w), kernel_size=3, stride=2, padding=1)
    self.n6 = C2f(c1=int(512*w*(1+r)), c2=int(512*w*r), n=round(3*d), shortcut=False)
-  
+
  def return_modules(self):
    return [self.n1, self.n2, self.n3, self.n4, self.n5, self.n6]
-  
+
  def __call__(self, p3, p4, p5):
    x = self.n1(self.up(p5).cat(p4, dim=1))
    head_1 = self.n2(self.up(x).cat(p3, dim=1))
@ -345,20 +345,20 @@ class Yolov8NECK:
    head_3 = self.n6(self.n5(head_2).cat(p5, dim=1))
    return [head_1, head_2, head_3]

-#task specific head. 
+#task specific head.
 class DetectionHead:
  def __init__(self, nc=80, filters=()):
-    self.ch = 16 
+    self.ch = 16
    self.nc = nc  # number of classes
-    self.nl = len(filters)  
+    self.nl = len(filters)
    self.no = nc + self.ch * 4  #
    self.stride = [8, 16, 32]
    c1 = max(filters[0], self.nc)
    c2 = max((filters[0] // 4, self.ch * 4))
-    self.dfl = DFL(self.ch) 
+    self.dfl = DFL(self.ch)
    self.cv3 = [[Conv_Block(x, c1, 3), Conv_Block(c1, c1, 3), Conv2d(c1, self.nc, 1)] for x in filters]
    self.cv2 = [[Conv_Block(x, c2, 3), Conv_Block(c2, c2, 3), Conv2d(c2, 4 * self.ch, 1)] for x in filters]
-  
+
  def __call__(self, x):
    for i in range(self.nl):
      x[i] = (x[i].sequential(self.cv2[i]).cat(x[i].sequential(self.cv3[i]), dim=1))
@ -369,7 +369,7 @@ class DetectionHead:
    dbox = dist2bbox(self.dfl(box), self.anchors.unsqueeze(0), xywh=True, dim=1) * self.strides
    z = dbox.cat(cls.sigmoid(), dim=1)
    return z
-   
+
 class YOLOv8:
  def __init__(self, w, r,  d, num_classes): #width_multiple, ratio_multiple, depth_multiple
    self.net = Darknet(w, r, d)
@ -386,9 +386,9 @@ class YOLOv8:
    yolov8neck_modules = [12, 15, 16, 18, 19, 21]
    yolov8_head_weights = [(22, self.head)]
    return [*zip(backbone_modules, self.net.return_modules()), *zip(yolov8neck_modules, self.fpn.return_modules()), *yolov8_head_weights]
-  
+
 if __name__ == '__main__':
-  
+
  # usage : python3 yolov8.py "image_URL OR image_path" "v8 variant" (optional, n is default)
  if len(sys.argv) < 2:
    print("Error: Image URL or path not provided.")
@ -397,7 +397,7 @@ if __name__ == '__main__':
  img_path = sys.argv[1]
  yolo_variant = sys.argv[2] if len(sys.argv) >= 3 else (print("No variant given, so choosing 'n' as the default. Yolov8 has different variants, you can choose from ['n', 's', 'm', 'l', 'x']") or 'n')
  print(f'running inference for YOLO version {yolo_variant}')
-  
+
  output_folder_path = './outputs_yolov8'
  if not os.path.exists(output_folder_path):
    os.makedirs(output_folder_path)
@ -409,31 +409,31 @@ if __name__ == '__main__':
    print('Error in image loading. Check your image file.')
    sys.exit(1)
  pre_processed_image = preprocess(image)
-  
+
  # Different YOLOv8 variants use different w , r, and d multiples. For a list , refer to this yaml file (the scales section) https://github.com/ultralytics/ultralytics/blob/main/ultralytics/models/v8/yolov8.yaml
-  depth, width, ratio = get_variant_multiples(yolo_variant) 
-  yolo_infer = YOLOv8(w=width, r=ratio, d=depth, num_classes=80)  
-  
+  depth, width, ratio = get_variant_multiples(yolo_variant)
+  yolo_infer = YOLOv8(w=width, r=ratio, d=depth, num_classes=80)
+
  weights_location = Path(__file__).parent.parent / "weights" / f'yolov8{yolo_variant}.safetensors'
  download_file(f'https://gitlab.com/r3sist/yolov8_weights/-/raw/master/yolov8{yolo_variant}.safetensors', weights_location)
-  
+
  state_dict = safe_load(weights_location)
  load_state_dict(yolo_infer, state_dict)
-    
+
  st = time.time()
  predictions = yolo_infer(pre_processed_image)
  print(f'did inference in {int(round(((time.time() - st) * 1000)))}ms')

  post_predictions = postprocess(preds=predictions, img=pre_processed_image, orig_imgs=image)
-  
+
  #v8 and v3 have same 80 class names for Object Detection
  class_labels = fetch('https://raw.githubusercontent.com/pjreddie/darknet/master/data/coco.names')
  class_labels = class_labels.decode('utf-8').split('\n')

  draw_bounding_boxes_and_save(orig_img_paths=image_location, output_img_paths=out_paths, all_predictions=post_predictions, class_labels=class_labels)

-# TODO for later: 
-#  1. Fix SPPF minor difference due to maxpool 
-#  2. AST exp overflow warning while on cpu 
-#  3. Make NMS faster 
+# TODO for later:
+#  1. Fix SPPF minor difference due to maxpool
+#  2. AST exp overflow warning while on cpu
+#  3. Make NMS faster
 #  4. Add video inference and webcam support
--- a/extra/lr_scheduler.py
+++ b/extra/lr_scheduler.py
@ -7,9 +7,9 @@ class LR_Scheduler:
  def __init__(self, optimizer: Optimizer):
    self.optimizer = optimizer
    self.epoch_counter = Tensor([0], requires_grad=False)
-  
+
  def get_lr(self): pass
-  
+
  def step(self) -> None:
    self.epoch_counter.assign(self.epoch_counter + 1).realize()
    self.optimizer.lr.assign(self.get_lr()).realize()
@ -19,7 +19,7 @@ class MultiStepLR(LR_Scheduler):
    super().__init__(optimizer)
    self.milestones = milestones
    self.gamma = gamma
-  
+
  def get_lr(self) -> Tensor:
    if self.epoch_counter.numpy()[0] not in self.milestones:
      return self.optimizer.lr
@ -34,13 +34,13 @@ class ReduceLROnPlateau(LR_Scheduler):
    self.bad_epoch = 0

    if mode == "min": self.threshold *= -1
-  
+
  def is_better(self, current: float) -> bool:
    dynamic_threshold = self.best*(1+self.threshold) if self.threshold_mode == "rel" else self.best+self.threshold
    if self.mode == "min":
      return current < dynamic_threshold
    return current > dynamic_threshold
-  
+
  def step(self, current: float) -> None:
    self.epoch_counter.assign(self.epoch_counter + 1).realize()
    if self.is_better(current):
@ -48,7 +48,7 @@ class ReduceLROnPlateau(LR_Scheduler):
      self.best = current
    else:
      self.bad_epoch += 1
-    
+
    if self.bad_epoch > self.patience:
      self.optimizer.lr *= self.factor
      self.bad_epoch = 0
@ -74,12 +74,12 @@ class OneCycleLR(LR_Scheduler):
    self.pct_start = pct_start
    assert anneal_strategy == 'linear', 'only linear annealing supported'
    assert not cycle_momentum, 'cycle momentum not supported'
-    self.optimizer.lr.assign(self.get_lr()).realize() # update the initial LR 
+    self.optimizer.lr.assign(self.get_lr()).realize() # update the initial LR

  @staticmethod
  def _annealing_linear(start: Tensor, end: Tensor, pct: Tensor) -> Tensor: return ((end - start) * pct + start)

-  def get_lr(self) -> Tensor: 
+  def get_lr(self) -> Tensor:
    return (self.epoch_counter < self.total_steps*self.pct_start).where(
      self._annealing_linear(self.initial_lr, self.max_lr, self.epoch_counter/(self.total_steps*self.pct_start)),
      self._annealing_linear(self.max_lr, self.min_lr, (self.epoch_counter-(self.total_steps*self.pct_start))/(self.total_steps*(1-self.pct_start)))
--- a/extra/onnx.py
+++ b/extra/onnx.py
@ -78,7 +78,7 @@ def get_run_onnx(onnx_model: ModelProto):
  attribute_dict = {}
  for num,n in enumerate(onnx_model.graph.node):
    attribute_dict[num] = attribute_to_dict(n.attribute)
-  
+
  onnx_model_version = onnx_model.opset_import[0].version

  def run_onnx(inputs={}, debug=False):
@ -204,7 +204,7 @@ def get_run_onnx(onnx_model: ModelProto):
      assert len(n.output) <= len(ret), f"expected output size must be less than {len(ret)}, it's {n.output}"
      if debug: print([x.shape if isinstance(x, Tensor) else None for x in ret])
      if debug: print("outputs:")
-      for i in range(len(n.output)): 
+      for i in range(len(n.output)):
        if debug: print(f"\t{n.output[i]} - {ret[i]}")
        intermediate_tensors[n.output[i]] = ret[i]
      #print(ret[0].numpy().mean())
--- a/extra/onnx_ops.py
+++ b/extra/onnx_ops.py
@ -209,7 +209,7 @@ def Or(x:Tensor, y:Tensor): return Where((x==y), x, Tensor.ones(*x.shape)).cast(
 def Xor(x:Tensor, y:Tensor): return Where((x==y), Tensor.zeros(*x.shape), Tensor.ones(*x.shape)).cast(dtypes.bool)
 def Not(x:Tensor): return Where((x==1), Tensor.zeros(*x.shape), Tensor.ones(*x.shape)).cast(dtypes.bool)

-def Trilu(x: Tensor, k: Union[Tensor, int]=0, upper=1): 
+def Trilu(x: Tensor, k: Union[Tensor, int]=0, upper=1):
  k = int(k.numpy().item()) if k is not 0 else 0 # onnx passes k as a tensor int64 with one element, default is 0
  return x.triu(k) if upper else x.tril(k)

@ -242,13 +242,13 @@ def NegativeLogLikelihoodLoss(input, target, weight=None, ignore_index=None, red
    input = input.reshape((N, C, -1))
    target = target.reshape((N, -1))
  if weight is not None:
-    mask = target.unsqueeze(-1) == Tensor.arange(C,dtype=dtypes.int64).repeat((N, 1, 1)) 
+    mask = target.unsqueeze(-1) == Tensor.arange(C,dtype=dtypes.int64).repeat((N, 1, 1))
    weight = (mask * weight).sum(axis=-1)
  if ignore_index is not None:
    cond = (target == ignore_index)
-    weight = cond.where(0, weight) if weight is not None else cond.where(Tensor.zeros(*target.shape), 1) 
-  mask = target[:, None, :] ==  Tensor.arange(C).reshape([1, C] + [1]*(len(input.shape) -2)) 
-  loss = (-mask * input).sum(axis=1) * (1 if weight is None else weight)  
+    weight = cond.where(0, weight) if weight is not None else cond.where(Tensor.zeros(*target.shape), 1)
+  mask = target[:, None, :] ==  Tensor.arange(C).reshape([1, C] + [1]*(len(input.shape) -2))
+  loss = (-mask * input).sum(axis=1) * (1 if weight is None else weight)
  if reduction == "mean": return loss.mean() if weight is None else loss.sum() / weight.sum()
  elif reduction == "sum": return loss.sum()
  return loss.reshape(t_shape) if len(i_shape) != 3 else loss
@ -259,7 +259,7 @@ def OneHot(indices, depth, values, axis=-1):
  if axis < 0: axis += rank + 1
  ls, rs = indices.shape[0:axis], indices.shape[axis: rank]
  cond = indices[:,None] == Tensor.arange(depth).reshape((1,) * len(ls) + (depth,) + (1,) * len(rs))
-  return cond.where(values[1], values[0]).cast(values.dtype) 
+  return cond.where(values[1], values[0]).cast(values.dtype)

 def Floor(x:Tensor): return x.floor()
 def Ceil(x:Tensor): return x.ceil()
--- a/extra/training.py
+++ b/extra/training.py
@ -13,7 +13,7 @@ def sparse_categorical_crossentropy(out, Y):
  y = Tensor(y)
  return out.mul(y).mean()

-def train(model, X_train, Y_train, optim, steps, BS=128, lossfn=sparse_categorical_crossentropy, 
+def train(model, X_train, Y_train, optim, steps, BS=128, lossfn=sparse_categorical_crossentropy,
        transform=lambda x: x, target_transform=lambda x: x, noloss=False):
  Tensor.training = True
  losses, accuracies = [], []
@ -41,9 +41,9 @@ def train(model, X_train, Y_train, optim, steps, BS=128, lossfn=sparse_categoric
      accuracies.append(accuracy)
      t.set_description("loss %.2f accuracy %.2f" % (loss, accuracy))
  return [losses, accuracies]
-    

-def evaluate(model, X_test, Y_test, num_classes=None, BS=128, return_predict=False, transform=lambda x: x, 
+
+def evaluate(model, X_test, Y_test, num_classes=None, BS=128, return_predict=False, transform=lambda x: x,
             target_transform=lambda y: y):
  Tensor.training = False
  def numpy_eval(Y_test, num_classes):
--- a/models/mask_rcnn.py
+++ b/models/mask_rcnn.py
@ -40,7 +40,7 @@ def topk(input_, k, dim=-1, largest=True, sorted=False):
  ind_part = np.argsort(input_, axis=dim)
  ind = np.take_along_axis(ind, ind_part, axis=dim)
  if largest: input_ *= -1
-  val = np.take_along_axis(input_, ind_part, axis=dim) 
+  val = np.take_along_axis(input_, ind_part, axis=dim)
  return Tensor(val), ind

 # This is very slow for large arrays, or indices
@ -48,12 +48,12 @@ def _gather(array, indices):
  indices = indices.float().to(array.device)
  reshape_arg = [1]*array.ndim + [array.shape[-1]]
  return Tensor.where(
-    indices.unsqueeze(indices.ndim).expand(*indices.shape, array.shape[-1]) == Tensor.arange(array.shape[-1]).reshape(*reshape_arg).expand(*indices.shape, array.shape[-1]), 
+    indices.unsqueeze(indices.ndim).expand(*indices.shape, array.shape[-1]) == Tensor.arange(array.shape[-1]).reshape(*reshape_arg).expand(*indices.shape, array.shape[-1]),
    array, 0,
  ).sum(indices.ndim)

 # TODO: replace npgather with a faster gather using tinygrad only
-# NOTE: this blocks the gradient 
+# NOTE: this blocks the gradient
 def npgather(array,indices):
  if isinstance(array, Tensor): array = array.numpy()
  if isinstance(indices, Tensor): indices = indices.numpy()
@ -98,7 +98,7 @@ def tensor_gather(tensor, indices):
  return ret


-class LastLevelMaxPool: 
+class LastLevelMaxPool:
  def __call__(self, x): return [Tensor.max_pool2d(x, 1, 2)]


@ -853,7 +853,7 @@ def _bilinear_interpolate(
  w2 = outer_prod(hy, lx)
  w3 = outer_prod(ly, hx)
  w4 = outer_prod(ly, lx)
-  
+
  val = w1*v1 + w2*v2 + w3*v3 + w4*v4
  return val

@ -861,41 +861,41 @@ def _bilinear_interpolate(
 def _roi_align(input, rois, spatial_scale, pooled_height, pooled_width, sampling_ratio, aligned):
  orig_dtype = input.dtype
  _, _, height, width = input.shape
-  ph = Tensor.arange(pooled_height, device=input.device)  
-  pw = Tensor.arange(pooled_width, device=input.device) 
+  ph = Tensor.arange(pooled_height, device=input.device)
+  pw = Tensor.arange(pooled_width, device=input.device)

-  roi_batch_ind = rois[:, 0].cast(dtypes.int32).contiguous() 
+  roi_batch_ind = rois[:, 0].cast(dtypes.int32).contiguous()
  offset = 0.5 if aligned else 0.0
  roi_start_w = rois[:, 1] * spatial_scale - offset
  roi_start_h = rois[:, 2] * spatial_scale - offset
-  roi_end_w = rois[:, 3] * spatial_scale - offset 
+  roi_end_w = rois[:, 3] * spatial_scale - offset
  roi_end_h = rois[:, 4] * spatial_scale - offset

-  roi_width = roi_end_w - roi_start_w 
-  roi_height = roi_end_h - roi_start_h 
+  roi_width = roi_end_w - roi_start_w
+  roi_height = roi_end_h - roi_start_h
  if not aligned:
-    roi_width = roi_width.maximum(1.0) 
-    roi_height = roi_height.maximum(1.0) 
+    roi_width = roi_width.maximum(1.0)
+    roi_height = roi_height.maximum(1.0)

-  bin_size_h = roi_height / pooled_height  
-  bin_size_w = roi_width / pooled_width  
+  bin_size_h = roi_height / pooled_height
+  bin_size_w = roi_width / pooled_width

  exact_sampling = sampling_ratio > 0
-  roi_bin_grid_h = sampling_ratio if exact_sampling else (roi_height / pooled_height).ceil() 
+  roi_bin_grid_h = sampling_ratio if exact_sampling else (roi_height / pooled_height).ceil()
  roi_bin_grid_w = sampling_ratio if exact_sampling else (roi_width / pooled_width).ceil()

  if exact_sampling:
-    count = max(roi_bin_grid_h * roi_bin_grid_w, 1)  
-    iy = Tensor.arange(roi_bin_grid_h, device=input.device) 
-    ix = Tensor.arange(roi_bin_grid_w, device=input.device) 
+    count = max(roi_bin_grid_h * roi_bin_grid_w, 1)
+    iy = Tensor.arange(roi_bin_grid_h, device=input.device)
+    ix = Tensor.arange(roi_bin_grid_w, device=input.device)
    ymask = None
    xmask = None
  else:
    count = (roi_bin_grid_h * roi_bin_grid_w).maximum(1)
-    iy = Tensor.arange(height, device=input.device)  
-    ix = Tensor.arange(width, device=input.device)  
-    ymask = iy[None, :] < roi_bin_grid_h[:, None] 
-    xmask = ix[None, :] < roi_bin_grid_w[:, None] 
+    iy = Tensor.arange(height, device=input.device)
+    ix = Tensor.arange(width, device=input.device)
+    ymask = iy[None, :] < roi_bin_grid_h[:, None]
+    xmask = ix[None, :] < roi_bin_grid_w[:, None]

  def from_K(t):
    return t[:, None, None]
--- a/models/unet3d.py
+++ b/models/unet3d.py
@ -30,7 +30,7 @@ class UNet3D:
    self.input_block = DownsampleBlock(in_channels, filters[0], stride=1)
    self.downsample = [DownsampleBlock(i, o) for i, o in zip(inp, out)]
    self.bottleneck = DownsampleBlock(filters[-1], filters[-1])
-    self.upsample = [UpsampleBlock(filters[-1], filters[-1])] + [UpsampleBlock(i, o) for i, o in zip(out[::-1], inp[::-1])] 
+    self.upsample = [UpsampleBlock(filters[-1], filters[-1])] + [UpsampleBlock(i, o) for i, o in zip(out[::-1], inp[::-1])]
    self.output = {"conv": nn.Conv2d(filters[0], n_class, kernel_size=(1, 1, 1))}

  def __call__(self, x):
@ -44,7 +44,7 @@ class UNet3D:
      x = upsample(x, skip)
    x = self.output["conv"](x)
    return x
-    
+
  def load_from_pretrained(self):
    fn = Path(__file__).parent.parent / "weights" / "unet-3d.ckpt"
    download_file("https://zenodo.org/record/5597155/files/3dunet_kits19_pytorch.ptc?download=1", fn)
--- a/test/test_conv.py
+++ b/test/test_conv.py
@ -104,7 +104,7 @@ class TestConv(unittest.TestCase):
    x = x.conv2d(w, groups=32)
    out = x.numpy()
    Tensor.no_grad = False
-  
+
  def test_multiadd(self):
    w = Tensor.ones(32)
    x = Tensor.ones(32).relu()