655 lines
22 KiB
Python
655 lines
22 KiB
Python
from tinygrad.nn import Conv2d, BatchNorm2d
|
|
from tinygrad.tensor import Tensor
|
|
import numpy as np
|
|
from itertools import chain
|
|
from pathlib import Path
|
|
import cv2
|
|
from collections import defaultdict
|
|
import time, sys
|
|
from tinygrad.helpers import fetch
|
|
from tinygrad.nn.state import safe_load, load_state_dict
|
|
|
|
# Model architecture from https://github.com/ultralytics/ultralytics/issues/189
|
|
# The upsampling class has been taken from this pull request https://github.com/tinygrad/tinygrad/pull/784 by dc-dc-dc. Now 2(?) models use upsampling. (retinet and this)
|
|
|
|
|
|
# Pre processing image functions.
|
|
def compute_transform(
|
|
image, new_shape=(640, 640), auto=False, scaleFill=False, scaleup=True, stride=32
|
|
):
|
|
shape = image.shape[:2] # current shape [height, width]
|
|
new_shape = (new_shape, new_shape) if isinstance(new_shape, int) else new_shape
|
|
r = min(new_shape[0] / shape[0], new_shape[1] / shape[1])
|
|
r = min(r, 1.0) if not scaleup else r
|
|
new_unpad = (int(round(shape[1] * r)), int(round(shape[0] * r)))
|
|
dw, dh = new_shape[1] - new_unpad[0], new_shape[0] - new_unpad[1]
|
|
dw, dh = (np.mod(dw, stride), np.mod(dh, stride)) if auto else (0.0, 0.0)
|
|
new_unpad = (new_shape[1], new_shape[0]) if scaleFill else new_unpad
|
|
dw /= 2
|
|
dh /= 2
|
|
image = (
|
|
cv2.resize(image, new_unpad, interpolation=cv2.INTER_LINEAR)
|
|
if shape[::-1] != new_unpad
|
|
else image
|
|
)
|
|
top, bottom = int(round(dh - 0.1)), int(round(dh + 0.1))
|
|
left, right = int(round(dw - 0.1)), int(round(dw + 0.1))
|
|
image = cv2.copyMakeBorder(
|
|
image, top, bottom, left, right, cv2.BORDER_CONSTANT, value=(114, 114, 114)
|
|
)
|
|
return image
|
|
|
|
|
|
def preprocess(im, imgsz=640, model_stride=32, model_pt=True):
|
|
same_shapes = all(x.shape == im[0].shape for x in im)
|
|
auto = same_shapes and model_pt
|
|
im = Tensor(
|
|
[
|
|
compute_transform(x, new_shape=imgsz, auto=auto, stride=model_stride)
|
|
for x in im
|
|
]
|
|
)
|
|
im = Tensor.stack(im) if im.shape[0] > 1 else im
|
|
im = im[..., ::-1].permute(0, 3, 1, 2) # BGR to RGB, BHWC to BCHW, (n, 3, h, w)
|
|
im /= 255 # 0 - 255 to 0.0 - 1.0
|
|
return im
|
|
|
|
|
|
# Post Processing functions
|
|
def box_area(box):
|
|
return (box[:, 2] - box[:, 0]) * (box[:, 3] - box[:, 1])
|
|
|
|
|
|
def box_iou(box1, box2):
|
|
lt = np.maximum(box1[:, None, :2], box2[:, :2])
|
|
rb = np.minimum(box1[:, None, 2:], box2[:, 2:])
|
|
wh = np.clip(rb - lt, 0, None)
|
|
inter = wh[:, :, 0] * wh[:, :, 1]
|
|
area1 = box_area(box1)[:, None]
|
|
area2 = box_area(box2)[None, :]
|
|
iou = inter / (area1 + area2 - inter)
|
|
return iou
|
|
|
|
|
|
def compute_nms(boxes, scores, iou_threshold):
|
|
order, keep = scores.argsort()[::-1], []
|
|
while order.size > 0:
|
|
i = order[0]
|
|
keep.append(i)
|
|
if order.size == 1:
|
|
break
|
|
iou = box_iou(boxes[i][None, :], boxes[order[1:]])
|
|
inds = np.where(iou.squeeze() <= iou_threshold)[0]
|
|
order = order[inds + 1]
|
|
return np.array(keep)
|
|
|
|
|
|
def non_max_suppression(
|
|
prediction,
|
|
conf_thres=0.25,
|
|
iou_thres=0.45,
|
|
agnostic=False,
|
|
max_det=300,
|
|
nc=0,
|
|
max_wh=7680,
|
|
):
|
|
prediction = prediction[0] if isinstance(prediction, (list, tuple)) else prediction
|
|
bs, nc = prediction.shape[0], nc or (prediction.shape[1] - 4)
|
|
xc = np.amax(prediction[:, 4 : 4 + nc], axis=1) > conf_thres
|
|
nm = prediction.shape[1] - nc - 4
|
|
output = [np.zeros((0, 6 + nm))] * bs
|
|
|
|
for xi, x in enumerate(prediction):
|
|
x = x.swapaxes(0, -1)[xc[xi]]
|
|
if not x.shape[0]:
|
|
continue
|
|
box, cls, mask = np.split(x, [4, 4 + nc], axis=1)
|
|
conf, j = np.max(cls, axis=1, keepdims=True), np.argmax(
|
|
cls, axis=1, keepdims=True
|
|
)
|
|
x = np.concatenate((xywh2xyxy(box), conf, j.astype(np.float32), mask), axis=1)
|
|
x = x[conf.ravel() > conf_thres]
|
|
if not x.shape[0]:
|
|
continue
|
|
x = x[np.argsort(-x[:, 4])]
|
|
c = x[:, 5:6] * (0 if agnostic else max_wh)
|
|
boxes, scores = x[:, :4] + c, x[:, 4]
|
|
i = compute_nms(boxes, scores, iou_thres)[:max_det]
|
|
output[xi] = x[i]
|
|
return output
|
|
|
|
|
|
def postprocess(preds, img, orig_imgs):
|
|
print("copying to CPU now for post processing")
|
|
# if you are on CPU, this causes an overflow runtime error. doesn't "seem" to make any difference in the predictions though.
|
|
# TODO: make non_max_suppression in tinygrad - to make this faster
|
|
preds = preds.numpy() if isinstance(preds, Tensor) else preds
|
|
preds = non_max_suppression(
|
|
prediction=preds, conf_thres=0.25, iou_thres=0.7, agnostic=False, max_det=300
|
|
)
|
|
all_preds = []
|
|
for i, pred in enumerate(preds):
|
|
orig_img = orig_imgs[i] if isinstance(orig_imgs, list) else orig_imgs
|
|
if not isinstance(orig_imgs, Tensor):
|
|
pred[:, :4] = scale_boxes(img.shape[2:], pred[:, :4], orig_img.shape)
|
|
all_preds.append(pred)
|
|
return all_preds
|
|
|
|
|
|
def draw_bounding_boxes_and_save(
|
|
orig_img_paths, output_img_paths, all_predictions, class_labels, iou_threshold=0.5
|
|
):
|
|
color_dict = {
|
|
label: tuple(
|
|
(((i + 1) * 50) % 256, ((i + 1) * 100) % 256, ((i + 1) * 150) % 256)
|
|
)
|
|
for i, label in enumerate(class_labels)
|
|
}
|
|
font = cv2.FONT_HERSHEY_SIMPLEX
|
|
|
|
def is_bright_color(color):
|
|
r, g, b = color
|
|
brightness = (r * 299 + g * 587 + b * 114) / 1000
|
|
return brightness > 127
|
|
|
|
for img_idx, (orig_img_path, output_img_path, predictions) in enumerate(
|
|
zip(orig_img_paths, output_img_paths, all_predictions)
|
|
):
|
|
predictions = np.array(predictions)
|
|
orig_img = (
|
|
cv2.imread(orig_img_path)
|
|
if not isinstance(orig_img_path, np.ndarray)
|
|
else cv2.imdecode(orig_img_path, 1)
|
|
)
|
|
height, width, _ = orig_img.shape
|
|
box_thickness = int((height + width) / 400)
|
|
font_scale = (height + width) / 2500
|
|
|
|
grouped_preds = defaultdict(list)
|
|
object_count = defaultdict(int)
|
|
|
|
for pred_np in predictions:
|
|
grouped_preds[int(pred_np[-1])].append(pred_np)
|
|
|
|
def draw_box_and_label(pred, color):
|
|
x1, y1, x2, y2, conf, _ = pred
|
|
x1, y1, x2, y2 = map(int, (x1, y1, x2, y2))
|
|
cv2.rectangle(orig_img, (x1, y1), (x2, y2), color, box_thickness)
|
|
label = f"{class_labels[class_id]} {conf:.2f}"
|
|
text_size, _ = cv2.getTextSize(label, font, font_scale, 1)
|
|
label_y, bg_y = (
|
|
(y1 - 4, y1 - text_size[1] - 4)
|
|
if y1 - text_size[1] - 4 > 0
|
|
else (y1 + text_size[1], y1)
|
|
)
|
|
cv2.rectangle(
|
|
orig_img,
|
|
(x1, bg_y),
|
|
(x1 + text_size[0], bg_y + text_size[1]),
|
|
color,
|
|
-1,
|
|
)
|
|
font_color = (0, 0, 0) if is_bright_color(color) else (255, 255, 255)
|
|
cv2.putText(
|
|
orig_img,
|
|
label,
|
|
(x1, label_y),
|
|
font,
|
|
font_scale,
|
|
font_color,
|
|
1,
|
|
cv2.LINE_AA,
|
|
)
|
|
|
|
for class_id, pred_list in grouped_preds.items():
|
|
pred_list = np.array(pred_list)
|
|
while len(pred_list) > 0:
|
|
max_conf_idx = np.argmax(pred_list[:, 4])
|
|
max_conf_pred = pred_list[max_conf_idx]
|
|
pred_list = np.delete(pred_list, max_conf_idx, axis=0)
|
|
color = color_dict[class_labels[class_id]]
|
|
draw_box_and_label(max_conf_pred, color)
|
|
object_count[class_labels[class_id]] += 1
|
|
iou_scores = box_iou(np.array([max_conf_pred[:4]]), pred_list[:, :4])
|
|
low_iou_indices = np.where(iou_scores[0] < iou_threshold)[0]
|
|
pred_list = pred_list[low_iou_indices]
|
|
for low_conf_pred in pred_list:
|
|
draw_box_and_label(low_conf_pred, color)
|
|
|
|
print(f"Image {img_idx + 1}:")
|
|
print("Objects detected:")
|
|
for obj, count in object_count.items():
|
|
print(f"- {obj}: {count}")
|
|
|
|
cv2.imwrite(output_img_path, orig_img)
|
|
print(f"saved detections at {output_img_path}")
|
|
|
|
|
|
# utility functions for forward pass.
|
|
def dist2bbox(distance, anchor_points, xywh=True, dim=-1):
|
|
lt, rb = distance.chunk(2, dim)
|
|
x1y1 = anchor_points - lt
|
|
x2y2 = anchor_points + rb
|
|
if xywh:
|
|
c_xy = (x1y1 + x2y2) / 2
|
|
wh = x2y2 - x1y1
|
|
return c_xy.cat(wh, dim=1)
|
|
return x1y1.cat(x2y2, dim=1)
|
|
|
|
|
|
def make_anchors(feats, strides, grid_cell_offset=0.5):
|
|
anchor_points, stride_tensor = [], []
|
|
assert feats is not None
|
|
for i, stride in enumerate(strides):
|
|
_, _, h, w = feats[i].shape
|
|
sx = Tensor.arange(w) + grid_cell_offset
|
|
sy = Tensor.arange(h) + grid_cell_offset
|
|
|
|
# this is np.meshgrid but in tinygrad
|
|
sx = sx.reshape(1, -1).repeat([h, 1]).reshape(-1)
|
|
sy = sy.reshape(-1, 1).repeat([1, w]).reshape(-1)
|
|
|
|
anchor_points.append(Tensor.stack((sx, sy), -1).reshape(-1, 2))
|
|
stride_tensor.append(Tensor.full((h * w), stride))
|
|
anchor_points = anchor_points[0].cat(anchor_points[1], anchor_points[2])
|
|
stride_tensor = (
|
|
stride_tensor[0].cat(stride_tensor[1], stride_tensor[2]).unsqueeze(1)
|
|
)
|
|
return anchor_points, stride_tensor
|
|
|
|
|
|
# this function is from the original implementation
|
|
def autopad(k, p=None, d=1): # kernel, padding, dilation
|
|
if d > 1:
|
|
k = (
|
|
d * (k - 1) + 1 if isinstance(k, int) else [d * (x - 1) + 1 for x in k]
|
|
) # actual kernel-size
|
|
if p is None:
|
|
p = k // 2 if isinstance(k, int) else [x // 2 for x in k] # auto-pad
|
|
return p
|
|
|
|
|
|
def clip_boxes(boxes, shape):
|
|
boxes[..., [0, 2]] = np.clip(boxes[..., [0, 2]], 0, shape[1]) # x1, x2
|
|
boxes[..., [1, 3]] = np.clip(boxes[..., [1, 3]], 0, shape[0]) # y1, y2
|
|
return boxes
|
|
|
|
|
|
def scale_boxes(img1_shape, boxes, img0_shape, ratio_pad=None):
|
|
gain = (
|
|
ratio_pad
|
|
if ratio_pad
|
|
else min(img1_shape[0] / img0_shape[0], img1_shape[1] / img0_shape[1])
|
|
)
|
|
pad = (
|
|
(img1_shape[1] - img0_shape[1] * gain) / 2,
|
|
(img1_shape[0] - img0_shape[0] * gain) / 2,
|
|
)
|
|
boxes_np = boxes.numpy() if isinstance(boxes, Tensor) else boxes
|
|
boxes_np[..., [0, 2]] -= pad[0]
|
|
boxes_np[..., [1, 3]] -= pad[1]
|
|
boxes_np[..., :4] /= gain
|
|
boxes_np = clip_boxes(boxes_np, img0_shape)
|
|
return boxes_np
|
|
|
|
|
|
def xywh2xyxy(x):
|
|
xy = x[..., :2] # center x, y
|
|
wh = x[..., 2:4] # width, height
|
|
xy1 = xy - wh / 2 # top left x, y
|
|
xy2 = xy + wh / 2 # bottom right x, y
|
|
result = np.concatenate((xy1, xy2), axis=-1)
|
|
return Tensor(result) if isinstance(x, Tensor) else result
|
|
|
|
|
|
def get_variant_multiples(variant):
|
|
return {
|
|
"n": (0.33, 0.25, 2.0),
|
|
"s": (0.33, 0.50, 2.0),
|
|
"m": (0.67, 0.75, 1.5),
|
|
"l": (1.0, 1.0, 1.0),
|
|
"x": (1, 1.25, 1.0),
|
|
}.get(variant, None)
|
|
|
|
|
|
def label_predictions(all_predictions):
|
|
class_index_count = defaultdict(int)
|
|
for predictions in all_predictions:
|
|
predictions = np.array(predictions)
|
|
for pred_np in predictions:
|
|
class_id = int(pred_np[-1])
|
|
class_index_count[class_id] += 1
|
|
|
|
return dict(class_index_count)
|
|
|
|
|
|
# this is taken from https://github.com/tinygrad/tinygrad/pull/784/files by dc-dc-dc (Now 2 models use upsampling)
|
|
class Upsample:
|
|
def __init__(self, scale_factor: int, mode: str = "nearest") -> None:
|
|
assert mode == "nearest" # only mode supported for now
|
|
self.mode = mode
|
|
self.scale_factor = scale_factor
|
|
|
|
def __call__(self, x: Tensor) -> Tensor:
|
|
assert len(x.shape) > 2 and len(x.shape) <= 5
|
|
(b, c), _lens = x.shape[:2], len(x.shape[2:])
|
|
tmp = x.reshape([b, c, -1] + [1] * _lens) * Tensor.ones(
|
|
*[1, 1, 1] + [self.scale_factor] * _lens
|
|
)
|
|
return (
|
|
tmp.reshape(list(x.shape) + [self.scale_factor] * _lens)
|
|
.permute(
|
|
[0, 1]
|
|
+ list(
|
|
chain.from_iterable([[y + 2, y + 2 + _lens] for y in range(_lens)])
|
|
)
|
|
)
|
|
.reshape([b, c] + [x * self.scale_factor for x in x.shape[2:]])
|
|
)
|
|
|
|
|
|
class Conv_Block:
|
|
def __init__(
|
|
self, c1, c2, kernel_size=1, stride=1, groups=1, dilation=1, padding=None
|
|
):
|
|
self.conv = Conv2d(
|
|
c1,
|
|
c2,
|
|
kernel_size,
|
|
stride,
|
|
padding=autopad(kernel_size, padding, dilation),
|
|
bias=False,
|
|
groups=groups,
|
|
dilation=dilation,
|
|
)
|
|
self.bn = BatchNorm2d(c2, eps=0.001)
|
|
|
|
def __call__(self, x):
|
|
return self.bn(self.conv(x)).silu()
|
|
|
|
|
|
class Bottleneck:
|
|
def __init__(
|
|
self, c1, c2, shortcut: bool, g=1, kernels: list = (3, 3), channel_factor=0.5
|
|
):
|
|
c_ = int(c2 * channel_factor)
|
|
self.cv1 = Conv_Block(c1, c_, kernel_size=kernels[0], stride=1, padding=None)
|
|
self.cv2 = Conv_Block(
|
|
c_, c2, kernel_size=kernels[1], stride=1, padding=None, groups=g
|
|
)
|
|
self.residual = c1 == c2 and shortcut
|
|
|
|
def __call__(self, x):
|
|
return x + self.cv2(self.cv1(x)) if self.residual else self.cv2(self.cv1(x))
|
|
|
|
|
|
class C2f:
|
|
def __init__(self, c1, c2, n=1, shortcut=False, g=1, e=0.5):
|
|
self.c = int(c2 * e)
|
|
self.cv1 = Conv_Block(
|
|
c1,
|
|
2 * self.c,
|
|
1,
|
|
)
|
|
self.cv2 = Conv_Block((2 + n) * self.c, c2, 1)
|
|
self.bottleneck = [
|
|
Bottleneck(
|
|
self.c,
|
|
self.c,
|
|
shortcut,
|
|
g,
|
|
kernels=[(3, 3), (3, 3)],
|
|
channel_factor=1.0,
|
|
)
|
|
for _ in range(n)
|
|
]
|
|
|
|
def __call__(self, x):
|
|
y = list(self.cv1(x).chunk(2, 1))
|
|
y.extend(m(y[-1]) for m in self.bottleneck)
|
|
z = y[0]
|
|
for i in y[1:]:
|
|
z = z.cat(i, dim=1)
|
|
return self.cv2(z)
|
|
|
|
|
|
class SPPF:
|
|
def __init__(self, c1, c2, k=5):
|
|
c_ = c1 // 2 # hidden channels
|
|
self.cv1 = Conv_Block(c1, c_, 1, 1, padding=None)
|
|
self.cv2 = Conv_Block(c_ * 4, c2, 1, 1, padding=None)
|
|
|
|
# TODO: this pads with 0s, whereas torch function pads with -infinity. This results in a < 2% difference in prediction which does not make a difference visually.
|
|
self.maxpool = lambda x: x.pad2d((k // 2, k // 2, k // 2, k // 2)).max_pool2d(
|
|
kernel_size=k, stride=1
|
|
)
|
|
|
|
def __call__(self, x):
|
|
x = self.cv1(x)
|
|
x2 = self.maxpool(x)
|
|
x3 = self.maxpool(x2)
|
|
x4 = self.maxpool(x3)
|
|
return self.cv2(x.cat(x2, x3, x4, dim=1))
|
|
|
|
|
|
class DFL:
|
|
def __init__(self, c1=16):
|
|
self.conv = Conv2d(c1, 1, 1, bias=False)
|
|
x = Tensor.arange(c1)
|
|
self.conv.weight.assign(x.reshape(1, c1, 1, 1))
|
|
self.c1 = c1
|
|
|
|
def __call__(self, x):
|
|
b, c, a = x.shape # batch, channels, anchors
|
|
return self.conv(
|
|
x.reshape(b, 4, self.c1, a).transpose(2, 1).softmax(1)
|
|
).reshape(b, 4, a)
|
|
|
|
|
|
# backbone
|
|
class Darknet:
|
|
def __init__(self, w, r, d):
|
|
self.b1 = [
|
|
Conv_Block(c1=3, c2=int(64 * w), kernel_size=3, stride=2, padding=1),
|
|
Conv_Block(int(64 * w), int(128 * w), kernel_size=3, stride=2, padding=1),
|
|
]
|
|
self.b2 = [
|
|
C2f(c1=int(128 * w), c2=int(128 * w), n=round(3 * d), shortcut=True),
|
|
Conv_Block(int(128 * w), int(256 * w), 3, 2, 1),
|
|
C2f(int(256 * w), int(256 * w), round(6 * d), True),
|
|
]
|
|
self.b3 = [
|
|
Conv_Block(int(256 * w), int(512 * w), kernel_size=3, stride=2, padding=1),
|
|
C2f(int(512 * w), int(512 * w), round(6 * d), True),
|
|
]
|
|
self.b4 = [
|
|
Conv_Block(
|
|
int(512 * w), int(512 * w * r), kernel_size=3, stride=2, padding=1
|
|
),
|
|
C2f(int(512 * w * r), int(512 * w * r), round(3 * d), True),
|
|
]
|
|
self.b5 = [SPPF(int(512 * w * r), int(512 * w * r), 5)]
|
|
|
|
def return_modules(self):
|
|
return [*self.b1, *self.b2, *self.b3, *self.b4, *self.b5]
|
|
|
|
def __call__(self, x):
|
|
x1 = x.sequential(self.b1)
|
|
x2 = x1.sequential(self.b2)
|
|
x3 = x2.sequential(self.b3)
|
|
x4 = x3.sequential(self.b4)
|
|
x5 = x4.sequential(self.b5)
|
|
return (x2, x3, x5)
|
|
|
|
|
|
# yolo fpn (neck)
|
|
class Yolov8NECK:
|
|
def __init__(self, w, r, d): # width_multiple, ratio_multiple, depth_multiple
|
|
self.up = Upsample(2, mode="nearest")
|
|
self.n1 = C2f(
|
|
c1=int(512 * w * (1 + r)), c2=int(512 * w), n=round(3 * d), shortcut=False
|
|
)
|
|
self.n2 = C2f(c1=int(768 * w), c2=int(256 * w), n=round(3 * d), shortcut=False)
|
|
self.n3 = Conv_Block(
|
|
c1=int(256 * w), c2=int(256 * w), kernel_size=3, stride=2, padding=1
|
|
)
|
|
self.n4 = C2f(c1=int(768 * w), c2=int(512 * w), n=round(3 * d), shortcut=False)
|
|
self.n5 = Conv_Block(
|
|
c1=int(512 * w), c2=int(512 * w), kernel_size=3, stride=2, padding=1
|
|
)
|
|
self.n6 = C2f(
|
|
c1=int(512 * w * (1 + r)),
|
|
c2=int(512 * w * r),
|
|
n=round(3 * d),
|
|
shortcut=False,
|
|
)
|
|
|
|
def return_modules(self):
|
|
return [self.n1, self.n2, self.n3, self.n4, self.n5, self.n6]
|
|
|
|
def __call__(self, p3, p4, p5):
|
|
x = self.n1(self.up(p5).cat(p4, dim=1))
|
|
head_1 = self.n2(self.up(x).cat(p3, dim=1))
|
|
head_2 = self.n4(self.n3(head_1).cat(x, dim=1))
|
|
head_3 = self.n6(self.n5(head_2).cat(p5, dim=1))
|
|
return [head_1, head_2, head_3]
|
|
|
|
|
|
# task specific head.
|
|
class DetectionHead:
|
|
def __init__(self, nc=80, filters=()):
|
|
self.ch = 16
|
|
self.nc = nc # number of classes
|
|
self.nl = len(filters)
|
|
self.no = nc + self.ch * 4 #
|
|
self.stride = [8, 16, 32]
|
|
c1 = max(filters[0], self.nc)
|
|
c2 = max((filters[0] // 4, self.ch * 4))
|
|
self.dfl = DFL(self.ch)
|
|
self.cv3 = [
|
|
[Conv_Block(x, c1, 3), Conv_Block(c1, c1, 3), Conv2d(c1, self.nc, 1)]
|
|
for x in filters
|
|
]
|
|
self.cv2 = [
|
|
[Conv_Block(x, c2, 3), Conv_Block(c2, c2, 3), Conv2d(c2, 4 * self.ch, 1)]
|
|
for x in filters
|
|
]
|
|
|
|
def __call__(self, x):
|
|
for i in range(self.nl):
|
|
x[i] = x[i].sequential(self.cv2[i]).cat(x[i].sequential(self.cv3[i]), dim=1)
|
|
self.anchors, self.strides = (
|
|
x.transpose(0, 1) for x in make_anchors(x, self.stride, 0.5)
|
|
)
|
|
y = [(i.reshape(x[0].shape[0], self.no, -1)) for i in x]
|
|
x_cat = y[0].cat(y[1], y[2], dim=2)
|
|
box, cls = x_cat[:, : self.ch * 4], x_cat[:, self.ch * 4 :]
|
|
dbox = (
|
|
dist2bbox(self.dfl(box), self.anchors.unsqueeze(0), xywh=True, dim=1)
|
|
* self.strides
|
|
)
|
|
z = dbox.cat(cls.sigmoid(), dim=1)
|
|
return z
|
|
|
|
|
|
class YOLOv8:
|
|
def __init__(
|
|
self, w, r, d, num_classes
|
|
): # width_multiple, ratio_multiple, depth_multiple
|
|
self.net = Darknet(w, r, d)
|
|
self.fpn = Yolov8NECK(w, r, d)
|
|
self.head = DetectionHead(
|
|
num_classes, filters=(int(256 * w), int(512 * w), int(512 * w * r))
|
|
)
|
|
|
|
def __call__(self, x):
|
|
x = self.net(x)
|
|
x = self.fpn(*x)
|
|
return self.head(x)
|
|
|
|
def return_all_trainable_modules(self):
|
|
backbone_modules = [*range(10)]
|
|
yolov8neck_modules = [12, 15, 16, 18, 19, 21]
|
|
yolov8_head_weights = [(22, self.head)]
|
|
return [
|
|
*zip(backbone_modules, self.net.return_modules()),
|
|
*zip(yolov8neck_modules, self.fpn.return_modules()),
|
|
*yolov8_head_weights,
|
|
]
|
|
|
|
|
|
if __name__ == "__main__":
|
|
# usage : python3 yolov8.py "image_URL OR image_path" "v8 variant" (optional, n is default)
|
|
if len(sys.argv) < 2:
|
|
print("Error: Image URL or path not provided.")
|
|
sys.exit(1)
|
|
|
|
img_path = sys.argv[1]
|
|
yolo_variant = (
|
|
sys.argv[2]
|
|
if len(sys.argv) >= 3
|
|
else (
|
|
print(
|
|
"No variant given, so choosing 'n' as the default. Yolov8 has different variants, you can choose from ['n', 's', 'm', 'l', 'x']"
|
|
)
|
|
or "n"
|
|
)
|
|
)
|
|
print(f"running inference for YOLO version {yolo_variant}")
|
|
|
|
output_folder_path = Path("./outputs_yolov8")
|
|
output_folder_path.mkdir(parents=True, exist_ok=True)
|
|
# absolute image path or URL
|
|
image_location = [np.frombuffer(fetch(img_path).read_bytes(), np.uint8)]
|
|
image = [cv2.imdecode(image_location[0], 1)]
|
|
out_paths = [
|
|
(
|
|
output_folder_path / f"{Path(img_path).stem}_output{Path(img_path).suffix}"
|
|
).as_posix()
|
|
]
|
|
if not isinstance(image[0], np.ndarray):
|
|
print("Error in image loading. Check your image file.")
|
|
sys.exit(1)
|
|
pre_processed_image = preprocess(image)
|
|
|
|
# Different YOLOv8 variants use different w , r, and d multiples. For a list , refer to this yaml file (the scales section) https://github.com/ultralytics/ultralytics/blob/main/ultralytics/models/v8/yolov8.yaml
|
|
depth, width, ratio = get_variant_multiples(yolo_variant)
|
|
yolo_infer = YOLOv8(w=width, r=ratio, d=depth, num_classes=80)
|
|
|
|
state_dict = safe_load(
|
|
fetch(
|
|
f"https://gitlab.com/r3sist/yolov8_weights/-/raw/master/yolov8{yolo_variant}.safetensors"
|
|
)
|
|
)
|
|
load_state_dict(yolo_infer, state_dict)
|
|
|
|
st = time.time()
|
|
predictions = yolo_infer(pre_processed_image)
|
|
print(f"did inference in {int(round(((time.time() - st) * 1000)))}ms")
|
|
|
|
post_predictions = postprocess(
|
|
preds=predictions, img=pre_processed_image, orig_imgs=image
|
|
)
|
|
|
|
# v8 and v3 have same 80 class names for Object Detection
|
|
class_labels = (
|
|
fetch(
|
|
"https://raw.githubusercontent.com/pjreddie/darknet/master/data/coco.names"
|
|
)
|
|
.read_text()
|
|
.split("\n")
|
|
)
|
|
|
|
draw_bounding_boxes_and_save(
|
|
orig_img_paths=image_location,
|
|
output_img_paths=out_paths,
|
|
all_predictions=post_predictions,
|
|
class_labels=class_labels,
|
|
)
|
|
|
|
# TODO for later:
|
|
# 1. Fix SPPF minor difference due to maxpool
|
|
# 2. AST exp overflow warning while on cpu
|
|
# 3. Make NMS faster
|
|
# 4. Add video inference and webcam support
|