import numpy as np
import cv2 as cv
import argparse
import warnings
import os
class DaSiamRPNTracker:
 def __init__(self, im, target_pos, target_sz, net, kernel_r1, kernel_cls1):
 self.windowing = "cosine"
 self.exemplar_size = 127
 self.instance_size = 271
 self.total_stride = 8
 self.score_size = (self.instance_size -
 self.exemplar_size) // self.total_stride + 1
 self.context_amount = 0.5
 self.ratios = [0.33, 0.5, 1, 2, 3]
 self.scales = [8, ]
 self.anchor_num = len(self.ratios) * len(self.scales)
 self.penalty_k = 0.055
 self.window_influence = 0.42
 self.lr = 0.295
 self.im_h = im.shape[0]
 self.im_w = im.shape[1]
 self.target_pos = target_pos
 self.target_sz = target_sz
 self.avg_chans = np.mean(im, axis=(0, 1))
 self.net = net
 self.score = []
 if ((self.target_sz[0] * self.target_sz[1]) / float(self.im_h * self.im_w)) < 0.004:
 warnings.warn(
 "Using initializing bounding box of that size may cause inaccuracy of predictions!",
 category=None, stacklevel=1, source=None)
 self.anchor = self.__generate_anchor()
 wc_z = self.target_sz[0] + self.context_amount * sum(self.target_sz)
 hc_z = self.target_sz[1] + self.context_amount * sum(self.target_sz)
 s_z = round(np.sqrt(wc_z * hc_z))
 z_crop = self.__get_subwindow_tracking(im, self.exemplar_size, s_z)
 z_crop = z_crop.transpose(2, 0, 1).reshape(
 1, 3, 127, 127).astype(np.float32)
 self.net.setInput(z_crop)
 z_f = self.net.forward('63')
 kernel_r1.setInput(z_f)
 r1 = kernel_r1.forward()
 kernel_cls1.setInput(z_f)
 cls1 = kernel_cls1.forward()
 r1 = r1.reshape(20, 256, 4, 4)
 cls1 = cls1.reshape(10, 256, 4, 4)
 self.net.setParam(self.net.getLayerId('65'), 0, r1)
 self.net.setParam(self.net.getLayerId('68'), 0, cls1)
 if self.windowing == "cosine":
 self.window = np.outer(np.hanning(
 self.score_size), np.hanning(self.score_size))
 elif self.windowing == "uniform":
 self.window = np.ones((self.score_size, self.score_size))
 self.window = np.tile(self.window.flatten(), self.anchor_num)
 def __generate_anchor(self):
 self.anchor = np.zeros((self.anchor_num, 4), dtype=np.float32)
 size = self.total_stride * self.total_stride
 count = 0
 for ratio in self.ratios:
 ws = int(np.sqrt(size / ratio))
 hs = int(ws * ratio)
 for scale in self.scales:
 wws = ws * scale
 hhs = hs * scale
 self.anchor[count] = [0, 0, wws, hhs]
 count += 1
 score_sz = int(self.score_size)
 self.anchor = np.tile(self.anchor, score_sz *
 score_sz).reshape((-1, 4))
 ori = - (score_sz / 2) * self.total_stride
 xx, yy = np.meshgrid([ori + self.total_stride * dx for dx in range(score_sz)], [
 ori + self.total_stride * dy for dy in range(score_sz)])
 xx, yy = np.tile(xx.flatten(), (self.anchor_num, 1)).flatten(), np.tile(
 yy.flatten(), (self.anchor_num, 1)).flatten()
 self.anchor[:, 0], self.anchor[:, 1] = xx.astype(
 np.float32), yy.astype(np.float32)
 return self.anchor
 def track(self, im):
 wc_z = self.target_sz[1] + self.context_amount * sum(self.target_sz)
 hc_z = self.target_sz[0] + self.context_amount * sum(self.target_sz)
 s_z = np.sqrt(wc_z * hc_z)
 scale_z = self.exemplar_size / s_z
 d_search = (self.instance_size - self.exemplar_size) / 2
 pad = d_search / scale_z
 s_x = round(s_z + 2 * pad)
 x_crop = self.__get_subwindow_tracking(im, self.instance_size, s_x)
 x_crop = x_crop.transpose(2, 0, 1).reshape(
 1, 3, 271, 271).astype(np.float32)
 self.score = self.__tracker_eval(x_crop, scale_z)
 self.target_pos[0] = max(0, min(self.im_w, self.target_pos[0]))
 self.target_pos[1] = max(0, min(self.im_h, self.target_pos[1]))
 self.target_sz[0] = max(10, min(self.im_w, self.target_sz[0]))
 self.target_sz[1] = max(10, min(self.im_h, self.target_sz[1]))
 def __tracker_eval(self, x_crop, scale_z):
 target_size = self.target_sz * scale_z
 self.net.setInput(x_crop)
 outNames = self.net.getUnconnectedOutLayersNames()
 outNames = ['66', '68']
 delta, score = self.net.forward(outNames)
 delta = np.transpose(delta, (1, 2, 3, 0))
 delta = np.ascontiguousarray(delta, dtype=np.float32)
 delta = np.reshape(delta, (4, -1))
 score = np.transpose(score, (1, 2, 3, 0))
 score = np.ascontiguousarray(score, dtype=np.float32)
 score = np.reshape(score, (2, -1))
 score = self.__softmax(score)[1, :]
 delta[0, :] = delta[0, :] * self.anchor[:, 2] + self.anchor[:, 0]
 delta[1, :] = delta[1, :] * self.anchor[:, 3] + self.anchor[:, 1]
 delta[2, :] = np.exp(delta[2, :]) * self.anchor[:, 2]
 delta[3, :] = np.exp(delta[3, :]) * self.anchor[:, 3]
 def __change(r):
 return np.maximum(r, 1./r)
 def __sz(w, h):
 pad = (w + h) * 0.5
 sz2 = (w + pad) * (h + pad)
 return np.sqrt(sz2)
 def __sz_wh(wh):
 pad = (wh[0] + wh[1]) * 0.5
 sz2 = (wh[0] + pad) * (wh[1] + pad)
 return np.sqrt(sz2)
 s_c = __change(__sz(delta[2, :], delta[3, :]) / (__sz_wh(target_size)))
 r_c = __change(
 (target_size[0] / target_size[1]) / (delta[2, :] / delta[3, :]))
 penalty = np.exp(-(r_c * s_c - 1.) * self.penalty_k)
 pscore = penalty * score
 pscore = pscore * (1 - self.window_influence) + \
 self.window * self.window_influence
 best_pscore_id = np.argmax(pscore)
 target = delta[:, best_pscore_id] / scale_z
 target_size /= scale_z
 lr = penalty[best_pscore_id] * score[best_pscore_id] * self.lr
 res_x = target[0] + self.target_pos[0]
 res_y = target[1] + self.target_pos[1]
 res_w = target_size[0] * (1 - lr) + target[2] * lr
 res_h = target_size[1] * (1 - lr) + target[3] * lr
 self.target_pos = np.array([res_x, res_y])
 self.target_sz = np.array([res_w, res_h])
 return score[best_pscore_id]
 def __softmax(self, x):
 x_max = x.max(0)
 e_x = np.exp(x - x_max)
 y = e_x / e_x.sum(axis=0)
 return y
 def __get_subwindow_tracking(self, im, model_size, original_sz):
 im_sz = im.shape
 c = (original_sz + 1) / 2
 context_xmin = round(self.target_pos[0] - c)
 context_xmax = context_xmin + original_sz - 1
 context_ymin = round(self.target_pos[1] - c)
 context_ymax = context_ymin + original_sz - 1
 left_pad = int(max(0., -context_xmin))
 top_pad = int(max(0., -context_ymin))
 right_pad = int(max(0., context_xmax - im_sz[1] + 1))
 bottom_pad = int(max(0., context_ymax - im_sz[0] + 1))
 context_xmin += left_pad
 context_xmax += left_pad
 context_ymin += top_pad
 context_ymax += top_pad
 r, c, k = im.shape
 if any([top_pad, bottom_pad, left_pad, right_pad]):
 te_im = np.zeros((r + top_pad + bottom_pad, c +
 left_pad + right_pad, k), np.uint8)
 te_im[top_pad:top_pad + r, left_pad:left_pad + c, :] = im
 if top_pad:
 te_im[0:top_pad, left_pad:left_pad + c, :] = self.avg_chans
 if bottom_pad:
 te_im[r + top_pad:, left_pad:left_pad + c, :] = self.avg_chans
 if left_pad:
 te_im[:, 0:left_pad, :] = self.avg_chans
 if right_pad:
 te_im[:, c + left_pad:, :] = self.avg_chans
 im_patch_original = te_im[int(context_ymin):int(
 context_ymax + 1), int(context_xmin):int(context_xmax + 1), :]
 else:
 im_patch_original = im[int(context_ymin):int(
 context_ymax + 1), int(context_xmin):int(context_xmax + 1), :]
 if not np.array_equal(model_size, original_sz):
 im_patch_original = cv.resize(
 im_patch_original, (model_size, model_size))
 return im_patch_original
def get_iou(new, gt):
 new_xmin, new_ymin, new_w, new_h = new
 gt_xmin, gt_ymin, gt_w, gt_h = gt
 def get_max_coord(coord, size): return coord + size - 1.0
 new_xmax, new_ymax = get_max_coord(new_xmin, new_w), get_max_coord(
 new_ymin, new_h)
 gt_xmax, gt_ymax = get_max_coord(gt_xmin, gt_w), get_max_coord(
 gt_ymin, gt_h)
 dx = max(0, min(new_xmax, gt_xmax) - max(new_xmin, gt_xmin))
 dy = max(0, min(new_ymax, gt_ymax) - max(new_ymin, gt_ymin))
 area_of_overlap = dx * dy
 area_of_union = (new_xmax - new_xmin) * (new_ymax - new_ymin) + (
 gt_xmax - gt_xmin) * (gt_ymax - gt_ymin) - area_of_overlap
 iou = area_of_overlap / area_of_union if area_of_union != 0 else 0
 return iou
def get_pr(new, gt, is_norm):
 new_x, new_y, new_w, new_h = new
 gt_x, gt_y, gt_w, gt_h = gt
 def get_center(coord, size): return coord + (size + 1.0) / 2
 new_cx, new_cy, gt_cx, gt_cy = get_center(new_x, new_w), get_center(
 new_y, new_h), get_center(gt_x, gt_w), get_center(gt_y, gt_h)
 dx = new_cx - gt_cx
 dy = new_cy - gt_cy
 if is_norm:
 dx /= gt_w
 dy /= gt_h
 return np.sqrt(dx ** 2 + dy ** 2)
def main():
 parser = argparse.ArgumentParser(
 description="Run LaSOT-based benchmark for DaSiamRPN tracker")
 parser.add_argument("--net", type=str, default="dasiamrpn_model.onnx",
 help="Full path to onnx model of net")
 parser.add_argument("--kernel_r1", type=str, default="dasiamrpn_kernel_r1.onnx",
 help="Full path to onnx model of kernel_r1")
 parser.add_argument("--kernel_cls1", type=str, default="dasiamrpn_kernel_cls1.onnx",
 help="Full path to onnx model of kernel_cls1")
 parser.add_argument("--dataset", type=str,
 help="Full path to LaSOT folder")
 parser.add_argument("--v", dest="visualization", action='store_true',
 help="Showing process of tracking")
 args = parser.parse_args()
 trackers = ["DaSiamRPN"]
 cx, cy, w, h = 0.0, 0.0, 0, 0
 net = cv.dnn.readNet(args.net)
 kernel_r1 = cv.dnn.readNet(args.kernel_r1)
 kernel_cls1 = cv.dnn.readNet(args.kernel_cls1)
 video_names = os.path.join(args.dataset, "testing_set.txt")
 with open(video_names, 'rt') as f:
 list_of_videos = f.read().rstrip('\n').split('\n')
 iou_avg = []
 pr_avg = []
 n_pr_avg = []
 for tracker_name in trackers:
 print("Tracker name: ", tracker_name)
 number_of_thresholds = 21
 iou_video = np.zeros(number_of_thresholds)
 pr_video = np.zeros(number_of_thresholds)
 n_pr_video = np.zeros(number_of_thresholds)
 iou_thr = np.linspace(0, 1, number_of_thresholds)
 pr_thr = np.linspace(0, 50, number_of_thresholds)
 n_pr_thr = np.linspace(0, 0.5, number_of_thresholds)
 for video_name in list_of_videos:
 init_once = False
 print("\tVideo name: " + str(video_name))
 gt_file = open(os.path.join(args.dataset, video_name,
 "groundtruth.txt"), "r")
 gt_bb = gt_file.readline().rstrip("\n").split(",")
 init_bb = tuple([float(b) for b in gt_bb])
 video_sequence = sorted(os.listdir(os.path.join(
 args.dataset, video_name, "img")))
 iou_values = []
 pr_values = []
 n_pr_values = []
 frame_counter = len(video_sequence)
 for number_of_the_frame, image in enumerate(video_sequence):
 frame = cv.imread(os.path.join(
 args.dataset, video_name, "img", image))
 gt_bb = tuple([float(x) for x in gt_bb])
 if gt_bb[2] == 0 or gt_bb[3] == 0:
 gt_bb = gt_file.readline().rstrip("\n").split(",")
 frame_counter -= 1
 continue
 if ((number_of_the_frame + 1) % 250 == 0):
 init_once = False
 init_bb = gt_bb
 if not init_once:
 target_pos, target_sz = np.array(
 [init_bb[0], init_bb[1]]), np.array(
 [init_bb[2], init_bb[3]])
 tracker = DaSiamRPNTracker(
 frame, target_pos, target_sz, net, kernel_r1, kernel_cls1)
 init_once = True
 tracker.track(frame)
 w, h = tracker.target_sz
 cx, cy = tracker.target_pos
 new_bb = (cx, cy, w, h)
 if args.visualization:
 new_x, new_y, new_w, new_h = list(map(int, new_bb))
 cv.rectangle(frame, (new_x, new_y), ((
 new_x + new_w), (new_y + new_h)), (200, 0, 0))
 cv.imshow("Tracking", frame)
 cv.waitKey(1)
 iou_values.append(get_iou(new_bb, gt_bb))
 pr_values.append(get_pr(new_bb, gt_bb, is_norm=False))
 n_pr_values.append(get_pr(new_bb, gt_bb, is_norm=True))
 gt_bb = gt_file.readline().rstrip("\n").split(",")
 iou_video += (np.fromiter([sum(
 i >= thr for i in iou_values).astype(
 float) / frame_counter for thr in iou_thr], dtype=float))
 pr_video += (np.fromiter([sum(
 i <= thr for i in pr_values).astype(
 float) / frame_counter for thr in pr_thr], dtype=float))
 n_pr_video += (np.fromiter([sum(
 i <= thr for i in n_pr_values).astype(
 float) / frame_counter for thr in n_pr_thr], dtype=float))
 iou_mean_avg = np.array(iou_video) / len(list_of_videos)
 pr_mean_avg = np.array(pr_video) / len(list_of_videos)
 n_pr_mean_avg = np.array(n_pr_video) / len(list_of_videos)
 iou = np.trapz(iou_mean_avg, x=iou_thr) / iou_thr[-1]
 pr = np.trapz(pr_mean_avg, x=pr_thr) / pr_thr[-1]
 n_pr = np.trapz(n_pr_mean_avg, x=n_pr_thr) / n_pr_thr[-1]
 iou_avg.append('%.4f' % iou)
 pr_avg.append('%.4f' % pr)
 n_pr_avg.append('%.4f' % n_pr)
 titles = ["Names:", "IoU:", "Precision:", "N.Precision:"]
 data = [titles] + list(zip(trackers, iou_avg, pr_avg, n_pr_avg))
 for number, for_tracker in enumerate(data):
 line = '|'.join(str(x).ljust(20) for x in for_tracker)
 print(line)
 if number == 0:
 print('-' * len(line))
if __name__ == "__main__":
 main() 
Uh oh!
There was an error while loading. Please reload this page.
Added Python3 script with a benchmark for trackers.
LaSOT paper: https://arxiv.org/abs/1809.07845
TrackingNet paper: https://arxiv.org/abs/1803.10794
TrackingNet repo: https://github.com/SilvioGiancola/TrackingNet-devkit/blob/master/metrics.py
For every tracker was used a particular rate of re-initialization (measured in frames).
UPD 07.07.2020:
Current values for the LaSOT dataset (testing part) on Ubuntu 18.04:
GOTURN still has some memory issues. The issue is reported.
DaSiamRPN results for the LaSOT dataset (testing part) on Ubuntu 18.04:
Version of the benchmark for DaSiamRPN:
UPD 29.07.2020:
Results for GOTURN:
Links to fixes for GOTURN will be provided here soon.
UPD 14.08.2020:
Link to PR with fixes for GOTURN tracker.
Table with all results:
UPD 07.09.2020:
Pull request for GOTURN fixes is merged with the test.