1 /*
2 * This file is part of FFmpeg.
3 *
4 * FFmpeg is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU Lesser General Public
6 * License as published by the Free Software Foundation; either
7 * version 2.1 of the License, or (at your option) any later version.
8 *
9 * FFmpeg is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * Lesser General Public License for more details.
13 *
14 * You should have received a copy of the GNU Lesser General Public
15 * License along with FFmpeg; if not, write to the Free Software
16 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
17 *
18 * Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
19 * Copyright (C) 2009, Willow Garage Inc., all rights reserved.
20 * Copyright (C) 2013, OpenCV Foundation, all rights reserved.
21 * Third party copyrights are property of their respective owners.
22 *
23 * Redistribution and use in source and binary forms, with or without modification,
24 * are permitted provided that the following conditions are met:
25 *
26 * * Redistribution's of source code must retain the above copyright notice,
27 * this list of conditions and the following disclaimer.
28 *
29 * * Redistribution's in binary form must reproduce the above copyright notice,
30 * this list of conditions and the following disclaimer in the documentation
31 * and/or other materials provided with the distribution.
32 *
33 * * The name of the copyright holders may not be used to endorse or promote products
34 * derived from this software without specific prior written permission.
35 *
36 * This software is provided by the copyright holders and contributors "as is" and
37 * any express or implied warranties, including, but not limited to, the implied
38 * warranties of merchantability and fitness for a particular purpose are disclaimed.
39 * In no event shall the Intel Corporation or contributors be liable for any direct,
40 * indirect, incidental, special, exemplary, or consequential damages
41 * (including, but not limited to, procurement of substitute goods or services;
42 * loss of use, data, or profits; or business interruption) however caused
43 * and on any theory of liability, whether in contract, strict liability,
44 * or tort (including negligence or otherwise) arising in any way out of
45 * the use of this software, even if advised of the possibility of such damage.
46 */
47
64
65 /*
66 This filter matches feature points between frames (dealing with outliers) and then
67 uses the matches to estimate an affine transform between frames. This transform is
68 decomposed into various values (translation, scale, rotation) and the values are
69 summed relative to the start of the video to obtain on absolute camera position
70 for each frame. This "camera path" is then smoothed via a gaussian filter, resulting
71 in a new path that is turned back into an affine transform and applied to each
72 frame to render it.
73
74 High-level overview:
75
76 All of the work to extract motion data from frames occurs in queue_frame. Motion data
77 is buffered in a smoothing window, so queue_frame simply computes the absolute camera
78 positions and places them in ringbuffers.
79
80 filter_frame is responsible for looking at the absolute camera positions currently
81 in the ringbuffers, applying the gaussian filter, and then transforming the frames.
82 */
83
84 // Number of bits for BRIEF descriptors
86 // Size of the patch from which a BRIEF descriptor is extracted
87 // This is the size used in OpenCV
88 #define BRIEF_PATCH_SIZE 31
89 #define BRIEF_PATCH_SIZE_HALF (BRIEF_PATCH_SIZE / 2)
90
91 #define MATCHES_CONTIG_SIZE 2000
92
93 #define ROUNDED_UP_DIV(a, b) ((a + (b - 1)) / b)
94
96 // Previous frame
98 // Current frame
101
104 // Used to mark vectors as potential outliers
107
108 // Denotes the indices for the different types of motion in the ringbuffers array
115
116 // Should always be last
118 };
119
120 // Struct that holds data for drawing point match debug data
123 // The points used to calculate the affine transform for a frame
125
127 // For cases where we couldn't calculate a model
130
131 // Groups together the ringbuffers that store absolute distortion / position values
132 // for each frame
134 // Array with the various ringbuffers, indexed via the RingbufferIndices enum
136
137 // Offset to get to the current frame being processed
138 // (not in bytes)
140 // Keeps track of where the start and end of contiguous motion data is (to
141 // deal with cases where no motion data is found between two frames)
144
147
148 // Takes care of freeing the arrays within the DebugMatches inside of the
149 // debug_matches ringbuffer and then freeing the buffer itself.
152
154 return;
155 }
156
159
161 }
162
163 // Stores the translation, scale, rotation, and skew deltas between two frames
170
172 // The 2x3 similarity matrix
175
177 // The top left corner of the bounding box for the crop
179 // The bottom right corner of the bounding box for the crop
182
183 // Returned from function that determines start and end values for iteration
184 // around the current frame in a ringbuffer
189
192 // Whether or not the above `OpenCLFilterContext` has been initialized
194
195 // These variables are used in the activate callback
198
199 // State for random number generation
201
202 // FIFO frame queue used to buffer future frames for processing
204 // Ringbuffers for frame positions
206
207 // The number of frames' motion to consider before and after the frame we are
208 // smoothing
210 // The number of the frame we are currently processing
212
213 // Stores a 1d array of normalised gaussian kernel values for convolution
215
216 // Buffer for error values used in RANSAC code
218
219 // Information regarding how to crop the smoothed luminance (or RGB) planes
221 // Information regarding how to crop the smoothed chroma planes
223
224 // Whether or not we are processing YUV input (as oppposed to RGB)
226 // The underlying format of the hardware surfaces
228
229 // Buffer to copy `matches` into for the CPU to work with
232
234
243
244 // Stores a frame converted to grayscale
246 // Stores the harris response for a frame (measure of "cornerness" for each pixel)
248
249 // Detected features after non-maximum suppression and sub-pixel refinement
251 // Saved from the previous frame
253
254 // BRIEF sampling pattern that is randomly initialized
256 // Feature point descriptors for the current frame
258 // Feature point descriptors for the previous frame
260 // Vectors between points in current and previous frame
263 // Holds the matrix to transform luminance (or RGB) with
265 // Holds the matrix to transform chroma with
267
268 // Configurable options
269
273
274 // Whether or not feature points should be refined at a sub-pixel level
276 // If the user sets a value other than the default, 0, this percentage is
277 // translated into a sigma value ranging from 0.5 to 40.0
279 // This number is multiplied by the video frame rate to determine the size
280 // of the smooth window
282
283 // Debug stuff
284
288
289 // These store the total time spent executing the different kernels in nanoseconds
297
298 // Time spent copying matched features from the device to the host
301
302 // Returns a random uniformly-distributed number in [low, high]
305 }
306
307 // Returns the average execution time for an event given the total time and the
308 // number of frames processed.
310 return (
double)total_time / (
double)num_frames / 1000000.0;
311 }
312
313 // The following code is loosely ported from OpenCV
314
315 // Estimates affine transform from 3 point pairs
316 // model is a 2x3 matrix:
317 // a b c
318 // d e f
320 {
321 // src points
322 double x1 = point_pairs[0].
p.
p1.s[0];
323 double y1 = point_pairs[0].
p.
p1.s[1];
324 double x2 = point_pairs[1].
p.
p1.s[0];
325 double y2 = point_pairs[1].
p.
p1.s[1];
326 double x3 = point_pairs[2].
p.
p1.s[0];
327 double y3 = point_pairs[2].
p.
p1.s[1];
328
329 // dest points
330 double X1 = point_pairs[0].
p.
p2.s[0];
331 double Y1 = point_pairs[0].
p.
p2.s[1];
332 double X2 = point_pairs[1].
p.
p2.s[0];
333 double Y2 = point_pairs[1].
p.
p2.s[1];
334 double X3 = point_pairs[2].
p.
p2.s[0];
335 double Y3 = point_pairs[2].
p.
p2.s[1];
336
337 double d = 1.0 / ( x1*(y2-y3) + x2*(y3-y1) + x3*(y1-y2) );
338
339 model[0] = d * ( X1*(y2-y3) + X2*(y3-y1) + X3*(y1-y2) );
340 model[1] = d * ( X1*(x3-x2) + X2*(x1-x3) + X3*(x2-x1) );
341 model[2] = d * ( X1*(x2*y3 - x3*y2) + X2*(x3*y1 - x1*y3) + X3*(x1*y2 - x2*y1) );
342
343 model[3] = d * ( Y1*(y2-y3) + Y2*(y3-y1) + Y3*(y1-y2) );
344 model[4] = d * ( Y1*(x3-x2) + Y2*(x1-x3) + Y3*(x2-x1) );
345 model[5] = d * ( Y1*(x2*y3 - x3*y2) + Y2*(x3*y1 - x1*y3) + Y3*(x1*y2 - x2*y1) );
346 }
347
348 // Checks that the 3 points in the given array are not collinear
350 {
352
353 for (j = 0; j <
i; j++) {
354 double dx1 = points[j]->s[0] - points[
i]->s[0];
355 double dy1 = points[j]->s[1] - points[
i]->s[1];
356
357 for (k = 0; k < j; k++) {
358 double dx2 = points[k]->s[0] - points[
i]->s[0];
359 double dy2 = points[k]->s[1] - points[
i]->s[1];
360
361 // Assuming a 3840 x 2160 video with a point at (0, 0) and one at
362 // (3839, 2159), this prevents a third point from being within roughly
363 // 0.5 of a pixel of the line connecting the two on both axes
364 if (
fabs(dx2*dy1 - dy2*dx1) <= 1.0) {
365 return 0;
366 }
367 }
368 }
369
370 return 1;
371 }
372
373 // Checks a subset of 3 point pairs to make sure that the points are not collinear
374 // and not too close to each other
376 {
377 const cl_float2 *prev_points[] = {
378 &pairs_subset[0].
p.
p1,
379 &pairs_subset[1].
p.
p1,
380 &pairs_subset[2].
p.
p1
381 };
382
383 const cl_float2 *curr_points[] = {
384 &pairs_subset[0].
p.
p2,
385 &pairs_subset[1].
p.
p2,
386 &pairs_subset[2].
p.
p2
387 };
388
390 }
391
392 // Selects a random subset of 3 points from point_pairs and places them in pairs_subset
396 const int num_point_pairs,
398 int max_attempts
399 ) {
400 int idx[3];
401 int i = 0, j, iters = 0;
402
403 for (; iters < max_attempts; iters++) {
404 for (
i = 0;
i < 3 && iters < max_attempts;) {
405 int idx_i = 0;
406
407 for (;;) {
408 idx_i = idx[
i] =
rand_in(0, num_point_pairs, alfg);
409
410 for (j = 0; j <
i; j++) {
411 if (idx_i == idx[j]) {
412 break;
413 }
414 }
415
417 break;
418 }
419 }
420
421 pairs_subset[
i] = point_pairs[idx[
i]];
423 }
424
426 continue;
427 }
428 break;
429 }
430
431 return i == 3 && iters < max_attempts;
432 }
433
434 // Computes the error for each of the given points based on the given model.
437 const int num_point_pairs,
438 const double *model,
439 float *err
440 ) {
441 double F0 = model[0],
F1 = model[1],
F2 = model[2];
442 double F3 = model[3], F4 = model[4], F5 = model[5];
443
444 for (
int i = 0;
i < num_point_pairs;
i++) {
445 const cl_float2 *
f = &point_pairs[
i].
p.
p1;
446 const cl_float2 *t = &point_pairs[
i].
p.
p2;
447
448 double a = F0*
f->s[0] +
F1*
f->s[1] +
F2 - t->s[0];
449 double b =
F3*
f->s[0] + F4*
f->s[1] + F5 - t->s[1];
450
452 }
453 }
454
455 // Determines which of the given point matches are inliers for the given model
456 // based on the specified threshold.
457 //
458 // err must be an array of num_point_pairs length
461 const int num_point_pairs,
462 const double *model,
463 float *err,
464 double thresh
465 ) {
466 float t = (
float)(thresh * thresh);
467 int i, n = num_point_pairs, num_inliers = 0;
468
470
471 for (
i = 0;
i < n;
i++) {
473 // This is an inlier
475 num_inliers += 1;
476 } else {
478 }
479 }
480
481 return num_inliers;
482 }
483
484 // Determines the number of iterations required to achieve the desired confidence level.
485 //
486 // The equation used to determine the number of iterations to do is:
487 // 1 - confidence = (1 - inlier_probability^num_points)^num_iters
488 //
489 // Solving for num_iters:
490 //
491 // num_iters = log(1 - confidence) / log(1 - inlier_probability^num_points)
492 //
493 // A more in-depth explanation can be found at https://en.wikipedia.org/wiki/Random_sample_consensus
494 // under the 'Parameters' heading
496 {
497 double num, denom;
498
499 confidence =
av_clipd(confidence, 0.0, 1.0);
500 num_outliers =
av_clipd(num_outliers, 0.0, 1.0);
501
502 // avoid inf's & nan's
503 num =
FFMAX(1.0 - confidence, DBL_MIN);
504 denom = 1.0 - pow(1.0 - num_outliers, 3);
505 if (denom < DBL_MIN) {
506 return 0;
507 }
508
511
512 return denom >= 0 || -num >= max_iters * (-denom) ? max_iters : (
int)
round(num / denom);
513 }
514
515 // Estimates an affine transform between the given pairs of points using RANdom
516 // SAmple Consensus
521 const int num_point_pairs,
522 double *model_out,
523 const double threshold,
524 const int max_iters,
525 const double confidence
526 ) {
528 double best_model[6], model[6];
530
531 int iter, niters =
FFMAX(max_iters, 1);
532 int good_count, max_good_count = 0;
533
534 // We need at least 3 points to build a model from
535 if (num_point_pairs < 3) {
536 return 0;
537 } else if (num_point_pairs == 3) {
538 // There are only 3 points, so RANSAC doesn't apply here
540
541 for (
int i = 0;
i < 3; ++
i) {
543 }
544
545 return 1;
546 }
547
548 for (iter = 0; iter < niters; ++iter) {
549 int found =
get_subset(&deshake_ctx->
alfg, point_pairs, num_point_pairs, pairs_subset, 10000);
550
551 if (!found) {
552 if (iter == 0) {
553 return 0;
554 }
555
556 break;
557 }
558
561
562 if (good_count >
FFMAX(max_good_count, 2)) {
563 for (
int mi = 0;
mi < 6; ++
mi) {
564 best_model[
mi] = model[
mi];
565 }
566
567 for (int pi = 0; pi < 3; pi++) {
568 best_pairs[pi] = pairs_subset[pi];
569 }
570
571 max_good_count = good_count;
573 confidence,
574 (double)(num_point_pairs - good_count) / num_point_pairs,
575 niters
576 );
577 }
578 }
579
580 if (max_good_count > 0) {
581 for (
int mi = 0;
mi < 6; ++
mi) {
582 model_out[
mi] = best_model[
mi];
583 }
584
585 for (int pi = 0; pi < 3; ++pi) {
587 }
589
590 // Find the inliers again for the best model for debugging
593 }
594
596 }
597
598 // "Wiggles" the first point in best_pairs around a tiny bit in order to decrease the
599 // total error
604 const int num_inliers,
605 float best_err,
606 double *model_out
607 ) {
608 float move_x_val = 0.01;
609 float move_y_val = 0.01;
610 int move_x = 1;
611 float old_move_x_val = 0;
612 double model[6];
613 int last_changed = 0;
614
615 for (int iters = 0; iters < 200; iters++) {
616 float total_err = 0;
617
618 if (move_x) {
619 best_pairs[0].
p.
p2.s[0] += move_x_val;
620 } else {
621 best_pairs[0].
p.
p2.s[0] += move_y_val;
622 }
623
626
627 for (int j = 0; j < num_inliers; j++) {
629 }
630
631 if (total_err < best_err) {
632 for (
int mi = 0;
mi < 6; ++
mi) {
633 model_out[
mi] = model[
mi];
634 }
635
636 best_err = total_err;
637 last_changed = iters;
638 } else {
639 // Undo the change
640 if (move_x) {
641 best_pairs[0].
p.
p2.s[0] -= move_x_val;
642 } else {
643 best_pairs[0].
p.
p2.s[0] -= move_y_val;
644 }
645
646 if (iters - last_changed > 4) {
647 // We've already improved the model as much as we can
648 break;
649 }
650
651 old_move_x_val = move_x_val;
652
653 if (move_x) {
654 move_x_val *= -1;
655 } else {
656 move_y_val *= -1;
657 }
658
659 if (old_move_x_val < 0) {
660 move_x = 0;
661 } else {
662 move_x = 1;
663 }
664 }
665 }
666 }
667
668 // Uses a process similar to that of RANSAC to find a transform that minimizes
669 // the total error for a set of point matches determined to be inliers
670 //
671 // (Pick random subsets, compute model, find total error, iterate until error
672 // is minimized.)
677 const int num_inliers,
678 double *model_out,
679 const int max_iters
680 ) {
682 float best_err = FLT_MAX;
683 double best_model[6], model[6];
685
686 for (
int i = 0;
i < max_iters;
i++) {
687 float total_err = 0;
688 int found =
get_subset(&deshake_ctx->
alfg, inliers, num_inliers, pairs_subset, 10000);
689
690 if (!found) {
692 return 0;
693 }
694
695 break;
696 }
697
700
701 for (int j = 0; j < num_inliers; j++) {
703 }
704
705 if (
i == 0 || total_err < best_err) {
706 for (
int mi = 0;
mi < 6; ++
mi) {
707 best_model[
mi] = model[
mi];
708 }
709
710 for (int pi = 0; pi < 3; pi++) {
711 best_pairs[pi] = pairs_subset[pi];
712 }
713
714 best_err = total_err;
715 }
716 }
717
718 for (
int mi = 0;
mi < 6; ++
mi) {
719 model_out[
mi] = best_model[
mi];
720 }
721
722 for (int pi = 0; pi < 3; ++pi) {
724 }
727
728 optimize_model(deshake_ctx, best_pairs, inliers, num_inliers, best_err, model_out);
730 }
731
732 // End code from OpenCV
733
734 // Decomposes a similarity matrix into translation, rotation, scale, and skew
735 //
736 // See http://frederic-wang.fr/decomposition-of-2d-transform-matrices.html
738 {
740
743 double e = model[2];
745 double d = model[4];
748
749 memset(&
ret, 0,
sizeof(
ret));
750
751 ret.translation.s[0] = e;
752 ret.translation.s[1] =
f;
753
754 // This is the QR method
755 if (
a != 0 ||
b != 0) {
757
761 ret.skew.s[0] = atan((
a *
c +
b * d) / (
r *
r));
763 }
else if (
c != 0 || d != 0) {
764 double s = sqrt(
c *
c + d * d);
765
770 ret.skew.s[1] = atan((
a *
c +
b * d) / (
s *
s));
771 } // otherwise there is only translation
772
774 }
775
776 // Move valid vectors from the 2d buffer into a 1d buffer where they are contiguous
779 int size_y,
780 int size_x
781 ) {
782 int num_vectors = 0;
783
784 for (
int i = 0;
i < size_y; ++
i) {
785 for (int j = 0; j < size_x; ++j) {
787
790 ++num_vectors;
791 }
792
793 // Make sure we do not exceed the amount of space we allocated for these vectors
795 return num_vectors;
796 }
797 }
798 }
799 return num_vectors;
800 }
801
802 // Returns the gaussian kernel value for the given x coordinate and sigma value
804 return 1.0f /
expf(((
float)x * (
float)x) / (2.0
f * sigma * sigma));
805 }
806
807 // Makes a normalized gaussian kernel of the given length for the given sigma
808 // and places it in gauss_kernel
810 {
811 float gauss_sum = 0;
812 int window_half = length / 2;
813
814 for (
int i = 0;
i < length; ++
i) {
816
818 gauss_kernel[
i] =
val;
819 }
820
821 // Normalize the gaussian values
822 for (
int i = 0;
i < length; ++
i) {
823 gauss_kernel[
i] /= gauss_sum;
824 }
825 }
826
827 // Returns indices to start and end iteration at in order to iterate over a window
828 // of length size centered at the current frame in a ringbuffer
829 //
830 // Always returns numbers that result in a window of length size, even if that
831 // means specifying negative indices or indices past the end of the values in the
832 // ringbuffers. Make sure you clip indices appropriately within your loop.
835
838
839 return indices;
840 }
841
842 // Sets val to the value in the given ringbuffer at the given offset, taking care of
843 // clipping the offset into the appropriate range
849 ) {
850 int clip_start, clip_end, offset_clipped;
853 } else {
854 // This expression represents the last valid index in the buffer,
855 // which we use repeatedly at the end of the video.
857 }
858
861 } else {
862 // Negative indices will occur at the start of the video, and we want
863 // them to be clipped to 0 in order to repeatedly use the position of
864 // the first frame.
865 clip_start = 0;
866 }
867
870 clip_start,
871 clip_end
872 );
873
875 }
876
877 // Returns smoothed current frame value of the given buffer of floats based on the
878 // given Gaussian kernel and its length (also the window length, centered around the
879 // current frame) and the "maximum value" of the motion.
880 //
881 // This "maximum value" should be the width / height of the image in the case of
882 // translation and an empirically chosen constant for rotation / scale.
883 //
884 // The sigma chosen to generate the final gaussian kernel with used to smooth the
885 // camera path is either hardcoded (set by user, deshake_ctx->smooth_percent) or
886 // adaptively chosen.
889 float *gauss_kernel,
890 int length,
891 float max_val,
893 ) {
894 float new_large_s = 0, new_small_s = 0, new_best = 0, old, diff_between,
895 percent_of_max, inverted_percent;
897 float large_sigma = 40.0f;
898 float small_sigma = 2.0f;
899 float best_sigma;
900
902 best_sigma = (large_sigma - 0.5f) * deshake_ctx->
smooth_percent + 0.5f;
903 } else {
904 // Strategy to adaptively smooth trajectory:
905 //
906 // 1. Smooth path with large and small sigma values
907 // 2. Take the absolute value of the difference between them
908 // 3. Get a percentage by putting the difference over the "max value"
909 // 4, Invert the percentage
910 // 5. Calculate a new sigma value weighted towards the larger sigma value
911 // 6. Determine final smoothed trajectory value using that sigma
912
914 for (
int i = indices.
start, j = 0;
i < indices.
end; ++
i, ++j) {
916 new_large_s += old * gauss_kernel[j];
917 }
918
920 for (
int i = indices.
start, j = 0;
i < indices.
end; ++
i, ++j) {
922 new_small_s += old * gauss_kernel[j];
923 }
924
925 diff_between =
fabsf(new_large_s - new_small_s);
926 percent_of_max = diff_between / max_val;
927 inverted_percent = 1 - percent_of_max;
928 best_sigma = large_sigma *
powf(inverted_percent, 40);
929 }
930
932 for (
int i = indices.
start, j = 0;
i < indices.
end; ++
i, ++j) {
934 new_best += old * gauss_kernel[j];
935 }
936
937 return new_best;
938 }
939
940 // Returns the position of the given point after the transform is applied
943
946
948 }
949
950 // Creates an affine transform that scales from the center of a frame
952 float x_shift,
953 float y_shift,
954 float angle,
955 float scale_x,
956 float scale_y,
957 float center_w,
958 float center_h,
960 ) {
961 cl_float2 center_s;
962 float center_s_w, center_s_h;
963
965 0,
966 0,
967 0,
968 scale_x,
969 scale_y,
971 );
972
974 center_s_w = center_w - center_s.s[0];
975 center_s_h = center_h - center_s.s[1];
976
978 x_shift + center_s_w,
979 y_shift + center_s_h,
980 angle,
981 scale_x,
982 scale_y,
984 );
985 }
986
987 // Determines the crop necessary to eliminate black borders from a smoothed frame
988 // and updates target crop accordingly
992 float frame_width,
993 float frame_height
994 ) {
995 float new_width, new_height, adjusted_width, adjusted_height, adjusted_x, adjusted_y;
996
1001 float ar_h = frame_height / frame_width;
1002 float ar_w = frame_width / frame_height;
1003
1005 // The crop hasn't been set to the original size of the plane
1008 }
1009
1012 top_left.s[0],
1013 bottom_left.s[0]
1014 );
1015
1018 top_left.s[1],
1019 top_right.s[1]
1020 );
1021
1024 bottom_right.s[0],
1025 top_right.s[0]
1026 );
1027
1030 bottom_right.s[1],
1031 bottom_left.s[1]
1032 );
1033
1034 // Make sure our potentially new bounding box has the same aspect ratio
1037
1038 adjusted_width = new_height * ar_w;
1040
1041 if (adjusted_x >= crop->
top_left.s[0]) {
1043 } else {
1044 adjusted_height = new_width * ar_h;
1045 adjusted_y = crop->
bottom_right.s[1] - adjusted_height;
1047 }
1048 }
1049
1051 {
1053 cl_int cle;
1054
1057
1060
1061 if (
ctx->gauss_kernel)
1063
1064 if (
ctx->ransac_err)
1066
1067 if (
ctx->matches_host)
1069
1070 if (
ctx->matches_contig_host)
1072
1075
1077
1086
1088
1101 if (
ctx->debug_on) {
1104 }
1105
1107 }
1108
1110 {
1115 // Pointer to the host-side pattern buffer to be initialized and then copied
1116 // to the GPU
1118 cl_int cle;
1119 int err;
1120 cl_ulong8 zeroed_ulong8;
1122 cl_image_format grayscale_format;
1123 cl_image_desc grayscale_desc;
1124 cl_command_queue_properties queue_props;
1125
1141 };
1142
1143 // Number of elements for an array
1145
1146 const int descriptor_buf_size = image_grid_32 * (
BREIFN / 8);
1147 const int features_buf_size = image_grid_32 * sizeof(cl_float2);
1148
1151
1154
1159 ctx->curr_frame = 0;
1160
1161 memset(&zeroed_ulong8, 0, sizeof(cl_ulong8));
1162
1164 if (!
ctx->gauss_kernel) {
1167 }
1168
1170 if (!
ctx->ransac_err) {
1173 }
1174
1177 sizeof(float), 0);
1178
1179 if (!
ctx->abs_motion.ringbuffers[
i]) {
1182 }
1183 }
1184
1185 if (
ctx->debug_on) {
1187 ctx->smooth_window / 2,
1189 );
1190
1191 if (!
ctx->abs_motion.debug_matches) {
1194 }
1195 }
1196
1197 ctx->abs_motion.curr_frame_offset = 0;
1198 ctx->abs_motion.data_start_offset = -1;
1199 ctx->abs_motion.data_end_offset = -1;
1200
1202 if (!pattern_host) {
1205 }
1206
1208 if (!
ctx->matches_host) {
1211 }
1212
1214 if (!
ctx->matches_contig_host) {
1217 }
1218
1220 if (!
ctx->inliers) {
1223 }
1224
1225 // Initializing the patch pattern for building BREIF descriptors with
1229
1230 for (int j = 0; j < 2; ++j) {
1233 }
1234
1235 pattern_host[
i] = pair;
1236 }
1237
1238 for (
int i = 0;
i < 14;
i++) {
1239 if (
ctx->sw_format == disallowed_formats[
i]) {
1243 }
1244 }
1245
1248 } else {
1250 }
1251 ctx->sw_format = hw_frames_ctx->sw_format;
1252
1254 if (err < 0)
1256
1257 if (
ctx->debug_on) {
1258 queue_props = CL_QUEUE_PROFILING_ENABLE;
1259 } else {
1260 queue_props = 0;
1261 }
1262 ctx->command_queue = clCreateCommandQueue(
1263 ctx->ocf.hwctx->context,
1264 ctx->ocf.hwctx->device_id,
1265 queue_props,
1266 &cle
1267 );
1269
1279
1281 grayscale_format.image_channel_order = CL_R;
1282 grayscale_format.image_channel_data_type = CL_FLOAT;
1283
1284 grayscale_desc = (cl_image_desc) {
1285 .image_type = CL_MEM_OBJECT_IMAGE2D,
1286 .image_width = outlink->
w,
1287 .image_height = outlink->
h,
1288 .image_depth = 0,
1289 .image_array_size = 0,
1290 .image_row_pitch = 0,
1291 .image_slice_pitch = 0,
1292 .num_mip_levels = 0,
1293 .num_samples = 0,
1295 };
1296
1297 ctx->grayscale = clCreateImage(
1298 ctx->ocf.hwctx->context,
1299 0,
1300 &grayscale_format,
1301 &grayscale_desc,
1303 &cle
1304 );
1306 }
1307
1313 brief_pattern,
1314 CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
1316 pattern_host
1317 );
1324 if (
ctx->debug_on) {
1327 }
1328
1329 ctx->initialized = 1;
1331
1332 return 0;
1333
1336 return err;
1337 }
1338
1339 // Logs debug information about the transform data
1342 "Frame %d:\n"
1343 "\tframe moved from: %f x, %f y\n"
1344 "\t to: %f x, %f y\n"
1345 "\t rotated from: %f degrees\n"
1346 "\t to: %f degrees\n"
1347 "\t scaled from: %f x, %f y\n"
1348 "\t to: %f x, %f y\n"
1349 "\n"
1350 "\tframe moved by: %f x, %f y\n"
1351 "\t rotated by: %f degrees\n"
1352 "\t scaled by: %f x, %f y\n",
1353 curr_frame,
1363 );
1364 }
1365
1366 // Uses the buffered motion information to determine a transform that smooths the
1367 // given frame and applies it
1369 {
1375 int err;
1376 cl_int cle;
1379 // Luma (in the case of YUV) transform, or just the transform in the case of RGB
1380 float transform_y[9];
1381 // Chroma transform
1382 float transform_uv[9];
1383 // Luma crop transform (or RGB)
1384 float transform_crop_y[9];
1385 // Chroma crop transform
1386 float transform_crop_uv[9];
1387 float transform_debug_rgb[9];
1388 size_t global_work[2];
1390 cl_mem
src, transformed,
dst;
1393 cl_event transform_event, crop_upscale_event;
1395 cl_int num_model_matches;
1396
1397 const float center_w = (
float)input_frame->
width / 2;
1398 const float center_h = (
float)input_frame->
height / 2;
1399
1403
1404 const float center_w_chroma = (
float)chroma_width / 2;
1405 const float center_h_chroma = (
float)chroma_height / 2;
1406
1407 const float luma_w_over_chroma_w = ((
float)input_frame->
width / (
float)chroma_width);
1408 const float luma_h_over_chroma_h = ((
float)input_frame->
height / (
float)chroma_height);
1409
1413 &debug_matches, 1);
1414 }
1415
1418 } else {
1420 }
1422
1423 // Get the absolute transform data for this frame
1428 }
1429
1431 // If tripod mode is turned on we simply undo all motion relative to the
1432 // first frame
1433
1439 } else {
1440 // Tripod mode is off and we need to smooth a moving camera
1441
1443 deshake_ctx,
1448 );
1450 deshake_ctx,
1455 );
1457 deshake_ctx,
1462 );
1464 deshake_ctx,
1467 2.0f,
1469 );
1471 deshake_ctx,
1474 2.0f,
1476 );
1477 }
1478
1485 center_w,
1486 center_h,
1487 transform_y
1488 );
1489
1496 center_w_chroma,
1497 center_h_chroma,
1498 transform_uv
1499 );
1500
1503
1506
1508 if (!cropped_frame) {
1511 }
1512
1514 if (!transformed_frame) {
1517 }
1518
1521
1523 // Transform all of the planes appropriately
1524 src = (cl_mem)input_frame->
data[
p];
1525 transformed = (cl_mem)transformed_frame->data[
p];
1526
1527 if (!transformed)
1528 break;
1529
1531 if (err < 0)
1533
1537 global_work,
1539 &transform_event,
1540 { sizeof(cl_mem), &src },
1541 { sizeof(cl_mem), &transformed },
1542 { sizeof(cl_mem), &transforms[p] },
1543 );
1544 }
1545
1553 );
1554
1561 );
1562
1564
1565 // Invert the transform
1572 center_w,
1573 center_h,
1574 transform_debug_rgb
1575 );
1576
1578
1579 transformed = (cl_mem)transformed_frame->data[0];
1586 { sizeof(cl_mem), &transformed },
1589 { sizeof(cl_int), &num_model_matches },
1591 );
1592 }
1593
1595 // Generate transforms for cropping
1602 center_w,
1603 center_h,
1604 transform_crop_y
1605 );
1607
1614 center_w_chroma,
1615 center_h_chroma,
1616 transform_crop_uv
1617 );
1619
1620 crops[0] = deshake_ctx->
crop_y;
1621 crops[1] = crops[2] = deshake_ctx->
crop_uv;
1622
1624 // Crop all of the planes appropriately
1625 dst = (cl_mem)cropped_frame->
data[
p];
1626 transformed = (cl_mem)transformed_frame->data[
p];
1627
1629 break;
1630
1632 if (err < 0)
1634
1638 global_work,
1640 &crop_upscale_event,
1641 { sizeof(cl_mem), &transformed },
1642 { sizeof(cl_mem), &dst },
1643 { sizeof(cl_float2), &crops[p].top_left },
1644 { sizeof(cl_float2), &crops[p].bottom_right },
1645 );
1646 }
1647 }
1648
1650 // This means we are somewhere at the start of the video. We need to
1651 // increment the current frame offset until it reaches the center of
1652 // the ringbuffers (as the current frame will be located there for
1653 // the rest of the video).
1654 //
1655 // The end of the video is taken care of by draining motion data
1656 // one-by-one out of the buffer, causing the (at that point fixed)
1657 // offset to move towards later frames' data.
1659 }
1660
1662 // Keep the end offset in sync with the frame it's supposed to be
1663 // positioned at
1665
1667 // The end offset would be the start of the new video sequence; flip to
1668 // start offset
1671 }
1673 // Keep the start offset in sync with the frame it's supposed to be
1674 // positioned at
1676 }
1677
1682 }
1683 }
1684
1686
1689
1692 if (err < 0)
1694
1698
1699 } else {
1701 if (err < 0)
1703
1707 }
1708
1711
1715
1719 return err;
1720 }
1721
1722 // Add the given frame to the frame queue to eventually be processed.
1723 //
1724 // Also determines the motion from the previous frame and updates the stored
1725 // motion information accordingly.
1727 {
1730 int err;
1731 int num_vectors;
1732 int num_inliers = 0;
1733 cl_int cle;
1736 size_t global_work[2];
1737 size_t harris_global_work[2];
1738 size_t grid_32_global_work[2];
1739 int grid_32_h, grid_32_w;
1740 size_t local_work[2];
1742 float prev_vals[5];
1743 float new_vals[5];
1744 cl_event grayscale_event, harris_response_event, refine_features_event,
1745 brief_event, match_descriptors_event, read_buf_event;
1747
1748 num_vectors = 0;
1749
1750 local_work[0] = 8;
1751 local_work[1] = 8;
1752
1754 if (err < 0)
1756
1758 if (err < 0)
1760
1762 if (err < 0)
1764
1765 // We want a single work-item for each 32x32 block of pixels in the input frame
1766 grid_32_global_work[0] /= 32;
1767 grid_32_global_work[1] /= 32;
1768
1771
1772 if (deshake_ctx->
is_yuv) {
1774 } else {
1775 src = (cl_mem)input_frame->
data[0];
1776
1780 global_work,
1782 &grayscale_event,
1783 {
sizeof(cl_mem), &
src },
1784 {
sizeof(cl_mem), &deshake_ctx->
grayscale }
1785 );
1786 }
1787
1789 deshake_ctx->command_queue,
1790 deshake_ctx->kernel_harris_response,
1791 harris_global_work,
1792 local_work,
1793 &harris_response_event,
1794 { sizeof(cl_mem), &deshake_ctx->grayscale },
1795 { sizeof(cl_mem), &deshake_ctx->harris_buf }
1796 );
1797
1799 deshake_ctx->command_queue,
1800 deshake_ctx->kernel_refine_features,
1801 grid_32_global_work,
1803 &refine_features_event,
1804 { sizeof(cl_mem), &deshake_ctx->grayscale },
1805 { sizeof(cl_mem), &deshake_ctx->harris_buf },
1806 { sizeof(cl_mem), &deshake_ctx->refined_features },
1807 { sizeof(cl_int), &deshake_ctx->refine_features }
1808 );
1809
1811 deshake_ctx->command_queue,
1812 deshake_ctx->kernel_brief_descriptors,
1813 grid_32_global_work,
1815 &brief_event,
1816 { sizeof(cl_mem), &deshake_ctx->grayscale },
1817 { sizeof(cl_mem), &deshake_ctx->refined_features },
1818 { sizeof(cl_mem), &deshake_ctx->descriptors },
1819 { sizeof(cl_mem), &deshake_ctx->brief_pattern}
1820 );
1821
1823 // This is the first frame we've been given to queue, meaning there is
1824 // no previous frame to match descriptors to
1825
1826 goto no_motion_data;
1827 }
1828
1830 deshake_ctx->command_queue,
1831 deshake_ctx->kernel_match_descriptors,
1832 grid_32_global_work,
1834 &match_descriptors_event,
1835 { sizeof(cl_mem), &deshake_ctx->prev_refined_features },
1836 { sizeof(cl_mem), &deshake_ctx->refined_features },
1837 { sizeof(cl_mem), &deshake_ctx->descriptors },
1838 { sizeof(cl_mem), &deshake_ctx->prev_descriptors },
1839 { sizeof(cl_mem), &deshake_ctx->matches }
1840 );
1841
1842 cle = clEnqueueReadBuffer(
1843 deshake_ctx->command_queue,
1844 deshake_ctx->matches,
1845 CL_TRUE,
1846 0,
1848 deshake_ctx->matches_host,
1849 0,
1851 &read_buf_event
1852 );
1854
1856
1857 if (num_vectors < 10) {
1858 // Not enough matches to get reliable motion data for this frame
1859 //
1860 // From this point on all data is relative to this frame rather than the
1861 // original frame. We have to make sure that we don't mix values that were
1862 // relative to the original frame with the new values relative to this
1863 // frame when doing the gaussian smoothing. We keep track of where the old
1864 // values end using this data_end_offset field in order to accomplish
1865 // that goal.
1866 //
1867 // If no motion data is present for multiple frames in a short window of
1868 // time, we leave the end where it was to avoid mixing 0s in with the
1869 // old data (and just treat them all as part of the new values)
1870 if (deshake_ctx->abs_motion.data_end_offset == -1) {
1871 deshake_ctx->abs_motion.data_end_offset =
1873 }
1874
1875 goto no_motion_data;
1876 }
1877
1879 deshake_ctx,
1880 deshake_ctx->matches_contig_host,
1881 &debug_matches,
1882 num_vectors,
1883 model.matrix,
1884 10.0,
1885 3000,
1886 0.999999999999
1887 )) {
1888 goto no_motion_data;
1889 }
1890
1891 for (
int i = 0;
i < num_vectors;
i++) {
1892 if (deshake_ctx->matches_contig_host[
i].should_consider) {
1893 deshake_ctx->inliers[num_inliers] = deshake_ctx->matches_contig_host[
i];
1894 num_inliers++;
1895 }
1896 }
1897
1899 deshake_ctx,
1900 deshake_ctx->inliers,
1901 &debug_matches,
1902 num_inliers,
1903 model.matrix,
1904 400
1905 )) {
1906 goto no_motion_data;
1907 }
1908
1909
1911
1912 // Get the absolute transform data for the previous frame
1915 deshake_ctx->abs_motion.ringbuffers[
i],
1918 }
1919
1925
1926 if (deshake_ctx->debug_on) {
1927 if (!deshake_ctx->is_yuv) {
1929 }
1935 }
1936
1937 goto end;
1938
1939 no_motion_data:
1945
1946 for (
int i = 0;
i < num_vectors;
i++) {
1947 deshake_ctx->matches_contig_host[
i].should_consider = 0;
1948 }
1949 debug_matches.num_model_matches = 0;
1950
1951 if (deshake_ctx->debug_on) {
1953 "\n[ALERT] No motion data found in queue_frame, motion reset to 0\n\n"
1954 );
1955 }
1956
1957 goto end;
1958
1959 end:
1960 // Swap the descriptor buffers (we don't need the previous frame's descriptors
1961 // again so we will use that space for the next frame's descriptors)
1962 temp = deshake_ctx->prev_descriptors;
1963 deshake_ctx->prev_descriptors = deshake_ctx->descriptors;
1964 deshake_ctx->descriptors =
temp;
1965
1966 // Same for the refined features
1967 temp = deshake_ctx->prev_refined_features;
1968 deshake_ctx->prev_refined_features = deshake_ctx->refined_features;
1969 deshake_ctx->refined_features =
temp;
1970
1971 if (deshake_ctx->debug_on) {
1972 if (num_vectors == 0) {
1973 debug_matches.matches =
NULL;
1974 } else {
1976
1977 if (!debug_matches.matches) {
1980 }
1981 }
1982
1983 for (
int i = 0;
i < num_vectors;
i++) {
1984 debug_matches.matches[
i] = deshake_ctx->matches_contig_host[
i];
1985 }
1986 debug_matches.num_matches = num_vectors;
1987
1989 deshake_ctx->abs_motion.debug_matches,
1990 &debug_matches, 1);
1991 }
1992
1994 av_fifo_write(deshake_ctx->abs_motion.ringbuffers[
i], &new_vals[
i], 1);
1995 }
1996
1998
2000 clFinish(deshake_ctx->command_queue);
2002 return err;
2003 }
2004
2006 {
2013
2015
2016 if (!deshake_ctx->
eof) {
2021 if (!
frame->hw_frames_ctx)
2023
2028 }
2029
2030 // If there is no more space in the ringbuffers, remove the oldest
2031 // values to make room for the new ones
2035 }
2036 }
2041 // See if we have enough buffered frames to process one
2042 //
2043 // "enough" is half the smooth window of queued frames into the future
2046 }
2047 }
2048 }
2049 }
2050
2053 deshake_ctx->
eof = 1;
2054 }
2055 }
2056
2057 if (deshake_ctx->
eof) {
2058 // Finish processing the rest of the frames in the queue.
2062 }
2063
2067 }
2068 }
2069
2072 "Average kernel execution times:\n"
2073 "\t grayscale: %0.3f ms\n"
2074 "\t harris_response: %0.3f ms\n"
2075 "\t refine_features: %0.3f ms\n"
2076 "\tbrief_descriptors: %0.3f ms\n"
2077 "\tmatch_descriptors: %0.3f ms\n"
2078 "\t transform: %0.3f ms\n"
2079 "\t crop_upscale: %0.3f ms\n"
2080 "Average buffer read times:\n"
2081 "\t features buf: %0.3f ms\n",
2090 );
2091 }
2092
2094 return 0;
2095 }
2096
2097 if (!deshake_ctx->
eof) {
2099 }
2100
2102 }
2103
2105 {
2109 },
2110 };
2111
2113 {
2117 },
2118 };
2119
2120 #define OFFSET(x) offsetof(DeshakeOpenCLContext, x)
2121 #define FLAGS AV_OPT_FLAG_FILTERING_PARAM|AV_OPT_FLAG_VIDEO_PARAM
2122
2124 {
2125 "tripod", "simulates a tripod by preventing any camera movement whatsoever "
2126 "from the original frame",
2128 },
2129 {
2130 "debug", "turn on additional debugging information",
2132 },
2133 {
2134 "adaptive_crop", "attempt to subtly crop borders to reduce mirrored content",
2136 },
2137 {
2138 "refine_features", "refine feature point locations at a sub-pixel level",
2140 },
2141 {
2142 "smooth_strength", "smoothing strength (0 attempts to adaptively determine optimal strength)",
2144 },
2145 {
2146 "smooth_window_multiplier", "multiplier for number of frames to buffer for motion data",
2148 },
2150 };
2151
2153
2155 .
p.
name =
"deshake_opencl",
2157 .p.priv_class = &deshake_opencl_class,
2167 };