1 /*
2 * This file is part of FFmpeg.
3 *
4 * FFmpeg is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU Lesser General Public
6 * License as published by the Free Software Foundation; either
7 * version 2.1 of the License, or (at your option) any later version.
8 *
9 * FFmpeg is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * Lesser General Public License for more details.
13 *
14 * You should have received a copy of the GNU Lesser General Public
15 * License along with FFmpeg; if not, write to the Free Software
16 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
17 *
18 * Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
19 * Copyright (C) 2009, Willow Garage Inc., all rights reserved.
20 * Copyright (C) 2013, OpenCV Foundation, all rights reserved.
21 * Third party copyrights are property of their respective owners.
22 *
23 * Redistribution and use in source and binary forms, with or without modification,
24 * are permitted provided that the following conditions are met:
25 *
26 * * Redistribution's of source code must retain the above copyright notice,
27 * this list of conditions and the following disclaimer.
28 *
29 * * Redistribution's in binary form must reproduce the above copyright notice,
30 * this list of conditions and the following disclaimer in the documentation
31 * and/or other materials provided with the distribution.
32 *
33 * * The name of the copyright holders may not be used to endorse or promote products
34 * derived from this software without specific prior written permission.
35 *
36 * This software is provided by the copyright holders and contributors "as is" and
37 * any express or implied warranties, including, but not limited to, the implied
38 * warranties of merchantability and fitness for a particular purpose are disclaimed.
39 * In no event shall the Intel Corporation or contributors be liable for any direct,
40 * indirect, incidental, special, exemplary, or consequential damages
41 * (including, but not limited to, procurement of substitute goods or services;
42 * loss of use, data, or profits; or business interruption) however caused
43 * and on any theory of liability, whether in contract, strict liability,
44 * or tort (including negligence or otherwise) arising in any way out of
45 * the use of this software, even if advised of the possibility of such damage.
46 */
47
66
67 /*
68 This filter matches feature points between frames (dealing with outliers) and then
69 uses the matches to estimate an affine transform between frames. This transform is
70 decomposed into various values (translation, scale, rotation) and the values are
71 summed relative to the start of the video to obtain on absolute camera position
72 for each frame. This "camera path" is then smoothed via a gaussian filter, resulting
73 in a new path that is turned back into an affine transform and applied to each
74 frame to render it.
75
76 High-level overview:
77
78 All of the work to extract motion data from frames occurs in queue_frame. Motion data
79 is buffered in a smoothing window, so queue_frame simply computes the absolute camera
80 positions and places them in ringbuffers.
81
82 filter_frame is responsible for looking at the absolute camera positions currently
83 in the ringbuffers, applying the gaussian filter, and then transforming the frames.
84 */
85
86 // Number of bits for BRIEF descriptors
88 // Size of the patch from which a BRIEF descriptor is extracted
89 // This is the size used in OpenCV
90 #define BRIEF_PATCH_SIZE 31
91 #define BRIEF_PATCH_SIZE_HALF (BRIEF_PATCH_SIZE / 2)
92
93 #define MATCHES_CONTIG_SIZE 2000
94
95 #define ROUNDED_UP_DIV(a, b) ((a + (b - 1)) / b)
96
98 // Previous frame
100 // Current frame
103
106 // Used to mark vectors as potential outliers
109
110 // Denotes the indices for the different types of motion in the ringbuffers array
117
118 // Should always be last
120 };
121
122 // Struct that holds data for drawing point match debug data
125 // The points used to calculate the affine transform for a frame
127
129 // For cases where we couldn't calculate a model
132
133 // Groups together the ringbuffers that store absolute distortion / position values
134 // for each frame
136 // Array with the various ringbuffers, indexed via the RingbufferIndices enum
138
139 // Offset to get to the current frame being processed
140 // (not in bytes)
142 // Keeps track of where the start and end of contiguous motion data is (to
143 // deal with cases where no motion data is found between two frames)
146
149
150 // Takes care of freeing the arrays within the DebugMatches inside of the
151 // debug_matches ringbuffer and then freeing the buffer itself.
154
156 return;
157 }
158
161
163 }
164
165 // Stores the translation, scale, rotation, and skew deltas between two frames
172
174 // The 2x3 similarity matrix
177
179 // The top left corner of the bounding box for the crop
181 // The bottom right corner of the bounding box for the crop
184
185 // Returned from function that determines start and end values for iteration
186 // around the current frame in a ringbuffer
191
194 // Whether or not the above `OpenCLFilterContext` has been initialized
196
197 // These variables are used in the activate callback
200
201 // State for random number generation
203
204 // FIFO frame queue used to buffer future frames for processing
206 // Ringbuffers for frame positions
208
209 // The number of frames' motion to consider before and after the frame we are
210 // smoothing
212 // The number of the frame we are currently processing
214
215 // Stores a 1d array of normalised gaussian kernel values for convolution
217
218 // Buffer for error values used in RANSAC code
220
221 // Information regarding how to crop the smoothed luminance (or RGB) planes
223 // Information regarding how to crop the smoothed chroma planes
225
226 // Whether or not we are processing YUV input (as oppposed to RGB)
228 // The underlying format of the hardware surfaces
230
231 // Buffer to copy `matches` into for the CPU to work with
234
236
245
246 // Stores a frame converted to grayscale
248 // Stores the harris response for a frame (measure of "cornerness" for each pixel)
250
251 // Detected features after non-maximum suppression and sub-pixel refinement
253 // Saved from the previous frame
255
256 // BRIEF sampling pattern that is randomly initialized
258 // Feature point descriptors for the current frame
260 // Feature point descriptors for the previous frame
262 // Vectors between points in current and previous frame
265 // Holds the matrix to transform luminance (or RGB) with
267 // Holds the matrix to transform chroma with
269
270 // Configurable options
271
275
276 // Whether or not feature points should be refined at a sub-pixel level
278 // If the user sets a value other than the default, 0, this percentage is
279 // translated into a sigma value ranging from 0.5 to 40.0
281 // This number is multiplied by the video frame rate to determine the size
282 // of the smooth window
284
285 // Debug stuff
286
290
291 // These store the total time spent executing the different kernels in nanoseconds
299
300 // Time spent copying matched features from the device to the host
303
304 // Returns a random uniformly-distributed number in [low, high]
306 return (
av_lfg_get(alfg) % (high - low)) + low;
307 }
308
309 // Returns the average execution time for an event given the total time and the
310 // number of frames processed.
312 return (
double)total_time / (
double)num_frames / 1000000.0;
313 }
314
315 // The following code is loosely ported from OpenCV
316
317 // Estimates affine transform from 3 point pairs
318 // model is a 2x3 matrix:
319 // a b c
320 // d e f
322 {
323 // src points
324 double x1 = point_pairs[0].
p.
p1.s[0];
325 double y1 = point_pairs[0].
p.
p1.s[1];
326 double x2 = point_pairs[1].
p.
p1.s[0];
327 double y2 = point_pairs[1].
p.
p1.s[1];
328 double x3 = point_pairs[2].
p.
p1.s[0];
329 double y3 = point_pairs[2].
p.
p1.s[1];
330
331 // dest points
332 double X1 = point_pairs[0].
p.
p2.s[0];
333 double Y1 = point_pairs[0].
p.
p2.s[1];
334 double X2 = point_pairs[1].
p.
p2.s[0];
335 double Y2 = point_pairs[1].
p.
p2.s[1];
336 double X3 = point_pairs[2].
p.
p2.s[0];
337 double Y3 = point_pairs[2].
p.
p2.s[1];
338
339 double d = 1.0 / ( x1*(y2-y3) + x2*(y3-y1) + x3*(y1-y2) );
340
341 model[0] =
d * ( X1*(y2-y3) + X2*(y3-y1) + X3*(y1-y2) );
342 model[1] =
d * ( X1*(x3-x2) + X2*(x1-x3) + X3*(x2-x1) );
343 model[2] =
d * ( X1*(x2*y3 - x3*y2) + X2*(x3*y1 - x1*y3) + X3*(x1*y2 - x2*y1) );
344
345 model[3] =
d * ( Y1*(y2-y3) + Y2*(y3-y1) + Y3*(y1-y2) );
346 model[4] =
d * ( Y1*(x3-x2) + Y2*(x1-x3) + Y3*(x2-x1) );
347 model[5] =
d * ( Y1*(x2*y3 - x3*y2) + Y2*(x3*y1 - x1*y3) + Y3*(x1*y2 - x2*y1) );
348 }
349
350 // Checks that the 3 points in the given array are not collinear
352 {
354
355 for (j = 0; j <
i; j++) {
356 double dx1 = points[j]->s[0] - points[
i]->s[0];
357 double dy1 = points[j]->s[1] - points[
i]->s[1];
358
359 for (k = 0; k < j; k++) {
360 double dx2 = points[k]->s[0] - points[
i]->s[0];
361 double dy2 = points[k]->s[1] - points[
i]->s[1];
362
363 // Assuming a 3840 x 2160 video with a point at (0, 0) and one at
364 // (3839, 2159), this prevents a third point from being within roughly
365 // 0.5 of a pixel of the line connecting the two on both axes
366 if (
fabs(dx2*dy1 - dy2*dx1) <= 1.0) {
367 return 0;
368 }
369 }
370 }
371
372 return 1;
373 }
374
375 // Checks a subset of 3 point pairs to make sure that the points are not collinear
376 // and not too close to each other
378 {
379 const cl_float2 *prev_points[] = {
380 &pairs_subset[0].
p.
p1,
381 &pairs_subset[1].
p.
p1,
382 &pairs_subset[2].
p.
p1
383 };
384
385 const cl_float2 *curr_points[] = {
386 &pairs_subset[0].
p.
p2,
387 &pairs_subset[1].
p.
p2,
388 &pairs_subset[2].
p.
p2
389 };
390
392 }
393
394 // Selects a random subset of 3 points from point_pairs and places them in pairs_subset
398 const int num_point_pairs,
400 int max_attempts
401 ) {
402 int idx[3];
403 int i = 0, j, iters = 0;
404
405 for (; iters < max_attempts; iters++) {
406 for (
i = 0;
i < 3 && iters < max_attempts;) {
407 int idx_i = 0;
408
409 for (;;) {
410 idx_i = idx[
i] =
rand_in(0, num_point_pairs, alfg);
411
412 for (j = 0; j <
i; j++) {
413 if (idx_i == idx[j]) {
414 break;
415 }
416 }
417
419 break;
420 }
421 }
422
423 pairs_subset[
i] = point_pairs[idx[
i]];
425 }
426
428 continue;
429 }
430 break;
431 }
432
433 return i == 3 && iters < max_attempts;
434 }
435
436 // Computes the error for each of the given points based on the given model.
439 const int num_point_pairs,
440 const double *model,
441 float *err
442 ) {
443 double F0 = model[0],
F1 = model[1],
F2 = model[2];
444 double F3 = model[3], F4 = model[4], F5 = model[5];
445
446 for (
int i = 0;
i < num_point_pairs;
i++) {
447 const cl_float2 *
f = &point_pairs[
i].
p.
p1;
448 const cl_float2 *t = &point_pairs[
i].
p.
p2;
449
450 double a = F0*
f->s[0] +
F1*
f->s[1] +
F2 - t->s[0];
451 double b =
F3*
f->s[0] + F4*
f->s[1] + F5 - t->s[1];
452
454 }
455 }
456
457 // Determines which of the given point matches are inliers for the given model
458 // based on the specified threshold.
459 //
460 // err must be an array of num_point_pairs length
463 const int num_point_pairs,
464 const double *model,
465 float *err,
466 double thresh
467 ) {
468 float t = (
float)(thresh * thresh);
469 int i, n = num_point_pairs, num_inliers = 0;
470
472
473 for (
i = 0;
i < n;
i++) {
475 // This is an inlier
477 num_inliers += 1;
478 } else {
480 }
481 }
482
483 return num_inliers;
484 }
485
486 // Determines the number of iterations required to achieve the desired confidence level.
487 //
488 // The equation used to determine the number of iterations to do is:
489 // 1 - confidence = (1 - inlier_probability^num_points)^num_iters
490 //
491 // Solving for num_iters:
492 //
493 // num_iters = log(1 - confidence) / log(1 - inlier_probability^num_points)
494 //
495 // A more in-depth explanation can be found at https://en.wikipedia.org/wiki/Random_sample_consensus
496 // under the 'Parameters' heading
498 {
499 double num, denom;
500
501 confidence =
av_clipd(confidence, 0.0, 1.0);
502 num_outliers =
av_clipd(num_outliers, 0.0, 1.0);
503
504 // avoid inf's & nan's
505 num =
FFMAX(1.0 - confidence, DBL_MIN);
506 denom = 1.0 - pow(1.0 - num_outliers, 3);
507 if (denom < DBL_MIN) {
508 return 0;
509 }
510
511 num = log(num);
512 denom = log(denom);
513
514 return denom >= 0 || -num >= max_iters * (-denom) ? max_iters : (
int)
round(num / denom);
515 }
516
517 // Estimates an affine transform between the given pairs of points using RANdom
518 // SAmple Consensus
523 const int num_point_pairs,
524 double *model_out,
525 const double threshold,
526 const int max_iters,
527 const double confidence
528 ) {
530 double best_model[6], model[6];
532
533 int iter, niters =
FFMAX(max_iters, 1);
534 int good_count, max_good_count = 0;
535
536 // We need at least 3 points to build a model from
537 if (num_point_pairs < 3) {
538 return 0;
539 } else if (num_point_pairs == 3) {
540 // There are only 3 points, so RANSAC doesn't apply here
542
543 for (
int i = 0;
i < 3; ++
i) {
545 }
546
547 return 1;
548 }
549
550 for (iter = 0; iter < niters; ++iter) {
551 int found =
get_subset(&deshake_ctx->
alfg, point_pairs, num_point_pairs, pairs_subset, 10000);
552
553 if (!found) {
554 if (iter == 0) {
555 return 0;
556 }
557
558 break;
559 }
560
563
564 if (good_count >
FFMAX(max_good_count, 2)) {
565 for (
int mi = 0;
mi < 6; ++
mi) {
566 best_model[
mi] = model[
mi];
567 }
568
569 for (int pi = 0; pi < 3; pi++) {
570 best_pairs[pi] = pairs_subset[pi];
571 }
572
573 max_good_count = good_count;
575 confidence,
576 (double)(num_point_pairs - good_count) / num_point_pairs,
577 niters
578 );
579 }
580 }
581
582 if (max_good_count > 0) {
583 for (
int mi = 0;
mi < 6; ++
mi) {
584 model_out[
mi] = best_model[
mi];
585 }
586
587 for (int pi = 0; pi < 3; ++pi) {
589 }
591
592 // Find the inliers again for the best model for debugging
595 }
596
598 }
599
600 // "Wiggles" the first point in best_pairs around a tiny bit in order to decrease the
601 // total error
606 const int num_inliers,
607 float best_err,
608 double *model_out
609 ) {
610 float move_x_val = 0.01;
611 float move_y_val = 0.01;
612 int move_x = 1;
613 float old_move_x_val = 0;
614 double model[6];
615 int last_changed = 0;
616
617 for (int iters = 0; iters < 200; iters++) {
618 float total_err = 0;
619
620 if (move_x) {
621 best_pairs[0].
p.
p2.s[0] += move_x_val;
622 } else {
623 best_pairs[0].
p.
p2.s[0] += move_y_val;
624 }
625
628
629 for (int j = 0; j < num_inliers; j++) {
631 }
632
633 if (total_err < best_err) {
634 for (
int mi = 0;
mi < 6; ++
mi) {
635 model_out[
mi] = model[
mi];
636 }
637
638 best_err = total_err;
639 last_changed = iters;
640 } else {
641 // Undo the change
642 if (move_x) {
643 best_pairs[0].
p.
p2.s[0] -= move_x_val;
644 } else {
645 best_pairs[0].
p.
p2.s[0] -= move_y_val;
646 }
647
648 if (iters - last_changed > 4) {
649 // We've already improved the model as much as we can
650 break;
651 }
652
653 old_move_x_val = move_x_val;
654
655 if (move_x) {
656 move_x_val *= -1;
657 } else {
658 move_y_val *= -1;
659 }
660
661 if (old_move_x_val < 0) {
662 move_x = 0;
663 } else {
664 move_x = 1;
665 }
666 }
667 }
668 }
669
670 // Uses a process similar to that of RANSAC to find a transform that minimizes
671 // the total error for a set of point matches determined to be inliers
672 //
673 // (Pick random subsets, compute model, find total error, iterate until error
674 // is minimized.)
679 const int num_inliers,
680 double *model_out,
681 const int max_iters
682 ) {
684 float best_err = FLT_MAX;
685 double best_model[6], model[6];
687
688 for (
int i = 0;
i < max_iters;
i++) {
689 float total_err = 0;
690 int found =
get_subset(&deshake_ctx->
alfg, inliers, num_inliers, pairs_subset, 10000);
691
692 if (!found) {
694 return 0;
695 }
696
697 break;
698 }
699
702
703 for (int j = 0; j < num_inliers; j++) {
705 }
706
707 if (total_err < best_err) {
708 for (
int mi = 0;
mi < 6; ++
mi) {
709 best_model[
mi] = model[
mi];
710 }
711
712 for (int pi = 0; pi < 3; pi++) {
713 best_pairs[pi] = pairs_subset[pi];
714 }
715
716 best_err = total_err;
717 }
718 }
719
720 for (
int mi = 0;
mi < 6; ++
mi) {
721 model_out[
mi] = best_model[
mi];
722 }
723
724 for (int pi = 0; pi < 3; ++pi) {
726 }
729
730 optimize_model(deshake_ctx, best_pairs, inliers, num_inliers, best_err, model_out);
732 }
733
734 // End code from OpenCV
735
736 // Decomposes a similarity matrix into translation, rotation, scale, and skew
737 //
738 // See http://frederic-wang.fr/decomposition-of-2d-transform-matrices.html
740 {
742
745 double e = model[2];
750
751 memset(&
ret, 0,
sizeof(
ret));
752
753 ret.translation.s[0] = e;
754 ret.translation.s[1] =
f;
755
756 // This is the QR method
757 if (
a != 0 ||
b != 0) {
759
763 ret.skew.s[0] = atan((
a *
c +
b *
d) / (
r *
r));
765 }
else if (
c != 0 ||
d != 0) {
766 double s = sqrt(
c *
c +
d *
d);
767
772 ret.skew.s[1] = atan((
a *
c +
b *
d) / (
s *
s));
773 } // otherwise there is only translation
774
776 }
777
778 // Move valid vectors from the 2d buffer into a 1d buffer where they are contiguous
781 int size_y,
782 int size_x
783 ) {
784 int num_vectors = 0;
785
786 for (
int i = 0;
i < size_y; ++
i) {
787 for (int j = 0; j < size_x; ++j) {
789
792 ++num_vectors;
793 }
794
795 // Make sure we do not exceed the amount of space we allocated for these vectors
797 return num_vectors;
798 }
799 }
800 }
801 return num_vectors;
802 }
803
804 // Returns the gaussian kernel value for the given x coordinate and sigma value
806 return 1.0f /
expf(((
float)x * (
float)x) / (2.0
f * sigma * sigma));
807 }
808
809 // Makes a normalized gaussian kernel of the given length for the given sigma
810 // and places it in gauss_kernel
812 {
813 float gauss_sum = 0;
814 int window_half = length / 2;
815
816 for (
int i = 0;
i < length; ++
i) {
818
820 gauss_kernel[
i] =
val;
821 }
822
823 // Normalize the gaussian values
824 for (
int i = 0;
i < length; ++
i) {
825 gauss_kernel[
i] /= gauss_sum;
826 }
827 }
828
829 // Returns indices to start and end iteration at in order to iterate over a window
830 // of length size centered at the current frame in a ringbuffer
831 //
832 // Always returns numbers that result in a window of length size, even if that
833 // means specifying negative indices or indices past the end of the values in the
834 // ringbuffers. Make sure you clip indices appropriately within your loop.
837
840
841 return indices;
842 }
843
844 // Sets val to the value in the given ringbuffer at the given offset, taking care of
845 // clipping the offset into the appropriate range
851 ) {
852 int clip_start, clip_end, offset_clipped;
855 } else {
856 // This expression represents the last valid index in the buffer,
857 // which we use repeatedly at the end of the video.
859 }
860
863 } else {
864 // Negative indices will occur at the start of the video, and we want
865 // them to be clipped to 0 in order to repeatedly use the position of
866 // the first frame.
867 clip_start = 0;
868 }
869
872 clip_start,
873 clip_end
874 );
875
877 }
878
879 // Returns smoothed current frame value of the given buffer of floats based on the
880 // given Gaussian kernel and its length (also the window length, centered around the
881 // current frame) and the "maximum value" of the motion.
882 //
883 // This "maximum value" should be the width / height of the image in the case of
884 // translation and an empirically chosen constant for rotation / scale.
885 //
886 // The sigma chosen to generate the final gaussian kernel with used to smooth the
887 // camera path is either hardcoded (set by user, deshake_ctx->smooth_percent) or
888 // adaptively chosen.
891 float *gauss_kernel,
892 int length,
893 float max_val,
895 ) {
896 float new_large_s = 0, new_small_s = 0, new_best = 0, old, diff_between,
897 percent_of_max, inverted_percent;
899 float large_sigma = 40.0f;
900 float small_sigma = 2.0f;
901 float best_sigma;
902
904 best_sigma = (large_sigma - 0.5f) * deshake_ctx->
smooth_percent + 0.5f;
905 } else {
906 // Strategy to adaptively smooth trajectory:
907 //
908 // 1. Smooth path with large and small sigma values
909 // 2. Take the absolute value of the difference between them
910 // 3. Get a percentage by putting the difference over the "max value"
911 // 4, Invert the percentage
912 // 5. Calculate a new sigma value weighted towards the larger sigma value
913 // 6. Determine final smoothed trajectory value using that sigma
914
916 for (
int i = indices.
start, j = 0;
i < indices.
end; ++
i, ++j) {
918 new_large_s += old * gauss_kernel[j];
919 }
920
922 for (
int i = indices.
start, j = 0;
i < indices.
end; ++
i, ++j) {
924 new_small_s += old * gauss_kernel[j];
925 }
926
927 diff_between =
fabsf(new_large_s - new_small_s);
928 percent_of_max = diff_between / max_val;
929 inverted_percent = 1 - percent_of_max;
930 best_sigma = large_sigma *
powf(inverted_percent, 40);
931 }
932
934 for (
int i = indices.
start, j = 0;
i < indices.
end; ++
i, ++j) {
936 new_best += old * gauss_kernel[j];
937 }
938
939 return new_best;
940 }
941
942 // Returns the position of the given point after the transform is applied
945
948
950 }
951
952 // Creates an affine transform that scales from the center of a frame
954 float x_shift,
955 float y_shift,
956 float angle,
957 float scale_x,
958 float scale_y,
959 float center_w,
960 float center_h,
962 ) {
963 cl_float2 center_s;
964 float center_s_w, center_s_h;
965
967 0,
968 0,
969 0,
970 scale_x,
971 scale_y,
973 );
974
976 center_s_w = center_w - center_s.s[0];
977 center_s_h = center_h - center_s.s[1];
978
980 x_shift + center_s_w,
981 y_shift + center_s_h,
982 angle,
983 scale_x,
984 scale_y,
986 );
987 }
988
989 // Determines the crop necessary to eliminate black borders from a smoothed frame
990 // and updates target crop accordingly
994 float frame_width,
995 float frame_height
996 ) {
997 float new_width, new_height, adjusted_width, adjusted_height, adjusted_x, adjusted_y;
998
1003 float ar_h = frame_height / frame_width;
1004 float ar_w = frame_width / frame_height;
1005
1007 // The crop hasn't been set to the original size of the plane
1010 }
1011
1014 top_left.s[0],
1015 bottom_left.s[0]
1016 );
1017
1020 top_left.s[1],
1021 top_right.s[1]
1022 );
1023
1026 bottom_right.s[0],
1027 top_right.s[0]
1028 );
1029
1032 bottom_right.s[1],
1033 bottom_left.s[1]
1034 );
1035
1036 // Make sure our potentially new bounding box has the same aspect ratio
1039
1040 adjusted_width = new_height * ar_w;
1042
1043 if (adjusted_x >= crop->
top_left.s[0]) {
1045 } else {
1046 adjusted_height = new_width * ar_h;
1047 adjusted_y = crop->
bottom_right.s[1] - adjusted_height;
1049 }
1050 }
1051
1053 {
1055 cl_int cle;
1056
1059
1062
1063 if (
ctx->gauss_kernel)
1065
1066 if (
ctx->ransac_err)
1068
1069 if (
ctx->matches_host)
1071
1072 if (
ctx->matches_contig_host)
1074
1077
1079
1088
1090
1103 if (
ctx->debug_on) {
1106 }
1107
1109 }
1110
1112 {
1116 // Pointer to the host-side pattern buffer to be initialized and then copied
1117 // to the GPU
1119 cl_int cle;
1120 int err;
1121 cl_ulong8 zeroed_ulong8;
1123 cl_image_format grayscale_format;
1124 cl_image_desc grayscale_desc;
1125 cl_command_queue_properties queue_props;
1126
1142 };
1143
1144 // Number of elements for an array
1146
1147 const int descriptor_buf_size = image_grid_32 * (
BREIFN / 8);
1148 const int features_buf_size = image_grid_32 * sizeof(cl_float2);
1149
1152
1155
1160 ctx->curr_frame = 0;
1161
1162 memset(&zeroed_ulong8, 0, sizeof(cl_ulong8));
1163
1165 if (!
ctx->gauss_kernel) {
1168 }
1169
1171 if (!
ctx->ransac_err) {
1174 }
1175
1178 sizeof(float), 0);
1179
1180 if (!
ctx->abs_motion.ringbuffers[
i]) {
1183 }
1184 }
1185
1186 if (
ctx->debug_on) {
1188 ctx->smooth_window / 2,
1190 );
1191
1192 if (!
ctx->abs_motion.debug_matches) {
1195 }
1196 }
1197
1198 ctx->abs_motion.curr_frame_offset = 0;
1199 ctx->abs_motion.data_start_offset = -1;
1200 ctx->abs_motion.data_end_offset = -1;
1201
1203 if (!pattern_host) {
1206 }
1207
1209 if (!
ctx->matches_host) {
1212 }
1213
1215 if (!
ctx->matches_contig_host) {
1218 }
1219
1221 if (!
ctx->inliers) {
1224 }
1225
1226 // Initializing the patch pattern for building BREIF descriptors with
1230
1231 for (int j = 0; j < 2; ++j) {
1234 }
1235
1236 pattern_host[
i] = pair;
1237 }
1238
1239 for (
int i = 0;
i < 14;
i++) {
1240 if (
ctx->sw_format == disallowed_formats[
i]) {
1244 }
1245 }
1246
1249 } else {
1251 }
1252 ctx->sw_format = hw_frames_ctx->sw_format;
1253
1255 if (err < 0)
1257
1258 if (
ctx->debug_on) {
1259 queue_props = CL_QUEUE_PROFILING_ENABLE;
1260 } else {
1261 queue_props = 0;
1262 }
1263 ctx->command_queue = clCreateCommandQueue(
1264 ctx->ocf.hwctx->context,
1265 ctx->ocf.hwctx->device_id,
1266 queue_props,
1267 &cle
1268 );
1270
1280
1282 grayscale_format.image_channel_order = CL_R;
1283 grayscale_format.image_channel_data_type = CL_FLOAT;
1284
1285 grayscale_desc = (cl_image_desc) {
1286 .image_type = CL_MEM_OBJECT_IMAGE2D,
1287 .image_width = outlink->
w,
1288 .image_height = outlink->
h,
1289 .image_depth = 0,
1290 .image_array_size = 0,
1291 .image_row_pitch = 0,
1292 .image_slice_pitch = 0,
1293 .num_mip_levels = 0,
1294 .num_samples = 0,
1296 };
1297
1298 ctx->grayscale = clCreateImage(
1299 ctx->ocf.hwctx->context,
1300 0,
1301 &grayscale_format,
1302 &grayscale_desc,
1304 &cle
1305 );
1307 }
1308
1314 brief_pattern,
1315 CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
1317 pattern_host
1318 );
1325 if (
ctx->debug_on) {
1328 }
1329
1330 ctx->initialized = 1;
1332
1333 return 0;
1334
1337 return err;
1338 }
1339
1340 // Logs debug information about the transform data
1343 "Frame %d:\n"
1344 "\tframe moved from: %f x, %f y\n"
1345 "\t to: %f x, %f y\n"
1346 "\t rotated from: %f degrees\n"
1347 "\t to: %f degrees\n"
1348 "\t scaled from: %f x, %f y\n"
1349 "\t to: %f x, %f y\n"
1350 "\n"
1351 "\tframe moved by: %f x, %f y\n"
1352 "\t rotated by: %f degrees\n"
1353 "\t scaled by: %f x, %f y\n",
1354 curr_frame,
1364 );
1365 }
1366
1367 // Uses the buffered motion information to determine a transform that smooths the
1368 // given frame and applies it
1370 {
1375 int err;
1376 cl_int cle;
1379 // Luma (in the case of YUV) transform, or just the transform in the case of RGB
1380 float transform_y[9];
1381 // Chroma transform
1382 float transform_uv[9];
1383 // Luma crop transform (or RGB)
1384 float transform_crop_y[9];
1385 // Chroma crop transform
1386 float transform_crop_uv[9];
1387 float transform_debug_rgb[9];
1388 size_t global_work[2];
1390 cl_mem
src, transformed, dst;
1393 cl_event transform_event, crop_upscale_event;
1395 cl_int num_model_matches;
1396
1397 const float center_w = (
float)input_frame->
width / 2;
1398 const float center_h = (
float)input_frame->
height / 2;
1399
1403
1404 const float center_w_chroma = (
float)chroma_width / 2;
1405 const float center_h_chroma = (
float)chroma_height / 2;
1406
1407 const float luma_w_over_chroma_w = ((
float)input_frame->
width / (
float)chroma_width);
1408 const float luma_h_over_chroma_h = ((
float)input_frame->
height / (
float)chroma_height);
1409
1413 &debug_matches, 1);
1414 }
1415
1416 #if FF_API_PKT_DURATION
1420 } else
1422 #endif
1425 } else {
1427 }
1429
1430 // Get the absolute transform data for this frame
1435 }
1436
1438 // If tripod mode is turned on we simply undo all motion relative to the
1439 // first frame
1440
1446 } else {
1447 // Tripod mode is off and we need to smooth a moving camera
1448
1450 deshake_ctx,
1455 );
1457 deshake_ctx,
1462 );
1464 deshake_ctx,
1469 );
1471 deshake_ctx,
1474 2.0f,
1476 );
1478 deshake_ctx,
1481 2.0f,
1483 );
1484 }
1485
1492 center_w,
1493 center_h,
1494 transform_y
1495 );
1496
1503 center_w_chroma,
1504 center_h_chroma,
1505 transform_uv
1506 );
1507
1510
1513
1515 if (!cropped_frame) {
1518 }
1519
1521 if (!transformed_frame) {
1524 }
1525
1528
1529 for (
int p = 0; p <
FF_ARRAY_ELEMS(transformed_frame->data); p++) {
1530 // Transform all of the planes appropriately
1531 src = (cl_mem)input_frame->
data[p];
1532 transformed = (cl_mem)transformed_frame->data[p];
1533
1534 if (!transformed)
1535 break;
1536
1538 if (err < 0)
1540
1544 global_work,
1546 &transform_event,
1547 { sizeof(cl_mem), &src },
1548 { sizeof(cl_mem), &transformed },
1549 { sizeof(cl_mem), &transforms[p] },
1550 );
1551 }
1552
1560 );
1561
1568 );
1569
1571
1572 // Invert the transform
1579 center_w,
1580 center_h,
1581 transform_debug_rgb
1582 );
1583
1585
1586 transformed = (cl_mem)transformed_frame->data[0];
1593 { sizeof(cl_mem), &transformed },
1596 { sizeof(cl_int), &num_model_matches },
1598 );
1599 }
1600
1602 // Generate transforms for cropping
1609 center_w,
1610 center_h,
1611 transform_crop_y
1612 );
1614
1621 center_w_chroma,
1622 center_h_chroma,
1623 transform_crop_uv
1624 );
1626
1627 crops[0] = deshake_ctx->
crop_y;
1628 crops[1] = crops[2] = deshake_ctx->
crop_uv;
1629
1631 // Crop all of the planes appropriately
1632 dst = (cl_mem)cropped_frame->
data[p];
1633 transformed = (cl_mem)transformed_frame->data[p];
1634
1635 if (!dst)
1636 break;
1637
1639 if (err < 0)
1641
1645 global_work,
1647 &crop_upscale_event,
1648 { sizeof(cl_mem), &transformed },
1649 { sizeof(cl_mem), &dst },
1650 { sizeof(cl_float2), &crops[p].top_left },
1651 { sizeof(cl_float2), &crops[p].bottom_right },
1652 );
1653 }
1654 }
1655
1657 // This means we are somewhere at the start of the video. We need to
1658 // increment the current frame offset until it reaches the center of
1659 // the ringbuffers (as the current frame will be located there for
1660 // the rest of the video).
1661 //
1662 // The end of the video is taken care of by draining motion data
1663 // one-by-one out of the buffer, causing the (at that point fixed)
1664 // offset to move towards later frames' data.
1666 }
1667
1669 // Keep the end offset in sync with the frame it's supposed to be
1670 // positioned at
1672
1674 // The end offset would be the start of the new video sequence; flip to
1675 // start offset
1678 }
1680 // Keep the start offset in sync with the frame it's supposed to be
1681 // positioned at
1683 }
1684
1689 }
1690 }
1691
1693
1696
1699 if (err < 0)
1701
1705
1706 } else {
1708 if (err < 0)
1710
1714 }
1715
1718
1722
1726 return err;
1727 }
1728
1729 // Add the given frame to the frame queue to eventually be processed.
1730 //
1731 // Also determines the motion from the previous frame and updates the stored
1732 // motion information accordingly.
1734 {
1737 int err;
1738 int num_vectors;
1739 int num_inliers = 0;
1740 cl_int cle;
1743 size_t global_work[2];
1744 size_t harris_global_work[2];
1745 size_t grid_32_global_work[2];
1746 int grid_32_h, grid_32_w;
1747 size_t local_work[2];
1749 float prev_vals[5];
1750 float new_vals[5];
1751 cl_event grayscale_event, harris_response_event, refine_features_event,
1752 brief_event, match_descriptors_event, read_buf_event;
1754
1755 num_vectors = 0;
1756
1757 local_work[0] = 8;
1758 local_work[1] = 8;
1759
1761 if (err < 0)
1763
1765 if (err < 0)
1767
1769 if (err < 0)
1771
1772 // We want a single work-item for each 32x32 block of pixels in the input frame
1773 grid_32_global_work[0] /= 32;
1774 grid_32_global_work[1] /= 32;
1775
1778
1779 if (deshake_ctx->
is_yuv) {
1781 } else {
1782 src = (cl_mem)input_frame->
data[0];
1783
1787 global_work,
1789 &grayscale_event,
1790 {
sizeof(cl_mem), &
src },
1791 {
sizeof(cl_mem), &deshake_ctx->
grayscale }
1792 );
1793 }
1794
1796 deshake_ctx->command_queue,
1797 deshake_ctx->kernel_harris_response,
1798 harris_global_work,
1799 local_work,
1800 &harris_response_event,
1801 { sizeof(cl_mem), &deshake_ctx->grayscale },
1802 { sizeof(cl_mem), &deshake_ctx->harris_buf }
1803 );
1804
1806 deshake_ctx->command_queue,
1807 deshake_ctx->kernel_refine_features,
1808 grid_32_global_work,
1810 &refine_features_event,
1811 { sizeof(cl_mem), &deshake_ctx->grayscale },
1812 { sizeof(cl_mem), &deshake_ctx->harris_buf },
1813 { sizeof(cl_mem), &deshake_ctx->refined_features },
1814 { sizeof(cl_int), &deshake_ctx->refine_features }
1815 );
1816
1818 deshake_ctx->command_queue,
1819 deshake_ctx->kernel_brief_descriptors,
1820 grid_32_global_work,
1822 &brief_event,
1823 { sizeof(cl_mem), &deshake_ctx->grayscale },
1824 { sizeof(cl_mem), &deshake_ctx->refined_features },
1825 { sizeof(cl_mem), &deshake_ctx->descriptors },
1826 { sizeof(cl_mem), &deshake_ctx->brief_pattern}
1827 );
1828
1830 // This is the first frame we've been given to queue, meaning there is
1831 // no previous frame to match descriptors to
1832
1833 goto no_motion_data;
1834 }
1835
1837 deshake_ctx->command_queue,
1838 deshake_ctx->kernel_match_descriptors,
1839 grid_32_global_work,
1841 &match_descriptors_event,
1842 { sizeof(cl_mem), &deshake_ctx->prev_refined_features },
1843 { sizeof(cl_mem), &deshake_ctx->refined_features },
1844 { sizeof(cl_mem), &deshake_ctx->descriptors },
1845 { sizeof(cl_mem), &deshake_ctx->prev_descriptors },
1846 { sizeof(cl_mem), &deshake_ctx->matches }
1847 );
1848
1849 cle = clEnqueueReadBuffer(
1850 deshake_ctx->command_queue,
1851 deshake_ctx->matches,
1852 CL_TRUE,
1853 0,
1855 deshake_ctx->matches_host,
1856 0,
1858 &read_buf_event
1859 );
1861
1863
1864 if (num_vectors < 10) {
1865 // Not enough matches to get reliable motion data for this frame
1866 //
1867 // From this point on all data is relative to this frame rather than the
1868 // original frame. We have to make sure that we don't mix values that were
1869 // relative to the original frame with the new values relative to this
1870 // frame when doing the gaussian smoothing. We keep track of where the old
1871 // values end using this data_end_offset field in order to accomplish
1872 // that goal.
1873 //
1874 // If no motion data is present for multiple frames in a short window of
1875 // time, we leave the end where it was to avoid mixing 0s in with the
1876 // old data (and just treat them all as part of the new values)
1877 if (deshake_ctx->abs_motion.data_end_offset == -1) {
1878 deshake_ctx->abs_motion.data_end_offset =
1880 }
1881
1882 goto no_motion_data;
1883 }
1884
1886 deshake_ctx,
1887 deshake_ctx->matches_contig_host,
1888 &debug_matches,
1889 num_vectors,
1890 model.matrix,
1891 10.0,
1892 3000,
1893 0.999999999999
1894 )) {
1895 goto no_motion_data;
1896 }
1897
1898 for (
int i = 0;
i < num_vectors;
i++) {
1899 if (deshake_ctx->matches_contig_host[
i].should_consider) {
1900 deshake_ctx->inliers[num_inliers] = deshake_ctx->matches_contig_host[
i];
1901 num_inliers++;
1902 }
1903 }
1904
1906 deshake_ctx,
1907 deshake_ctx->inliers,
1908 &debug_matches,
1909 num_inliers,
1910 model.matrix,
1911 400
1912 )) {
1913 goto no_motion_data;
1914 }
1915
1916
1918
1919 // Get the absolute transform data for the previous frame
1922 deshake_ctx->abs_motion.ringbuffers[
i],
1925 }
1926
1932
1933 if (deshake_ctx->debug_on) {
1934 if (!deshake_ctx->is_yuv) {
1936 }
1942 }
1943
1944 goto end;
1945
1946 no_motion_data:
1952
1953 for (
int i = 0;
i < num_vectors;
i++) {
1954 deshake_ctx->matches_contig_host[
i].should_consider = 0;
1955 }
1956 debug_matches.num_model_matches = 0;
1957
1958 if (deshake_ctx->debug_on) {
1960 "\n[ALERT] No motion data found in queue_frame, motion reset to 0\n\n"
1961 );
1962 }
1963
1964 goto end;
1965
1966 end:
1967 // Swap the descriptor buffers (we don't need the previous frame's descriptors
1968 // again so we will use that space for the next frame's descriptors)
1969 temp = deshake_ctx->prev_descriptors;
1970 deshake_ctx->prev_descriptors = deshake_ctx->descriptors;
1971 deshake_ctx->descriptors =
temp;
1972
1973 // Same for the refined features
1974 temp = deshake_ctx->prev_refined_features;
1975 deshake_ctx->prev_refined_features = deshake_ctx->refined_features;
1976 deshake_ctx->refined_features =
temp;
1977
1978 if (deshake_ctx->debug_on) {
1979 if (num_vectors == 0) {
1980 debug_matches.matches =
NULL;
1981 } else {
1983
1984 if (!debug_matches.matches) {
1987 }
1988 }
1989
1990 for (
int i = 0;
i < num_vectors;
i++) {
1991 debug_matches.matches[
i] = deshake_ctx->matches_contig_host[
i];
1992 }
1993 debug_matches.num_matches = num_vectors;
1994
1996 deshake_ctx->abs_motion.debug_matches,
1997 &debug_matches, 1);
1998 }
1999
2001 av_fifo_write(deshake_ctx->abs_motion.ringbuffers[
i], &new_vals[
i], 1);
2002 }
2003
2005
2007 clFinish(deshake_ctx->command_queue);
2009 return err;
2010 }
2011
2013 {
2020
2022
2023 if (!deshake_ctx->
eof) {
2028 if (!
frame->hw_frames_ctx)
2030
2035 }
2036
2037 // If there is no more space in the ringbuffers, remove the oldest
2038 // values to make room for the new ones
2042 }
2043 }
2048 // See if we have enough buffered frames to process one
2049 //
2050 // "enough" is half the smooth window of queued frames into the future
2053 }
2054 }
2055 }
2056 }
2057
2060 deshake_ctx->
eof = 1;
2061 }
2062 }
2063
2064 if (deshake_ctx->
eof) {
2065 // Finish processing the rest of the frames in the queue.
2069 }
2070
2074 }
2075 }
2076
2079 "Average kernel execution times:\n"
2080 "\t grayscale: %0.3f ms\n"
2081 "\t harris_response: %0.3f ms\n"
2082 "\t refine_features: %0.3f ms\n"
2083 "\tbrief_descriptors: %0.3f ms\n"
2084 "\tmatch_descriptors: %0.3f ms\n"
2085 "\t transform: %0.3f ms\n"
2086 "\t crop_upscale: %0.3f ms\n"
2087 "Average buffer read times:\n"
2088 "\t features buf: %0.3f ms\n",
2097 );
2098 }
2099
2101 return 0;
2102 }
2103
2104 if (!deshake_ctx->
eof) {
2106 }
2107
2109 }
2110
2112 {
2116 },
2117 };
2118
2120 {
2124 },
2125 };
2126
2127 #define OFFSET(x) offsetof(DeshakeOpenCLContext, x)
2128 #define FLAGS AV_OPT_FLAG_FILTERING_PARAM|AV_OPT_FLAG_VIDEO_PARAM
2129
2131 {
2132 "tripod", "simulates a tripod by preventing any camera movement whatsoever "
2133 "from the original frame",
2135 },
2136 {
2137 "debug", "turn on additional debugging information",
2139 },
2140 {
2141 "adaptive_crop", "attempt to subtly crop borders to reduce mirrored content",
2143 },
2144 {
2145 "refine_features", "refine feature point locations at a sub-pixel level",
2147 },
2148 {
2149 "smooth_strength", "smoothing strength (0 attempts to adaptively determine optimal strength)",
2151 },
2152 {
2153 "smooth_window_multiplier", "multiplier for number of frames to buffer for motion data",
2155 },
2157 };
2158
2160
2162 .
name =
"deshake_opencl",
2165 .priv_class = &deshake_opencl_class,
2173 };