1 /*
2 * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
3 * Copyright (C) 2005 Nikolaj Poroshin <porosh3@psu.ru>
4 * Copyright (c) 2014 Arwa Arif <arwaarif1994@gmail.com>
5 *
6 * This file is part of FFmpeg.
7 *
8 * FFmpeg is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version.
12 *
13 * FFmpeg is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public License along
19 * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
20 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
21 */
22
23 /**
24 * @file
25 * Fast Simple Post-processing filter
26 * This implementation is based on an algorithm described in
27 * "Aria Nosratinia Embedded Post-Processing for
28 * Enhancement of Compressed Images (1999)"
29 * (http://www.utdallas.edu/~aria/papers/vlsisp99.pdf)
30 * Further, with splitting (I)DCT into horizontal/vertical passes, one of
31 * them can be performed once per block, not per pixel. This allows for much
32 * higher speed.
33 *
34 * Originally written by Michael Niedermayer and Nikolaj for the MPlayer
35 * project, and ported by Arwa Arif for FFmpeg.
36 */
37
44
49
50 #define OFFSET(x) offsetof(FSPPContext, x)
51 #define FLAGS AV_OPT_FLAG_FILTERING_PARAM|AV_OPT_FLAG_VIDEO_PARAM
58 };
59
61
63 { 0, 48, 12, 60, 3, 51, 15, 63, },
64 { 32, 16, 44, 28, 35, 19, 47, 31, },
65 { 8, 56, 4, 52, 11, 59, 7, 55, },
66 { 40, 24, 36, 20, 43, 27, 39, 23, },
67 { 2, 50, 14, 62, 1, 49, 13, 61, },
68 { 34, 18, 46, 30, 33, 17, 45, 29, },
69 { 10, 58, 6, 54, 9, 57, 5, 53, },
70 { 42, 26, 38, 22, 41, 25, 37, 21, },
71 };
72
74 // values (296) can't be too high
75 // -it causes too big quant dependence
76 // or maybe overflow(check), which results in some flashing
77 71, 296, 295, 237, 71, 40, 38, 19,
78 245, 193, 185, 121, 102, 73, 53, 27,
79 158, 129, 141, 107, 97, 73, 50, 26,
80 102, 116, 109, 98, 82, 66, 45, 23,
81 71, 94, 95, 81, 70, 56, 38, 20,
82 56, 77, 74, 66, 56, 44, 30, 15,
83 38, 53, 50, 45, 38, 30, 21, 11,
84 20, 27, 26, 23, 20, 15, 11, 5
85 };
86
87 //This func reads from 1 slice, 1 and clears 0 & 1
89 ptrdiff_t dst_stride, ptrdiff_t src_stride,
90 ptrdiff_t
width, ptrdiff_t
height, ptrdiff_t log2_scale)
91 {
92 int y, x;
93 #define STORE(pos) \
94 temp = (src[x + pos] + (d[pos] >> log2_scale)) >> (6 - log2_scale); \
95 src[x + pos] = src[x + pos - 8 * src_stride] = 0; \
96 if (temp & 0x100) temp = ~(temp >> 31); \
97 dst[x + pos] = temp;
98
99 for (y = 0; y <
height; y++) {
100 const uint8_t *d =
dither[y];
101 for (x = 0; x <
width; x += 8) {
111 }
114 }
115 }
116
117 //This func reads from 2 slices, 0 & 2 and clears 2-nd
119 ptrdiff_t dst_stride, ptrdiff_t src_stride,
120 ptrdiff_t
width, ptrdiff_t
height, ptrdiff_t log2_scale)
121 {
122 int y, x;
123 #define STORE2(pos) \
124 temp = (src[x + pos] + src[x + pos + 16 * src_stride] + (d[pos] >> log2_scale)) >> (6 - log2_scale); \
125 src[x + pos + 16 * src_stride] = 0; \
126 if (temp & 0x100) temp = ~(temp >> 31); \
127 dst[x + pos] = temp;
128
129 for (y = 0; y <
height; y++) {
130 const uint8_t *d =
dither[y];
131 for (x = 0; x <
width; x += 8) {
141 }
144 }
145 }
146
147 static void mul_thrmat_c(int16_t *thr_adr_noq, int16_t *thr_adr,
int q)
148 {
150 for (
a = 0;
a < 64;
a++)
151 thr_adr[
a] = q * thr_adr_noq[
a];
152 }
153
155 int dst_stride, int src_stride,
157 uint8_t *qp_store, int qp_stride, int is_luma)
158 {
159 int x, x0, y, es, qy, t;
160
163 const int qpsh = 4 - p->
hsub * !is_luma;
164 const int qpsv = 4 - p->
vsub * !is_luma;
165
167 int16_t *
block = (int16_t *)block_align;
168 int16_t *block3 = (int16_t *)(block_align + 4 * 8 *
BLOCKSZ);
169
170 memset(block3, 0, 4 * 8 *
BLOCKSZ);
171
173
174 for (y = 0; y <
height; y++) {
177 for (x = 0; x < 8; x++) {
180 }
181 }
182
183 for (y = 0; y < 8; y++) {
186 }
187 //FIXME (try edge emu)
188
189 for (y = 8; y < 24; y++)
191
193 const int y1 = y - 8 +
step;
//l5-7 l4-6;
194 qy = y - 4;
195
197 if (qy < 0) qy = 0;
198
199 qy = (qy >> qpsv) * qp_stride;
201
204
207 else
208 for (x = 0; x < 8 * (
BLOCKSZ - 1); x += 8) {
209 t = x + x0 - 2; //correct t=x+x0-2-(y&1), but its the same
210
211 if (t < 0) t = 0; //t always < width-2
212
213 t = qp_store[qy + (t >> qpsh)];
215
218 }
221 memmove(block3, block3 + (
BLOCKSZ - 1) * 64, 6 * 8 *
sizeof(int16_t));
222 }
223
224 es =
width + 8 - x0;
// 8, ...
225 if (es > 8)
227
229 if (es > 3)
231
232 if (!(y1 & 7) && y1) {
233 if (y1 & 8)
236 else
239 }
240 }
241
242 if (y & 7) { // height % 8 != 0
243 if (y & 8)
246 else
249 }
250 }
251
253 {
254 int_simd16_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
258
259 int16_t *dataptr;
260 int16_t *wsptr;
261 int16_t *threshold;
262 int ctr;
263
266
267 for (; cnt > 0; cnt -= 2) { //start positions
268 threshold = (int16_t *)thr_adr;//threshold_mtx
269 for (ctr =
DCTSIZE; ctr > 0; ctr--) {
270 // Process columns from input, add to output.
273
276
279
282
283 // Even part of FDCT
284
285 tmp10 = tmp0 + tmp3;
286 tmp13 = tmp0 - tmp3;
287 tmp11 = tmp1 + tmp2;
288 tmp12 = tmp1 - tmp2;
289
290 d0 = tmp10 + tmp11;
291 d4 = tmp10 - tmp11;
292
294 d2 = tmp13 + z1;
295 d6 = tmp13 - z1;
296
297 // Even part of IDCT
298
303 tmp0 += 2;
304 tmp10 = (tmp0 + tmp2) >> 2;
305 tmp11 = (tmp0 - tmp2) >> 2;
306
307 tmp13 = (tmp1 + tmp3) >>2; //+2 ! (psnr decides)
309
310 tmp0 = tmp10 + tmp13; //->temps
311 tmp3 = tmp10 - tmp13; //->temps
312 tmp1 = tmp11 + tmp12; //->temps
313 tmp2 = tmp11 - tmp12; //->temps
314
315 // Odd part of FDCT
316
317 tmp10 = tmp4 + tmp5;
318 tmp11 = tmp5 + tmp6;
319 tmp12 = tmp6 + tmp7;
320
325
326 z11 = tmp7 + z3;
327 z13 = tmp7 - z3;
328
329 d5 = z13 + z2;
330 d3 = z13 - z2;
331 d1 = z11 + z4;
332 d7 = z11 - z4;
333
334 // Odd part of IDCT
335
340
341 //Simd version uses here a shortcut for the tmp5,tmp6,tmp7 == 0
342 z13 = tmp6 + tmp5;
343 z10 = (tmp6 - tmp5) << 1;
344 z11 = tmp4 + tmp7;
345 z12 = (tmp4 - tmp7) << 1;
346
347 tmp7 = (z11 + z13) >> 2; //+2 !
352
353 tmp6 = tmp12 - tmp7;
354 tmp5 = tmp11 - tmp6;
355 tmp4 = tmp10 + tmp5;
356
357 wsptr[
DCTSIZE * 0] += (tmp0 + tmp7);
358 wsptr[
DCTSIZE * 1] += (tmp1 + tmp6);
359 wsptr[
DCTSIZE * 2] += (tmp2 + tmp5);
360 wsptr[
DCTSIZE * 3] += (tmp3 - tmp4);
361 wsptr[
DCTSIZE * 4] += (tmp3 + tmp4);
362 wsptr[
DCTSIZE * 5] += (tmp2 - tmp5);
363 wsptr[
DCTSIZE * 6] = (tmp1 - tmp6);
364 wsptr[
DCTSIZE * 7] = (tmp0 - tmp7);
365 //
366 dataptr++; //next column
367 wsptr++;
368 threshold++;
369 }
370 dataptr += 8; //skip each second start pos
371 wsptr += 8;
372 }
373 }
374
375 static void row_idct_c(int16_t *workspace, int16_t *output_adr, ptrdiff_t output_stride,
int cnt)
376 {
377 int_simd16_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
380 int16_t *outptr;
381 int16_t *wsptr;
382
383 cnt *= 4;
384 wsptr = workspace;
385 outptr = output_adr;
386 for (; cnt > 0; cnt--) {
387 // Even part
388 //Simd version reads 4x4 block and transposes it
389 tmp10 = wsptr[2] + wsptr[3];
390 tmp11 = wsptr[2] - wsptr[3];
391
392 tmp13 = wsptr[0] + wsptr[1];
394
395 tmp0 = tmp10 + tmp13; //->temps
396 tmp3 = tmp10 - tmp13; //->temps
397 tmp1 = tmp11 + tmp12;
398 tmp2 = tmp11 - tmp12;
399
400 // Odd part
401 //Also transpose, with previous:
402 // ---- ---- ||||
403 // ---- ---- idct ||||
404 // ---- ---- ---> ||||
405 // ---- ---- ||||
406 z13 = wsptr[4] + wsptr[5];
407 z10 = wsptr[4] - wsptr[5];
408 z11 = wsptr[6] + wsptr[7];
409 z12 = wsptr[6] - wsptr[7];
410
411 tmp7 = z11 + z13;
413
417
418 tmp6 = (tmp12 << 3) - tmp7;
419 tmp5 = (tmp11 << 3) - tmp6;
420 tmp4 = (tmp10 << 3) + tmp5;
421
422 // Final output stage: descale and write column
423 outptr[0 * output_stride] +=
DESCALE(tmp0 + tmp7, 3);
424 outptr[1 * output_stride] +=
DESCALE(tmp1 + tmp6, 3);
425 outptr[2 * output_stride] +=
DESCALE(tmp2 + tmp5, 3);
426 outptr[3 * output_stride] +=
DESCALE(tmp3 - tmp4, 3);
427 outptr[4 * output_stride] +=
DESCALE(tmp3 + tmp4, 3);
428 outptr[5 * output_stride] +=
DESCALE(tmp2 - tmp5, 3);
429 outptr[6 * output_stride] +=
DESCALE(tmp1 - tmp6, 3);
//no += ?
430 outptr[7 * output_stride] +=
DESCALE(tmp0 - tmp7, 3);
//no += ?
431 outptr++;
432
433 wsptr +=
DCTSIZE;
// advance pointer to next row
434 }
435 }
436
437 static void row_fdct_c(int16_t *
data,
const uint8_t *pixels, ptrdiff_t line_size,
int cnt)
438 {
439 int_simd16_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
442 int16_t *dataptr;
443
444 cnt *= 4;
445 // Pass 1: process rows.
446
448 for (; cnt > 0; cnt--) {
449 tmp0 = pixels[line_size * 0] + pixels[line_size * 7];
450 tmp7 = pixels[line_size * 0] - pixels[line_size * 7];
451 tmp1 = pixels[line_size * 1] + pixels[line_size * 6];
452 tmp6 = pixels[line_size * 1] - pixels[line_size * 6];
453 tmp2 = pixels[line_size * 2] + pixels[line_size * 5];
454 tmp5 = pixels[line_size * 2] - pixels[line_size * 5];
455 tmp3 = pixels[line_size * 3] + pixels[line_size * 4];
456 tmp4 = pixels[line_size * 3] - pixels[line_size * 4];
457
458 // Even part
459
460 tmp10 = tmp0 + tmp3;
461 tmp13 = tmp0 - tmp3;
462 tmp11 = tmp1 + tmp2;
463 tmp12 = tmp1 - tmp2;
464 //Even columns are written first, this leads to different order of columns
465 //in column_fidct(), but they are processed independently, so all ok.
466 //Later in the row_idct() columns readed at the same order.
467 dataptr[2] = tmp10 + tmp11;
468 dataptr[3] = tmp10 - tmp11;
469
471 dataptr[0] = tmp13 + z1;
472 dataptr[1] = tmp13 - z1;
473
474 // Odd part
475
476 tmp10 = (tmp4 + tmp5) << 2;
477 tmp11 = (tmp5 + tmp6) << 2;
478 tmp12 = (tmp6 + tmp7) << 2;
479
484
485 z11 = tmp7 + z3;
486 z13 = tmp7 - z3;
487
488 dataptr[4] = z13 + z2;
489 dataptr[5] = z13 - z2;
490 dataptr[6] = z11 + z4;
491 dataptr[7] = z11 - z4;
492
493 pixels++; // advance pointer to next column
495 }
496 }
497
506 };
507
509 {
514
517
521
524
531
532 #if ARCH_X86
534 #endif
535
536 return 0;
537 }
538
540 {
545
546 int qp_stride = 0;
547 int8_t *qp_table =
NULL;
550 int custom_threshold_m[64];
551
553
554 for (
i = 0;
i < 64;
i++)
//FIXME: tune custom_threshold[] and remove this !
556
557 for (
i = 0;
i < 8;
i++) {
559 |(((uint64_t)custom_threshold_m[
i * 8 + 6]) << 16)
560 |(((uint64_t)custom_threshold_m[
i * 8 + 0]) << 32)
561 |(((uint64_t)custom_threshold_m[
i * 8 + 4]) << 48);
562
564 |(((uint64_t)custom_threshold_m[
i * 8 + 3]) << 16)
565 |(((uint64_t)custom_threshold_m[
i * 8 + 1]) << 32)
566 |(((uint64_t)custom_threshold_m[
i * 8 + 7]) << 48);
567 }
568
571
572 /* if we are not in a constant user quantizer mode and we don't want to use
573 * the quantizers from the B-frames (B-frames often have a higher QP), we
574 * need to save the qp table from the last non B-frame; this is what the
575 * following code block does */
581 }
582
587 }
588 }
589
594 }
595
596 if (qp_table || fspp->
qp) {
599
600 /* get a new frame if in-place is not possible or if the dimensions
601 * are not multiple of 8 */
605
611 }
615 }
616
620 cw, ch, qp_table, qp_stride, 0);
622 cw, ch, qp_table, qp_stride, 0);
624 }
625 }
626
633 }
639 }
640
642 {
647 }
648
650 {
655 },
656 };
657
666 .priv_class = &fspp_class,
668 };