1 /*
2 * VC3/DNxHD encoder
3 * Copyright (c) 2007 Baptiste Coudurier <baptiste dot coudurier at smartjog dot com>
4 * Copyright (c) 2011 MirriAd Ltd
5 *
6 * VC-3 encoder funded by the British Broadcasting Corporation
7 * 10 bit support added by MirriAd Ltd, Joseph Artsimovich <joseph@mirriad.com>
8 *
9 * This file is part of FFmpeg.
10 *
11 * FFmpeg is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public
13 * License as published by the Free Software Foundation; either
14 * version 2.1 of the License, or (at your option) any later version.
15 *
16 * FFmpeg is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 * Lesser General Public License for more details.
20 *
21 * You should have received a copy of the GNU Lesser General Public
22 * License along with FFmpeg; if not, write to the Free Software
23 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
24 */
25
30
38
39
40 // The largest value that will not lead to overflow for 10bit samples.
41 #define DNX10BIT_QMAT_SHIFT 18
42 #define RC_VARIANCE 1 // use variance or ssd for fast rc
43 #define LAMBDA_FRAC_BITS 10
44
45 #define VE AV_OPT_FLAG_VIDEO_PARAM | AV_OPT_FLAG_ENCODING_PARAM
47 { "nitris_compat", "encode with Avid Nitris compatibility",
50 };
51
57 };
58
61 ptrdiff_t line_size)
62 {
63 int i;
64 for (i = 0; i < 4; i++) {
65 block[0] = pixels[0];
66 block[1] = pixels[1];
67 block[2] = pixels[2];
68 block[3] = pixels[3];
69 block[4] = pixels[4];
70 block[5] = pixels[5];
71 block[6] = pixels[6];
72 block[7] = pixels[7];
73 pixels += line_size;
74 block += 8;
75 }
76 memcpy(block, block - 8, sizeof(*block) * 8);
77 memcpy(block + 8, block - 16, sizeof(*block) * 8);
78 memcpy(block + 16, block - 24, sizeof(*block) * 8);
79 memcpy(block + 24, block - 32, sizeof(*block) * 8);
80 }
81
85 ptrdiff_t line_size)
86 {
87 int i;
88 const uint16_t* pixels16 = (const uint16_t*)pixels;
89 line_size >>= 1;
90
91 for (i = 0; i < 4; i++) {
92 block[0] = pixels16[0]; block[1] = pixels16[1];
93 block[2] = pixels16[2]; block[3] = pixels16[3];
94 block[4] = pixels16[4]; block[5] = pixels16[5];
95 block[6] = pixels16[6]; block[7] = pixels16[7];
96 pixels16 += line_size;
97 block += 8;
98 }
99 memcpy(block, block - 8, sizeof(*block) * 8);
100 memcpy(block + 8, block - 16, sizeof(*block) * 8);
101 memcpy(block + 16, block - 24, sizeof(*block) * 8);
102 memcpy(block + 24, block - 32, sizeof(*block) * 8);
103 }
104
106 int n,
int qscale,
int *overflow)
107 {
110 int last_non_zero = 0;
111 int i;
112
114
115 // Divide by 4 with rounding, to compensate scaling of DCT coefficients
116 block[0] = (block[0] + 2) >> 2;
117
118 for (i = 1; i < 64; ++i) {
119 int j = scantable[i];
121 int level = (block[j] ^ sign) - sign;
123 block[j] = (level ^ sign) - sign;
124 if (level)
125 last_non_zero = i;
126 }
127
128 return last_non_zero;
129 }
130
132 {
135
137 max_level, 4 *
sizeof(*ctx->
vlc_codes), fail);
139 max_level, 4 *
sizeof(*ctx->
vlc_bits), fail);
141 63 * 2, fail);
143 63, fail);
144
147 for (level = -max_level; level < max_level; level++) {
148 for (run = 0; run < 2; run++) {
149 int index = (level << 1) | run;
151
153 if (alevel > 64) {
154 offset = (alevel - 1) >> 6;
155 alevel -= offset << 6;
156 }
157 for (j = 0; j < 257; j++) {
162 if (alevel) {
166 } else {
169 }
170 break;
171 }
172 }
174 if (offset) {
178 }
179 }
180 }
181 for (i = 0; i < 62; i++) {
186 }
187 return 0;
188 fail:
190 }
191
193 {
194 // init first elem to 1 to avoid div by 0 in convert_matrix
195 uint16_t weight_matrix[64] = { 1, }; // convert_matrix needs uint16_t*
196 int qscale, i;
199
201 (ctx->
m.
avctx->
qmax + 1), 64 *
sizeof(
int), fail);
203 (ctx->
m.
avctx->
qmax + 1), 64 *
sizeof(
int), fail);
205 (ctx->
m.
avctx->
qmax + 1), 64 * 2 *
sizeof(uint16_t),
206 fail);
208 (ctx->
m.
avctx->
qmax + 1), 64 * 2 *
sizeof(uint16_t),
209 fail);
210
212 for (i = 1; i < 64; i++) {
215 }
219 for (i = 1; i < 64; i++) {
222 }
226
227 for (qscale = 1; qscale <= ctx->
m.
avctx->
qmax; qscale++) {
228 for (i = 0; i < 64; i++) {
235 }
236 }
237 } else {
238 // 10-bit
239 for (qscale = 1; qscale <= ctx->
m.
avctx->
qmax; qscale++) {
240 for (i = 1; i < 64; i++) {
242
243 /* The quantization formula from the VC-3 standard is:
244 * quantized = sign(block[i]) * floor(abs(block[i]/s) * p /
245 * (qscale * weight_table[i]))
246 * Where p is 32 for 8-bit samples and 8 for 10-bit ones.
247 * The s factor compensates scaling of DCT coefficients done by
248 * the DCT routines, and therefore is not present in standard.
249 * It's 8 for 8-bit samples and 4 for 10-bit ones.
250 * We want values of ctx->qtmatrix_l and ctx->qtmatrix_r to be:
251 * ((1 << DNX10BIT_QMAT_SHIFT) * (p / s)) /
252 * (qscale * weight_table[i])
253 * For 10-bit samples, p / s == 2 */
255 (qscale * luma_weight_table[i]);
257 (qscale * chroma_weight_table[i]);
258 }
259 }
260 }
261
266
267 return 0;
268 fail:
270 }
271
273 {
278
283 return 0;
284 fail:
286 }
287
289 {
292
295 bit_depth = 8;
296 break;
298 bit_depth = 10;
299 break;
300 default:
302 "pixel format is incompatible with DNxHD\n");
304 }
305
309 "video parameters incompatible with DNxHD. Valid DNxHD profiles:\n");
312 }
314
317
319
323
325
332
335
340 } else {
343 }
344
345 if (ARCH_X86)
347
350
354 }
355
357
360 // XXX tune lbias/cbias
363
364 /* Avid Nitris hardware decoder requires a minimum amount of padding
365 * in the coding unit payload */
368
373
379 ctx->
m.
mb_num *
sizeof(uint16_t), fail);
382
386
389
393 }
394
395 if (avctx->
qmax <= 1) {
398 }
399
404 }
405
406 return 0;
407 fail: // for FF_ALLOCZ_OR_GOTO
409 }
410
412 {
414 static const uint8_t header_prefix[5] = { 0x00, 0x00, 0x02, 0x80, 0x01 };
415
416 memset(buf, 0, 640);
417
418 memcpy(buf, header_prefix, 5);
420 buf[6] = 0x80; // crc flag off
421 buf[7] = 0xa0; // reserved
425
430
431 buf[0x5f] = 0x01; // UDL
432
433 buf[0x167] = 0x02; // reserved
436 buf[0x16f] = 0x10; // reserved
437
438 ctx->
msip = buf + 0x170;
439 return 0;
440 }
441
443 {
444 int nbits;
445 if (diff < 0) {
447 diff--;
448 } else {
450 }
453 (diff & ((1 << nbits) - 1)));
454 }
455
458 int last_index,
int n)
459 {
460 int last_non_zero = 0;
461 int slevel, i, j;
462
465
466 for (i = 1; i <= last_index; i++) {
468 slevel = block[j];
469 if (slevel) {
470 int run_level = i - last_non_zero - 1;
471 int rlevel = (slevel << 1) | !!run_level;
473 if (run_level)
476 last_non_zero = i;
477 }
478 }
480 }
481
484 int qscale, int last_index)
485 {
488 int i;
489
492
493 for (i = 1; i <= last_index; i++) {
495 level = block[j];
496 if (level) {
497 if (level < 0) {
498 level = (1 - 2 *
level) * qscale * weight_matrix[i];
500 if (weight_matrix[i] != 8)
501 level += 8;
502 level >>= 4;
503 } else {
504 if (weight_matrix[i] != 32)
505 level += 32;
506 level >>= 6;
507 }
509 } else {
510 level = (2 * level + 1) * qscale * weight_matrix[i];
512 if (weight_matrix[i] != 8)
513 level += 8;
514 level >>= 4;
515 } else {
516 if (weight_matrix[i] != 32)
517 level += 32;
518 level >>= 6;
519 }
520 }
522 }
523 }
524 }
525
527 {
528 int score = 0;
529 int i;
530 for (i = 0; i < 64; i++)
531 score += (block[i] - qblock[i]) * (block[i] - qblock[i]);
532 return score;
533 }
534
537 {
538 int last_non_zero = 0;
541 for (i = 1; i <= last_index; i++) {
543 level = block[j];
544 if (level) {
545 int run_level = i - last_non_zero - 1;
546 bits += ctx->
vlc_bits[(level << 1) |
547 !!run_level] + ctx->
run_bits[run_level];
548 last_non_zero = i;
549 }
550 }
552 }
553
556 {
558 const int bw = 1 << bs;
560 ((mb_y << 4) * ctx->
m.
linesize) + (mb_x << bs + 1);
566
571
586 } else {
591 }
592 } else {
601 }
602 }
603
606 {
607 const static uint8_t component[8]={0,0,1,2,0,0,1,2};
608 return component[i];
609 }
610
612 int jobnr, int threadnr)
613 {
615 int mb_y = jobnr, mb_x;
618 ctx = ctx->
thread[threadnr];
619
623
624 for (mb_x = 0; mb_x < ctx->
m.
mb_width; mb_x++) {
626 int ssd = 0;
627 int ac_bits = 0;
628 int dc_bits = 0;
629 int i;
630
632
633 for (i = 0; i < 8; i++) {
634 int16_t *src_block = ctx->
blocks[i];
635 int overflow, nbits,
diff, last_index;
637
638 memcpy(
block, src_block, 64 *
sizeof(*
block));
640 qscale, &overflow);
642
644 if (diff < 0)
646 else
648
651
653
658 }
659 }
661 ctx->
mb_rc[qscale][
mb].
bits = ac_bits + dc_bits + 12 +
663 }
664 return 0;
665 }
666
668 int jobnr, int threadnr)
669 {
671 int mb_y = jobnr, mb_x;
672 ctx = ctx->
thread[threadnr];
675
679 for (mb_x = 0; mb_x < ctx->
m.
mb_width; mb_x++) {
682 int i;
683
685
687
688 for (i = 0; i < 8; i++) {
692 qscale, &overflow);
693 // START_TIMER;
695 // STOP_TIMER("encode_block");
696 }
697 }
701 return 0;
702 }
703
705 {
706 int mb_y, mb_x;
708 for (mb_y = 0; mb_y < ctx->
m.
mb_height; mb_y++) {
709 int thread_size;
712 for (mb_x = 0; mb_x < ctx->
m.
mb_width; mb_x++) {
715 }
719 offset += thread_size;
720 }
721 }
722
724 int jobnr, int threadnr)
725 {
727 int mb_y = jobnr, mb_x, x,
y;
728 int partial_last_row = (mb_y == ctx->
m.
mb_height - 1) &&
730
731 ctx = ctx->
thread[threadnr];
734 for (mb_x = 0; mb_x < ctx->
m.
mb_width; ++mb_x, pix += 16) {
736 int sum;
737 int varc;
738
739 if (!partial_last_row && mb_x * 16 <= avctx->
width - 16) {
742 } else {
745 sum = varc = 0;
746 for (y = 0; y < bh; y++) {
747 for (x = 0; x < bw; x++) {
751 }
752 }
753 }
754 varc = (varc - (((unsigned) sum * sum) >> 8) + 128) >> 8;
755
758 }
759 } else { // 10-bit
760 int const linesize = ctx->
m.
linesize >> 1;
761 for (mb_x = 0; mb_x < ctx->
m.
mb_width; ++mb_x) {
762 uint16_t *pix = (uint16_t *)ctx->
thread[0]->
src[0] +
763 ((mb_y << 4) * linesize) + (mb_x << 4);
765 int sum = 0;
766 int sqsum = 0;
767 int mean, sqmean;
768 int i, j;
769 // Macroblocks are 16x16 pixels, unlike DCT blocks which are 8x8.
770 for (i = 0; i < 16; ++i) {
771 for (j = 0; j < 16; ++j) {
772 // Turn 16-bit pixels into 10-bit ones.
773 int const sample = (unsigned) pix[j] >> 6;
776 // 2^10 * 2^10 * 16 * 16 = 2^28, which is less than INT_MAX
777 }
778 pix += linesize;
779 }
780 mean = sum >> 8; // 16*16 == 2^8
781 sqmean = sqsum >> 8;
784 }
785 }
786 return 0;
787 }
788
790 {
791 int lambda, up_step, down_step;
792 int last_lower = INT_MAX, last_higher = 0;
794
795 for (q = 1; q < avctx->
qmax; q++) {
799 }
802
803 for (;;) {
806 if (lambda == last_higher) {
807 lambda++;
808 end = 1; // need to set final qscales/bits
809 }
812 unsigned min = UINT_MAX;
813 int qscale = 1;
815 for (q = 1; q < avctx->
qmax; q++) {
818 if (score < min) {
819 min = score;
820 qscale = q;
821 }
822 }
826 }
827 bits = (bits + 31) & ~31; // padding
829 break;
830 }
831 // av_dlog(ctx->m.avctx,
832 // "lambda %d, up %u, down %u, bits %d, frame %d\n",
833 // lambda, last_higher, last_lower, bits, ctx->frame_bits);
834 if (end) {
837 break;
838 }
839 if (bits < ctx->frame_bits) {
840 last_lower =
FFMIN(lambda, last_lower);
841 if (last_higher != 0)
842 lambda = (lambda+last_higher)>>1;
843 else
844 lambda -= down_step;
845 down_step =
FFMIN((int64_t)down_step*5, INT_MAX);
847 lambda =
FFMAX(1, lambda);
848 if (lambda == last_lower)
849 break;
850 } else {
851 last_higher =
FFMAX(lambda, last_higher);
852 if (last_lower != INT_MAX)
853 lambda = (lambda+last_lower)>>1;
854 else if ((int64_t)lambda + up_step > INT_MAX)
856 else
857 lambda += up_step;
858 up_step =
FFMIN((int64_t)up_step*5, INT_MAX);
860 }
861 }
862 //av_dlog(ctx->m.avctx, "out lambda %d\n", lambda);
864 return 0;
865 }
866
868 {
870 int up_step = 1;
871 int down_step = 1;
872 int last_higher = 0;
873 int last_lower = INT_MAX;
874 int qscale;
876
878 for (;;) {
879 bits = 0;
881 // XXX avoid recalculating bits
887 bits = (bits+31)&~31; // padding
889 break;
890 }
891 // av_dlog(ctx->m.avctx,
892 // "%d, qscale %d, bits %d, frame %d, higher %d, lower %d\n",
893 // ctx->m.avctx->frame_number, qscale, bits, ctx->frame_bits,
894 // last_higher, last_lower);
895 if (bits < ctx->frame_bits) {
896 if (qscale == 1)
897 return 1;
898 if (last_higher == qscale - 1) {
899 qscale = last_higher;
900 break;
901 }
902 last_lower =
FFMIN(qscale, last_lower);
903 if (last_higher != 0)
904 qscale = (qscale + last_higher) >> 1;
905 else
906 qscale -= down_step++;
907 if (qscale < 1)
908 qscale = 1;
909 up_step = 1;
910 } else {
911 if (last_lower == qscale + 1)
912 break;
913 last_higher =
FFMAX(qscale, last_higher);
914 if (last_lower != INT_MAX)
915 qscale = (qscale + last_lower) >> 1;
916 else
917 qscale += up_step++;
918 down_step = 1;
921 }
922 }
923 //av_dlog(ctx->m.avctx, "out qscale %d\n", qscale);
925 return 0;
926 }
927
928 #define BUCKET_BITS 8
929 #define RADIX_PASSES 4
930 #define NBUCKETS (1 << BUCKET_BITS)
931
933 {
937 }
938
941 {
942 int i, j;
943 memset(buckets, 0,
sizeof(buckets[0][0]) *
RADIX_PASSES * NBUCKETS);
944 for (i = 0; i <
size; i++) {
949 }
951 }
954 for (i = NBUCKETS - 1; i >= 0; i--)
955 buckets[j][i] = offset -= buckets[j][i];
957 }
958 }
959
962 {
964 int i;
965 for (i = 0; i <
size; i++) {
967 int pos = buckets[
v]++;
968 dst[pos] = data[i];
969 }
970 }
971
973 {
982 }
984 }
985
987 {
988 int max_bits = 0;
995 int delta_bits;
1006 delta_bits
1007 : INT_MIN; // avoid increasing qscale
1008 }
1009 }
1010 max_bits += 31; // worst padding
1011 }
1012 if (!ret) {
1023 }
1024 }
1025 return 0;
1026 }
1027
1029 {
1030 int i;
1031
1037 }
1038
1041 }
1042
1045 {
1050
1054
1056
1057 encode_coding_unit:
1058 for (i = 0; i < 3; i++) {
1062 }
1063
1065
1068 else
1070 if (ret < 0) {
1072 "picture could not fit ratecontrol constraints, increase qmax\n");
1074 }
1075
1077
1078 offset = 0;
1083 }
1084
1086
1088 memset(buf + 640 + offset, 0,
1090
1092
1094 first_field = 0;
1097 goto encode_coding_unit;
1098 }
1099
1101
1103 *got_packet = 1;
1104 return 0;
1105 }
1106
1108 {
1111 int i;
1112
1117
1124
1129
1132
1134
1135 return 0;
1136 }
1137
1139 { "qmax", "1024" }, /* Maximum quantization scale factor allowed for VC-3 */
1141 };
1142
1157 },
1160 };