Go to the documentation of this file. 1 /*
2 * ALAC audio encoder
3 * Copyright (c) 2008 Jaikrishnan Menon <realityman@gmx.net>
4 *
5 * This file is part of FFmpeg.
6 *
7 * FFmpeg is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; either
10 * version 2.1 of the License, or (at your option) any later version.
11 *
12 * FFmpeg is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * Lesser General Public License for more details.
16 *
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with FFmpeg; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 */
21
23
31
32 #define DEFAULT_FRAME_SIZE 4096
33 #define ALAC_EXTRADATA_SIZE 36
34 #define ALAC_FRAME_HEADER_SIZE 55
35 #define ALAC_FRAME_FOOTER_SIZE 3
36
37 #define ALAC_ESCAPE_CODE 0x1FF
38 #define ALAC_MAX_LPC_ORDER 30
39 #define DEFAULT_MAX_PRED_ORDER 6
40 #define DEFAULT_MIN_PRED_ORDER 4
41 #define ALAC_MAX_LPC_PRECISION 9
42 #define ALAC_MIN_LPC_SHIFT 0
43 #define ALAC_MAX_LPC_SHIFT 9
44
45 #define ALAC_CHMODE_LEFT_RIGHT 0
46 #define ALAC_CHMODE_LEFT_SIDE 1
47 #define ALAC_CHMODE_RIGHT_SIDE 2
48 #define ALAC_CHMODE_MID_SIDE 3
49
56
62
67 int verbatim;
/**< current frame verbatim mode flag */
83
84
87 {
90 s->avctx->bits_per_raw_sample;
91
92 #define COPY_SAMPLES(type) do { \
93 for (ch = 0; ch < channels; ch++) { \
94 int32_t *bptr = s->sample_buf[ch]; \
95 const type *sptr = (const type *)samples[ch]; \
96 for (i = 0; i < s->frame_size; i++) \
97 bptr[i] = sptr[i] >> shift; \
98 } \
99 } while (0)
100
103 else
105 }
106
108 int k, int write_sample_size)
109 {
111
112 k =
FFMIN(k,
s->rc.k_modifier);
113 divisor = (1<<k) - 1;
114 q = x / divisor;
116
117 if (q > 8) {
118 // write escape code and sample value directly
120 put_bits(&
s->pbctx, write_sample_size, x);
121 } else {
122 if (q)
125
126 if (k != 1) {
129 else
131 }
132 }
133 }
134
137 int instance)
138 {
139 int encode_fs = 0;
140
142 encode_fs = 1;
143
144 put_bits(&
s->pbctx, 3, element);
// element type
145 put_bits(&
s->pbctx, 4, instance);
// element instance
146 put_bits(&
s->pbctx, 12, 0);
// unused header bits
147 put_bits(&
s->pbctx, 1, encode_fs);
// Sample count is in the header
148 put_bits(&
s->pbctx, 2,
s->extra_bits >> 3);
// Extra bytes (for 24-bit)
149 put_bits(&
s->pbctx, 1,
s->verbatim);
// Audio block is verbatim
150 if (encode_fs)
151 put_bits32(&
s->pbctx,
s->frame_size);
// No. of samples in the frame
152 }
153
155 {
158 int opt_order;
159
160 if (
s->compression_level == 1) {
161 s->lpc[ch].lpc_order = 6;
162 s->lpc[ch].lpc_quant = 6;
163 s->lpc[ch].lpc_coeff[0] = 160;
164 s->lpc[ch].lpc_coeff[1] = -190;
165 s->lpc[ch].lpc_coeff[2] = 170;
166 s->lpc[ch].lpc_coeff[3] = -130;
167 s->lpc[ch].lpc_coeff[4] = 80;
168 s->lpc[ch].lpc_coeff[5] = -25;
169 } else {
172 s->min_prediction_order,
173 s->max_prediction_order,
178
179 s->lpc[ch].lpc_order = opt_order;
180 s->lpc[ch].lpc_quant =
shift[opt_order-1];
181 memcpy(
s->lpc[ch].lpc_coeff, coefs[opt_order-1], opt_order*
sizeof(
int));
182 }
183 }
184
186 {
189 uint64_t sum[4];
190 uint64_t score[4];
191
192 /* calculate sum of 2nd order residual for each channel */
193 sum[0] = sum[1] = sum[2] = sum[3] = 0;
194 for (
i = 2;
i < n;
i++) {
195 lt = left_ch[
i] - 2 * left_ch[
i - 1] + left_ch[
i - 2];
196 rt = right_ch[
i] - 2 * right_ch[
i - 1] + right_ch[
i - 2];
197 sum[2] +=
FFABS((lt + rt) >> 1);
198 sum[3] +=
FFABS(lt - rt);
201 }
202
203 /* calculate score for each mode */
204 score[0] = sum[0] + sum[1];
205 score[1] = sum[0] + sum[3];
206 score[2] = sum[1] + sum[3];
207 score[3] = sum[2] + sum[3];
208
209 /* return mode with lowest score */
210 best = 0;
211 for (
i = 1;
i < 4;
i++) {
212 if (score[
i] < score[best])
214 }
215 return best;
216 }
217
219 {
221 int i,
mode, n =
s->frame_size;
223
225
228 s->interlacing_leftweight = 0;
229 s->interlacing_shift = 0;
230 break;
232 for (
i = 0;
i < n;
i++)
234 s->interlacing_leftweight = 1;
235 s->interlacing_shift = 0;
236 break;
238 for (
i = 0;
i < n;
i++) {
242 }
243 s->interlacing_leftweight = 1;
244 s->interlacing_shift = 31;
245 break;
246 default:
247 for (
i = 0;
i < n;
i++) {
250 right[
i] =
tmp - right[
i];
251 }
252 s->interlacing_leftweight = 1;
253 s->interlacing_shift = 1;
254 break;
255 }
256 }
257
259 {
262 int32_t *residual =
s->predictor_buf[ch];
263
265 residual[0] =
s->sample_buf[ch][0];
266
267 for (
i = 1;
i <
s->frame_size;
i++) {
268 residual[
i] =
s->sample_buf[ch][
i ] -
269 s->sample_buf[ch][
i - 1];
270 }
271
272 return;
273 }
274
275 // generalised linear predictor
276
279
280 // generate warm-up samples
284
285 // perform lpc on remaining samples
287 int sum = 1 << (lpc.
lpc_quant - 1), res_val, j;
288
292 }
293
297 s->write_sample_size);
298 res_val = residual[
i];
299
300 if (res_val) {
302 int neg = (res_val < 0);
303
304 while (
index >= 0 && (neg ? (res_val < 0) : (res_val > 0))) {
307
308 if (neg)
309 sign *= -1;
310
315 }
316 }
318 }
319 }
320 }
321
323 {
324 unsigned int history =
s->rc.initial_history;
325 int sign_modifier = 0,
i, k;
327
328 for (
i = 0;
i <
s->frame_size;) {
329 int x;
330
331 k =
av_log2((history >> 9) + 3);
332
333 x = -2 * (*samples) -1;
334 x ^= x >> 31;
335
338
340
341 history += x *
s->rc.history_mult -
342 ((history *
s->rc.history_mult) >> 9);
343
344 sign_modifier = 0;
345 if (x > 0xFFFF)
346 history = 0xFFFF;
347
349 unsigned int block_size = 0;
350
351 k = 7 -
av_log2(history) + ((history + 16) >> 6);
352
356 block_size++;
357 }
359 sign_modifier = (block_size <= 0xFFFF);
360 history = 0;
361 }
362
363 }
364 }
365
368 const uint8_t *samples0, const uint8_t *samples1)
369 {
370 const uint8_t *
samples[2] = { samples0, samples1 };
372 int prediction_type = 0;
374
376
379 /* samples are channel-interleaved in verbatim mode */
381 int shift = 32 -
s->avctx->bits_per_raw_sample;
384 for (
i = 0;
i <
s->frame_size;
i++)
387 samples_s32[j][
i] >>
shift);
388 } else {
389 const int16_t *samples_s16[2] = { (const int16_t *)samples0,
390 (const int16_t *)samples1 };
391 for (
i = 0;
i <
s->frame_size;
i++)
395 }
396 } else {
397 s->write_sample_size =
s->avctx->bits_per_raw_sample -
s->extra_bits +
399
402
403 // extract extra bits if needed
405 uint32_t
mask = (1 <<
s->extra_bits) - 1;
407 int32_t *extra =
s->predictor_buf[j];
409 for (
i = 0;
i <
s->frame_size;
i++) {
411 smp[
i] >>=
s->extra_bits;
412 }
413 }
414 }
415
418 else
419 s->interlacing_shift =
s->interlacing_leftweight = 0;
421 put_bits(pb, 8,
s->interlacing_leftweight);
422
425
428
431 // predictor coeff. table
432 for (j = 0; j <
s->lpc[
i].lpc_order; j++)
434 }
435
436 // write extra bits if needed
438 for (
i = 0;
i <
s->frame_size;
i++) {
440 put_bits(pb,
s->extra_bits,
s->predictor_buf[j][
i]);
441 }
442 }
443 }
444
445 // apply lpc and entropy coding to audio samples
448
449 // TODO: determine when this will actually help. for now it's not used.
450 if (prediction_type == 15) {
451 // 2nd pass 1st order filter
453 for (j =
s->frame_size - 1; j > 0; j--)
454 residual[j] -= residual[j - 1];
455 }
457 }
458 }
459 }
460
463 {
467 int ch, element, sce, cpe;
468
470
471 ch = element = sce = cpe = 0;
472 while (ch < s->avctx->channels) {
473 if (ch_elements[element] ==
TYPE_CPE) {
476 cpe++;
477 ch += 2;
478 } else {
480 sce++;
481 ch++;
482 }
483 element++;
484 }
485
488
490 }
491
493 {
496 }
497
499 {
502 return 0;
503 }
504
506 {
509 uint8_t *alac_extradata;
510
512
517 } else {
520 }
521
522 // Set default compression level
524 s->compression_level = 2;
525 else
527
528 // Initialize default Rice parameters
529 s->rc.history_mult = 40;
530 s->rc.initial_history = 10;
531 s->rc.k_modifier = 14;
532 s->rc.rice_modifier = 4;
533
537
542
549 AV_WB32(alac_extradata+24,
s->max_coded_frame_size);
553
554 // Set relevant extradata fields
555 if (
s->compression_level > 0) {
556 AV_WB8(alac_extradata+18,
s->rc.history_mult);
557 AV_WB8(alac_extradata+19,
s->rc.initial_history);
558 AV_WB8(alac_extradata+20,
s->rc.k_modifier);
559 }
560
561 if (
s->max_prediction_order <
s->min_prediction_order) {
563 "invalid prediction orders: min=%d max=%d\n",
564 s->min_prediction_order,
s->max_prediction_order);
566 }
567
569
571 s->max_prediction_order,
574 }
575
576 return 0;
577 }
578
581 {
583 int out_bytes, max_frame_size,
ret;
584
585 s->frame_size =
frame->nb_samples;
586
590 else
591 max_frame_size =
s->max_coded_frame_size;
592
595
596 /* use verbatim mode for compression_level 0 */
597 if (
s->compression_level) {
600 } else {
603 }
604
606
607 if (out_bytes > max_frame_size) {
608 /* frame too large. use verbatim mode */
612 }
613
614 avpkt->
size = out_bytes;
615 *got_packet_ptr = 1;
616 return 0;
617 }
618
619 #define OFFSET(x) offsetof(AlacEncodeContext, x)
620 #define AE AV_OPT_FLAG_AUDIO_PARAM | AV_OPT_FLAG_ENCODING_PARAM
624
626 };
627
633 };
634
651 };
Tag MUST be and< 10hcoeff half pel interpolation filter coefficients, hcoeff[0] are the 2 middle coefficients[1] are the next outer ones and so on, resulting in a filter like:...eff[2], hcoeff[1], hcoeff[0], hcoeff[0], hcoeff[1], hcoeff[2] ... the sign of the coefficients is not explicitly stored but alternates after each coeff and coeff[0] is positive, so ...,+,-,+,-,+,+,-,+,-,+,... hcoeff[0] is not explicitly stored but found by subtracting the sum of all stored coefficients with signs from 32 hcoeff[0]=32 - hcoeff[1] - hcoeff[2] - ... a good choice for hcoeff and htaps is htaps=6 hcoeff={40,-10, 2} an alternative which requires more computations at both encoder and decoder side and may or may not be better is htaps=8 hcoeff={42,-14, 6,-2}ref_frames minimum of the number of available reference frames and max_ref_frames for example the first frame after a key frame always has ref_frames=1spatial_decomposition_type wavelet type 0 is a 9/7 symmetric compact integer wavelet 1 is a 5/3 symmetric compact integer wavelet others are reserved stored as delta from last, last is reset to 0 if always_reset||keyframeqlog quality(logarithmic quantizer scale) stored as delta from last, last is reset to 0 if always_reset||keyframemv_scale stored as delta from last, last is reset to 0 if always_reset||keyframe FIXME check that everything works fine if this changes between framesqbias dequantization bias stored as delta from last, last is reset to 0 if always_reset||keyframeblock_max_depth maximum depth of the block tree stored as delta from last, last is reset to 0 if always_reset||keyframequant_table quantization tableHighlevel bitstream structure:==============================--------------------------------------------|Header|--------------------------------------------|------------------------------------|||Block0||||split?||||yes no||||......... intra?||||:Block01 :yes no||||:Block02 :....... ..........||||:Block03 ::y DC ::ref index:||||:Block04 ::cb DC ::motion x :||||......... :cr DC ::motion y :||||....... ..........|||------------------------------------||------------------------------------|||Block1|||...|--------------------------------------------|------------ ------------ ------------|||Y subbands||Cb subbands||Cr subbands||||--- ---||--- ---||--- ---|||||LL0||HL0||||LL0||HL0||||LL0||HL0|||||--- ---||--- ---||--- ---||||--- ---||--- ---||--- ---|||||LH0||HH0||||LH0||HH0||||LH0||HH0|||||--- ---||--- ---||--- ---||||--- ---||--- ---||--- ---|||||HL1||LH1||||HL1||LH1||||HL1||LH1|||||--- ---||--- ---||--- ---||||--- ---||--- ---||--- ---|||||HH1||HL2||||HH1||HL2||||HH1||HL2|||||...||...||...|||------------ ------------ ------------|--------------------------------------------Decoding process:=================------------|||Subbands|------------||||------------|Intra DC||||LL0 subband prediction ------------|\ Dequantization ------------------- \||Reference frames|\ IDWT|------- -------|Motion \|||Frame 0||Frame 1||Compensation . OBMC v -------|------- -------|--------------. \------> Frame n output Frame Frame<----------------------------------/|...|------------------- Range Coder:============Binary Range Coder:------------------- The implemented range coder is an adapted version based upon "Range encoding: an algorithm for removing redundancy from a digitised message." by G. N. N. Martin. The symbols encoded by the Snow range coder are bits(0|1). The associated probabilities are not fix but change depending on the symbol mix seen so far. bit seen|new state ---------+----------------------------------------------- 0|256 - state_transition_table[256 - old_state];1|state_transition_table[old_state];state_transition_table={ 0, 0, 0, 0, 0, 0, 0, 0, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 190, 191, 192, 194, 194, 195, 196, 197, 198, 199, 200, 201, 202, 202, 204, 205, 206, 207, 208, 209, 209, 210, 211, 212, 213, 215, 215, 216, 217, 218, 219, 220, 220, 222, 223, 224, 225, 226, 227, 227, 229, 229, 230, 231, 232, 234, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 248, 0, 0, 0, 0, 0, 0, 0};FIXME Range Coding of integers:------------------------- FIXME Neighboring Blocks:===================left and top are set to the respective blocks unless they are outside of the image in which case they are set to the Null block top-left is set to the top left block unless it is outside of the image in which case it is set to the left block if this block has no larger parent block or it is at the left side of its parent block and the top right block is not outside of the image then the top right block is used for top-right else the top-left block is used Null block y, cb, cr are 128 level, ref, mx and my are 0 Motion Vector Prediction:=========================1. the motion vectors of all the neighboring blocks are scaled to compensate for the difference of reference frames scaled_mv=(mv *(256 *(current_reference+1)/(mv.reference+1))+128)> the median of the scaled left