Go to the documentation of this file. 1 /*
2 * ALAC audio encoder
3 * Copyright (c) 2008 Jaikrishnan Menon <realityman@gmx.net>
4 *
5 * This file is part of FFmpeg.
6 *
7 * FFmpeg is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; either
10 * version 2.1 of the License, or (at your option) any later version.
11 *
12 * FFmpeg is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * Lesser General Public License for more details.
16 *
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with FFmpeg; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 */
21
23
31
32 #define DEFAULT_FRAME_SIZE 4096
33 #define ALAC_EXTRADATA_SIZE 36
34 #define ALAC_FRAME_HEADER_SIZE 55
35 #define ALAC_FRAME_FOOTER_SIZE 3
36
37 #define ALAC_ESCAPE_CODE 0x1FF
38 #define ALAC_MAX_LPC_ORDER 30
39 #define DEFAULT_MAX_PRED_ORDER 6
40 #define DEFAULT_MIN_PRED_ORDER 4
41 #define ALAC_MAX_LPC_PRECISION 9
42 #define ALAC_MIN_LPC_SHIFT 0
43 #define ALAC_MAX_LPC_SHIFT 9
44
45 #define ALAC_CHMODE_LEFT_RIGHT 0
46 #define ALAC_CHMODE_LEFT_SIDE 1
47 #define ALAC_CHMODE_RIGHT_SIDE 2
48 #define ALAC_CHMODE_MID_SIDE 3
49
56
62
67 int verbatim;
/**< current frame verbatim mode flag */
83
84
87 {
90 s->avctx->bits_per_raw_sample;
91
92 #define COPY_SAMPLES(type) do { \
93 for (ch = 0; ch < channels; ch++) { \
94 int32_t *bptr = s->sample_buf[ch]; \
95 const type *sptr = (const type *)samples[ch]; \
96 for (i = 0; i < s->frame_size; i++) \
97 bptr[i] = sptr[i] >> shift; \
98 } \
99 } while (0)
100
103 else
105 }
106
108 int k, int write_sample_size)
109 {
111
112 k =
FFMIN(k,
s->rc.k_modifier);
113 divisor = (1<<k) - 1;
114 q = x / divisor;
116
117 if (q > 8) {
118 // write escape code and sample value directly
120 put_bits(&
s->pbctx, write_sample_size, x);
121 } else {
122 if (q)
125
126 if (k != 1) {
129 else
131 }
132 }
133 }
134
137 int instance)
138 {
139 int encode_fs = 0;
140
142 encode_fs = 1;
143
144 put_bits(&
s->pbctx, 3, element);
// element type
145 put_bits(&
s->pbctx, 4, instance);
// element instance
146 put_bits(&
s->pbctx, 12, 0);
// unused header bits
147 put_bits(&
s->pbctx, 1, encode_fs);
// Sample count is in the header
148 put_bits(&
s->pbctx, 2,
s->extra_bits >> 3);
// Extra bytes (for 24-bit)
149 put_bits(&
s->pbctx, 1,
s->verbatim);
// Audio block is verbatim
150 if (encode_fs)
151 put_bits32(&
s->pbctx,
s->frame_size);
// No. of samples in the frame
152 }
153
155 {
158 int opt_order;
159
160 if (
s->compression_level == 1) {
161 s->lpc[ch].lpc_order = 6;
162 s->lpc[ch].lpc_quant = 6;
163 s->lpc[ch].lpc_coeff[0] = 160;
164 s->lpc[ch].lpc_coeff[1] = -190;
165 s->lpc[ch].lpc_coeff[2] = 170;
166 s->lpc[ch].lpc_coeff[3] = -130;
167 s->lpc[ch].lpc_coeff[4] = 80;
168 s->lpc[ch].lpc_coeff[5] = -25;
169 } else {
172 s->min_prediction_order,
173 s->max_prediction_order,
178
179 s->lpc[ch].lpc_order = opt_order;
180 s->lpc[ch].lpc_quant =
shift[opt_order-1];
181 memcpy(
s->lpc[ch].lpc_coeff, coefs[opt_order-1], opt_order*
sizeof(
int));
182 }
183 }
184
186 {
189 uint64_t sum[4];
190 uint64_t score[4];
191
192 /* calculate sum of 2nd order residual for each channel */
193 sum[0] = sum[1] = sum[2] = sum[3] = 0;
194 for (
i = 2;
i < n;
i++) {
195 lt = left_ch[
i] - 2 * left_ch[
i - 1] + left_ch[
i - 2];
196 rt = right_ch[
i] - 2 * right_ch[
i - 1] + right_ch[
i - 2];
197 sum[2] +=
FFABS((lt + rt) >> 1);
198 sum[3] +=
FFABS(lt - rt);
201 }
202
203 /* calculate score for each mode */
204 score[0] = sum[0] + sum[1];
205 score[1] = sum[0] + sum[3];
206 score[2] = sum[1] + sum[3];
207 score[3] = sum[2] + sum[3];
208
209 /* return mode with lowest score */
210 best = 0;
211 for (
i = 1;
i < 4;
i++) {
212 if (score[
i] < score[best])
214 }
215 return best;
216 }
217
219 {
221 int i,
mode, n =
s->frame_size;
223
225
228 s->interlacing_leftweight = 0;
229 s->interlacing_shift = 0;
230 break;
232 for (
i = 0;
i < n;
i++)
234 s->interlacing_leftweight = 1;
235 s->interlacing_shift = 0;
236 break;
238 for (
i = 0;
i < n;
i++) {
242 }
243 s->interlacing_leftweight = 1;
244 s->interlacing_shift = 31;
245 break;
246 default:
247 for (
i = 0;
i < n;
i++) {
250 right[
i] =
tmp - right[
i];
251 }
252 s->interlacing_leftweight = 1;
253 s->interlacing_shift = 1;
254 break;
255 }
256 }
257
259 {
262 int32_t *residual =
s->predictor_buf[ch];
263
265 residual[0] =
s->sample_buf[ch][0];
266
267 for (
i = 1;
i <
s->frame_size;
i++) {
268 residual[
i] =
s->sample_buf[ch][
i ] -
269 s->sample_buf[ch][
i - 1];
270 }
271
272 return;
273 }
274
275 // generalised linear predictor
276
279
280 // generate warm-up samples
284
285 // perform lpc on remaining samples
287 int sum = 1 << (lpc.
lpc_quant - 1), res_val, j;
288
292 }
293
297 s->write_sample_size);
298 res_val = residual[
i];
299
300 if (res_val) {
302 int neg = (res_val < 0);
303
304 while (
index >= 0 && (neg ? (res_val < 0) : (res_val > 0))) {
307
308 if (neg)
309 sign *= -1;
310
315 }
316 }
318 }
319 }
320 }
321
323 {
324 unsigned int history =
s->rc.initial_history;
325 int sign_modifier = 0,
i, k;
327
328 for (
i = 0;
i <
s->frame_size;) {
329 int x;
330
331 k =
av_log2((history >> 9) + 3);
332
333 x = -2 * (*samples) -1;
334 x ^= x >> 31;
335
338
340
341 history += x *
s->rc.history_mult -
342 ((history *
s->rc.history_mult) >> 9);
343
344 sign_modifier = 0;
345 if (x > 0xFFFF)
346 history = 0xFFFF;
347
349 unsigned int block_size = 0;
350
351 k = 7 -
av_log2(history) + ((history + 16) >> 6);
352
356 block_size++;
357 }
359 sign_modifier = (block_size <= 0xFFFF);
360 history = 0;
361 }
362
363 }
364 }
365
368 const uint8_t *samples0, const uint8_t *samples1)
369 {
370 const uint8_t *
samples[2] = { samples0, samples1 };
372 int prediction_type = 0;
374
376
379 /* samples are channel-interleaved in verbatim mode */
381 int shift = 32 -
s->avctx->bits_per_raw_sample;
384 for (
i = 0;
i <
s->frame_size;
i++)
387 samples_s32[j][
i] >>
shift);
388 } else {
389 const int16_t *samples_s16[2] = { (const int16_t *)samples0,
390 (const int16_t *)samples1 };
391 for (
i = 0;
i <
s->frame_size;
i++)
395 }
396 } else {
397 s->write_sample_size =
s->avctx->bits_per_raw_sample -
s->extra_bits +
399
402
403 // extract extra bits if needed
405 uint32_t
mask = (1 <<
s->extra_bits) - 1;
407 int32_t *extra =
s->predictor_buf[j];
409 for (
i = 0;
i <
s->frame_size;
i++) {
411 smp[
i] >>=
s->extra_bits;
412 }
413 }
414 }
415
418 else
419 s->interlacing_shift =
s->interlacing_leftweight = 0;
421 put_bits(pb, 8,
s->interlacing_leftweight);
422
425
428
431 // predictor coeff. table
432 for (j = 0; j <
s->lpc[
i].lpc_order; j++)
434 }
435
436 // write extra bits if needed
438 for (
i = 0;
i <
s->frame_size;
i++) {
440 put_bits(pb,
s->extra_bits,
s->predictor_buf[j][
i]);
441 }
442 }
443 }
444
445 // apply lpc and entropy coding to audio samples
448
449 // TODO: determine when this will actually help. for now it's not used.
450 if (prediction_type == 15) {
451 // 2nd pass 1st order filter
453 for (j =
s->frame_size - 1; j > 0; j--)
454 residual[j] -= residual[j - 1];
455 }
457 }
458 }
459 }
460
463 {
465 int channels =
s->avctx->ch_layout.nb_channels;
468 int ch, element, sce, cpe;
469
471
472 ch = element = sce = cpe = 0;
474 if (ch_elements[element] ==
TYPE_CPE) {
477 cpe++;
478 ch += 2;
479 } else {
481 sce++;
482 ch++;
483 }
484 element++;
485 }
486
489
491 }
492
494 {
497 }
498
500 {
503 return 0;
504 }
505
507 {
510 uint8_t *alac_extradata;
511
513
518 } else {
521 }
522
523 // Set default compression level
525 s->compression_level = 2;
526 else
528
529 // Initialize default Rice parameters
530 s->rc.history_mult = 40;
531 s->rc.initial_history = 10;
532 s->rc.k_modifier = 14;
533 s->rc.rice_modifier = 4;
534
538
543
550 AV_WB32(alac_extradata+24,
s->max_coded_frame_size);
554
555 // Set relevant extradata fields
556 if (
s->compression_level > 0) {
557 AV_WB8(alac_extradata+18,
s->rc.history_mult);
558 AV_WB8(alac_extradata+19,
s->rc.initial_history);
559 AV_WB8(alac_extradata+20,
s->rc.k_modifier);
560 }
561
562 if (
s->max_prediction_order <
s->min_prediction_order) {
564 "invalid prediction orders: min=%d max=%d\n",
565 s->min_prediction_order,
s->max_prediction_order);
567 }
568
570
572 s->max_prediction_order,
575 }
576
577 return 0;
578 }
579
582 {
584 int out_bytes, max_frame_size,
ret;
585
587
591 else
592 max_frame_size =
s->max_coded_frame_size;
593
596
597 /* use verbatim mode for compression_level 0 */
598 if (
s->compression_level) {
601 } else {
604 }
605
607
608 if (out_bytes > max_frame_size) {
609 /* frame too large. use verbatim mode */
613 }
614
615 avpkt->
size = out_bytes;
616 *got_packet_ptr = 1;
617 return 0;
618 }
619
620 #define OFFSET(x) offsetof(AlacEncodeContext, x)
621 #define AE AV_OPT_FLAG_AUDIO_PARAM | AV_OPT_FLAG_ENCODING_PARAM
625
627 };
628
634 };
635
652 };
Tag MUST be and< 10hcoeff half pel interpolation filter coefficients, hcoeff[0] are the 2 middle coefficients[1] are the next outer ones and so on, resulting in a filter like:...eff[2], hcoeff[1], hcoeff[0], hcoeff[0], hcoeff[1], hcoeff[2] ... the sign of the coefficients is not explicitly stored but alternates after each coeff and coeff[0] is positive, so ...,+,-,+,-,+,+,-,+,-,+,... hcoeff[0] is not explicitly stored but found by subtracting the sum of all stored coefficients with signs from 32 hcoeff[0]=32 - hcoeff[1] - hcoeff[2] - ... a good choice for hcoeff and htaps is htaps=6 hcoeff={40,-10, 2} an alternative which requires more computations at both encoder and decoder side and may or may not be better is htaps=8 hcoeff={42,-14, 6,-2}ref_frames minimum of the number of available reference frames and max_ref_frames for example the first frame after a key frame always has ref_frames=1spatial_decomposition_type wavelet type 0 is a 9/7 symmetric compact integer wavelet 1 is a 5/3 symmetric compact integer wavelet others are reserved stored as delta from last, last is reset to 0 if always_reset||keyframeqlog quality(logarithmic quantizer scale) stored as delta from last, last is reset to 0 if always_reset||keyframemv_scale stored as delta from last, last is reset to 0 if always_reset||keyframe FIXME check that everything works fine if this changes between framesqbias dequantization bias stored as delta from last, last is reset to 0 if always_reset||keyframeblock_max_depth maximum depth of the block tree stored as delta from last, last is reset to 0 if always_reset||keyframequant_table quantization tableHighlevel bitstream structure:==============================--------------------------------------------|Header|--------------------------------------------|------------------------------------|||Block0||||split?||||yes no||||......... intra?||||:Block01 :yes no||||:Block02 :....... ..........||||:Block03 ::y DC ::ref index:||||:Block04 ::cb DC ::motion x :||||......... :cr DC ::motion y :||||....... ..........|||------------------------------------||------------------------------------|||Block1|||...|--------------------------------------------|------------ ------------ ------------|||Y subbands||Cb subbands||Cr subbands||||--- ---||--- ---||--- ---|||||LL0||HL0||||LL0||HL0||||LL0||HL0|||||--- ---||--- ---||--- ---||||--- ---||--- ---||--- ---|||||LH0||HH0||||LH0||HH0||||LH0||HH0|||||--- ---||--- ---||--- ---||||--- ---||--- ---||--- ---|||||HL1||LH1||||HL1||LH1||||HL1||LH1|||||--- ---||--- ---||--- ---||||--- ---||--- ---||--- ---|||||HH1||HL2||||HH1||HL2||||HH1||HL2|||||...||...||...|||------------ ------------ ------------|--------------------------------------------Decoding process:=================------------|||Subbands|------------||||------------|Intra DC||||LL0 subband prediction ------------|\ Dequantization ------------------- \||Reference frames|\ IDWT|------- -------|Motion \|||Frame 0||Frame 1||Compensation . OBMC v -------|------- -------|--------------. \------> Frame n output Frame Frame<----------------------------------/|...|------------------- Range Coder:============Binary Range Coder:------------------- The implemented range coder is an adapted version based upon "Range encoding: an algorithm for removing redundancy from a digitised message." by G. N. N. Martin. The symbols encoded by the Snow range coder are bits(0|1). The associated probabilities are not fix but change depending on the symbol mix seen so far. bit seen|new state ---------+----------------------------------------------- 0|256 - state_transition_table[256 - old_state];1|state_transition_table[old_state];state_transition_table={ 0, 0, 0, 0, 0, 0, 0, 0, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 190, 191, 192, 194, 194, 195, 196, 197, 198, 199, 200, 201, 202, 202, 204, 205, 206, 207, 208, 209, 209, 210, 211, 212, 213, 215, 215, 216, 217, 218, 219, 220, 220, 222, 223, 224, 225, 226, 227, 227, 229, 229, 230, 231, 232, 234, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 248, 0, 0, 0, 0, 0, 0, 0};FIXME Range Coding of integers:------------------------- FIXME Neighboring Blocks:===================left and top are set to the respective blocks unless they are outside of the image in which case they are set to the Null block top-left is set to the top left block unless it is outside of the image in which case it is set to the left block if this block has no larger parent block or it is at the left side of its parent block and the top right block is not outside of the image then the top right block is used for top-right else the top-left block is used Null block y, cb, cr are 128 level, ref, mx and my are 0 Motion Vector Prediction:=========================1. the motion vectors of all the neighboring blocks are scaled to compensate for the difference of reference frames scaled_mv=(mv *(256 *(current_reference+1)/(mv.reference+1))+128)> the median of the scaled left