1 /*
2 * AAC encoder
3 * Copyright (C) 2008 Konstantin Shishkov
4 *
5 * This file is part of FFmpeg.
6 *
7 * FFmpeg is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; either
10 * version 2.1 of the License, or (at your option) any later version.
11 *
12 * FFmpeg is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * Lesser General Public License for more details.
16 *
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with FFmpeg; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 */
21
22 /**
23 * @file
24 * AAC encoder
25 */
26
27 /***********************************
28 * TODOs:
29 * add sane pulse detection
30 ***********************************/
32
44
50
52
54 {
60
62
65
72
73 put_bits(pb, 1, 0);
/* Stereo mixdown */
74 put_bits(pb, 1, 0);
/* Mono mixdown */
75 put_bits(pb, 1, 0);
/* Something else */
76
77 for (
i = 0;
i < 4;
i++) {
78 for (j = 0; j < pce->
num_ele[
i]; j++) {
82 }
83 }
84
88 }
89
90 /**
91 * Make AAC audio config object.
92 * @see 1.6.2.1 "Syntax - AudioSpecificConfig"
93 */
95 {
98 int channels = (!
s->needs_pce)*(
s->channels - (
s->channels == 8 ? 1 : 0));
99 const int max_size = 32;
100
104
106 put_bits(&pb, 5,
s->profile+1);
//profile
107 put_bits(&pb, 4,
s->samplerate_index);
//sample rate index
109 //GASpecificConfig
110 put_bits(&pb, 1, 0);
//frame length - 1024 samples
111 put_bits(&pb, 1, 0);
//does not depend on core coder
112 put_bits(&pb, 1, 0);
//is not extension
115
116 //Explicitly Mark SBR absent
117 put_bits(&pb, 11, 0x2b7);
//sync extension
122
123 return 0;
124 }
125
127 {
128 ++
s->quantize_band_cost_cache_generation;
129 if (
s->quantize_band_cost_cache_generation == 0) {
130 memset(
s->quantize_band_cost_cache, 0,
sizeof(
s->quantize_band_cost_cache));
131 s->quantize_band_cost_cache_generation = 1;
132 }
133 }
134
135 #define WINDOW_FUNC(type) \
136 static void apply_ ##type ##_window(AVFloatDSPContext *fdsp, \
137 SingleChannelElement *sce, \
138 const float *audio)
139
141 {
144 float *
out = sce->ret_buf;
145
146 fdsp->vector_fmul (
out, audio, lwindow, 1024);
147 fdsp->vector_fmul_reverse(
out + 1024, audio + 1024, pwindow, 1024);
148 }
149
151 {
154 float *
out = sce->ret_buf;
155
156 fdsp->vector_fmul(
out, audio, lwindow, 1024);
157 memcpy(
out + 1024, audio + 1024,
sizeof(
out[0]) * 448);
158 fdsp->vector_fmul_reverse(
out + 1024 + 448, audio + 1024 + 448, swindow, 128);
159 memset(
out + 1024 + 576, 0,
sizeof(
out[0]) * 448);
160 }
161
163 {
166 float *
out = sce->ret_buf;
167
168 memset(
out, 0,
sizeof(
out[0]) * 448);
169 fdsp->vector_fmul(
out + 448, audio + 448, swindow, 128);
170 memcpy(
out + 576, audio + 576,
sizeof(
out[0]) * 448);
171 fdsp->vector_fmul_reverse(
out + 1024, audio + 1024, lwindow, 1024);
172 }
173
175 {
178 const float *in = audio + 448;
179 float *
out = sce->ret_buf;
181
182 for (
w = 0;
w < 8;
w++) {
183 fdsp->vector_fmul (
out, in,
w ? pwindow : swindow, 128);
185 in += 128;
186 fdsp->vector_fmul_reverse(
out, in, swindow, 128);
188 }
189 }
190
193 const float *audio) = {
198 };
199
201 float *audio)
202 {
205
207
210 else
211 for (
i = 0;
i < 1024;
i += 128)
213 memcpy(audio, audio + 1024, sizeof(audio[0]) * 1024);
215 }
216
217 /**
218 * Encode ics_info element.
219 * @see Table 4.6 (syntax of ics_info)
220 */
222 {
224
225 put_bits(&
s->pb, 1, 0);
// ics_reserved bit
231 } else {
233 for (
w = 1;
w < 8;
w++)
235 }
236 }
237
238 /**
239 * Encode MS data.
240 * @see 4.6.8.1 "Joint Coding - M/S Stereo"
241 */
243 {
245
251 }
252
253 /**
254 * Produce integer coefficients from scalefactors provided by the model.
255 */
257 {
259 int maxsfb, cmaxsfb;
260
261 for (ch = 0; ch < chans; ch++) {
263 maxsfb = 0;
267 for (cmaxsfb = ics->
num_swb; cmaxsfb > 0 && cpe->
ch[ch].
zeroes[
w*16+cmaxsfb-1]; cmaxsfb--)
268 ;
269 maxsfb =
FFMAX(maxsfb, cmaxsfb);
270 }
271 }
273
274 //adjust zero bands for window groups
281 break;
282 }
283 }
285 }
286 }
287 }
288
292 int msc = 0;
298 msc++;
299 if (msc == 0 || ics0->
max_sfb == 0)
301 else
303 }
304 }
305
307 {
311 return;
314 int start = (
w+w2) * 128;
320 continue;
321 }
323 p *= -1;
328 }
330 }
331 }
332 }
333 }
334
336 {
340 return;
343 int start = (
w+w2) * 128;
345 /* ms_mask can be used for other purposes in PNS and I/S,
346 * so must not apply M/S if any band uses either, even if
347 * ms_mask is set.
348 */
353 continue;
354 }
360 }
362 }
363 }
364 }
365 }
366
367 /**
368 * Encode scalefactor band coding type.
369 */
371 {
373
374 if (
s->coder->set_special_band_scalefactors)
375 s->coder->set_special_band_scalefactors(
s, sce);
376
379 }
380
381 /**
382 * Encode scalefactors.
383 */
386 {
388 int off_is = 0, noise_flag = 1;
390
397 if (noise_flag-- > 0) {
399 continue;
400 }
405 } else {
408 }
412 }
413 }
414 }
415 }
416
417 /**
418 * Encode pulse data.
419 */
421 {
423
426 return;
427
433 }
434 }
435
436 /**
437 * Encode spectral coefficients processed by psychoacoustic model.
438 */
440 {
442
444 start = 0;
448 continue;
449 }
451 s->coder->quantize_and_encode_band(
s, &
s->pb,
452 &sce->
coeffs[start + w2*128],
458 }
460 }
461 }
462 }
463
464 /**
465 * Downscale spectral coefficients for near-clipping windows to avoid artifacts
466 */
468 {
470
473 start = 0;
475 float *swb_coeffs = &sce->
coeffs[start +
w*128];
479 }
480 }
481 }
482 }
483
484 /**
485 * Encode one channel of audio data.
486 */
489 int common_window)
490 {
492 if (!common_window) {
494 if (
s->coder->encode_main_pred)
495 s->coder->encode_main_pred(
s, sce);
496 if (
s->coder->encode_ltp_info)
497 s->coder->encode_ltp_info(
s, sce, 0);
498 }
503 if (
s->coder->encode_tns_info)
504 s->coder->encode_tns_info(
s, sce);
507 return 0;
508 }
509
510 /**
511 * Write some auxiliary information about the created AAC file.
512 */
514 {
515 int i, namelen, padbits;
516
517 namelen = strlen(
name) + 2;
520 if (namelen >= 15)
522 put_bits(&
s->pb, 4, 0);
//extension type - filler
525 for (
i = 0;
i < namelen - 2;
i++)
528 }
529
530 /*
531 * Copy input samples.
532 * Channels are reordered from libavcodec's default order to AAC order.
533 */
535 {
536 int ch;
537 int end = 2048 + (
frame ?
frame->nb_samples : 0);
538 const uint8_t *channel_map =
s->reorder_map;
539
540 /* copy and remap input samples */
541 for (ch = 0; ch <
s->channels; ch++) {
542 /* copy last 1024 samples of previous frame to the start of the current frame */
543 memcpy(&
s->planar_samples[ch][1024], &
s->planar_samples[ch][2048], 1024 *
sizeof(
s->planar_samples[0][0]));
544
545 /* copy new samples and zero any remaining samples */
547 memcpy(&
s->planar_samples[ch][2048],
548 frame->extended_data[channel_map[ch]],
549 frame->nb_samples *
sizeof(
s->planar_samples[0][0]));
550 }
551 memset(&
s->planar_samples[ch][end], 0,
552 (3072 - end) *
sizeof(
s->planar_samples[0][0]));
553 }
554 }
555
558 {
560 float **
samples =
s->planar_samples, *samples2, *la, *overlap;
564 int i, its, ch,
w, chans,
tag, start_ch,
ret, frame_bits;
565 int target_bits, rate_bits, too_many_bits, too_few_bits;
566 int ms_mode = 0, is_mode = 0, tns_mode = 0, pred_mode = 0;
567 int chan_el_counter[4];
569
570 /* add current frame to queue */
574 } else {
575 if (!
s->afq.remaining_samples || (!
s->afq.frame_alloc && !
s->afq.frame_count))
576 return 0;
577 }
578
582
584 return 0;
585
586 start_ch = 0;
587 for (
i = 0;
i <
s->chan_map[0];
i++) {
589 tag =
s->chan_map[
i+1];
592 for (ch = 0; ch < chans; ch++) {
593 int k;
594 float clip_avoidance_factor;
597 s->cur_channel = start_ch + ch;
598 overlap = &
samples[
s->cur_channel][0];
599 samples2 = overlap + 1024;
600 la = samples2 + (448+64);
609
610 /* Only the lowest 12 coefficients are used in a LFE channel.
611 * The expression below results in only the bottom 8 coefficients
612 * being used for 11.025kHz to 16kHz sample rates.
613 */
614 ics->
num_swb =
s->samplerate_index >= 8 ? 1 : 3;
615 } else {
616 wi[ch] =
s->psy.model->window(&
s->psy, samples2, la,
s->cur_channel,
618 }
633
636
637 /* Calculate input sample maximums and evaluate clipping risk */
638 clip_avoidance_factor = 0.0f;
640 const float *wbuf = overlap +
w * 128;
643 int j;
644 /* mdct input is 2 * output */
645 for (j = 0; j < wlen; j++)
648 }
652 clip_avoidance_factor =
FFMAX(clip_avoidance_factor, wi[ch].clipping[
w]);
653 } else {
655 }
656 }
659 } else {
661 }
662
664
665 if (
s->options.ltp &&
s->coder->update_ltp) {
666 s->coder->update_ltp(
s, sce);
669 }
670
671 for (k = 0; k < 1024; k++) {
672 if (!(
fabs(cpe->
ch[ch].
coeffs[k]) < 1E16)) {
// Ensure headroom for energy calculation
675 }
676 }
678 }
679 start_ch += chans;
680 }
683 frame_bits = its = 0;
684 do {
686
689 start_ch = 0;
690 target_bits = 0;
691 memset(chan_el_counter, 0, sizeof(chan_el_counter));
692 for (
i = 0;
i <
s->chan_map[0];
i++) {
694 const float *coeffs[2];
695 tag =
s->chan_map[
i+1];
703 for (ch = 0; ch < chans; ch++) {
711 for (
w = 0;
w < 128;
w++)
714 }
715 s->psy.bitres.alloc = -1;
716 s->psy.bitres.bits =
s->last_frame_pb_count /
s->channels;
717 s->psy.model->analyze(&
s->psy, start_ch, coeffs, wi);
718 if (
s->psy.bitres.alloc > 0) {
719 /* Lambda unused here on purpose, we need to take psy's unscaled allocation */
720 target_bits +=
s->psy.bitres.alloc
722 s->psy.bitres.alloc /= chans;
723 }
725 for (ch = 0; ch < chans; ch++) {
726 s->cur_channel = start_ch + ch;
727 if (
s->options.pns &&
s->coder->mark_pns)
728 s->coder->mark_pns(
s, avctx, &cpe->
ch[ch]);
729 s->coder->search_for_quantizers(avctx,
s, &cpe->
ch[ch],
s->lambda);
730 }
731 if (chans > 1
732 && wi[0].window_type[0] == wi[1].window_type[0]
733 && wi[0].window_shape == wi[1].window_shape) {
734
737 if (wi[0].grouping[
w] != wi[1].grouping[
w]) {
739 break;
740 }
741 }
742 }
743 for (ch = 0; ch < chans; ch++) { /* TNS and PNS */
745 s->cur_channel = start_ch + ch;
746 if (
s->options.tns &&
s->coder->search_for_tns)
747 s->coder->search_for_tns(
s, sce);
748 if (
s->options.tns &&
s->coder->apply_tns_filt)
749 s->coder->apply_tns_filt(
s, sce);
751 tns_mode = 1;
752 if (
s->options.pns &&
s->coder->search_for_pns)
753 s->coder->search_for_pns(
s, avctx, sce);
754 }
755 s->cur_channel = start_ch;
756 if (
s->options.intensity_stereo) {
/* Intensity Stereo */
757 if (
s->coder->search_for_is)
758 s->coder->search_for_is(
s, avctx, cpe);
761 }
762 if (
s->options.pred) {
/* Prediction */
763 for (ch = 0; ch < chans; ch++) {
765 s->cur_channel = start_ch + ch;
766 if (
s->options.pred &&
s->coder->search_for_pred)
767 s->coder->search_for_pred(
s, sce);
769 }
770 if (
s->coder->adjust_common_pred)
771 s->coder->adjust_common_pred(
s, cpe);
772 for (ch = 0; ch < chans; ch++) {
774 s->cur_channel = start_ch + ch;
775 if (
s->options.pred &&
s->coder->apply_main_pred)
776 s->coder->apply_main_pred(
s, sce);
777 }
778 s->cur_channel = start_ch;
779 }
780 if (
s->options.mid_side) {
/* Mid/Side stereo */
781 if (
s->options.mid_side == -1 &&
s->coder->search_for_ms)
782 s->coder->search_for_ms(
s, cpe);
786 }
788 if (
s->options.ltp) {
/* LTP */
789 for (ch = 0; ch < chans; ch++) {
791 s->cur_channel = start_ch + ch;
792 if (
s->coder->search_for_ltp)
795 }
796 s->cur_channel = start_ch;
797 if (
s->coder->adjust_common_ltp)
798 s->coder->adjust_common_ltp(
s, cpe);
799 }
800 if (chans == 2) {
804 if (
s->coder->encode_main_pred)
805 s->coder->encode_main_pred(
s, &cpe->
ch[0]);
806 if (
s->coder->encode_ltp_info)
807 s->coder->encode_ltp_info(
s, &cpe->
ch[0], 1);
810 }
811 }
812 for (ch = 0; ch < chans; ch++) {
813 s->cur_channel = start_ch + ch;
815 }
816 start_ch += chans;
817 }
818
820 /* When using a constant Q-scale, don't mess with lambda */
821 break;
822 }
823
824 /* rate control stuff
825 * allow between the nominal bitrate, and what psy's bit reservoir says to target
826 * but drift towards the nominal bitrate always
827 */
830 rate_bits =
FFMIN(rate_bits, 6144 *
s->channels - 3);
831 too_many_bits =
FFMAX(target_bits, rate_bits);
832 too_many_bits =
FFMIN(too_many_bits, 6144 *
s->channels - 3);
833 too_few_bits =
FFMIN(
FFMAX(rate_bits - rate_bits/4, target_bits), too_many_bits);
834
835 /* When using ABR, be strict (but only for increasing) */
836 too_few_bits = too_few_bits - too_few_bits/8;
837 too_many_bits = too_many_bits + too_many_bits/2;
838
839 if ( its == 0 /* for steady-state Q-scale tracking */
840 || (its < 5 && (frame_bits < too_few_bits || frame_bits > too_many_bits))
841 || frame_bits >= 6144 *
s->channels - 3 )
842 {
843 float ratio = ((float)rate_bits) / frame_bits;
844
845 if (frame_bits >= too_few_bits && frame_bits <= too_many_bits) {
846 /*
847 * This path is for steady-state Q-scale tracking
848 * When frame bits fall within the stable range, we still need to adjust
849 * lambda to maintain it like so in a stable fashion (large jumps in lambda
850 * create artifacts and should be avoided), but slowly
851 */
852 ratio = sqrtf(sqrtf(ratio));
854 } else {
855 /* Not so fast though */
856 ratio = sqrtf(ratio);
857 }
858 s->lambda =
av_clipf(
s->lambda * ratio, FLT_EPSILON, 65536.f);
859
860 /* Keep iterating if we must reduce and lambda is in the sky */
861 if (ratio > 0.9
f && ratio < 1.1
f) {
862 break;
863 } else {
864 if (is_mode || ms_mode || tns_mode || pred_mode) {
865 for (
i = 0;
i <
s->chan_map[0];
i++) {
866 // Must restore coeffs
869 for (ch = 0; ch < chans; ch++)
871 }
872 }
873 its++;
874 }
875 } else {
876 break;
877 }
878 } while (1);
879
880 if (
s->options.ltp &&
s->coder->ltp_insert_new_frame)
881 s->coder->ltp_insert_new_frame(
s);
882
885
888
889 s->lambda_sum +=
s->lambda;
891
894
895 *got_packet_ptr = 1;
896 return 0;
897 }
898
900 {
902
904
915 return 0;
916 }
917
919 {
921
925
926 // window init
928
933
934 return 0;
935 }
936
938 {
939 int ch;
943
944 for(ch = 0; ch <
s->channels; ch++)
945 s->planar_samples[ch] =
s->buffer.samples + 3 * 1024 * ch;
946
947 return 0;
948 }
949
951 {
954 const uint8_t *
sizes[2];
956 int lengths[2];
957
958 /* Constants */
959 s->last_frame_pb_count = 0;
963
964 /* Channel map and unspecified bitrate guessing */
966
970 s->needs_pce =
s->options.pce;
971 break;
972 }
973 }
974
976 char buf[64];
979 break;
982 av_log(avctx,
AV_LOG_INFO,
"Using a PCE to encode channel layout \"%s\"\n", buf);
984 s->reorder_map =
s->pce.reorder_map;
985 s->chan_map =
s->pce.config_map;
986 } else {
989 }
990
992 for (
i = 1;
i <=
s->chan_map[0];
i++) {
994 s->chan_map[
i] ==
TYPE_LFE ? 16000 :
/* LFE */
995 69000 ; /* SCE */
996 }
997 }
998
999 /* Samplerate */
1000 for (
i = 0;
i < 16;
i++)
1002 break;
1003 s->samplerate_index =
i;
1007 "Unsupported sample rate %d\n", avctx->
sample_rate);
1008
1009 /* Bitrate limiting */
1011 "Too many bits %f > %d per frame requested, clamping to max\n",
1013 6144 *
s->channels);
1016
1017 /* Profile and option setting */
1022 break;
1026 "Main prediction unavailable in the \"mpeg2_aac_low\" profile\n");
1028 "LTP prediction unavailable in the \"mpeg2_aac_low\" profile\n");
1030 "PNS unavailable in the \"mpeg2_aac_low\" profile, turning off\n");
1035 "Main prediction unavailable in the \"aac_ltp\" profile\n");
1037 s->options.pred = 1;
1039 "LTP prediction unavailable in the \"aac_main\" profile\n");
1040 }
else if (
s->options.ltp) {
1043 "Chainging profile to \"aac_ltp\"\n");
1045 "Main prediction unavailable in the \"aac_ltp\" profile\n");
1046 }
else if (
s->options.pred) {
1049 "Chainging profile to \"aac_main\"\n");
1051 "LTP prediction unavailable in the \"aac_main\" profile\n");
1052 }
1054
1055 /* Coder limitations */
1059 "The ANMR coder is considered experimental, add -strict -2 to enable!\n");
1060 s->options.intensity_stereo = 0;
1062 }
1064 "The LPT profile requires experimental compliance, add -strict -2 to enable!\n");
1065
1066 /* M/S introduces horrible artifacts with multichannel files, this is temporary */
1067 if (
s->channels > 3)
1068 s->options.mid_side = 0;
1069
1072
1075
1078
1083 for (
i = 0;
i <
s->chan_map[0];
i++)
1086 s->chan_map[0], grouping)) < 0)
1090 s->random_state = 0x1f2e3d4c;
1091
1094
1095 if (ARCH_X86)
1097
1098 if (HAVE_MIPSDSP)
1100
1103
1104 return 0;
1105 }
1106
1107 #define AACENC_FLAGS AV_OPT_FLAG_ENCODING_PARAM | AV_OPT_FLAG_AUDIO_PARAM
1109 {
"aac_coder",
"Coding algorithm", offsetof(
AACEncContext,
options.coder),
AV_OPT_TYPE_INT, {.i64 =
AAC_CODER_TWOLOOP}, 0,
AAC_CODER_NB-1,
AACENC_FLAGS,
"coder"},
1122 };
1123
1129 };
1130
1132 { "b", "0" },
1134 };
1135
1152 };