FFmpeg: libavfilter/af_arnndn.c Source File

FFmpeg

[フレーム]

libavfilter

af_arnndn.c

Go to the documentation of this file.

1 /*

7 * Copyright (c) Jean-Marc Valin

9 *

10 * Redistribution and use in source and binary forms, with or without

11 * modification, are permitted provided that the following conditions

12 * are met:

13 *

14 * - Redistributions of source code must retain the above copyright

15 * notice, this list of conditions and the following disclaimer.

16 *

17 * - Redistributions in binary form must reproduce the above copyright

18 * notice, this list of conditions and the following disclaimer in the

19 * documentation and/or other materials provided with the distribution.

20 *

21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS

22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT

23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR

24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR

25 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,

26 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,

27 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR

28 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF

29 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING

30 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS

31 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

32 */

34 #include "libavutil/avassert.h"

35 #include "libavutil/file_open.h"

36 #include "libavutil/float_dsp.h"

37 #include "libavutil/mem.h"

38 #include "libavutil/mem_internal.h"

39 #include "libavutil/opt.h"

40 #include "libavutil/tx.h"

41 #include "avfilter.h"

42 #include "audio.h"

43 #include "filters.h"

44 #include "formats.h"

46 #define FRAME_SIZE_SHIFT 2

47 #define FRAME_SIZE (120<<FRAME_SIZE_SHIFT)

48 #define WINDOW_SIZE (2*FRAME_SIZE)

49 #define FREQ_SIZE (FRAME_SIZE + 1)

51 #define PITCH_MIN_PERIOD 60

52 #define PITCH_MAX_PERIOD 768

53 #define PITCH_FRAME_SIZE 960

54 #define PITCH_BUF_SIZE (PITCH_MAX_PERIOD+PITCH_FRAME_SIZE)

56 #define SQUARE(x) ((x)*(x))

58 #define NB_BANDS 22

60 #define CEPS_MEM 8

61 #define NB_DELTA_CEPS 6

63 #define NB_FEATURES (NB_BANDS+3*NB_DELTA_CEPS+2)

65 #define WEIGHTS_SCALE (1.f/256)

67 #define MAX_NEURONS 128

69 #define ACTIVATION_TANH 0

70 #define ACTIVATION_SIGMOID 1

71 #define ACTIVATION_RELU 2

73 #define Q15ONE 1.0f

75 typedef struct DenseLayer {

76 const float *bias;

77 const float *input_weights;

78 int nb_inputs;

79 int nb_neurons;

80 int activation;

81 } DenseLayer;

83 typedef struct GRULayer {

84 const float *bias;

85 const float *input_weights;

86 const float *recurrent_weights;

87 int nb_inputs;

88 int nb_neurons;

89 int activation;

90 } GRULayer;

92 typedef struct RNNModel {

93 int input_dense_size;

94 const DenseLayer *input_dense;

96 int vad_gru_size;

97 const GRULayer *vad_gru;

99 int noise_gru_size;

100 const GRULayer *noise_gru;

101

102 int denoise_gru_size;

103 const GRULayer *denoise_gru;

104

105 int denoise_output_size;

106 const DenseLayer *denoise_output;

107

108 int vad_output_size;

109 const DenseLayer *vad_output;

110 } RNNModel;

111

112 typedef struct RNNState {

113 float *vad_gru_state;

114 float *noise_gru_state;

115 float *denoise_gru_state;

116 RNNModel *model;

117 } RNNState;

118

119 typedef struct DenoiseState {

120 float analysis_mem[FRAME_SIZE];

121 float cepstral_mem[CEPS_MEM][NB_BANDS];

122 int memid;

123 DECLARE_ALIGNED(32, float, synthesis_mem)[FRAME_SIZE];

124 float pitch_buf[PITCH_BUF_SIZE];

125 float pitch_enh_buf[PITCH_BUF_SIZE];

126 float last_gain;

127 int last_period;

128 float mem_hp_x[2];

129 float lastg[NB_BANDS];

130 float history[FRAME_SIZE];

131 RNNState rnn[2];

132 AVTXContext *tx, *txi;

133 av_tx_fn tx_fn, txi_fn;

134 } DenoiseState;

135

136 typedef struct AudioRNNContext {

137 const AVClass *class;

138

139 char *model_name;

140 float mix;

141

142 int channels;

143 DenoiseState *st;

144

145 DECLARE_ALIGNED(32, float, window)[WINDOW_SIZE];

146 DECLARE_ALIGNED(32, float, dct_table)[FFALIGN(NB_BANDS, 4)][FFALIGN(NB_BANDS, 4)];

147

148 RNNModel *model[2];

149

150 AVFloatDSPContext *fdsp;

151 } AudioRNNContext;

152

153 #define F_ACTIVATION_TANH 0

154 #define F_ACTIVATION_SIGMOID 1

155 #define F_ACTIVATION_RELU 2

156

157 static void rnnoise_model_free(RNNModel *model)

158 {

159 #define FREE_MAYBE(ptr) do { if (ptr) free(ptr); } while (0)

160 #define FREE_DENSE(name) do { \

161 if (model->name) { \

162 av_free((void *) model->name->input_weights); \

163 av_free((void *) model->name->bias); \

164 av_free((void *) model->name); \

165 } \

166 } while (0)

167 #define FREE_GRU(name) do { \

168 if (model->name) { \

169 av_free((void *) model->name->input_weights); \

170 av_free((void *) model->name->recurrent_weights); \

171 av_free((void *) model->name->bias); \

172 av_free((void *) model->name); \

173 } \

174 } while (0)

175

176 if (!model)

177 return;

178 FREE_DENSE(input_dense);

179 FREE_GRU(vad_gru);

180 FREE_GRU(noise_gru);

181 FREE_GRU(denoise_gru);

182 FREE_DENSE(denoise_output);

183 FREE_DENSE(vad_output);

184 av_free(model);

185 }

186

187 static int rnnoise_model_from_file(FILE *f, RNNModel **rnn)

188 {

189 RNNModel *ret = NULL;

190 DenseLayer *input_dense;

191 GRULayer *vad_gru;

192 GRULayer *noise_gru;

193 GRULayer *denoise_gru;

194 DenseLayer *denoise_output;

195 DenseLayer *vad_output;

196 int in;

197

198 if (fscanf(f, "rnnoise-nu model file version %d\n", &in) != 1 || in != 1)

199 return AVERROR_INVALIDDATA;

200

201 ret = av_calloc(1, sizeof(RNNModel));

202 if (!ret)

203 return AVERROR(ENOMEM);

204

205 #define ALLOC_LAYER(type, name) \

206 name = av_calloc(1, sizeof(type)); \

207 if (!name) { \

208 rnnoise_model_free(ret); \

209 return AVERROR(ENOMEM); \

210 } \

211 ret->name = name

212

213 ALLOC_LAYER(DenseLayer, input_dense);

214 ALLOC_LAYER(GRULayer, vad_gru);

215 ALLOC_LAYER(GRULayer, noise_gru);

216 ALLOC_LAYER(GRULayer, denoise_gru);

217 ALLOC_LAYER(DenseLayer, denoise_output);

218 ALLOC_LAYER(DenseLayer, vad_output);

219

220 #define INPUT_VAL(name) do { \

221 if (fscanf(f, "%d", &in) != 1 || in < 0 || in > 128) { \

222 rnnoise_model_free(ret); \

223 return AVERROR(EINVAL); \

224 } \

225 name = in; \

226 } while (0)

227

228 #define INPUT_ACTIVATION(name) do { \

229 int activation; \

230 INPUT_VAL(activation); \

231 switch (activation) { \

232 case F_ACTIVATION_SIGMOID: \

233 name = ACTIVATION_SIGMOID; \

234 break; \

235 case F_ACTIVATION_RELU: \

236 name = ACTIVATION_RELU; \

237 break; \

238 default: \

239 name = ACTIVATION_TANH; \

240 } \

241 } while (0)

242

243 #define INPUT_ARRAY(name, len) do { \

244 float *values = av_calloc((len), sizeof(float)); \

245 if (!values) { \

246 rnnoise_model_free(ret); \

247 return AVERROR(ENOMEM); \

248 } \

249 name = values; \

250 for (int i = 0; i < (len); i++) { \

251 if (fscanf(f, "%d", &in) != 1) { \

252 rnnoise_model_free(ret); \

253 return AVERROR(EINVAL); \

254 } \

255 values[i] = in; \

256 } \

257 } while (0)

258

259 #define INPUT_ARRAY3(name, len0, len1, len2) do { \

260 float *values = av_calloc(FFALIGN((len0), 4) * FFALIGN((len1), 4) * (len2), sizeof(float)); \

261 if (!values) { \

262 rnnoise_model_free(ret); \

263 return AVERROR(ENOMEM); \

264 } \

265 name = values; \

266 for (int k = 0; k < (len0); k++) { \

267 for (int i = 0; i < (len2); i++) { \

268 for (int j = 0; j < (len1); j++) { \

269 if (fscanf(f, "%d", &in) != 1) { \

270 rnnoise_model_free(ret); \

271 return AVERROR(EINVAL); \

272 } \

273 values[j * (len2) * FFALIGN((len0), 4) + i * FFALIGN((len0), 4) + k] = in; \

274 } \

275 } \

276 } \

277 } while (0)

278

279 #define NEW_LINE() do { \

280 int c; \

281 while ((c = fgetc(f)) != EOF) { \

282 if (c == '\n') \

283 break; \

284 } \

285 } while (0)

286

287 #define INPUT_DENSE(name) do { \

288 INPUT_VAL(name->nb_inputs); \

289 INPUT_VAL(name->nb_neurons); \

290 ret->name ## _size = name->nb_neurons; \

291 INPUT_ACTIVATION(name->activation); \

292 NEW_LINE(); \

293 INPUT_ARRAY(name->input_weights, name->nb_inputs * name->nb_neurons); \

294 NEW_LINE(); \

295 INPUT_ARRAY(name->bias, name->nb_neurons); \

296 NEW_LINE(); \

297 } while (0)

298

299 #define INPUT_GRU(name) do { \

300 INPUT_VAL(name->nb_inputs); \

301 INPUT_VAL(name->nb_neurons); \

302 ret->name ## _size = name->nb_neurons; \

303 INPUT_ACTIVATION(name->activation); \

304 NEW_LINE(); \

305 INPUT_ARRAY3(name->input_weights, name->nb_inputs, name->nb_neurons, 3); \

306 NEW_LINE(); \

307 INPUT_ARRAY3(name->recurrent_weights, name->nb_neurons, name->nb_neurons, 3); \

308 NEW_LINE(); \

309 INPUT_ARRAY(name->bias, name->nb_neurons * 3); \

310 NEW_LINE(); \

311 } while (0)

312

313 INPUT_DENSE(input_dense);

314 INPUT_GRU(vad_gru);

315 INPUT_GRU(noise_gru);

316 INPUT_GRU(denoise_gru);

317 INPUT_DENSE(denoise_output);

318 INPUT_DENSE(vad_output);

319

320 if (vad_output->nb_neurons != 1) {

321 rnnoise_model_free(ret);

322 return AVERROR(EINVAL);

323 }

324

325 *rnn = ret;

326

327 return 0;

328 }

329

330 static int query_formats(const AVFilterContext *ctx,

331 AVFilterFormatsConfig **cfg_in,

332 AVFilterFormatsConfig **cfg_out)

333 {

334 static const enum AVSampleFormat sample_fmts[] = {

335 AV_SAMPLE_FMT_FLTP,

336 AV_SAMPLE_FMT_NONE

337 };

338 int ret, sample_rates[] = { 48000, -1 };

339

340 ret = ff_set_common_formats_from_list2(ctx, cfg_in, cfg_out, sample_fmts);

341 if (ret < 0)

342 return ret;

343

344 return ff_set_common_samplerates_from_list2(ctx, cfg_in, cfg_out, sample_rates);

345 }

346

347 static int config_input(AVFilterLink *inlink)

348 {

349 AVFilterContext *ctx = inlink->dst;

350 AudioRNNContext *s = ctx->priv;

351 int ret = 0;

352

353 s->channels = inlink->ch_layout.nb_channels;

354

355 if (!s->st)

356 s->st = av_calloc(s->channels, sizeof(DenoiseState));

357 if (!s->st)

358 return AVERROR(ENOMEM);

359

360 for (int i = 0; i < s->channels; i++) {

361 DenoiseState *st = &s->st[i];

362

363 st->rnn[0].model = s->model[0];

364 st->rnn[0].vad_gru_state = av_calloc(FFALIGN(s->model[0]->vad_gru_size, 16), sizeof(float));

365 st->rnn[0].noise_gru_state = av_calloc(FFALIGN(s->model[0]->noise_gru_size, 16), sizeof(float));

366 st->rnn[0].denoise_gru_state = av_calloc(FFALIGN(s->model[0]->denoise_gru_size, 16), sizeof(float));

367 if (!st->rnn[0].vad_gru_state ||

368 !st->rnn[0].noise_gru_state ||

369 !st->rnn[0].denoise_gru_state)

370 return AVERROR(ENOMEM);

371 }

372

373 for (int i = 0; i < s->channels; i++) {

374 DenoiseState *st = &s->st[i];

375 float scale = 1.f;

376

377 if (!st->tx)

378 ret = av_tx_init(&st->tx, &st->tx_fn, AV_TX_FLOAT_FFT, 0, WINDOW_SIZE, &scale, 0);

379 if (ret < 0)

380 return ret;

381

382 if (!st->txi)

383 ret = av_tx_init(&st->txi, &st->txi_fn, AV_TX_FLOAT_FFT, 1, WINDOW_SIZE, &scale, 0);

384 if (ret < 0)

385 return ret;

386 }

387

388 return ret;

389 }

390

391 static void biquad(float *y, float mem[2], const float *x,

392 const float *b, const float *a, int N)

393 {

394 for (int i = 0; i < N; i++) {

395 float xi, yi;

396

397 xi = x[i];

398 yi = x[i] + mem[0];

399 mem[0] = mem[1] + (b[0]*xi - a[0]*yi);

400 mem[1] = (b[1]*xi - a[1]*yi);

401 y[i] = yi;

402 }

403 }

404

405 #define RNN_MOVE(dst, src, n) (memmove((dst), (src), (n)*sizeof(*(dst)) + 0*((dst)-(src)) ))

406 #define RNN_CLEAR(dst, n) (memset((dst), 0, (n)*sizeof(*(dst))))

407 #define RNN_COPY(dst, src, n) (memcpy((dst), (src), (n)*sizeof(*(dst)) + 0*((dst)-(src)) ))

408

409 static void forward_transform(DenoiseState *st, AVComplexFloat *out, const float *in)

410 {

411 AVComplexFloat x[WINDOW_SIZE];

412 AVComplexFloat y[WINDOW_SIZE];

413

414 for (int i = 0; i < WINDOW_SIZE; i++) {

415 x[i].re = in[i];

416 x[i].im = 0;

417 }

418

419 st->tx_fn(st->tx, y, x, sizeof(AVComplexFloat));

420

421 RNN_COPY(out, y, FREQ_SIZE);

422 }

423

424 static void inverse_transform(DenoiseState *st, float *out, const AVComplexFloat *in)

425 {

426 AVComplexFloat x[WINDOW_SIZE];

427 AVComplexFloat y[WINDOW_SIZE];

428

429 RNN_COPY(x, in, FREQ_SIZE);

430

431 for (int i = FREQ_SIZE; i < WINDOW_SIZE; i++) {

432 x[i].re = x[WINDOW_SIZE - i].re;

433 x[i].im = -x[WINDOW_SIZE - i].im;

434 }

435

436 st->txi_fn(st->txi, y, x, sizeof(AVComplexFloat));

437

438 for (int i = 0; i < WINDOW_SIZE; i++)

439 out[i] = y[i].re / WINDOW_SIZE;

440 }

441

442 static const uint8_t eband5ms[] = {

443 /*0 200 400 600 800 1k 1.2 1.4 1.6 2k 2.4 2.8 3.2 4k 4.8 5.6 6.8 8k 9.6 12k 15.6 20k*/

444 0, 1, 2, 3, 4, 5, 6, 7, 8, 10, 12, 14, 16, 20, 24, 28, 34, 40, 48, 60, 78, 100

445 };

446

447 static void compute_band_energy(float *bandE, const AVComplexFloat *X)

448 {

449 float sum[NB_BANDS] = {0};

450

451 for (int i = 0; i < NB_BANDS - 1; i++) {

452 int band_size;

453

454 band_size = (eband5ms[i + 1] - eband5ms[i]) << FRAME_SIZE_SHIFT;

455 for (int j = 0; j < band_size; j++) {

456 float tmp, frac = (float)j / band_size;

457

458 tmp = SQUARE(X[(eband5ms[i] << FRAME_SIZE_SHIFT) + j].re);

459 tmp += SQUARE(X[(eband5ms[i] << FRAME_SIZE_SHIFT) + j].im);

460 sum[i] += (1.f - frac) * tmp;

461 sum[i + 1] += frac * tmp;

462 }

463 }

464

465 sum[0] *= 2;

466 sum[NB_BANDS - 1] *= 2;

467

468 for (int i = 0; i < NB_BANDS; i++)

469 bandE[i] = sum[i];

470 }

471

472 static void compute_band_corr(float *bandE, const AVComplexFloat *X, const AVComplexFloat *P)

473 {

474 float sum[NB_BANDS] = { 0 };

475

476 for (int i = 0; i < NB_BANDS - 1; i++) {

477 int band_size;

478

479 band_size = (eband5ms[i + 1] - eband5ms[i]) << FRAME_SIZE_SHIFT;

480 for (int j = 0; j < band_size; j++) {

481 float tmp, frac = (float)j / band_size;

482

483 tmp = X[(eband5ms[i]<<FRAME_SIZE_SHIFT) + j].re * P[(eband5ms[i]<<FRAME_SIZE_SHIFT) + j].re;

484 tmp += X[(eband5ms[i]<<FRAME_SIZE_SHIFT) + j].im * P[(eband5ms[i]<<FRAME_SIZE_SHIFT) + j].im;

485 sum[i] += (1 - frac) * tmp;

486 sum[i + 1] += frac * tmp;

487 }

488 }

489

490 sum[0] *= 2;

491 sum[NB_BANDS-1] *= 2;

492

493 for (int i = 0; i < NB_BANDS; i++)

494 bandE[i] = sum[i];

495 }

496

497 static void frame_analysis(AudioRNNContext *s, DenoiseState *st, AVComplexFloat *X, float *Ex, const float *in)

498 {

499 LOCAL_ALIGNED_32(float, x, [WINDOW_SIZE]);

500

501 RNN_COPY(x, st->analysis_mem, FRAME_SIZE);

502 RNN_COPY(x + FRAME_SIZE, in, FRAME_SIZE);

503 RNN_COPY(st->analysis_mem, in, FRAME_SIZE);

504 s->fdsp->vector_fmul(x, x, s->window, WINDOW_SIZE);

505 forward_transform(st, X, x);

506 compute_band_energy(Ex, X);

507 }

508

509 static void frame_synthesis(AudioRNNContext *s, DenoiseState *st, float *out, const AVComplexFloat *y)

510 {

511 LOCAL_ALIGNED_32(float, x, [WINDOW_SIZE]);

512 const float *src = st->history;

513 const float mix = s->mix;

514 const float imix = 1.f - FFMAX(mix, 0.f);

515

516 inverse_transform(st, x, y);

517 s->fdsp->vector_fmul(x, x, s->window, WINDOW_SIZE);

518 s->fdsp->vector_fmac_scalar(x, st->synthesis_mem, 1.f, FRAME_SIZE);

519 RNN_COPY(out, x, FRAME_SIZE);

520 RNN_COPY(st->synthesis_mem, &x[FRAME_SIZE], FRAME_SIZE);

521

522 for (int n = 0; n < FRAME_SIZE; n++)

523 out[n] = out[n] * mix + src[n] * imix;

524 }

525

526 static inline void xcorr_kernel(const float *x, const float *y, float sum[4], int len)

527 {

528 float y_0, y_1, y_2, y_3 = 0;

529 int j;

530

531 y_0 = *y++;

532 y_1 = *y++;

533 y_2 = *y++;

534

535 for (j = 0; j < len - 3; j += 4) {

536 float tmp;

537

538 tmp = *x++;

539 y_3 = *y++;

540 sum[0] += tmp * y_0;

541 sum[1] += tmp * y_1;

542 sum[2] += tmp * y_2;

543 sum[3] += tmp * y_3;

544 tmp = *x++;

545 y_0 = *y++;

546 sum[0] += tmp * y_1;

547 sum[1] += tmp * y_2;

548 sum[2] += tmp * y_3;

549 sum[3] += tmp * y_0;

550 tmp = *x++;

551 y_1 = *y++;

552 sum[0] += tmp * y_2;

553 sum[1] += tmp * y_3;

554 sum[2] += tmp * y_0;

555 sum[3] += tmp * y_1;

556 tmp = *x++;

557 y_2 = *y++;

558 sum[0] += tmp * y_3;

559 sum[1] += tmp * y_0;

560 sum[2] += tmp * y_1;

561 sum[3] += tmp * y_2;

562 }

563

564 if (j++ < len) {

565 float tmp = *x++;

566

567 y_3 = *y++;

568 sum[0] += tmp * y_0;

569 sum[1] += tmp * y_1;

570 sum[2] += tmp * y_2;

571 sum[3] += tmp * y_3;

572 }

573

574 if (j++ < len) {

575 float tmp=*x++;

576

577 y_0 = *y++;

578 sum[0] += tmp * y_1;

579 sum[1] += tmp * y_2;

580 sum[2] += tmp * y_3;

581 sum[3] += tmp * y_0;

582 }

583

584 if (j < len) {

585 float tmp=*x++;

586

587 y_1 = *y++;

588 sum[0] += tmp * y_2;

589 sum[1] += tmp * y_3;

590 sum[2] += tmp * y_0;

591 sum[3] += tmp * y_1;

592 }

593 }

594

595 static inline float celt_inner_prod(const float *x,

596 const float *y, int N)

597 {

598 float xy = 0.f;

599

600 for (int i = 0; i < N; i++)

601 xy += x[i] * y[i];

602

603 return xy;

604 }

605

606 static void celt_pitch_xcorr(const float *x, const float *y,

607 float *xcorr, int len, int max_pitch)

608 {

609 int i;

610

611 for (i = 0; i < max_pitch - 3; i += 4) {

612 float sum[4] = { 0, 0, 0, 0};

613

614 xcorr_kernel(x, y + i, sum, len);

615

616 xcorr[i] = sum[0];

617 xcorr[i + 1] = sum[1];

618 xcorr[i + 2] = sum[2];

619 xcorr[i + 3] = sum[3];

620 }

621 /* In case max_pitch isn't a multiple of 4, do non-unrolled version. */

622 for (; i < max_pitch; i++) {

623 xcorr[i] = celt_inner_prod(x, y + i, len);

624 }

625 }

626

627 static int celt_autocorr(const float *x, /* in: [0...n-1] samples x */

628 float *ac, /* out: [0...lag-1] ac values */

629 const float *window,

630 int overlap,

631 int lag,

632 int n)

633 {

634 int fastN = n - lag;

635 int shift;

636 const float *xptr;

637 float xx[PITCH_BUF_SIZE>>1];

638

639 if (overlap == 0) {

640 xptr = x;

641 } else {

642 for (int i = 0; i < n; i++)

643 xx[i] = x[i];

644 for (int i = 0; i < overlap; i++) {

645 xx[i] = x[i] * window[i];

646 xx[n-i-1] = x[n-i-1] * window[i];

647 }

648 xptr = xx;

649 }

650

651 shift = 0;

652 celt_pitch_xcorr(xptr, xptr, ac, fastN, lag+1);

653

654 for (int k = 0; k <= lag; k++) {

655 float d = 0.f;

656

657 for (int i = k + fastN; i < n; i++)

658 d += xptr[i] * xptr[i-k];

659 ac[k] += d;

660 }

661

662 return shift;

663 }

664

665 static void celt_lpc(float *lpc, /* out: [0...p-1] LPC coefficients */

666 const float *ac, /* in: [0...p] autocorrelation values */

667 int p)

668 {

669 float r, error = ac[0];

670

671 RNN_CLEAR(lpc, p);

672 if (ac[0] != 0) {

673 for (int i = 0; i < p; i++) {

674 /* Sum up this iteration's reflection coefficient */

675 float rr = 0;

676 for (int j = 0; j < i; j++)

677 rr += (lpc[j] * ac[i - j]);

678 rr += ac[i + 1];

679 r = -rr/error;

680 /* Update LPC coefficients and total error */

681 lpc[i] = r;

682 for (int j = 0; j < (i + 1) >> 1; j++) {

683 float tmp1, tmp2;

684 tmp1 = lpc[j];

685 tmp2 = lpc[i-1-j];

686 lpc[j] = tmp1 + (r*tmp2);

687 lpc[i-1-j] = tmp2 + (r*tmp1);

688 }

689

690 error = error - (r * r *error);

691 /* Bail out once we get 30 dB gain */

692 if (error < .001f * ac[0])

693 break;

694 }

695 }

696 }

697

698 static void celt_fir5(const float *x,

699 const float *num,

700 float *y,

701 int N,

702 float *mem)

703 {

704 float num0, num1, num2, num3, num4;

705 float mem0, mem1, mem2, mem3, mem4;

706

707 num0 = num[0];

708 num1 = num[1];

709 num2 = num[2];

710 num3 = num[3];

711 num4 = num[4];

712 mem0 = mem[0];

713 mem1 = mem[1];

714 mem2 = mem[2];

715 mem3 = mem[3];

716 mem4 = mem[4];

717

718 for (int i = 0; i < N; i++) {

719 float sum = x[i];

720

721 sum += (num0*mem0);

722 sum += (num1*mem1);

723 sum += (num2*mem2);

724 sum += (num3*mem3);

725 sum += (num4*mem4);

726 mem4 = mem3;

727 mem3 = mem2;

728 mem2 = mem1;

729 mem1 = mem0;

730 mem0 = x[i];

731 y[i] = sum;

732 }

733

734 mem[0] = mem0;

735 mem[1] = mem1;

736 mem[2] = mem2;

737 mem[3] = mem3;

738 mem[4] = mem4;

739 }

740

741 static void pitch_downsample(float *x[], float *x_lp,

742 int len, int C)

743 {

744 float ac[5];

745 float tmp=Q15ONE;

746 float lpc[4], mem[5]={0,0,0,0,0};

747 float lpc2[5];

748 float c1 = .8f;

749

750 for (int i = 1; i < len >> 1; i++)

751 x_lp[i] = .5f * (.5f * (x[0][(2*i-1)]+x[0][(2*i+1)])+x[0][2*i]);

752 x_lp[0] = .5f * (.5f * (x[0][1])+x[0][0]);

753 if (C==2) {

754 for (int i = 1; i < len >> 1; i++)

755 x_lp[i] += (.5f * (.5f * (x[1][(2*i-1)]+x[1][(2*i+1)])+x[1][2*i]));

756 x_lp[0] += .5f * (.5f * (x[1][1])+x[1][0]);

757 }

758

759 celt_autocorr(x_lp, ac, NULL, 0, 4, len>>1);

760

761 /* Noise floor -40 dB */

762 ac[0] *= 1.0001f;

763 /* Lag windowing */

764 for (int i = 1; i <= 4; i++) {

765 /*ac[i] *= exp(-.5*(2*M_PI*.002*i)*(2*M_PI*.002*i));*/

766 ac[i] -= ac[i]*(.008f*i)*(.008f*i);

767 }

768

769 celt_lpc(lpc, ac, 4);

770 for (int i = 0; i < 4; i++) {

771 tmp = .9f * tmp;

772 lpc[i] = (lpc[i] * tmp);

773 }

774 /* Add a zero */

775 lpc2[0] = lpc[0] + .8f;

776 lpc2[1] = lpc[1] + (c1 * lpc[0]);

777 lpc2[2] = lpc[2] + (c1 * lpc[1]);

778 lpc2[3] = lpc[3] + (c1 * lpc[2]);

779 lpc2[4] = (c1 * lpc[3]);

780 celt_fir5(x_lp, lpc2, x_lp, len>>1, mem);

781 }

782

783 static inline void dual_inner_prod(const float *x, const float *y01, const float *y02,

784 int N, float *xy1, float *xy2)

785 {

786 float xy01 = 0, xy02 = 0;

787

788 for (int i = 0; i < N; i++) {

789 xy01 += (x[i] * y01[i]);

790 xy02 += (x[i] * y02[i]);

791 }

792

793 *xy1 = xy01;

794 *xy2 = xy02;

795 }

796

797 static float compute_pitch_gain(float xy, float xx, float yy)

798 {

799 return xy / sqrtf(1.f + xx * yy);

800 }

801

802 static const uint8_t second_check[16] = {0, 0, 3, 2, 3, 2, 5, 2, 3, 2, 3, 2, 5, 2, 3, 2};

803 static float remove_doubling(float *x, int maxperiod, int minperiod, int N,

804 int *T0_, int prev_period, float prev_gain)

805 {

806 int k, i, T, T0;

807 float g, g0;

808 float pg;

809 float xy,xx,yy,xy2;

810 float xcorr[3];

811 float best_xy, best_yy;

812 int offset;

813 int minperiod0;

814 float yy_lookup[PITCH_MAX_PERIOD+1];

815

816 minperiod0 = minperiod;

817 maxperiod /= 2;

818 minperiod /= 2;

819 *T0_ /= 2;

820 prev_period /= 2;

821 N /= 2;

822 x += maxperiod;

823 if (*T0_>=maxperiod)

824 *T0_=maxperiod-1;

825

826 T = T0 = *T0_;

827 dual_inner_prod(x, x, x-T0, N, &xx, &xy);

828 yy_lookup[0] = xx;

829 yy=xx;

830 for (i = 1; i <= maxperiod; i++) {

831 yy = yy+(x[-i] * x[-i])-(x[N-i] * x[N-i]);

832 yy_lookup[i] = FFMAX(0, yy);

833 }

834 yy = yy_lookup[T0];

835 best_xy = xy;

836 best_yy = yy;

837 g = g0 = compute_pitch_gain(xy, xx, yy);

838 /* Look for any pitch at T/k */

839 for (k = 2; k <= 15; k++) {

840 int T1, T1b;

841 float g1;

842 float cont=0;

843 float thresh;

844 T1 = (2*T0+k)/(2*k);

845 if (T1 < minperiod)

846 break;

847 /* Look for another strong correlation at T1b */

848 if (k==2)

849 {

850 if (T1+T0>maxperiod)

851 T1b = T0;

852 else

853 T1b = T0+T1;

854 } else

855 {

856 T1b = (2*second_check[k]*T0+k)/(2*k);

857 }

858 dual_inner_prod(x, &x[-T1], &x[-T1b], N, &xy, &xy2);

859 xy = .5f * (xy + xy2);

860 yy = .5f * (yy_lookup[T1] + yy_lookup[T1b]);

861 g1 = compute_pitch_gain(xy, xx, yy);

862 if (FFABS(T1-prev_period)<=1)

863 cont = prev_gain;

864 else if (FFABS(T1-prev_period)<=2 && 5 * k * k < T0)

865 cont = prev_gain * .5f;

866 else

867 cont = 0;

868 thresh = FFMAX(.3f, (.7f * g0) - cont);

869 /* Bias against very high pitch (very short period) to avoid false-positives

870 due to short-term correlation */

871 if (T1<3*minperiod)

872 thresh = FFMAX(.4f, (.85f * g0) - cont);

873 else if (T1<2*minperiod)

874 thresh = FFMAX(.5f, (.9f * g0) - cont);

875 if (g1 > thresh)

876 {

877 best_xy = xy;

878 best_yy = yy;

879 T = T1;

880 g = g1;

881 }

882 }

883 best_xy = FFMAX(0, best_xy);

884 if (best_yy <= best_xy)

885 pg = Q15ONE;

886 else

887 pg = best_xy/(best_yy + 1);

888

889 for (k = 0; k < 3; k++)

890 xcorr[k] = celt_inner_prod(x, x-(T+k-1), N);

891 if ((xcorr[2]-xcorr[0]) > .7f * (xcorr[1]-xcorr[0]))

892 offset = 1;

893 else if ((xcorr[0]-xcorr[2]) > (.7f * (xcorr[1] - xcorr[2])))

894 offset = -1;

895 else

896 offset = 0;

897 if (pg > g)

898 pg = g;

899 *T0_ = 2*T+offset;

900

901 if (*T0_<minperiod0)

902 *T0_=minperiod0;

903 return pg;

904 }

905

906 static void find_best_pitch(float *xcorr, float *y, int len,

907 int max_pitch, int *best_pitch)

908 {

909 float best_num[2];

910 float best_den[2];

911 float Syy = 1.f;

912

913 best_num[0] = -1;

914 best_num[1] = -1;

915 best_den[0] = 0;

916 best_den[1] = 0;

917 best_pitch[0] = 0;

918 best_pitch[1] = 1;

919

920 for (int j = 0; j < len; j++)

921 Syy += y[j] * y[j];

922

923 for (int i = 0; i < max_pitch; i++) {

924 if (xcorr[i]>0) {

925 float num;

926 float xcorr16;

927

928 xcorr16 = xcorr[i];

929 /* Considering the range of xcorr16, this should avoid both underflows

930 and overflows (inf) when squaring xcorr16 */

931 xcorr16 *= 1e-12f;

932 num = xcorr16 * xcorr16;

933 if ((num * best_den[1]) > (best_num[1] * Syy)) {

934 if ((num * best_den[0]) > (best_num[0] * Syy)) {

935 best_num[1] = best_num[0];

936 best_den[1] = best_den[0];

937 best_pitch[1] = best_pitch[0];

938 best_num[0] = num;

939 best_den[0] = Syy;

940 best_pitch[0] = i;

941 } else {

942 best_num[1] = num;

943 best_den[1] = Syy;

944 best_pitch[1] = i;

945 }

946 }

947 }

948 Syy += y[i+len]*y[i+len] - y[i] * y[i];

949 Syy = FFMAX(1, Syy);

950 }

951 }

952

953 static void pitch_search(const float *x_lp, float *y,

954 int len, int max_pitch, int *pitch)

955 {

956 int lag;

957 int best_pitch[2]={0,0};

958 int offset;

959

960 float x_lp4[WINDOW_SIZE];

961 float y_lp4[WINDOW_SIZE];

962 float xcorr[WINDOW_SIZE];

963

964 lag = len+max_pitch;

965

966 /* Downsample by 2 again */

967 for (int j = 0; j < len >> 2; j++)

968 x_lp4[j] = x_lp[2*j];

969 for (int j = 0; j < lag >> 2; j++)

970 y_lp4[j] = y[2*j];

971

972 /* Coarse search with 4x decimation */

973

974 celt_pitch_xcorr(x_lp4, y_lp4, xcorr, len>>2, max_pitch>>2);

975

976 find_best_pitch(xcorr, y_lp4, len>>2, max_pitch>>2, best_pitch);

977

978 /* Finer search with 2x decimation */

979 for (int i = 0; i < max_pitch >> 1; i++) {

980 float sum;

981 xcorr[i] = 0;

982 if (FFABS(i-2*best_pitch[0])>2 && FFABS(i-2*best_pitch[1])>2)

983 continue;

984 sum = celt_inner_prod(x_lp, y+i, len>>1);

985 xcorr[i] = FFMAX(-1, sum);

986 }

987

988 find_best_pitch(xcorr, y, len>>1, max_pitch>>1, best_pitch);

989

990 /* Refine by pseudo-interpolation */

991 if (best_pitch[0] > 0 && best_pitch[0] < (max_pitch >> 1) - 1) {

992 float a, b, c;

993

994 a = xcorr[best_pitch[0] - 1];

995 b = xcorr[best_pitch[0]];

996 c = xcorr[best_pitch[0] + 1];

997 if (c - a > .7f * (b - a))

998 offset = 1;

999 else if (a - c > .7f * (b-c))

1000 offset = -1;

1001 else

1002 offset = 0;

1003 } else {

1004 offset = 0;

1005 }

1006

1007 *pitch = 2 * best_pitch[0] - offset;

1008 }

1009

1010 static void dct(AudioRNNContext *s, float *out, const float *in)

1011 {

1012 for (int i = 0; i < NB_BANDS; i++) {

1013 float sum;

1014

1015 sum = s->fdsp->scalarproduct_float(in, s->dct_table[i], FFALIGN(NB_BANDS, 4));

1016 out[i] = sum * sqrtf(2.f / 22);

1017 }

1018 }

1019

1020 static int compute_frame_features(AudioRNNContext *s, DenoiseState *st, AVComplexFloat *X, AVComplexFloat *P,

1021 float *Ex, float *Ep, float *Exp, float *features, const float *in)

1022 {

1023 float E = 0;

1024 float *ceps_0, *ceps_1, *ceps_2;

1025 float spec_variability = 0;

1026 LOCAL_ALIGNED_32(float, Ly, [NB_BANDS]);

1027 LOCAL_ALIGNED_32(float, p, [WINDOW_SIZE]);

1028 float pitch_buf[PITCH_BUF_SIZE>>1];

1029 int pitch_index;

1030 float gain;

1031 float *(pre[1]);

1032 float tmp[NB_BANDS];

1033 float follow, logMax;

1034

1035 frame_analysis(s, st, X, Ex, in);

1036 RNN_MOVE(st->pitch_buf, &st->pitch_buf[FRAME_SIZE], PITCH_BUF_SIZE-FRAME_SIZE);

1037 RNN_COPY(&st->pitch_buf[PITCH_BUF_SIZE-FRAME_SIZE], in, FRAME_SIZE);

1038 pre[0] = &st->pitch_buf[0];

1039 pitch_downsample(pre, pitch_buf, PITCH_BUF_SIZE, 1);

1040 pitch_search(pitch_buf+(PITCH_MAX_PERIOD>>1), pitch_buf, PITCH_FRAME_SIZE,

1041 PITCH_MAX_PERIOD-3*PITCH_MIN_PERIOD, &pitch_index);

1042 pitch_index = PITCH_MAX_PERIOD-pitch_index;

1043

1044 gain = remove_doubling(pitch_buf, PITCH_MAX_PERIOD, PITCH_MIN_PERIOD,

1045 PITCH_FRAME_SIZE, &pitch_index, st->last_period, st->last_gain);

1046 st->last_period = pitch_index;

1047 st->last_gain = gain;

1048

1049 for (int i = 0; i < WINDOW_SIZE; i++)

1050 p[i] = st->pitch_buf[PITCH_BUF_SIZE-WINDOW_SIZE-pitch_index+i];

1051

1052 s->fdsp->vector_fmul(p, p, s->window, WINDOW_SIZE);

1053 forward_transform(st, P, p);

1054 compute_band_energy(Ep, P);

1055 compute_band_corr(Exp, X, P);

1056

1057 for (int i = 0; i < NB_BANDS; i++)

1058 Exp[i] = Exp[i] / sqrtf(.001f+Ex[i]*Ep[i]);

1059

1060 dct(s, tmp, Exp);

1061

1062 for (int i = 0; i < NB_DELTA_CEPS; i++)

1063 features[NB_BANDS+2*NB_DELTA_CEPS+i] = tmp[i];

1064

1065 features[NB_BANDS+2*NB_DELTA_CEPS] -= 1.3;

1066 features[NB_BANDS+2*NB_DELTA_CEPS+1] -= 0.9;

1067 features[NB_BANDS+3*NB_DELTA_CEPS] = .01*(pitch_index-300);

1068 logMax = -2;

1069 follow = -2;

1070

1071 for (int i = 0; i < NB_BANDS; i++) {

1072 Ly[i] = log10f(1e-2f + Ex[i]);

1073 Ly[i] = FFMAX(logMax-7, FFMAX(follow-1.5, Ly[i]));

1074 logMax = FFMAX(logMax, Ly[i]);

1075 follow = FFMAX(follow-1.5, Ly[i]);

1076 E += Ex[i];

1077 }

1078

1079 if (E < 0.04f) {

1080 /* If there's no audio, avoid messing up the state. */

1081 RNN_CLEAR(features, NB_FEATURES);

1082 return 1;

1083 }

1084

1085 dct(s, features, Ly);

1086 features[0] -= 12;

1087 features[1] -= 4;

1088 ceps_0 = st->cepstral_mem[st->memid];

1089 ceps_1 = (st->memid < 1) ? st->cepstral_mem[CEPS_MEM+st->memid-1] : st->cepstral_mem[st->memid-1];

1090 ceps_2 = (st->memid < 2) ? st->cepstral_mem[CEPS_MEM+st->memid-2] : st->cepstral_mem[st->memid-2];

1091

1092 for (int i = 0; i < NB_BANDS; i++)

1093 ceps_0[i] = features[i];

1094

1095 st->memid++;

1096 for (int i = 0; i < NB_DELTA_CEPS; i++) {

1097 features[i] = ceps_0[i] + ceps_1[i] + ceps_2[i];

1098 features[NB_BANDS+i] = ceps_0[i] - ceps_2[i];

1099 features[NB_BANDS+NB_DELTA_CEPS+i] = ceps_0[i] - 2*ceps_1[i] + ceps_2[i];

1100 }

1101 /* Spectral variability features. */

1102 if (st->memid == CEPS_MEM)

1103 st->memid = 0;

1104

1105 for (int i = 0; i < CEPS_MEM; i++) {

1106 float mindist = 1e15f;

1107 for (int j = 0; j < CEPS_MEM; j++) {

1108 float dist = 0.f;

1109 for (int k = 0; k < NB_BANDS; k++) {

1110 float tmp;

1111

1112 tmp = st->cepstral_mem[i][k] - st->cepstral_mem[j][k];

1113 dist += tmp*tmp;

1114 }

1115

1116 if (j != i)

1117 mindist = FFMIN(mindist, dist);

1118 }

1119

1120 spec_variability += mindist;

1121 }

1122

1123 features[NB_BANDS+3*NB_DELTA_CEPS+1] = spec_variability/CEPS_MEM-2.1;

1124

1125 return 0;

1126 }

1127

1128 static void interp_band_gain(float *g, const float *bandE)

1129 {

1130 memset(g, 0, sizeof(*g) * FREQ_SIZE);

1131

1132 for (int i = 0; i < NB_BANDS - 1; i++) {

1133 const int band_size = (eband5ms[i + 1] - eband5ms[i]) << FRAME_SIZE_SHIFT;

1134

1135 for (int j = 0; j < band_size; j++) {

1136 float frac = (float)j / band_size;

1137

1138 g[(eband5ms[i] << FRAME_SIZE_SHIFT) + j] = (1.f - frac) * bandE[i] + frac * bandE[i + 1];

1139 }

1140 }

1141 }

1142

1143 static void pitch_filter(AVComplexFloat *X, const AVComplexFloat *P, const float *Ex, const float *Ep,

1144 const float *Exp, const float *g)

1145 {

1146 float newE[NB_BANDS];

1147 float r[NB_BANDS];

1148 float norm[NB_BANDS];

1149 float rf[FREQ_SIZE] = {0};

1150 float normf[FREQ_SIZE]={0};

1151

1152 for (int i = 0; i < NB_BANDS; i++) {

1153 if (Exp[i]>g[i]) r[i] = 1;

1154 else r[i] = SQUARE(Exp[i])*(1-SQUARE(g[i]))/(.001 + SQUARE(g[i])*(1-SQUARE(Exp[i])));

1155 r[i] = sqrtf(av_clipf(r[i], 0, 1));

1156 r[i] *= sqrtf(Ex[i]/(1e-8+Ep[i]));

1157 }

1158 interp_band_gain(rf, r);

1159 for (int i = 0; i < FREQ_SIZE; i++) {

1160 X[i].re += rf[i]*P[i].re;

1161 X[i].im += rf[i]*P[i].im;

1162 }

1163 compute_band_energy(newE, X);

1164 for (int i = 0; i < NB_BANDS; i++) {

1165 norm[i] = sqrtf(Ex[i] / (1e-8+newE[i]));

1166 }

1167 interp_band_gain(normf, norm);

1168 for (int i = 0; i < FREQ_SIZE; i++) {

1169 X[i].re *= normf[i];

1170 X[i].im *= normf[i];

1171 }

1172 }

1173

1174 static const float tansig_table[201] = {

1175 0.000000f, 0.039979f, 0.079830f, 0.119427f, 0.158649f,

1176 0.197375f, 0.235496f, 0.272905f, 0.309507f, 0.345214f,

1177 0.379949f, 0.413644f, 0.446244f, 0.477700f, 0.507977f,

1178 0.537050f, 0.564900f, 0.591519f, 0.616909f, 0.641077f,

1179 0.664037f, 0.685809f, 0.706419f, 0.725897f, 0.744277f,

1180 0.761594f, 0.777888f, 0.793199f, 0.807569f, 0.821040f,

1181 0.833655f, 0.845456f, 0.856485f, 0.866784f, 0.876393f,

1182 0.885352f, 0.893698f, 0.901468f, 0.908698f, 0.915420f,

1183 0.921669f, 0.927473f, 0.932862f, 0.937863f, 0.942503f,

1184 0.946806f, 0.950795f, 0.954492f, 0.957917f, 0.961090f,

1185 0.964028f, 0.966747f, 0.969265f, 0.971594f, 0.973749f,

1186 0.975743f, 0.977587f, 0.979293f, 0.980869f, 0.982327f,

1187 0.983675f, 0.984921f, 0.986072f, 0.987136f, 0.988119f,

1188 0.989027f, 0.989867f, 0.990642f, 0.991359f, 0.992020f,

1189 0.992631f, 0.993196f, 0.993718f, 0.994199f, 0.994644f,

1190 0.995055f, 0.995434f, 0.995784f, 0.996108f, 0.996407f,

1191 0.996682f, 0.996937f, 0.997172f, 0.997389f, 0.997590f,

1192 0.997775f, 0.997946f, 0.998104f, 0.998249f, 0.998384f,

1193 0.998508f, 0.998623f, 0.998728f, 0.998826f, 0.998916f,

1194 0.999000f, 0.999076f, 0.999147f, 0.999213f, 0.999273f,

1195 0.999329f, 0.999381f, 0.999428f, 0.999472f, 0.999513f,

1196 0.999550f, 0.999585f, 0.999617f, 0.999646f, 0.999673f,

1197 0.999699f, 0.999722f, 0.999743f, 0.999763f, 0.999781f,

1198 0.999798f, 0.999813f, 0.999828f, 0.999841f, 0.999853f,

1199 0.999865f, 0.999875f, 0.999885f, 0.999893f, 0.999902f,

1200 0.999909f, 0.999916f, 0.999923f, 0.999929f, 0.999934f,

1201 0.999939f, 0.999944f, 0.999948f, 0.999952f, 0.999956f,

1202 0.999959f, 0.999962f, 0.999965f, 0.999968f, 0.999970f,

1203 0.999973f, 0.999975f, 0.999977f, 0.999978f, 0.999980f,

1204 0.999982f, 0.999983f, 0.999984f, 0.999986f, 0.999987f,

1205 0.999988f, 0.999989f, 0.999990f, 0.999990f, 0.999991f,

1206 0.999992f, 0.999992f, 0.999993f, 0.999994f, 0.999994f,

1207 0.999994f, 0.999995f, 0.999995f, 0.999996f, 0.999996f,

1208 0.999996f, 0.999997f, 0.999997f, 0.999997f, 0.999997f,

1209 0.999997f, 0.999998f, 0.999998f, 0.999998f, 0.999998f,

1210 0.999998f, 0.999998f, 0.999999f, 0.999999f, 0.999999f,

1211 0.999999f, 0.999999f, 0.999999f, 0.999999f, 0.999999f,

1212 0.999999f, 0.999999f, 0.999999f, 0.999999f, 0.999999f,

1213 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,

1214 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,

1215 1.000000f,

1216 };

1217

1218 static inline float tansig_approx(float x)

1219 {

1220 float y, dy;

1221 float sign=1;

1222 int i;

1223

1224 /* Tests are reversed to catch NaNs */

1225 if (!(x<8))

1226 return 1;

1227 if (!(x>-8))

1228 return -1;

1229 /* Another check in case of -ffast-math */

1230

1231 if (isnan(x))

1232 return 0;

1233

1234 if (x < 0) {

1235 x=-x;

1236 sign=-1;

1237 }

1238 i = (int)floor(.5f+25*x);

1239 x -= .04f*i;

1240 y = tansig_table[i];

1241 dy = 1-y*y;

1242 y = y + x*dy*(1 - y*x);

1243 return sign*y;

1244 }

1245

1246 static inline float sigmoid_approx(float x)

1247 {

1248 return .5f + .5f*tansig_approx(.5f*x);

1249 }

1250

1251 static void compute_dense(const DenseLayer *layer, float *output, const float *input)

1252 {

1253 const int N = layer->nb_neurons, M = layer->nb_inputs, stride = N;

1254

1255 for (int i = 0; i < N; i++) {

1256 /* Compute update gate. */

1257 float sum = layer->bias[i];

1258

1259 for (int j = 0; j < M; j++)

1260 sum += layer->input_weights[j * stride + i] * input[j];

1261

1262 output[i] = WEIGHTS_SCALE * sum;

1263 }

1264

1265 if (layer->activation == ACTIVATION_SIGMOID) {

1266 for (int i = 0; i < N; i++)

1267 output[i] = sigmoid_approx(output[i]);

1268 } else if (layer->activation == ACTIVATION_TANH) {

1269 for (int i = 0; i < N; i++)

1270 output[i] = tansig_approx(output[i]);

1271 } else if (layer->activation == ACTIVATION_RELU) {

1272 for (int i = 0; i < N; i++)

1273 output[i] = FFMAX(0, output[i]);

1274 } else {

1275 av_assert0(0);

1276 }

1277 }

1278

1279 static void compute_gru(AudioRNNContext *s, const GRULayer *gru, float *state, const float *input)

1280 {

1281 LOCAL_ALIGNED_32(float, z, [MAX_NEURONS]);

1282 LOCAL_ALIGNED_32(float, r, [MAX_NEURONS]);

1283 LOCAL_ALIGNED_32(float, h, [MAX_NEURONS]);

1284 const int M = gru->nb_inputs;

1285 const int N = gru->nb_neurons;

1286 const int AN = FFALIGN(N, 4);

1287 const int AM = FFALIGN(M, 4);

1288 const int stride = 3 * AN, istride = 3 * AM;

1289

1290 for (int i = 0; i < N; i++) {

1291 /* Compute update gate. */

1292 float sum = gru->bias[i];

1293

1294 sum += s->fdsp->scalarproduct_float(gru->input_weights + i * istride, input, AM);

1295 sum += s->fdsp->scalarproduct_float(gru->recurrent_weights + i * stride, state, AN);

1296 z[i] = sigmoid_approx(WEIGHTS_SCALE * sum);

1297 }

1298

1299 for (int i = 0; i < N; i++) {

1300 /* Compute reset gate. */

1301 float sum = gru->bias[N + i];

1302

1303 sum += s->fdsp->scalarproduct_float(gru->input_weights + AM + i * istride, input, AM);

1304 sum += s->fdsp->scalarproduct_float(gru->recurrent_weights + AN + i * stride, state, AN);

1305 r[i] = sigmoid_approx(WEIGHTS_SCALE * sum);

1306 }

1307

1308 for (int i = 0; i < N; i++) {

1309 /* Compute output. */

1310 float sum = gru->bias[2 * N + i];

1311

1312 sum += s->fdsp->scalarproduct_float(gru->input_weights + 2 * AM + i * istride, input, AM);

1313 for (int j = 0; j < N; j++)

1314 sum += gru->recurrent_weights[2 * AN + i * stride + j] * state[j] * r[j];

1315

1316 if (gru->activation == ACTIVATION_SIGMOID)

1317 sum = sigmoid_approx(WEIGHTS_SCALE * sum);

1318 else if (gru->activation == ACTIVATION_TANH)

1319 sum = tansig_approx(WEIGHTS_SCALE * sum);

1320 else if (gru->activation == ACTIVATION_RELU)

1321 sum = FFMAX(0, WEIGHTS_SCALE * sum);

1322 else

1323 av_assert0(0);

1324 h[i] = z[i] * state[i] + (1.f - z[i]) * sum;

1325 }

1326

1327 RNN_COPY(state, h, N);

1328 }

1329

1330 #define INPUT_SIZE 42

1331

1332 static void compute_rnn(AudioRNNContext *s, RNNState *rnn, float *gains, float *vad, const float *input)

1333 {

1334 LOCAL_ALIGNED_32(float, dense_out, [MAX_NEURONS]);

1335 LOCAL_ALIGNED_32(float, noise_input, [MAX_NEURONS * 3]);

1336 LOCAL_ALIGNED_32(float, denoise_input, [MAX_NEURONS * 3]);

1337

1338 compute_dense(rnn->model->input_dense, dense_out, input);

1339 compute_gru(s, rnn->model->vad_gru, rnn->vad_gru_state, dense_out);

1340 compute_dense(rnn->model->vad_output, vad, rnn->vad_gru_state);

1341

1342 memcpy(noise_input, dense_out, rnn->model->input_dense_size * sizeof(float));

1343 memcpy(noise_input + rnn->model->input_dense_size,

1344 rnn->vad_gru_state, rnn->model->vad_gru_size * sizeof(float));

1345 memcpy(noise_input + rnn->model->input_dense_size + rnn->model->vad_gru_size,

1346 input, INPUT_SIZE * sizeof(float));

1347

1348 compute_gru(s, rnn->model->noise_gru, rnn->noise_gru_state, noise_input);

1349

1350 memcpy(denoise_input, rnn->vad_gru_state, rnn->model->vad_gru_size * sizeof(float));

1351 memcpy(denoise_input + rnn->model->vad_gru_size,

1352 rnn->noise_gru_state, rnn->model->noise_gru_size * sizeof(float));

1353 memcpy(denoise_input + rnn->model->vad_gru_size + rnn->model->noise_gru_size,

1354 input, INPUT_SIZE * sizeof(float));

1355

1356 compute_gru(s, rnn->model->denoise_gru, rnn->denoise_gru_state, denoise_input);

1357 compute_dense(rnn->model->denoise_output, gains, rnn->denoise_gru_state);

1358 }

1359

1360 static float rnnoise_channel(AudioRNNContext *s, DenoiseState *st, float *out, const float *in,

1361 int disabled)

1362 {

1363 AVComplexFloat X[FREQ_SIZE];

1364 AVComplexFloat P[WINDOW_SIZE];

1365 float x[FRAME_SIZE];

1366 float Ex[NB_BANDS], Ep[NB_BANDS];

1367 LOCAL_ALIGNED_32(float, Exp, [NB_BANDS]);

1368 float features[NB_FEATURES];

1369 float g[NB_BANDS];

1370 float gf[FREQ_SIZE];

1371 float vad_prob = 0;

1372 float *history = st->history;

1373 static const float a_hp[2] = {-1.99599, 0.99600};

1374 static const float b_hp[2] = {-2, 1};

1375 int silence;

1376

1377 biquad(x, st->mem_hp_x, in, b_hp, a_hp, FRAME_SIZE);

1378 silence = compute_frame_features(s, st, X, P, Ex, Ep, Exp, features, x);

1379

1380 if (!silence && !disabled) {

1381 compute_rnn(s, &st->rnn[0], g, &vad_prob, features);

1382 pitch_filter(X, P, Ex, Ep, Exp, g);

1383 for (int i = 0; i < NB_BANDS; i++) {

1384 float alpha = .6f;

1385

1386 g[i] = FFMAX(g[i], alpha * st->lastg[i]);

1387 st->lastg[i] = g[i];

1388 }

1389

1390 interp_band_gain(gf, g);

1391

1392 for (int i = 0; i < FREQ_SIZE; i++) {

1393 X[i].re *= gf[i];

1394 X[i].im *= gf[i];

1395 }

1396 }

1397

1398 frame_synthesis(s, st, out, X);

1399 memcpy(history, in, FRAME_SIZE * sizeof(*history));

1400

1401 return vad_prob;

1402 }

1403

1404 typedef struct ThreadData {

1405 AVFrame *in, *out;

1406 } ThreadData;

1407

1408 static int rnnoise_channels(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs)

1409 {

1410 AudioRNNContext *s = ctx->priv;

1411 ThreadData *td = arg;

1412 AVFrame *in = td->in;

1413 AVFrame *out = td->out;

1414 const int start = (out->ch_layout.nb_channels * jobnr) / nb_jobs;

1415 const int end = (out->ch_layout.nb_channels * (jobnr+1)) / nb_jobs;

1416

1417 for (int ch = start; ch < end; ch++) {

1418 rnnoise_channel(s, &s->st[ch],

1419 (float *)out->extended_data[ch],

1420 (const float *)in->extended_data[ch],

1421 ctx->is_disabled);

1422 }

1423

1424 return 0;

1425 }

1426

1427 static int filter_frame(AVFilterLink *inlink, AVFrame *in)

1428 {

1429 AVFilterContext *ctx = inlink->dst;

1430 AVFilterLink *outlink = ctx->outputs[0];

1431 AVFrame *out = NULL;

1432 ThreadData td;

1433

1434 out = ff_get_audio_buffer(outlink, FRAME_SIZE);

1435 if (!out) {

1436 av_frame_free(&in);

1437 return AVERROR(ENOMEM);

1438 }

1439 av_frame_copy_props(out, in);

1440

1441 td.in = in; td.out = out;

1442 ff_filter_execute(ctx, rnnoise_channels, &td, NULL,

1443 FFMIN(outlink->ch_layout.nb_channels, ff_filter_get_nb_threads(ctx)));

1444

1445 av_frame_free(&in);

1446 return ff_filter_frame(outlink, out);

1447 }

1448

1449 static int activate(AVFilterContext *ctx)

1450 {

1451 AVFilterLink *inlink = ctx->inputs[0];

1452 AVFilterLink *outlink = ctx->outputs[0];

1453 AVFrame *in = NULL;

1454 int ret;

1455

1456 FF_FILTER_FORWARD_STATUS_BACK(outlink, inlink);

1457

1458 ret = ff_inlink_consume_samples(inlink, FRAME_SIZE, FRAME_SIZE, &in);

1459 if (ret < 0)

1460 return ret;

1461

1462 if (ret > 0)

1463 return filter_frame(inlink, in);

1464

1465 FF_FILTER_FORWARD_STATUS(inlink, outlink);

1466 FF_FILTER_FORWARD_WANTED(outlink, inlink);

1467

1468 return FFERROR_NOT_READY;

1469 }

1470

1471 static int open_model(AVFilterContext *ctx, RNNModel **model)

1472 {

1473 AudioRNNContext *s = ctx->priv;

1474 int ret;

1475 FILE *f;

1476

1477 if (!s->model_name)

1478 return AVERROR(EINVAL);

1479 f = avpriv_fopen_utf8(s->model_name, "r");

1480 if (!f) {

1481 av_log(ctx, AV_LOG_ERROR, "Failed to open model file: %s\n", s->model_name);

1482 return AVERROR(EINVAL);

1483 }

1484

1485 ret = rnnoise_model_from_file(f, model);

1486 fclose(f);

1487 if (!*model || ret < 0)

1488 return ret;

1489

1490 return 0;

1491 }

1492

1493 static av_cold int init(AVFilterContext *ctx)

1494 {

1495 AudioRNNContext *s = ctx->priv;

1496 int ret;

1497

1498 s->fdsp = avpriv_float_dsp_alloc(0);

1499 if (!s->fdsp)

1500 return AVERROR(ENOMEM);

1501

1502 ret = open_model(ctx, &s->model[0]);

1503 if (ret < 0)

1504 return ret;

1505

1506 for (int i = 0; i < FRAME_SIZE; i++) {

1507 s->window[i] = sin(.5*M_PI*sin(.5*M_PI*(i+.5)/FRAME_SIZE) * sin(.5*M_PI*(i+.5)/FRAME_SIZE));

1508 s->window[WINDOW_SIZE - 1 - i] = s->window[i];

1509 }

1510

1511 for (int i = 0; i < NB_BANDS; i++) {

1512 for (int j = 0; j < NB_BANDS; j++) {

1513 s->dct_table[j][i] = cosf((i + .5f) * j * M_PI / NB_BANDS);

1514 if (j == 0)

1515 s->dct_table[j][i] *= sqrtf(.5);

1516 }

1517 }

1518

1519 return 0;

1520 }

1521

1522 static void free_model(AVFilterContext *ctx, int n)

1523 {

1524 AudioRNNContext *s = ctx->priv;

1525

1526 rnnoise_model_free(s->model[n]);

1527 s->model[n] = NULL;

1528

1529 for (int ch = 0; ch < s->channels && s->st; ch++) {

1530 av_freep(&s->st[ch].rnn[n].vad_gru_state);

1531 av_freep(&s->st[ch].rnn[n].noise_gru_state);

1532 av_freep(&s->st[ch].rnn[n].denoise_gru_state);

1533 }

1534 }

1535

1536 static int process_command(AVFilterContext *ctx, const char *cmd, const char *args,

1537 char *res, int res_len, int flags)

1538 {

1539 AudioRNNContext *s = ctx->priv;

1540 int ret;

1541

1542 ret = ff_filter_process_command(ctx, cmd, args, res, res_len, flags);

1543 if (ret < 0)

1544 return ret;

1545

1546 ret = open_model(ctx, &s->model[1]);

1547 if (ret < 0)

1548 return ret;

1549

1550 FFSWAP(RNNModel *, s->model[0], s->model[1]);

1551 for (int ch = 0; ch < s->channels; ch++)

1552 FFSWAP(RNNState, s->st[ch].rnn[0], s->st[ch].rnn[1]);

1553

1554 ret = config_input(ctx->inputs[0]);

1555 if (ret < 0) {

1556 for (int ch = 0; ch < s->channels; ch++)

1557 FFSWAP(RNNState, s->st[ch].rnn[0], s->st[ch].rnn[1]);

1558 FFSWAP(RNNModel *, s->model[0], s->model[1]);

1559 return ret;

1560 }

1561

1562 free_model(ctx, 1);

1563 return 0;

1564 }

1565

1566 static av_cold void uninit(AVFilterContext *ctx)

1567 {

1568 AudioRNNContext *s = ctx->priv;

1569

1570 av_freep(&s->fdsp);

1571 free_model(ctx, 0);

1572 for (int ch = 0; ch < s->channels && s->st; ch++) {

1573 av_tx_uninit(&s->st[ch].tx);

1574 av_tx_uninit(&s->st[ch].txi);

1575 }

1576 av_freep(&s->st);

1577 }

1578

1579 static const AVFilterPad inputs[] = {

1580 {

1581 .name = "default",

1582 .type = AVMEDIA_TYPE_AUDIO,

1583 .config_props = config_input,

1584 },

1585 };

1586

1587 #define OFFSET(x) offsetof(AudioRNNContext, x)

1588 #define AF AV_OPT_FLAG_AUDIO_PARAM|AV_OPT_FLAG_FILTERING_PARAM|AV_OPT_FLAG_RUNTIME_PARAM

1589

1590 static const AVOption arnndn_options[] = {

1591 { "model", "set model name", OFFSET(model_name), AV_OPT_TYPE_STRING, {.str=NULL}, 0, 0, AF },

1592 { "m", "set model name", OFFSET(model_name), AV_OPT_TYPE_STRING, {.str=NULL}, 0, 0, AF },

1593 { "mix", "set output vs input mix", OFFSET(mix), AV_OPT_TYPE_FLOAT, {.dbl=1.0},-1, 1, AF },

1594 { NULL }

1595 };

1596

1597 AVFILTER_DEFINE_CLASS(arnndn);

1598

1599 const FFFilter ff_af_arnndn = {

1600 .p.name = "arnndn",

1601 .p.description = NULL_IF_CONFIG_SMALL("Reduce noise from speech using Recurrent Neural Networks."),

1602 .p.priv_class = &arnndn_class,

1603 .p.flags = AVFILTER_FLAG_SUPPORT_TIMELINE_INTERNAL |

1604 AVFILTER_FLAG_SLICE_THREADS,

1605 .priv_size = sizeof(AudioRNNContext),

1606 .activate = activate,

1607 .init = init,

1608 .uninit = uninit,

1609 FILTER_INPUTS(inputs),

1610 FILTER_OUTPUTS(ff_audio_default_filterpad),

1611 FILTER_QUERY_FUNC2(query_formats),

1612 .process_command = process_command,

1613 };

error

static void error(const char *err)

Definition: target_bsf_fuzzer.c:32

flags

const SwsFlags flags[]

Definition: swscale.c:61

compute_dense

static void compute_dense(const DenseLayer *layer, float *output, const float *input)

Definition: af_arnndn.c:1251

ff_get_audio_buffer

AVFrame * ff_get_audio_buffer(AVFilterLink *link, int nb_samples)

Request an audio samples buffer with a specific set of permissions.

Definition: audio.c:98

AV_SAMPLE_FMT_FLTP

@ AV_SAMPLE_FMT_FLTP

float, planar

Definition: samplefmt.h:66

PITCH_MAX_PERIOD

#define PITCH_MAX_PERIOD

Definition: af_arnndn.c:52

pitch_downsample

static void pitch_downsample(float *x[], float *x_lp, int len, int C)

Definition: af_arnndn.c:741

WEIGHTS_SCALE

#define WEIGHTS_SCALE

Definition: af_arnndn.c:65

mix

static int mix(int c0, int c1)

Definition: 4xm.c:717

DenoiseState::synthesis_mem

float synthesis_mem[FRAME_SIZE]

Definition: af_arnndn.c:123

const char * r

Definition: vf_curves.c:127

AVERROR

Filter the word "frame" indicates either a video frame or a group of audio as stored in an AVFrame structure Format for each input and each output the list of supported formats For video that means pixel format For audio that means channel sample they are references to shared objects When the negotiation mechanism computes the intersection of the formats supported at each end of a all references to both lists are replaced with a reference to the intersection And when a single format is eventually chosen for a link amongst the remaining all references to the list are updated That means that if a filter requires that its input and output have the same format amongst a supported all it has to do is use a reference to the same list of formats query_formats can leave some formats unset and return AVERROR(EAGAIN) to cause the negotiation mechanism toagain later. That can be used by filters with complex requirements to use the format negotiated on one link to set the formats supported on another. Frame references ownership and permissions

opt.h

activate

static int activate(AVFilterContext *ctx)

Definition: af_arnndn.c:1449

mem_internal.h

GRULayer::activation

int activation

Definition: af_arnndn.c:89

out

FILE * out

Definition: movenc.c:55

dual_inner_prod

static void dual_inner_prod(const float *x, const float *y01, const float *y02, int N, float *xy1, float *xy2)

Definition: af_arnndn.c:783

ff_filter_frame

int ff_filter_frame(AVFilterLink *link, AVFrame *frame)

Send a frame of data to the next filter.

Definition: avfilter.c:1067

sample_fmts

static enum AVSampleFormat sample_fmts[]

Definition: adpcmenc.c:948

FFERROR_NOT_READY

return FFERROR_NOT_READY

Definition: filter_design.txt:204

FREE_GRU

#define FREE_GRU(name)

AVTXContext

Definition: tx_priv.h:235

output

filter_frame For filters that do not use the this method is called when a frame is pushed to the filter s input It can be called at any time except in a reentrant way If the input frame is enough to produce output

Definition: filter_design.txt:226

inlink

The exact code depends on how similar the blocks are and how related they are to the and needs to apply these operations to the correct inlink or outlink if there are several Macros are available to factor that when no extra processing is inlink

Definition: filter_design.txt:212

PITCH_MIN_PERIOD

#define PITCH_MIN_PERIOD

Definition: af_arnndn.c:51

av_frame_free

void av_frame_free(AVFrame **frame)

Free the frame and any dynamically allocated objects in it, e.g.

Definition: frame.c:64

GRULayer::nb_neurons

int nb_neurons

Definition: af_arnndn.c:88

RNNState::noise_gru_state

float * noise_gru_state

Definition: af_arnndn.c:114

uninit

static av_cold void uninit(AVFilterContext *ctx)

Definition: af_arnndn.c:1566

FILTER_INPUTS

#define FILTER_INPUTS(array)

Definition: filters.h:263

inverse_transform

static void inverse_transform(DenoiseState *st, float *out, const AVComplexFloat *in)

Definition: af_arnndn.c:424

sample_rates

static const int sample_rates[]

Definition: dcaenc.h:34

AVFrame

This structure describes decoded (raw) audio or video data.

Definition: frame.h:427

DenoiseState::lastg

float lastg[NB_BANDS]

Definition: af_arnndn.c:129

AVOption

AVOption.

Definition: opt.h:429

OFFSET

#define OFFSET(x)

Definition: af_arnndn.c:1587

#define b

Definition: input.c:42

arnndn_options

static const AVOption arnndn_options[]

Definition: af_arnndn.c:1590

frame_synthesis

static void frame_synthesis(AudioRNNContext *s, DenoiseState *st, float *out, const AVComplexFloat *y)

Definition: af_arnndn.c:509

NB_DELTA_CEPS

#define NB_DELTA_CEPS

Definition: af_arnndn.c:61

RNNModel::input_dense_size

int input_dense_size

Definition: af_arnndn.c:93

AVComplexFloat

Definition: tx.h:27

FFMAX

#define FFMAX(a, b)

Definition: macros.h:47

AVFilter::name

const char * name

Filter name.

Definition: avfilter.h:220

static const uint64_t c1

Definition: murmur3.c:52

ThreadData::out

AVFrame * out

Definition: af_adeclick.c:526

AVChannelLayout::nb_channels

int nb_channels

Number of channels in this layout.

Definition: channel_layout.h:329

ThreadData::in

AVFrame * in

Definition: af_adecorrelate.c:155

tansig_table

static const float tansig_table[201]

Definition: af_arnndn.c:1174

AVFilterLink

A link between two filters.

Definition: avfilter.h:395

find_best_pitch

static void find_best_pitch(float *xcorr, float *y, int len, int max_pitch, int *best_pitch)

Definition: af_arnndn.c:906

FF_FILTER_FORWARD_STATUS_BACK

#define FF_FILTER_FORWARD_STATUS_BACK(outlink, inlink)

Forward the status on an output link to an input link.

Definition: filters.h:638

process_command

static int process_command(AVFilterContext *ctx, const char *cmd, const char *args, char *res, int res_len, int flags)

Definition: af_arnndn.c:1536

av_tx_init

av_cold int av_tx_init(AVTXContext **ctx, av_tx_fn *tx, enum AVTXType type, int inv, int len, const void *scale, uint64_t flags)

Initialize a transform context with the given configuration (i)MDCTs with an odd length are currently...

Definition: tx.c:903

DenoiseState::memid

int memid

Definition: af_arnndn.c:122

RNN_CLEAR

#define RNN_CLEAR(dst, n)

Definition: af_arnndn.c:406

GRULayer::nb_inputs

int nb_inputs

Definition: af_arnndn.c:87

compute_band_energy

static void compute_band_energy(float *bandE, const AVComplexFloat *X)

Definition: af_arnndn.c:447

formats.h

compute_rnn

static void compute_rnn(AudioRNNContext *s, RNNState *rnn, float *gains, float *vad, const float *input)

Definition: af_arnndn.c:1332

DenoiseState::txi

AVTXContext * txi

Definition: af_arnndn.c:132

free_model

static void free_model(AVFilterContext *ctx, int n)

Definition: af_arnndn.c:1522

RNNState::denoise_gru_state

float * denoise_gru_state

Definition: af_arnndn.c:115

rnnoise_channels

static int rnnoise_channels(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs)

Definition: af_arnndn.c:1408

ACTIVATION_RELU

#define ACTIVATION_RELU

Definition: af_arnndn.c:71

AVComplexFloat::im

float im

Definition: tx.h:28

DenoiseState::mem_hp_x

float mem_hp_x[2]

Definition: af_arnndn.c:128

window

static SDL_Window * window

Definition: ffplay.c:361

cosf

#define cosf(x)

Definition: libm.h:80

log10f

#define log10f(x)

Definition: libm.h:416

AudioRNNContext::model

RNNModel * model[2]

Definition: af_arnndn.c:148

rnnoise_model_free

static void rnnoise_model_free(RNNModel *model)

Definition: af_arnndn.c:157

AudioRNNContext::st

DenoiseState * st

Definition: af_arnndn.c:143

DenoiseState::cepstral_mem

float cepstral_mem[CEPS_MEM][NB_BANDS]

Definition: af_arnndn.c:121

SQUARE

#define SQUARE(x)

Definition: af_arnndn.c:56

#define AF

Definition: af_arnndn.c:1588

DenseLayer::bias

const float * bias

Definition: af_arnndn.c:76

AVFilterPad

A filter pad used for either input or output.

Definition: filters.h:39

FREQ_SIZE

#define FREQ_SIZE

Definition: af_arnndn.c:49

#define T(x)

Definition: vpx_arith.h:29

compute_band_corr

static void compute_band_corr(float *bandE, const AVComplexFloat *X, const AVComplexFloat *P)

Definition: af_arnndn.c:472

DenoiseState::history

float history[FRAME_SIZE]

Definition: af_arnndn.c:130

s EdgeDetect Foobar g libavfilter vf_edgedetect c libavfilter vf_foobar c edit libavfilter and add an entry for foobar following the pattern of the other filters edit libavfilter allfilters and add an entry for foobar following the pattern of the other filters configure make j< whatever > ffmpeg ffmpeg i you should get a foobar png with Lena edge detected That s your new playground is ready Some little details about what s going which in turn will define variables for the build system and the C

Definition: writing_filters.txt:58

avassert.h

AV_LOG_ERROR

#define AV_LOG_ERROR

Something went wrong and cannot losslessly be recovered.

Definition: log.h:210

av_cold

#define av_cold

Definition: attributes.h:106

av_tx_fn

void(* av_tx_fn)(AVTXContext *s, void *out, void *in, ptrdiff_t stride)

Function pointer to a function to perform the transform.

Definition: tx.h:151

FFFilter

Definition: filters.h:266

float

Definition: af_crystalizer.c:122

MAX_NEURONS

#define MAX_NEURONS

Definition: af_arnndn.c:67

#define s(width, name)

Definition: cbs_vp9.c:198

frame_analysis

static void frame_analysis(AudioRNNContext *s, DenoiseState *st, AVComplexFloat *X, float *Ex, const float *in)

Definition: af_arnndn.c:497

DenseLayer::nb_inputs

int nb_inputs

Definition: af_arnndn.c:78

CEPS_MEM

#define CEPS_MEM

Definition: af_arnndn.c:60

floor

static __device__ float floor(float a)

Definition: cuda_runtime.h:173

inputs

static const AVFilterPad inputs[]

Definition: af_arnndn.c:1579

const char * g

Definition: vf_curves.c:128

celt_inner_prod

static float celt_inner_prod(const float *x, const float *y, int N)

Definition: af_arnndn.c:595

AVMEDIA_TYPE_AUDIO

@ AVMEDIA_TYPE_AUDIO

Definition: avutil.h:201

av_assert0

#define av_assert0(cond)

assert() equivalent, that is always enabled.

Definition: avassert.h:41

filters.h

AV_TX_FLOAT_FFT

@ AV_TX_FLOAT_FFT

Standard complex to complex FFT with sample data type of AVComplexFloat, AVComplexDouble or AVComplex...

Definition: tx.h:47

ff_set_common_samplerates_from_list2

int ff_set_common_samplerates_from_list2(const AVFilterContext *ctx, AVFilterFormatsConfig **cfg_in, AVFilterFormatsConfig **cfg_out, const int *samplerates)

Definition: formats.c:989

#define P

ctx

AVFormatContext * ctx

Definition: movenc.c:49

RNNModel::vad_gru_size

int vad_gru_size

Definition: af_arnndn.c:96

#define xi(width, name, var, range_min, range_max, subs,...)

Definition: cbs_h2645.c:418

rnnoise_model_from_file

static int rnnoise_model_from_file(FILE *f, RNNModel **rnn)

Definition: af_arnndn.c:187

config_input

static int config_input(AVFilterLink *inlink)

Definition: af_arnndn.c:347

FRAME_SIZE_SHIFT

#define FRAME_SIZE_SHIFT

Definition: af_arnndn.c:46

ACTIVATION_TANH

#define ACTIVATION_TANH

Definition: af_arnndn.c:69

FILTER_OUTPUTS

#define FILTER_OUTPUTS(array)

Definition: filters.h:264

file_open.h

#define E

Definition: avdct.c:34

tmp

static uint8_t tmp[40]

Definition: aes_ctr.c:52

arg

const char * arg

Definition: jacosubdec.c:67

FFABS

#define FFABS(a)

Absolute value, Note, INT_MIN / INT64_MIN result in undefined behavior as they are not representable ...

Definition: common.h:74

if(ret)

Definition: filter_design.txt:179

#define M(a, b)

Definition: vp3dsp.c:48

RNNModel::vad_gru

const GRULayer * vad_gru

Definition: af_arnndn.c:97

AVClass

Describe the class of an AVClass context structure.

Definition: log.h:76

ff_inlink_consume_samples

int ff_inlink_consume_samples(AVFilterLink *link, unsigned min, unsigned max, AVFrame **rframe)

Take samples from the link's FIFO and update the link's stats.

Definition: avfilter.c:1537

NULL

#define NULL

Definition: coverity.c:32

LOCAL_ALIGNED_32

#define LOCAL_ALIGNED_32(t, v,...)

Definition: mem_internal.h:132

av_frame_copy_props

int av_frame_copy_props(AVFrame *dst, const AVFrame *src)

Copy only "metadata" fields from src to dst.

Definition: frame.c:599

sigmoid_approx

static float sigmoid_approx(float x)

Definition: af_arnndn.c:1246

RNNModel::denoise_gru_size

int denoise_gru_size

Definition: af_arnndn.c:102

RNNModel::vad_output

const DenseLayer * vad_output

Definition: af_arnndn.c:109

isnan

#define isnan(x)

Definition: libm.h:342

GRULayer::recurrent_weights

const float * recurrent_weights

Definition: af_arnndn.c:86

FREE_DENSE

#define FREE_DENSE(name)

PITCH_BUF_SIZE

#define PITCH_BUF_SIZE

Definition: af_arnndn.c:54

ff_audio_default_filterpad

const AVFilterPad ff_audio_default_filterpad[1]

An AVFilterPad array whose only entry has name "default" and is of type AVMEDIA_TYPE_AUDIO.

Definition: audio.c:34

sqrtf

static __device__ float sqrtf(float a)

Definition: cuda_runtime.h:184

PITCH_FRAME_SIZE

#define PITCH_FRAME_SIZE

Definition: af_arnndn.c:53

av_clipf

Definition: af_crystalizer.c:122

RNNModel::input_dense

const DenseLayer * input_dense

Definition: af_arnndn.c:94

Undefined Behavior In the C some operations are like signed integer dereferencing freed accessing outside allocated Undefined Behavior must not occur in a C it is not safe even if the output of undefined operations is unused The unsafety may seem nit picking but Optimizing compilers have in fact optimized code on the assumption that no undefined Behavior occurs Optimizing code based on wrong assumptions can and has in some cases lead to effects beyond the output of computations The signed integer overflow problem in speed critical code Code which is highly optimized and works with signed integers sometimes has the problem that often the output of the computation does not c

Definition: undefined.txt:32

AVFilterFormatsConfig

Lists of formats / etc.

Definition: avfilter.h:121

DenseLayer::input_weights

const float * input_weights

Definition: af_arnndn.c:77

float_dsp.h

biquad

static void biquad(float *y, float mem[2], const float *x, const float *b, const float *a, int N)

Definition: af_arnndn.c:391

DenoiseState::pitch_buf

float pitch_buf[PITCH_BUF_SIZE]

Definition: af_arnndn.c:124

Definition: af_crystalizer.c:122

INPUT_SIZE

#define INPUT_SIZE

Definition: af_arnndn.c:1330

NULL_IF_CONFIG_SMALL

#define NULL_IF_CONFIG_SMALL(x)

Return NULL if CONFIG_SMALL is true, otherwise the argument without modification.

Definition: internal.h:94

NB_BANDS

#define NB_BANDS

Definition: af_arnndn.c:58

DECLARE_ALIGNED

#define DECLARE_ALIGNED(n, t, v)

Definition: mem_internal.h:104

shift

static int shift(int a, int b)

Definition: bonk.c:261

DenseLayer::nb_neurons

int nb_neurons

Definition: af_arnndn.c:79

AV_SAMPLE_FMT_NONE

@ AV_SAMPLE_FMT_NONE

Definition: samplefmt.h:56

celt_autocorr

static int celt_autocorr(const float *x, float *ac, const float *window, int overlap, int lag, int n)

Definition: af_arnndn.c:627

WINDOW_SIZE

#define WINDOW_SIZE

Definition: af_arnndn.c:48

AVComplexFloat::re

float re

Definition: tx.h:28

AudioRNNContext::mix

float mix

Definition: af_arnndn.c:140

AVFloatDSPContext

Definition: float_dsp.h:24

RNNModel::noise_gru_size

int noise_gru_size

Definition: af_arnndn.c:99

celt_lpc

static void celt_lpc(float *lpc, const float *ac, int p)

Definition: af_arnndn.c:665

ff_filter_process_command

int ff_filter_process_command(AVFilterContext *ctx, const char *cmd, const char *arg, char *res, int res_len, int flags)

Generic processing of user supplied commands that are set in the same way as the filter options.

Definition: avfilter.c:905

DenoiseState::rnn

RNNState rnn[2]

Definition: af_arnndn.c:131

The reader does not expect b to be semantically here and if the code is changed by maybe adding a a division or other the signedness will almost certainly be mistaken To avoid this confusion a new type was SUINT is the C unsigned type but it holds a signed int to use the same example SUINT a

Definition: undefined.txt:41

RNN_MOVE

#define RNN_MOVE(dst, src, n)

Definition: af_arnndn.c:405

offset

it s the only field you need to keep assuming you have a context There is some magic you don t need to care about around this just let it vf offset

Definition: writing_filters.txt:86

FF_FILTER_FORWARD_WANTED

FF_FILTER_FORWARD_WANTED(outlink, inlink)

#define N

Definition: af_mcompand.c:54

RNNModel::denoise_gru

const GRULayer * denoise_gru

Definition: af_arnndn.c:103

input

and forward the test the status of outputs and forward it to the corresponding return FFERROR_NOT_READY If the filters stores internally one or a few frame for some input

Definition: filter_design.txt:172

DenoiseState::last_gain

float last_gain

Definition: af_arnndn.c:126

M_PI

#define M_PI

Definition: mathematics.h:67

av_tx_uninit

av_cold void av_tx_uninit(AVTXContext **ctx)

Frees a context and sets *ctx to NULL, does nothing when *ctx == NULL.

Definition: tx.c:295

AudioRNNContext::channels

int channels

Definition: af_arnndn.c:142

DenoiseState::tx

AVTXContext * tx

Definition: af_arnndn.c:132

ACTIVATION_SIGMOID

#define ACTIVATION_SIGMOID

Definition: af_arnndn.c:70

AudioRNNContext::model_name

char * model_name

Definition: af_arnndn.c:139

AV_OPT_TYPE_FLOAT

@ AV_OPT_TYPE_FLOAT

Underlying C type is float.

Definition: opt.h:271

#define i(width, name, range_min, range_max)

Definition: cbs_h2645.c:256

DenoiseState

Definition: af_arnndn.c:119

RNN_COPY

#define RNN_COPY(dst, src, n)

Definition: af_arnndn.c:407

AVFrame::extended_data

uint8_t ** extended_data

pointers to the data planes/channels.

Definition: frame.h:488

ff_filter_get_nb_threads

int ff_filter_get_nb_threads(AVFilterContext *ctx)

Get number of threads for current filter instance.

Definition: avfilter.c:845

AVSampleFormat

Audio sample formats.

Definition: samplefmt.h:55

ThreadData

Used for passing data between threads.

Definition: dsddec.c:71

interp_band_gain

static void interp_band_gain(float *g, const float *bandE)

Definition: af_arnndn.c:1128

FILTER_QUERY_FUNC2

#define FILTER_QUERY_FUNC2(func)

Definition: filters.h:240

FFMIN

#define FFMIN(a, b)

Definition: macros.h:49

dct

static void dct(AudioRNNContext *s, float *out, const float *in)

Definition: af_arnndn.c:1010

AudioRNNContext

Definition: af_arnndn.c:136

FRAME_SIZE

#define FRAME_SIZE

Definition: af_arnndn.c:47

len

int len

Definition: vorbis_enc_data.h:426

AudioRNNContext::dct_table

float dct_table[FFALIGN(NB_BANDS, 4)][FFALIGN(NB_BANDS, 4)]

Definition: af_arnndn.c:146

AVFilterPad::name

const char * name

Pad name.

Definition: filters.h:45

avpriv_fopen_utf8

FILE * avpriv_fopen_utf8(const char *path, const char *mode)

Open a file using a UTF-8 filename.

Definition: file_open.c:161

av_calloc

void * av_calloc(size_t nmemb, size_t size)

Definition: mem.c:264

stride

#define stride

Definition: h264pred_template.c:536

open_model

static int open_model(AVFilterContext *ctx, RNNModel **model)

Definition: af_arnndn.c:1471

state

static struct @531 state

ret

Definition: filter_design.txt:187

RNNModel

Definition: af_arnndn.c:92

@ X

Definition: vf_addroi.c:27

FFSWAP

#define FFSWAP(type, a, b)

Definition: macros.h:52

compute_frame_features

static int compute_frame_features(AudioRNNContext *s, DenoiseState *st, AVComplexFloat *X, AVComplexFloat *P, float *Ex, float *Ep, float *Exp, float *features, const float *in)

Definition: af_arnndn.c:1020

DenseLayer

Definition: af_arnndn.c:75

GRULayer::input_weights

const float * input_weights

Definition: af_arnndn.c:85

query_formats

static int query_formats(const AVFilterContext *ctx, AVFilterFormatsConfig **cfg_in, AVFilterFormatsConfig **cfg_out)

Definition: af_arnndn.c:330

AudioRNNContext::window

float window[WINDOW_SIZE]

Definition: af_arnndn.c:145

second_check

static const uint8_t second_check[16]

Definition: af_arnndn.c:802

remove_doubling

static float remove_doubling(float *x, int maxperiod, int minperiod, int N, int *T0_, int prev_period, float prev_gain)

Definition: af_arnndn.c:803

RNNModel::denoise_output_size

int denoise_output_size

Definition: af_arnndn.c:105

ff_set_common_formats_from_list2

int ff_set_common_formats_from_list2(const AVFilterContext *ctx, AVFilterFormatsConfig **cfg_in, AVFilterFormatsConfig **cfg_out, const int *fmts)

Definition: formats.c:1085

compute_pitch_gain

static float compute_pitch_gain(float xy, float xx, float yy)

Definition: af_arnndn.c:797

AVFILTER_DEFINE_CLASS

AVFILTER_DEFINE_CLASS(arnndn)

ff_filter_execute

int ff_filter_execute(AVFilterContext *ctx, avfilter_action_func *func, void *arg, int *ret, int nb_jobs)

Definition: avfilter.c:1693

xcorr_kernel

static void xcorr_kernel(const float *x, const float *y, float sum[4], int len)

Definition: af_arnndn.c:526

RNNModel::vad_output_size

int vad_output_size

Definition: af_arnndn.c:108

pitch_search

static void pitch_search(const float *x_lp, float *y, int len, int max_pitch, int *pitch)

Definition: af_arnndn.c:953

pitch_filter

static void pitch_filter(AVComplexFloat *X, const AVComplexFloat *P, const float *Ex, const float *Ep, const float *Exp, const float *g)

Definition: af_arnndn.c:1143

avfilter.h

ff_af_arnndn

const FFFilter ff_af_arnndn

Definition: af_arnndn.c:1599

celt_pitch_xcorr

static void celt_pitch_xcorr(const float *x, const float *y, float *xcorr, int len, int max_pitch)

Definition: af_arnndn.c:606

RNNState::vad_gru_state

float * vad_gru_state

Definition: af_arnndn.c:113

Windows::Graphics::DirectX::Direct3D11::p

IDirect3DDxgiInterfaceAccess _COM_Outptr_ void ** p

Definition: vsrc_gfxcapture_winrt.hpp:53

INPUT_GRU

#define INPUT_GRU(name)

rnnoise_channel

static float rnnoise_channel(AudioRNNContext *s, DenoiseState *st, float *out, const float *in, int disabled)

Definition: af_arnndn.c:1360

celt_fir5

static void celt_fir5(const float *x, const float *num, float *y, int N, float *mem)

Definition: af_arnndn.c:698

filter_frame

static int filter_frame(AVFilterLink *inlink, AVFrame *in)

Definition: af_arnndn.c:1427

AVFilterContext

An instance of a filter.

Definition: avfilter.h:274

DenoiseState::pitch_enh_buf

float pitch_enh_buf[PITCH_BUF_SIZE]

Definition: af_arnndn.c:125

AVFILTER_FLAG_SLICE_THREADS

#define AVFILTER_FLAG_SLICE_THREADS

The filter supports multithreading by splitting frames into multiple parts and processing them concur...

Definition: avfilter.h:167

tansig_approx

static float tansig_approx(float x)

Definition: af_arnndn.c:1218

AudioRNNContext::fdsp

AVFloatDSPContext * fdsp

Definition: af_arnndn.c:150

Q15ONE

#define Q15ONE

Definition: af_arnndn.c:73

FFFilter::p

AVFilter p

The public AVFilter.

Definition: filters.h:270

DenoiseState::last_period

int last_period

Definition: af_arnndn.c:127

mem.h

audio.h

DenoiseState::tx_fn

av_tx_fn tx_fn

Definition: af_arnndn.c:133

forward_transform

static void forward_transform(DenoiseState *st, AVComplexFloat *out, const float *in)

Definition: af_arnndn.c:409

AVFilterLink::ch_layout

AVChannelLayout ch_layout

channel layout of current buffer (see libavutil/channel_layout.h)

Definition: avfilter.h:422

av_free

#define av_free(p)

Definition: tableprint_vlc.h:34

scale

static void scale(int *out, const int *in, const int w, const int h, const int shift)

Definition: intra.c:273

FF_FILTER_FORWARD_STATUS

FF_FILTER_FORWARD_STATUS(inlink, outlink)

FFALIGN

#define FFALIGN(x, a)

Definition: macros.h:78

alpha

static const int16_t alpha[]

Definition: ilbcdata.h:55

av_freep

#define av_freep(p)

Definition: tableprint_vlc.h:35

avpriv_float_dsp_alloc

av_cold AVFloatDSPContext * avpriv_float_dsp_alloc(int bit_exact)

Allocate a float DSP context.

Definition: float_dsp.c:135

DenoiseState::txi_fn

av_tx_fn txi_fn

Definition: af_arnndn.c:133

AVFILTER_FLAG_SUPPORT_TIMELINE_INTERNAL

#define AVFILTER_FLAG_SUPPORT_TIMELINE_INTERNAL

Same as AVFILTER_FLAG_SUPPORT_TIMELINE_GENERIC, except that the filter will have its filter_frame() c...

Definition: avfilter.h:205

DenseLayer::activation

int activation

Definition: af_arnndn.c:80

RNNModel::denoise_output

const DenseLayer * denoise_output

Definition: af_arnndn.c:106

av_log

#define av_log(a,...)

Definition: tableprint_vlc.h:27

AVERROR_INVALIDDATA

#define AVERROR_INVALIDDATA

Invalid data found when processing input.

Definition: error.h:61

Definition: vp9dsp_template.c:2070

RNNState

Definition: af_arnndn.c:112

ALLOC_LAYER

#define ALLOC_LAYER(type, name)

AV_OPT_TYPE_STRING

@ AV_OPT_TYPE_STRING

Underlying C type is a uint8_t* that is either NULL or points to a C string allocated with the av_mal...

Definition: opt.h:276

GRULayer

Definition: af_arnndn.c:83

compute_gru

static void compute_gru(AudioRNNContext *s, const GRULayer *gru, float *state, const float *input)

Definition: af_arnndn.c:1279

eband5ms

static const uint8_t eband5ms[]

Definition: af_arnndn.c:442

GRULayer::bias

const float * bias

Definition: af_arnndn.c:84

INPUT_DENSE

#define INPUT_DENSE(name)

RNNModel::noise_gru

const GRULayer * noise_gru

Definition: af_arnndn.c:100

NB_FEATURES

#define NB_FEATURES

Definition: af_arnndn.c:63

src

#define src

Definition: vp8dsp.c:248

init

static av_cold int init(AVFilterContext *ctx)

Definition: af_arnndn.c:1493

tx.h

RNNState::model

RNNModel * model

Definition: af_arnndn.c:116

DenoiseState::analysis_mem

float analysis_mem[FRAME_SIZE]

Definition: af_arnndn.c:120

Generated on Sat Oct 18 2025 19:22:54 for FFmpeg by doxygen 1.8.17