FFmpeg: libavfilter/af_whisper.c Source File

FFmpeg

[フレーム]

libavfilter

af_whisper.c

Go to the documentation of this file.

1 /*

3 *

4 * This file is part of FFmpeg.

5 *

6 * FFmpeg is free software; you can redistribute it and/or

7 * modify it under the terms of the GNU Lesser General Public License

8 * as published by the Free Software Foundation; either

9 * version 2.1 of the License, or (at your option) any later version.

10 *

11 * FFmpeg is distributed in the hope that it will be useful,

12 * but WITHOUT ANY WARRANTY; without even the implied warranty of

13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

14 * GNU Lesser General Public License for more details.

15 *

16 * You should have received a copy of the GNU Lesser General Public License

17 * along with FFmpeg; if not, write to the Free Software Foundation, Inc.,

18 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA

19 */

21 #include <stdio.h>

22 #include <stdint.h>

23 #include <stdlib.h>

25 #include <whisper.h>

27 #include "libavutil/avutil.h"

28 #include "libavutil/opt.h"

29 #include "libavutil/channel_layout.h"

30 #include "libavutil/samplefmt.h"

31 #include "libavfilter/avfilter.h"

32 #include "libavfilter/audio.h"

33 #include "libavutil/mem.h"

34 #include "libavutil/avstring.h"

35 #include "libavutil/internal.h"

36 #include "libavformat/avio.h"

37 #include "libavutil/thread.h"

39 #include "formats.h"

41 typedef struct WhisperContext {

42 const AVClass *class;

43 char *model_path;

44 char *language;

45 bool use_gpu;

46 int gpu_device;

47 char *vad_model_path;

48 float vad_threshold;

49 int64_t vad_min_speech_duration;

50 int64_t vad_min_silence_duration;

52 int64_t queue;

53 char *destination;

54 char *format;

56 struct whisper_context *ctx_wsp;

57 struct whisper_vad_context *ctx_vad;

58 struct whisper_vad_params vad_params;

60 float *audio_buffer;

61 int audio_buffer_queue_size;

62 int audio_buffer_fill_size;

63 int audio_buffer_vad_size;

64 int64_t audio_buffer_start_ms;

66 int eof;

67 int64_t next_pts;

69 AVIOContext *avio_context;

70 int index;

71 } WhisperContext;

73 static void cb_log(enum ggml_log_level level, const char *text, void *user_data)

74 {

75 AVFilterContext *ctx = user_data;

76 int av_log_level = AV_LOG_DEBUG;

77 switch (level) {

78 case GGML_LOG_LEVEL_ERROR:

79 av_log_level = AV_LOG_ERROR;

80 break;

81 case GGML_LOG_LEVEL_WARN:

82 av_log_level = AV_LOG_WARNING;

83 break;

84 }

85 av_log(ctx, av_log_level, "%s", text);

86 }

88 static int init(AVFilterContext *ctx)

89 {

90 WhisperContext *wctx = ctx->priv;

92 static AVOnce init_static_once = AV_ONCE_INIT;

93 ff_thread_once(&init_static_once, ggml_backend_load_all);

95 whisper_log_set(cb_log, ctx);

97 // Init whisper context

98 if (!wctx->model_path) {

99 av_log(ctx, AV_LOG_ERROR, "No whisper model path specified. Use the 'model' option.\n");

100 return AVERROR(EINVAL);

101 }

102

103 struct whisper_context_params params = whisper_context_default_params();

104 params.use_gpu = wctx->use_gpu;

105 params.gpu_device = wctx->gpu_device;

106

107 wctx->ctx_wsp = whisper_init_from_file_with_params(wctx->model_path, params);

108 if (wctx->ctx_wsp == NULL) {

109 av_log(ctx, AV_LOG_ERROR, "Failed to initialize whisper context from model: %s\n", wctx->model_path);

110 return AVERROR(EIO);

111 }

112

113 // Init buffer

114 wctx->audio_buffer_queue_size = av_rescale(wctx->queue, WHISPER_SAMPLE_RATE, AV_TIME_BASE);

115 wctx->audio_buffer = av_malloc_array(wctx->audio_buffer_queue_size, sizeof(*wctx->audio_buffer));

116 if (!wctx->audio_buffer)

117 return AVERROR(ENOMEM);

118

119 // Init VAD model context

120 if (wctx->vad_model_path) {

121 struct whisper_vad_context_params ctx_params = whisper_vad_default_context_params();

122 ctx_params.n_threads = ff_filter_get_nb_threads(ctx);

123 // ctx_params.use_gpu = wctx->use_gpu; TODO (see: whisper_vad_init_context)

124 ctx_params.gpu_device = wctx->gpu_device;

125 wctx->ctx_vad = whisper_vad_init_from_file_with_params(wctx->vad_model_path, ctx_params);

126

127 wctx->vad_params = whisper_vad_default_params();

128 wctx->vad_params.threshold = wctx->vad_threshold;

129 wctx->vad_params.min_speech_duration_ms = av_rescale(wctx->vad_min_speech_duration, 1000, AV_TIME_BASE);

130 wctx->vad_params.min_silence_duration_ms = av_rescale(wctx->vad_min_silence_duration, 1000, AV_TIME_BASE);

131 wctx->vad_params.max_speech_duration_s = av_rescale(wctx->queue, 1, AV_TIME_BASE);

132 wctx->vad_params.speech_pad_ms = 0;

133 wctx->vad_params.samples_overlap = 0;

134 }

135

136 wctx->next_pts = AV_NOPTS_VALUE;

137

138 if (wctx->destination && strcmp("", wctx->destination)) {

139 const char *dst = wctx->destination;

140 if (!strcmp("-", dst))

141 dst = "pipe:1";

142 int ret = avio_open(&wctx->avio_context, dst, AVIO_FLAG_WRITE);

143

144 if (ret < 0) {

145 av_log(ctx, AV_LOG_ERROR, "Could not open %s: %s\n", wctx->destination, av_err2str(ret));

146 return ret;

147 }

148

149 wctx->avio_context->direct = AVIO_FLAG_DIRECT;

150 }

151

152 av_log(ctx, AV_LOG_INFO,

153 "Whisper filter initialized: model: %s lang: %s queue: %" PRId64 " ms\n",

154 wctx->model_path, wctx->language, wctx->queue / 1000);

155

156 return 0;

157 }

158

159 static void uninit(AVFilterContext *ctx)

160 {

161 WhisperContext *wctx = ctx->priv;

162

163 if (wctx->audio_buffer_fill_size > 0) {

164 av_log(ctx, AV_LOG_WARNING,

165 "Remaining audio buffer %d samples (%d seconds) after stopping\n",

166 wctx->audio_buffer_fill_size, wctx->audio_buffer_fill_size / WHISPER_SAMPLE_RATE);

167 }

168

169 if (wctx->ctx_vad) {

170 whisper_vad_free(wctx->ctx_vad);

171 wctx->ctx_vad = NULL;

172 }

173

174 if (wctx->ctx_wsp) {

175 whisper_free(wctx->ctx_wsp);

176 wctx->ctx_wsp = NULL;

177 }

178

179 av_freep(&wctx->audio_buffer);

180

181 if (wctx->avio_context)

182 avio_closep(&wctx->avio_context);

183 }

184

185 static void run_transcription(AVFilterContext *ctx, AVFrame *frame, int samples)

186 {

187 WhisperContext *wctx = ctx->priv;

188 samples = FFMAX(0, FFMIN(samples, wctx->audio_buffer_fill_size));

189

190 if (!wctx->ctx_wsp || samples == 0)

191 return;

192

193 const int64_t timestamp_ms = wctx->audio_buffer_start_ms;

194 const float duration = (float) samples / WHISPER_SAMPLE_RATE;

195

196 av_log(ctx, AV_LOG_INFO,

197 "run transcription at %" PRId64 " ms, %d/%d samples (%.2f seconds)...\n",

198 timestamp_ms, samples, wctx->audio_buffer_fill_size, duration);

199

200 struct whisper_full_params params = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);

201 params.language = wctx->language;

202 params.n_threads = ff_filter_get_nb_threads(ctx);

203 params.print_special = 0;

204 params.print_progress = 0;

205 params.print_realtime = 0;

206 params.print_timestamps = 0;

207

208 if (whisper_full(wctx->ctx_wsp, params, wctx->audio_buffer, samples) != 0) {

209 av_log(ctx, AV_LOG_ERROR, "Failed to process audio with whisper.cpp\n");

210 return;

211 }

212

213 const int n_segments = whisper_full_n_segments(wctx->ctx_wsp);

214 char *segments_text = NULL;

215

216 for (int i = 0; i < n_segments; ++i) {

217 const char *text = whisper_full_get_segment_text(wctx->ctx_wsp, i);

218 if (av_isspace(text[0]))

219 text++;

220 char *text_cleaned = av_strireplace(text, "[BLANK_AUDIO]", "");

221

222 if (av_strnlen(text_cleaned, 1) == 0) {

223 av_freep(&text_cleaned);

224 continue;

225 }

226

227 const bool turn = whisper_full_get_segment_speaker_turn_next(wctx->ctx_wsp, i);

228 const int64_t t0_ms = whisper_full_get_segment_t0(wctx->ctx_wsp, i) * 10;

229 const int64_t t1_ms = whisper_full_get_segment_t1(wctx->ctx_wsp, i) * 10;

230

231 av_log(ctx, AV_LOG_DEBUG, " [%" PRId64 "-%" PRId64 "%s]: \"%s\"\n",

232 timestamp_ms + t0_ms, timestamp_ms + t1_ms, turn ? " (turn)" : "", text_cleaned);

233

234 if (segments_text) {

235 char *new_text = av_asprintf("%s%s", segments_text, text_cleaned);

236 av_freep(&segments_text);

237 segments_text = new_text;

238 } else

239 segments_text = av_strdup(text_cleaned);

240

241 if (wctx->avio_context) {

242 const int64_t start_t = timestamp_ms + t0_ms;

243 const int64_t end_t = timestamp_ms + t1_ms;

244 char *buf = NULL;

245

246 if (!av_strcasecmp(wctx->format, "srt")) {

247 buf =

248 av_asprintf

249 ("%d\n%02" PRId64 ":%02" PRId64 ":%02" PRId64 ",%03" PRId64 " --> %02" PRId64 ":%02" PRId64 ":%02" PRId64 ",%03" PRId64 "\n%s\n\n",

250 wctx->index, start_t / 3600000,

251 (start_t / 60000) % 60, (start_t / 1000) % 60,

252 start_t % 1000, end_t / 3600000, (end_t / 60000) % 60,

253 (end_t / 1000) % 60, end_t % 1000, text_cleaned);

254

255 wctx->index++;

256 } else if (!av_strcasecmp(wctx->format, "json")) {

257 buf = av_asprintf("{\"start\":%" PRId64 ",\"end\":%" PRId64 ",\"text\":\"%s\"}\n", start_t, end_t, text_cleaned);

258 } else

259 buf = av_strdup(text_cleaned);

260

261 if (buf) {

262 avio_write(wctx->avio_context, buf, strlen(buf));

263 av_freep(&buf);

264 }

265 }

266

267 av_freep(&text_cleaned);

268 }

269

270 AVDictionary **metadata = &frame->metadata;

271 if (metadata && segments_text) {

272 av_dict_set(metadata, "lavfi.whisper.text", segments_text, 0);

273 char *duration_text = av_asprintf("%f", duration);

274 av_dict_set(metadata, "lavfi.whisper.duration", duration_text, AV_DICT_DONT_STRDUP_VAL);

275 }

276 av_freep(&segments_text);

277

278 if (wctx->audio_buffer_fill_size > samples) {

279 memcpy(wctx->audio_buffer, wctx->audio_buffer + samples,

280 (wctx->audio_buffer_fill_size - samples) * sizeof(*wctx->audio_buffer));

281 wctx->audio_buffer_start_ms += duration * 1000;

282 }

283 wctx->audio_buffer_fill_size -= samples;

284 wctx->audio_buffer_vad_size = wctx->audio_buffer_fill_size;

285 }

286

287 static int filter_frame(AVFilterLink *inlink, AVFrame *frame)

288 {

289 AVFilterContext *ctx = inlink->dst;

290 WhisperContext *wctx = ctx->priv;

291 AVFilterLink *outlink = ctx->outputs[0];

292

293 const int samples = frame->nb_samples;

294 const float *input_data = (const float *) frame->data[0];

295

296 if (wctx->audio_buffer_fill_size + samples > wctx->audio_buffer_queue_size) {

297 run_transcription(ctx, frame, wctx->audio_buffer_fill_size);

298 }

299

300 if (!wctx->audio_buffer_fill_size)

301 wctx->audio_buffer_start_ms = av_rescale_q(frame->pts,

302 (AVRational) {1000, 1},

303 (AVRational) {inlink->time_base.den, inlink->time_base.num});

304 memcpy(wctx->audio_buffer + wctx->audio_buffer_fill_size, input_data, samples * sizeof(*wctx->audio_buffer));

305 wctx->audio_buffer_fill_size += samples;

306

307 if (wctx->ctx_vad

308 && (wctx->audio_buffer_fill_size - wctx->audio_buffer_vad_size) >=

309 av_rescale(wctx->vad_min_speech_duration + wctx->vad_min_silence_duration, WHISPER_SAMPLE_RATE, AV_TIME_BASE)) {

310 struct whisper_vad_segments *segments = whisper_vad_segments_from_samples(wctx->ctx_vad,

311 wctx->vad_params,

312 wctx->audio_buffer,

313 wctx->audio_buffer_fill_size);

314 wctx->audio_buffer_vad_size = wctx->audio_buffer_fill_size;

315

316 if (!segments) {

317 av_log(ctx, AV_LOG_ERROR, "failed to detect VAD\n");

318 } else {

319 int n_segments = whisper_vad_segments_n_segments(segments);

320

321 if (n_segments > 0) {

322 const float start_ms = whisper_vad_segments_get_segment_t0(segments, 0) * 10.0;

323 const float end_ms = whisper_vad_segments_get_segment_t1(segments, n_segments - 1) * 10.0;

324 int end_pos = (int) (end_ms * WHISPER_SAMPLE_RATE / 1000);

325

326 if (end_pos <= wctx->audio_buffer_fill_size -

327 av_rescale(wctx->vad_min_silence_duration, WHISPER_SAMPLE_RATE, AV_TIME_BASE)) {

328 av_log(ctx, AV_LOG_INFO,

329 "VAD detected %d segments, start: %.0f ms, end: %.0f ms (buffer: %d ms)\n",

330 n_segments, start_ms, end_ms, 1000 * wctx->audio_buffer_fill_size / WHISPER_SAMPLE_RATE);

331 run_transcription(ctx, frame, end_pos);

332 }

333 }

334

335 whisper_vad_free_segments(segments);

336 }

337 } else if (wctx->audio_buffer_fill_size >= wctx->audio_buffer_queue_size)

338 run_transcription(ctx, frame, wctx->audio_buffer_fill_size);

339

340 wctx->next_pts = frame->pts + av_rescale_q(samples, (AVRational) {

341 1, inlink->sample_rate}

342 , inlink->time_base);

343 return ff_filter_frame(outlink, frame);

344 }

345

346 static int push_last_frame(AVFilterLink *outlink)

347 {

348 AVFilterContext *ctx = outlink->src;

349 WhisperContext *wctx = ctx->priv;

350 AVFrame *frame;

351 int n_out = 1;

352

353 if (ctx->is_disabled || wctx->audio_buffer_fill_size == 0)

354 return 0;

355 frame = ff_get_audio_buffer(outlink, n_out);

356 if (!frame)

357 return AVERROR(ENOMEM);

358

359 av_samples_set_silence(frame->extended_data, 0, n_out, frame->ch_layout.nb_channels, frame->format);

360

361 frame->pts = wctx->next_pts;

362 if (wctx->next_pts != AV_NOPTS_VALUE)

363 wctx->next_pts += av_rescale_q(n_out, (AVRational) {

364 1, outlink->sample_rate}

365 , outlink->time_base);

366

367 run_transcription(ctx, frame, wctx->audio_buffer_fill_size);

368

369 return ff_filter_frame(outlink, frame);

370 }

371

372 static int activate(AVFilterContext *ctx)

373 {

374 AVFilterLink *inlink = ctx->inputs[0];

375 AVFilterLink *outlink = ctx->outputs[0];

376 WhisperContext *wctx = ctx->priv;

377 int64_t pts;

378 int status;

379

380 FF_FILTER_FORWARD_STATUS_BACK(outlink, inlink);

381

382 if (!wctx->eof && ff_inlink_queued_frames(inlink)) {

383 AVFrame *frame = NULL;

384 int ret;

385

386 ret = ff_inlink_consume_frame(inlink, &frame);

387 if (ret < 0)

388 return ret;

389 if (ret > 0)

390 return filter_frame(inlink, frame);

391 }

392

393 if (!wctx->eof && ff_inlink_acknowledge_status(inlink, &status, &pts))

394 wctx->eof = status == AVERROR_EOF;

395

396 if (wctx->eof) {

397 push_last_frame(outlink);

398

399 ff_outlink_set_status(outlink, AVERROR_EOF, wctx->next_pts);

400 return 0;

401 }

402

403 FF_FILTER_FORWARD_WANTED(outlink, inlink);

404

405 return FFERROR_NOT_READY;

406 }

407

408 static int query_formats(const AVFilterContext *ctx,

409 AVFilterFormatsConfig **cfg_in,

410 AVFilterFormatsConfig **cfg_out)

411 {

412 static const enum AVSampleFormat sample_fmts[] = { AV_SAMPLE_FMT_FLT, AV_SAMPLE_FMT_NONE };

413 AVChannelLayout chlayouts[] = { FF_COUNT2LAYOUT(1), { 0 } };

414 int sample_rates[] = { WHISPER_SAMPLE_RATE, -1 };

415 int ret;

416

417 ret = ff_set_common_formats_from_list2(ctx, cfg_in, cfg_out, sample_fmts);

418 if (ret < 0)

419 return ret;

420

421 ret = ff_set_common_channel_layouts_from_list2(ctx, cfg_in, cfg_out, chlayouts);

422 if (ret < 0)

423 return ret;

424

425 return ff_set_common_samplerates_from_list2(ctx, cfg_in, cfg_out, sample_rates);

426 }

427

428 #define OFFSET(x) offsetof(WhisperContext, x)

429 #define FLAGS AV_OPT_FLAG_AUDIO_PARAM | AV_OPT_FLAG_FILTERING_PARAM

430 #define HOURS 3600000000

431

432 static const AVOption whisper_options[] = {

433 { "model", "Path to the whisper.cpp model file", OFFSET(model_path), AV_OPT_TYPE_STRING,.flags = FLAGS },

434 { "language", "Language for transcription ('auto' for auto-detect)", OFFSET(language), AV_OPT_TYPE_STRING, {.str = "auto"}, .flags = FLAGS },

435 { "queue", "Audio queue size", OFFSET(queue), AV_OPT_TYPE_DURATION, {.i64 = 3000000}, 20000, HOURS, .flags = FLAGS },

436 { "use_gpu", "Use GPU for processing", OFFSET(use_gpu), AV_OPT_TYPE_BOOL, {.i64 = 1}, 0, 1, .flags = FLAGS },

437 { "gpu_device", "GPU device to use", OFFSET(gpu_device), AV_OPT_TYPE_INT, {.i64 = 0}, 0, INT_MAX, .flags = FLAGS },

438 { "destination", "Output destination", OFFSET(destination), AV_OPT_TYPE_STRING, {.str = ""}, .flags = FLAGS },

439 { "format", "Output format (text|srt|json)", OFFSET(format), AV_OPT_TYPE_STRING, {.str = "text"},.flags = FLAGS },

440 { "vad_model", "Path to the VAD model file", OFFSET(vad_model_path), AV_OPT_TYPE_STRING,.flags = FLAGS },

441 { "vad_threshold", "VAD threshold", OFFSET(vad_threshold), AV_OPT_TYPE_FLOAT, {.dbl = 0.5}, 0.0, 1.0, .flags = FLAGS },

442 { "vad_min_speech_duration", "Minimum speech duration for VAD", OFFSET(vad_min_speech_duration), AV_OPT_TYPE_DURATION, {.i64 = 100000}, 20000, HOURS, .flags = FLAGS },

443 { "vad_min_silence_duration", "Minimum silence duration for VAD", OFFSET(vad_min_silence_duration), AV_OPT_TYPE_DURATION, {.i64 = 500000}, 0, HOURS, .flags = FLAGS },

444 { NULL }

445 };

446

447 static const AVClass whisper_class = {

448 .class_name = "whisper",

449 .item_name = av_default_item_name,

450 .option = whisper_options,

451 .version = LIBAVUTIL_VERSION_INT,

452 };

453

454 const FFFilter ff_af_whisper = {

455 .p.name = "whisper",

456 .p.description = NULL_IF_CONFIG_SMALL("Transcribe audio using whisper.cpp."),

457 .p.priv_class = &whisper_class,

458 .p.flags = AVFILTER_FLAG_METADATA_ONLY,

459 .init = init,

460 .uninit = uninit,

461 .activate = activate,

462 .priv_size = sizeof(WhisperContext),

463 FILTER_INPUTS(ff_audio_default_filterpad),

464 FILTER_OUTPUTS(ff_audio_default_filterpad),

465 FILTER_QUERY_FUNC2(query_formats),

466 };

ff_get_audio_buffer

AVFrame * ff_get_audio_buffer(AVFilterLink *link, int nb_samples)

Request an audio samples buffer with a specific set of permissions.

Definition: audio.c:98

AV_LOG_WARNING

#define AV_LOG_WARNING

Something somehow does not look correct.

Definition: log.h:216

level

uint8_t level

Definition: svq3.c:208

AVERROR

Filter the word "frame" indicates either a video frame or a group of audio as stored in an AVFrame structure Format for each input and each output the list of supported formats For video that means pixel format For audio that means channel sample they are references to shared objects When the negotiation mechanism computes the intersection of the formats supported at each end of a all references to both lists are replaced with a reference to the intersection And when a single format is eventually chosen for a link amongst the remaining all references to the list are updated That means that if a filter requires that its input and output have the same format amongst a supported all it has to do is use a reference to the same list of formats query_formats can leave some formats unset and return AVERROR(EAGAIN) to cause the negotiation mechanism toagain later. That can be used by filters with complex requirements to use the format negotiated on one link to set the formats supported on another. Frame references ownership and permissions

opt.h

whisper_options

static const AVOption whisper_options[]

Definition: af_whisper.c:432

ff_filter_frame

int ff_filter_frame(AVFilterLink *link, AVFrame *frame)

Send a frame of data to the next filter.

Definition: avfilter.c:1067

sample_fmts

static enum AVSampleFormat sample_fmts[]

Definition: adpcmenc.c:948

thread.h

AVERROR_EOF

#define AVERROR_EOF

End of file.

Definition: error.h:57

FFERROR_NOT_READY

return FFERROR_NOT_READY

Definition: filter_design.txt:204

int64_t

long long int64_t

Definition: coverity.c:34

inlink

The exact code depends on how similar the blocks are and how related they are to the and needs to apply these operations to the correct inlink or outlink if there are several Macros are available to factor that when no extra processing is inlink

Definition: filter_design.txt:212

av_asprintf

char * av_asprintf(const char *fmt,...)

Definition: avstring.c:115

av_strcasecmp

int av_strcasecmp(const char *a, const char *b)

Locale-independent case-insensitive compare.

Definition: avstring.c:208

av_isspace

static av_const int av_isspace(int c)

Locale-independent conversion of ASCII isspace.

Definition: avstring.h:218

FILTER_INPUTS

#define FILTER_INPUTS(array)

Definition: filters.h:263

sample_rates

static const int sample_rates[]

Definition: dcaenc.h:34

AVFrame

This structure describes decoded (raw) audio or video data.

Definition: frame.h:427

WhisperContext::audio_buffer_vad_size

int audio_buffer_vad_size

Definition: af_whisper.c:63

av_samples_set_silence

int av_samples_set_silence(uint8_t *const *audio_data, int offset, int nb_samples, int nb_channels, enum AVSampleFormat sample_fmt)

Fill an audio buffer with silence.

Definition: samplefmt.c:246

AVOption

AVOption.

Definition: opt.h:429

WhisperContext::language

char * language

Definition: af_whisper.c:44

avio_open

int avio_open(AVIOContext **s, const char *filename, int flags)

Create and initialize a AVIOContext for accessing the resource indicated by url.

Definition: avio.c:498

AV_OPT_TYPE_DURATION

@ AV_OPT_TYPE_DURATION

Underlying C type is int64_t.

Definition: opt.h:319

WhisperContext

Definition: af_whisper.c:41

ff_set_common_channel_layouts_from_list2

int ff_set_common_channel_layouts_from_list2(const AVFilterContext *ctx, AVFilterFormatsConfig **cfg_in, AVFilterFormatsConfig **cfg_out, const AVChannelLayout *fmts)

Definition: formats.c:1013

WhisperContext::audio_buffer_queue_size

int audio_buffer_queue_size

Definition: af_whisper.c:61

WhisperContext::use_gpu

bool use_gpu

Definition: af_whisper.c:45

AVDictionary

Definition: dict.c:32

FFMAX

#define FFMAX(a, b)

Definition: macros.h:47

AVFilter::name

const char * name

Filter name.

Definition: avfilter.h:220

AVFilterLink

A link between two filters.

Definition: avfilter.h:395

FF_FILTER_FORWARD_STATUS_BACK

#define FF_FILTER_FORWARD_STATUS_BACK(outlink, inlink)

Forward the status on an output link to an input link.

Definition: filters.h:638

WhisperContext::avio_context

AVIOContext * avio_context

Definition: af_whisper.c:69

formats.h

ff_inlink_consume_frame

int ff_inlink_consume_frame(AVFilterLink *link, AVFrame **rframe)

Take a frame from the link's FIFO and update the link's stats.

Definition: avfilter.c:1517

whisper_class

static const AVClass whisper_class

Definition: af_whisper.c:447

ff_af_whisper

const FFFilter ff_af_whisper

Definition: af_whisper.c:454

samplefmt.h

run_transcription

static void run_transcription(AVFilterContext *ctx, AVFrame *frame, int samples)

Definition: af_whisper.c:185

WhisperContext::vad_threshold

float vad_threshold

Definition: af_whisper.c:48

pts

static int64_t pts

Definition: transcode_aac.c:644

AV_DICT_DONT_STRDUP_VAL

#define AV_DICT_DONT_STRDUP_VAL

Take ownership of a value that's been allocated with av_malloc() or another memory allocation functio...

Definition: dict.h:79

ff_thread_once

static int ff_thread_once(char *control, void(*routine)(void))

Definition: thread.h:205

init

static int init(AVFilterContext *ctx)

Definition: af_whisper.c:88

AV_LOG_ERROR

#define AV_LOG_ERROR

Something went wrong and cannot losslessly be recovered.

Definition: log.h:210

WhisperContext::gpu_device

int gpu_device

Definition: af_whisper.c:46

FFFilter

Definition: filters.h:266

duration

int64_t duration

Definition: movenc.c:65

WhisperContext::audio_buffer_start_ms

int64_t audio_buffer_start_ms

Definition: af_whisper.c:64

float

Definition: af_crystalizer.c:122

ff_outlink_set_status

static void ff_outlink_set_status(AVFilterLink *link, int status, int64_t pts)

Set the status field of a link from the source filter.

Definition: filters.h:628

av_log_level

static int av_log_level

Definition: log.c:58

AVIO_FLAG_WRITE

#define AVIO_FLAG_WRITE

write-only

Definition: avio.h:618

ff_set_common_samplerates_from_list2

int ff_set_common_samplerates_from_list2(const AVFilterContext *ctx, AVFilterFormatsConfig **cfg_in, AVFilterFormatsConfig **cfg_out, const int *samplerates)

Definition: formats.c:1037

AV_LOG_DEBUG

#define AV_LOG_DEBUG

Stuff which is only useful for libav* developers.

Definition: log.h:231

ctx

AVFormatContext * ctx

Definition: movenc.c:49

av_rescale_q

int64_t av_rescale_q(int64_t a, AVRational bq, AVRational cq)

Rescale a 64-bit integer by 2 rational numbers.

Definition: mathematics.c:142

FILTER_OUTPUTS

#define FILTER_OUTPUTS(array)

Definition: filters.h:264

if(ret)

Definition: filter_design.txt:179

LIBAVUTIL_VERSION_INT

#define LIBAVUTIL_VERSION_INT

Definition: version.h:85

AV_ONCE_INIT

#define AV_ONCE_INIT

Definition: thread.h:203

AVClass

Describe the class of an AVClass context structure.

Definition: log.h:76

filter_frame

static int filter_frame(AVFilterLink *inlink, AVFrame *frame)

Definition: af_whisper.c:287

metadata

Stream codec metadata

Definition: ogg-flac-chained-meta.txt:2

NULL

#define NULL

Definition: coverity.c:32

format

New swscale design to change SwsGraph is what coordinates multiple passes These can include cascaded scaling error diffusion and so on Or we could have separate passes for the vertical and horizontal scaling In between each SwsPass lies a fully allocated image buffer Graph passes may have different levels of e g we can have a single threaded error diffusion pass following a multi threaded scaling pass SwsGraph is internally recreated whenever the image format

Definition: swscale-v2.txt:14

AVRational

Rational number (pair of numerator and denominator).

Definition: rational.h:58

av_strireplace

char * av_strireplace(const char *str, const char *from, const char *to)

Locale-independent strings replace.

Definition: avstring.c:230

av_strnlen

size_t static size_t av_strnlen(const char *s, size_t len)

Get the count of continuous non zero chars starting from the beginning.

Definition: avstring.h:141

av_default_item_name

const char * av_default_item_name(void *ptr)

Return the context name.

Definition: log.c:241

WhisperContext::audio_buffer_fill_size

int audio_buffer_fill_size

Definition: af_whisper.c:62

WhisperContext::vad_model_path

char * vad_model_path

Definition: af_whisper.c:47

ff_audio_default_filterpad

const AVFilterPad ff_audio_default_filterpad[1]

An AVFilterPad array whose only entry has name "default" and is of type AVMEDIA_TYPE_AUDIO.

Definition: audio.c:34

HOURS

#define HOURS

Definition: af_whisper.c:430

ff_inlink_acknowledge_status

int ff_inlink_acknowledge_status(AVFilterLink *link, int *rstatus, int64_t *rpts)

Test and acknowledge the change of status on the link.

Definition: avfilter.c:1464

AVOnce

#define AVOnce

Definition: thread.h:202

AVFilterFormatsConfig

Lists of formats / etc.

Definition: avfilter.h:121

WhisperContext::model_path

char * model_path

Definition: af_whisper.c:43

ff_inlink_queued_frames

size_t ff_inlink_queued_frames(AVFilterLink *link)

Get the number of frames available on the link.

Definition: avfilter.c:1480

WhisperContext::format

char * format

Definition: af_whisper.c:54

AVIOContext

Bytestream IO Context.

Definition: avio.h:160

activate

static int activate(AVFilterContext *ctx)

Definition: af_whisper.c:372

NULL_IF_CONFIG_SMALL

#define NULL_IF_CONFIG_SMALL(x)

Return NULL if CONFIG_SMALL is true, otherwise the argument without modification.

Definition: internal.h:94

AVChannelLayout

An AVChannelLayout holds information about the channel layout of audio data.

Definition: channel_layout.h:319

WhisperContext::index

int index

Definition: af_whisper.c:70

dst

uint8_t ptrdiff_t const uint8_t ptrdiff_t int intptr_t intptr_t int int16_t * dst

Definition: dsp.h:87

av_err2str

#define av_err2str(errnum)

Convenience macro, the return value should be used only directly in function arguments but never stan...

Definition: error.h:122

AV_SAMPLE_FMT_NONE

@ AV_SAMPLE_FMT_NONE

Definition: samplefmt.h:56

avio.h

WhisperContext::eof

int eof

Definition: af_whisper.c:66

AV_NOPTS_VALUE

#define AV_NOPTS_VALUE

Undefined timestamp value.

Definition: avutil.h:247

user_data

static int FUNC() user_data(CodedBitstreamContext *ctx, RWContext *rw, MPEG2RawUserData *current)

Definition: cbs_mpeg2_syntax_template.c:59

AVFilterLink::src

AVFilterContext * src

source filter

Definition: avfilter.h:396

avio_write

void avio_write(AVIOContext *s, const unsigned char *buf, int size)

Definition: aviobuf.c:206

FF_FILTER_FORWARD_WANTED

FF_FILTER_FORWARD_WANTED(outlink, inlink)

WhisperContext::vad_min_silence_duration

int64_t vad_min_silence_duration

Definition: af_whisper.c:50

query_formats

static int query_formats(const AVFilterContext *ctx, AVFilterFormatsConfig **cfg_in, AVFilterFormatsConfig **cfg_out)

Definition: af_whisper.c:408

AV_LOG_INFO

#define AV_LOG_INFO

Standard information.

Definition: log.h:221

AV_OPT_TYPE_FLOAT

@ AV_OPT_TYPE_FLOAT

Underlying C type is float.

Definition: opt.h:271

AVFilterLink::sample_rate

int sample_rate

samples per second

Definition: avfilter.h:421

AVIOContext::direct

int direct

avio_read and avio_write should if possible be satisfied directly instead of going through a buffer,...

Definition: avio.h:268

#define i(width, name, range_min, range_max)

Definition: cbs_h2645.c:256

internal.h

AV_TIME_BASE

#define AV_TIME_BASE

Internal time base represented as integer.

Definition: avutil.h:253

av_malloc_array

#define av_malloc_array(a, b)

Definition: tableprint_vlc.h:32

ff_filter_get_nb_threads

int ff_filter_get_nb_threads(AVFilterContext *ctx)

Get number of threads for current filter instance.

Definition: avfilter.c:845

AVSampleFormat

Audio sample formats.

Definition: samplefmt.h:55

FILTER_QUERY_FUNC2

#define FILTER_QUERY_FUNC2(func)

Definition: filters.h:240

FFMIN

#define FFMIN(a, b)

Definition: macros.h:49

OFFSET

#define OFFSET(x)

Definition: af_whisper.c:428

av_rescale

int64_t av_rescale(int64_t a, int64_t b, int64_t c)

Rescale a 64-bit integer with rounding to nearest.

Definition: mathematics.c:129

uninit

static void uninit(AVFilterContext *ctx)

Definition: af_whisper.c:159

language

Undefined Behavior In the C language

Definition: undefined.txt:3

cb_log

static void cb_log(enum ggml_log_level level, const char *text, void *user_data)

Definition: af_whisper.c:73

ret

Definition: filter_design.txt:187

AVClass::class_name

const char * class_name

The name of the class; usually it is the same name as the context structure type to which the AVClass...

Definition: log.h:81

frame

these buffered frames must be flushed immediately if a new input produces new the filter must not call request_frame to get more It must just process the frame or queue it The task of requesting more frames is left to the filter s request_frame method or the application If a filter has several the filter must be ready for frames arriving randomly on any input any filter with several inputs will most likely require some kind of queuing mechanism It is perfectly acceptable to have a limited queue and to drop frames when the inputs are too unbalanced request_frame For filters that do not use the this method is called when a frame is wanted on an output For a it should directly call filter_frame on the corresponding output For a if there are queued frames already one of these frames should be pushed If the filter should request a frame on one of its repeatedly until at least one frame has been pushed Return or at least make progress towards producing a frame

Definition: filter_design.txt:265

WhisperContext::audio_buffer

float * audio_buffer

Definition: af_whisper.c:60

FF_COUNT2LAYOUT

#define FF_COUNT2LAYOUT(c)

Encode a channel count as a channel layout.

Definition: formats.h:102

push_last_frame

static int push_last_frame(AVFilterLink *outlink)

Definition: af_whisper.c:346

status