1 /*
2 * Copyright (c) 2025 Vittorio Palmisano
3 *
4 * This file is part of FFmpeg.
5 *
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public License
8 * as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
10 *
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public License
17 * along with FFmpeg; if not, write to the Free Software Foundation, Inc.,
18 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19 */
20
21 #include <stdio.h>
22 #include <stdint.h>
23 #include <stdlib.h>
24
25 #include <whisper.h>
26
38
40
51
55
59
65
68
72
74 {
78 case GGML_LOG_LEVEL_ERROR:
80 break;
81 case GGML_LOG_LEVEL_WARN:
83 break;
84 }
86 }
87
89 {
91
94
96
97 // Init whisper context
101 }
102
103 struct whisper_context_params params = whisper_context_default_params();
104 params.use_gpu = wctx->
use_gpu;
106
111 }
112
113 // Init buffer
118
119 // Init VAD model context
121 struct whisper_vad_context_params ctx_params = whisper_vad_default_context_params();
123 // ctx_params.use_gpu = wctx->use_gpu; TODO (see: whisper_vad_init_context)
126
127 wctx->
vad_params = whisper_vad_default_params();
134 }
135
137
140 if (!strcmp(
"-",
dst))
143
147 }
148
150 }
151
153 "Whisper filter initialized: model: %s lang: %s queue: %" PRId64 " ms\n",
155
156 return 0;
157 }
158
160 {
162
165 "Remaining audio buffer %d samples (%d seconds) after stopping\n",
167 }
168
170 whisper_vad_free(wctx->
ctx_vad);
172 }
173
177 }
178
180
183 }
184
186 {
189
191 return;
192
195
197 "run transcription at %" PRId64 " ms, %d/%d samples (%.2f seconds)...\n",
199
200 struct whisper_full_params params = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
203 params.print_special = 0;
204 params.print_progress = 0;
205 params.print_realtime = 0;
206 params.print_timestamps = 0;
207
210 return;
211 }
212
213 const int n_segments = whisper_full_n_segments(wctx->
ctx_wsp);
214 char *segments_text =
NULL;
215
216 for (
int i = 0;
i < n_segments; ++
i) {
217 const char *text = whisper_full_get_segment_text(wctx->
ctx_wsp,
i);
219 text++;
221
224 continue;
225 }
226
227 const bool turn = whisper_full_get_segment_speaker_turn_next(wctx->
ctx_wsp,
i);
228 const int64_t t0_ms = whisper_full_get_segment_t0(wctx->
ctx_wsp,
i) * 10;
229 const int64_t t1_ms = whisper_full_get_segment_t1(wctx->
ctx_wsp,
i) * 10;
230
232 timestamp_ms + t0_ms, timestamp_ms + t1_ms, turn ? " (turn)" : "", text_cleaned);
233
234 if (segments_text) {
235 char *new_text =
av_asprintf(
"%s%s", segments_text, text_cleaned);
237 segments_text = new_text;
238 } else
240
242 const int64_t start_t = timestamp_ms + t0_ms;
243 const int64_t end_t = timestamp_ms + t1_ms;
245
247 buf =
249 ("%d\n%02" PRId64 ":%02" PRId64 ":%02" PRId64 ",%03" PRId64 " --> %02" PRId64 ":%02" PRId64 ":%02" PRId64 ",%03" PRId64 "\n%s\n\n",
250 wctx->
index, start_t / 3600000,
251 (start_t / 60000) % 60, (start_t / 1000) % 60,
252 start_t % 1000, end_t / 3600000, (end_t / 60000) % 60,
253 (end_t / 1000) % 60, end_t % 1000, text_cleaned);
254
257 buf =
av_asprintf(
"{\"start\":%" PRId64
",\"end\":%" PRId64
",\"text\":\"%s\"}\n", start_t, end_t, text_cleaned);
258 } else
260
261 if (buf) {
264 }
265 }
266
268 }
269
275 }
277
282 }
285 }
286
288 {
292
295
298 }
299
303 (
AVRational) {inlink->time_base.den, inlink->time_base.num});
306
310 struct whisper_vad_segments *segments = whisper_vad_segments_from_samples(wctx->
ctx_vad,
315
316 if (!segments) {
318 } else {
319 int n_segments = whisper_vad_segments_n_segments(segments);
320
321 if (n_segments > 0) {
322 const float start_ms = whisper_vad_segments_get_segment_t0(segments, 0) * 10.0;
323 const float end_ms = whisper_vad_segments_get_segment_t1(segments, n_segments - 1) * 10.0;
324 int end_pos = (int) (end_ms * WHISPER_SAMPLE_RATE / 1000);
325
326 if (end_pos <= wctx->audio_buffer_fill_size -
329 "VAD detected %d segments, start: %.0f ms, end: %.0f ms (buffer: %d ms)\n",
332 }
333 }
334
335 whisper_vad_free_segments(segments);
336 }
339
344 }
345
347 {
351 int n_out = 1;
352
354 return 0;
358
360
366
368
370 }
371
373 {
379
381
385
391 }
392
395
398
400 return 0;
401 }
402
404
406 }
407
411 {
416
420
424
426 }
427
428 #define OFFSET(x) offsetof(WhisperContext, x)
429 #define FLAGS AV_OPT_FLAG_AUDIO_PARAM | AV_OPT_FLAG_FILTERING_PARAM
430 #define HOURS 3600000000
431
445 };
446
452 };
453
466 };