af_atempo.c 38 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225
  1. /*
  2. * Copyright (c) 2012 Pavel Koshevoy <pkoshevoy at gmail dot com>
  3. *
  4. * This file is part of FFmpeg.
  5. *
  6. * FFmpeg is free software; you can redistribute it and/or
  7. * modify it under the terms of the GNU Lesser General Public
  8. * License as published by the Free Software Foundation; either
  9. * version 2.1 of the License, or (at your option) any later version.
  10. *
  11. * FFmpeg is distributed in the hope that it will be useful,
  12. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  14. * Lesser General Public License for more details.
  15. *
  16. * You should have received a copy of the GNU Lesser General Public
  17. * License along with FFmpeg; if not, write to the Free Software
  18. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  19. */
  20. /**
  21. * @file
  22. * tempo scaling audio filter -- an implementation of WSOLA algorithm
  23. *
  24. * Based on MIT licensed yaeAudioTempoFilter.h and yaeAudioFragment.h
  25. * from Apprentice Video player by Pavel Koshevoy.
  26. * https://sourceforge.net/projects/apprenticevideo/
  27. *
  28. * An explanation of SOLA algorithm is available at
  29. * http://www.surina.net/article/time-and-pitch-scaling.html
  30. *
  31. * WSOLA is very similar to SOLA, only one major difference exists between
  32. * these algorithms. SOLA shifts audio fragments along the output stream,
  33. * where as WSOLA shifts audio fragments along the input stream.
  34. *
  35. * The advantage of WSOLA algorithm is that the overlap region size is
  36. * always the same, therefore the blending function is constant and
  37. * can be precomputed.
  38. */
  39. #include <float.h>
  40. #include "libavcodec/avfft.h"
  41. #include "libavutil/avassert.h"
  42. #include "libavutil/avstring.h"
  43. #include "libavutil/channel_layout.h"
  44. #include "libavutil/eval.h"
  45. #include "libavutil/opt.h"
  46. #include "libavutil/samplefmt.h"
  47. #include "avfilter.h"
  48. #include "audio.h"
  49. #include "internal.h"
  50. /**
  51. * A fragment of audio waveform
  52. */
  53. typedef struct AudioFragment {
  54. // index of the first sample of this fragment in the overall waveform;
  55. // 0: input sample position
  56. // 1: output sample position
  57. int64_t position[2];
  58. // original packed multi-channel samples:
  59. uint8_t *data;
  60. // number of samples in this fragment:
  61. int nsamples;
  62. // rDFT transform of the down-mixed mono fragment, used for
  63. // fast waveform alignment via correlation in frequency domain:
  64. FFTSample *xdat;
  65. } AudioFragment;
  66. /**
  67. * Filter state machine states
  68. */
  69. typedef enum {
  70. YAE_LOAD_FRAGMENT,
  71. YAE_ADJUST_POSITION,
  72. YAE_RELOAD_FRAGMENT,
  73. YAE_OUTPUT_OVERLAP_ADD,
  74. YAE_FLUSH_OUTPUT,
  75. } FilterState;
  76. /**
  77. * Filter state machine
  78. */
  79. typedef struct ATempoContext {
  80. const AVClass *class;
  81. // ring-buffer of input samples, necessary because some times
  82. // input fragment position may be adjusted backwards:
  83. uint8_t *buffer;
  84. // ring-buffer maximum capacity, expressed in sample rate time base:
  85. int ring;
  86. // ring-buffer house keeping:
  87. int size;
  88. int head;
  89. int tail;
  90. // 0: input sample position corresponding to the ring buffer tail
  91. // 1: output sample position
  92. int64_t position[2];
  93. // first input timestamp, all other timestamps are offset by this one
  94. int64_t start_pts;
  95. // sample format:
  96. enum AVSampleFormat format;
  97. // number of channels:
  98. int channels;
  99. // row of bytes to skip from one sample to next, across multple channels;
  100. // stride = (number-of-channels * bits-per-sample-per-channel) / 8
  101. int stride;
  102. // fragment window size, power-of-two integer:
  103. int window;
  104. // Hann window coefficients, for feathering
  105. // (blending) the overlapping fragment region:
  106. float *hann;
  107. // tempo scaling factor:
  108. double tempo;
  109. // a snapshot of previous fragment input and output position values
  110. // captured when the tempo scale factor was set most recently:
  111. int64_t origin[2];
  112. // current/previous fragment ring-buffer:
  113. AudioFragment frag[2];
  114. // current fragment index:
  115. uint64_t nfrag;
  116. // current state:
  117. FilterState state;
  118. // for fast correlation calculation in frequency domain:
  119. RDFTContext *real_to_complex;
  120. RDFTContext *complex_to_real;
  121. FFTSample *correlation;
  122. // for managing AVFilterPad.request_frame and AVFilterPad.filter_frame
  123. AVFrame *dst_buffer;
  124. uint8_t *dst;
  125. uint8_t *dst_end;
  126. uint64_t nsamples_in;
  127. uint64_t nsamples_out;
  128. } ATempoContext;
  129. #define YAE_ATEMPO_MIN 0.5
  130. #define YAE_ATEMPO_MAX 100.0
  131. #define OFFSET(x) offsetof(ATempoContext, x)
  132. static const AVOption atempo_options[] = {
  133. { "tempo", "set tempo scale factor",
  134. OFFSET(tempo), AV_OPT_TYPE_DOUBLE, { .dbl = 1.0 },
  135. YAE_ATEMPO_MIN,
  136. YAE_ATEMPO_MAX,
  137. AV_OPT_FLAG_AUDIO_PARAM | AV_OPT_FLAG_FILTERING_PARAM },
  138. { NULL }
  139. };
  140. AVFILTER_DEFINE_CLASS(atempo);
  141. inline static AudioFragment *yae_curr_frag(ATempoContext *atempo)
  142. {
  143. return &atempo->frag[atempo->nfrag % 2];
  144. }
  145. inline static AudioFragment *yae_prev_frag(ATempoContext *atempo)
  146. {
  147. return &atempo->frag[(atempo->nfrag + 1) % 2];
  148. }
  149. /**
  150. * Reset filter to initial state, do not deallocate existing local buffers.
  151. */
  152. static void yae_clear(ATempoContext *atempo)
  153. {
  154. atempo->size = 0;
  155. atempo->head = 0;
  156. atempo->tail = 0;
  157. atempo->nfrag = 0;
  158. atempo->state = YAE_LOAD_FRAGMENT;
  159. atempo->start_pts = AV_NOPTS_VALUE;
  160. atempo->position[0] = 0;
  161. atempo->position[1] = 0;
  162. atempo->origin[0] = 0;
  163. atempo->origin[1] = 0;
  164. atempo->frag[0].position[0] = 0;
  165. atempo->frag[0].position[1] = 0;
  166. atempo->frag[0].nsamples = 0;
  167. atempo->frag[1].position[0] = 0;
  168. atempo->frag[1].position[1] = 0;
  169. atempo->frag[1].nsamples = 0;
  170. // shift left position of 1st fragment by half a window
  171. // so that no re-normalization would be required for
  172. // the left half of the 1st fragment:
  173. atempo->frag[0].position[0] = -(int64_t)(atempo->window / 2);
  174. atempo->frag[0].position[1] = -(int64_t)(atempo->window / 2);
  175. av_frame_free(&atempo->dst_buffer);
  176. atempo->dst = NULL;
  177. atempo->dst_end = NULL;
  178. atempo->nsamples_in = 0;
  179. atempo->nsamples_out = 0;
  180. }
  181. /**
  182. * Reset filter to initial state and deallocate all buffers.
  183. */
  184. static void yae_release_buffers(ATempoContext *atempo)
  185. {
  186. yae_clear(atempo);
  187. av_freep(&atempo->frag[0].data);
  188. av_freep(&atempo->frag[1].data);
  189. av_freep(&atempo->frag[0].xdat);
  190. av_freep(&atempo->frag[1].xdat);
  191. av_freep(&atempo->buffer);
  192. av_freep(&atempo->hann);
  193. av_freep(&atempo->correlation);
  194. av_rdft_end(atempo->real_to_complex);
  195. atempo->real_to_complex = NULL;
  196. av_rdft_end(atempo->complex_to_real);
  197. atempo->complex_to_real = NULL;
  198. }
  199. /* av_realloc is not aligned enough; fortunately, the data does not need to
  200. * be preserved */
  201. #define RE_MALLOC_OR_FAIL(field, field_size) \
  202. do { \
  203. av_freep(&field); \
  204. field = av_malloc(field_size); \
  205. if (!field) { \
  206. yae_release_buffers(atempo); \
  207. return AVERROR(ENOMEM); \
  208. } \
  209. } while (0)
  210. /**
  211. * Prepare filter for processing audio data of given format,
  212. * sample rate and number of channels.
  213. */
  214. static int yae_reset(ATempoContext *atempo,
  215. enum AVSampleFormat format,
  216. int sample_rate,
  217. int channels)
  218. {
  219. const int sample_size = av_get_bytes_per_sample(format);
  220. uint32_t nlevels = 0;
  221. uint32_t pot;
  222. int i;
  223. atempo->format = format;
  224. atempo->channels = channels;
  225. atempo->stride = sample_size * channels;
  226. // pick a segment window size:
  227. atempo->window = sample_rate / 24;
  228. // adjust window size to be a power-of-two integer:
  229. nlevels = av_log2(atempo->window);
  230. pot = 1 << nlevels;
  231. av_assert0(pot <= atempo->window);
  232. if (pot < atempo->window) {
  233. atempo->window = pot * 2;
  234. nlevels++;
  235. }
  236. // initialize audio fragment buffers:
  237. RE_MALLOC_OR_FAIL(atempo->frag[0].data, atempo->window * atempo->stride);
  238. RE_MALLOC_OR_FAIL(atempo->frag[1].data, atempo->window * atempo->stride);
  239. RE_MALLOC_OR_FAIL(atempo->frag[0].xdat, atempo->window * sizeof(FFTComplex));
  240. RE_MALLOC_OR_FAIL(atempo->frag[1].xdat, atempo->window * sizeof(FFTComplex));
  241. // initialize rDFT contexts:
  242. av_rdft_end(atempo->real_to_complex);
  243. atempo->real_to_complex = NULL;
  244. av_rdft_end(atempo->complex_to_real);
  245. atempo->complex_to_real = NULL;
  246. atempo->real_to_complex = av_rdft_init(nlevels + 1, DFT_R2C);
  247. if (!atempo->real_to_complex) {
  248. yae_release_buffers(atempo);
  249. return AVERROR(ENOMEM);
  250. }
  251. atempo->complex_to_real = av_rdft_init(nlevels + 1, IDFT_C2R);
  252. if (!atempo->complex_to_real) {
  253. yae_release_buffers(atempo);
  254. return AVERROR(ENOMEM);
  255. }
  256. RE_MALLOC_OR_FAIL(atempo->correlation, atempo->window * sizeof(FFTComplex));
  257. atempo->ring = atempo->window * 3;
  258. RE_MALLOC_OR_FAIL(atempo->buffer, atempo->ring * atempo->stride);
  259. // initialize the Hann window function:
  260. RE_MALLOC_OR_FAIL(atempo->hann, atempo->window * sizeof(float));
  261. for (i = 0; i < atempo->window; i++) {
  262. double t = (double)i / (double)(atempo->window - 1);
  263. double h = 0.5 * (1.0 - cos(2.0 * M_PI * t));
  264. atempo->hann[i] = (float)h;
  265. }
  266. yae_clear(atempo);
  267. return 0;
  268. }
  269. static int yae_set_tempo(AVFilterContext *ctx, const char *arg_tempo)
  270. {
  271. const AudioFragment *prev;
  272. ATempoContext *atempo = ctx->priv;
  273. char *tail = NULL;
  274. double tempo = av_strtod(arg_tempo, &tail);
  275. if (tail && *tail) {
  276. av_log(ctx, AV_LOG_ERROR, "Invalid tempo value '%s'\n", arg_tempo);
  277. return AVERROR(EINVAL);
  278. }
  279. if (tempo < YAE_ATEMPO_MIN || tempo > YAE_ATEMPO_MAX) {
  280. av_log(ctx, AV_LOG_ERROR, "Tempo value %f exceeds [%f, %f] range\n",
  281. tempo, YAE_ATEMPO_MIN, YAE_ATEMPO_MAX);
  282. return AVERROR(EINVAL);
  283. }
  284. prev = yae_prev_frag(atempo);
  285. atempo->origin[0] = prev->position[0] + atempo->window / 2;
  286. atempo->origin[1] = prev->position[1] + atempo->window / 2;
  287. atempo->tempo = tempo;
  288. return 0;
  289. }
  290. /**
  291. * A helper macro for initializing complex data buffer with scalar data
  292. * of a given type.
  293. */
  294. #define yae_init_xdat(scalar_type, scalar_max) \
  295. do { \
  296. const uint8_t *src_end = src + \
  297. frag->nsamples * atempo->channels * sizeof(scalar_type); \
  298. \
  299. FFTSample *xdat = frag->xdat; \
  300. scalar_type tmp; \
  301. \
  302. if (atempo->channels == 1) { \
  303. for (; src < src_end; xdat++) { \
  304. tmp = *(const scalar_type *)src; \
  305. src += sizeof(scalar_type); \
  306. \
  307. *xdat = (FFTSample)tmp; \
  308. } \
  309. } else { \
  310. FFTSample s, max, ti, si; \
  311. int i; \
  312. \
  313. for (; src < src_end; xdat++) { \
  314. tmp = *(const scalar_type *)src; \
  315. src += sizeof(scalar_type); \
  316. \
  317. max = (FFTSample)tmp; \
  318. s = FFMIN((FFTSample)scalar_max, \
  319. (FFTSample)fabsf(max)); \
  320. \
  321. for (i = 1; i < atempo->channels; i++) { \
  322. tmp = *(const scalar_type *)src; \
  323. src += sizeof(scalar_type); \
  324. \
  325. ti = (FFTSample)tmp; \
  326. si = FFMIN((FFTSample)scalar_max, \
  327. (FFTSample)fabsf(ti)); \
  328. \
  329. if (s < si) { \
  330. s = si; \
  331. max = ti; \
  332. } \
  333. } \
  334. \
  335. *xdat = max; \
  336. } \
  337. } \
  338. } while (0)
  339. /**
  340. * Initialize complex data buffer of a given audio fragment
  341. * with down-mixed mono data of appropriate scalar type.
  342. */
  343. static void yae_downmix(ATempoContext *atempo, AudioFragment *frag)
  344. {
  345. // shortcuts:
  346. const uint8_t *src = frag->data;
  347. // init complex data buffer used for FFT and Correlation:
  348. memset(frag->xdat, 0, sizeof(FFTComplex) * atempo->window);
  349. if (atempo->format == AV_SAMPLE_FMT_U8) {
  350. yae_init_xdat(uint8_t, 127);
  351. } else if (atempo->format == AV_SAMPLE_FMT_S16) {
  352. yae_init_xdat(int16_t, 32767);
  353. } else if (atempo->format == AV_SAMPLE_FMT_S32) {
  354. yae_init_xdat(int, 2147483647);
  355. } else if (atempo->format == AV_SAMPLE_FMT_FLT) {
  356. yae_init_xdat(float, 1);
  357. } else if (atempo->format == AV_SAMPLE_FMT_DBL) {
  358. yae_init_xdat(double, 1);
  359. }
  360. }
  361. /**
  362. * Populate the internal data buffer on as-needed basis.
  363. *
  364. * @return
  365. * 0 if requested data was already available or was successfully loaded,
  366. * AVERROR(EAGAIN) if more input data is required.
  367. */
  368. static int yae_load_data(ATempoContext *atempo,
  369. const uint8_t **src_ref,
  370. const uint8_t *src_end,
  371. int64_t stop_here)
  372. {
  373. // shortcut:
  374. const uint8_t *src = *src_ref;
  375. const int read_size = stop_here - atempo->position[0];
  376. if (stop_here <= atempo->position[0]) {
  377. return 0;
  378. }
  379. // samples are not expected to be skipped, unless tempo is greater than 2:
  380. av_assert0(read_size <= atempo->ring || atempo->tempo > 2.0);
  381. while (atempo->position[0] < stop_here && src < src_end) {
  382. int src_samples = (src_end - src) / atempo->stride;
  383. // load data piece-wise, in order to avoid complicating the logic:
  384. int nsamples = FFMIN(read_size, src_samples);
  385. int na;
  386. int nb;
  387. nsamples = FFMIN(nsamples, atempo->ring);
  388. na = FFMIN(nsamples, atempo->ring - atempo->tail);
  389. nb = FFMIN(nsamples - na, atempo->ring);
  390. if (na) {
  391. uint8_t *a = atempo->buffer + atempo->tail * atempo->stride;
  392. memcpy(a, src, na * atempo->stride);
  393. src += na * atempo->stride;
  394. atempo->position[0] += na;
  395. atempo->size = FFMIN(atempo->size + na, atempo->ring);
  396. atempo->tail = (atempo->tail + na) % atempo->ring;
  397. atempo->head =
  398. atempo->size < atempo->ring ?
  399. atempo->tail - atempo->size :
  400. atempo->tail;
  401. }
  402. if (nb) {
  403. uint8_t *b = atempo->buffer;
  404. memcpy(b, src, nb * atempo->stride);
  405. src += nb * atempo->stride;
  406. atempo->position[0] += nb;
  407. atempo->size = FFMIN(atempo->size + nb, atempo->ring);
  408. atempo->tail = (atempo->tail + nb) % atempo->ring;
  409. atempo->head =
  410. atempo->size < atempo->ring ?
  411. atempo->tail - atempo->size :
  412. atempo->tail;
  413. }
  414. }
  415. // pass back the updated source buffer pointer:
  416. *src_ref = src;
  417. // sanity check:
  418. av_assert0(atempo->position[0] <= stop_here);
  419. return atempo->position[0] == stop_here ? 0 : AVERROR(EAGAIN);
  420. }
  421. /**
  422. * Populate current audio fragment data buffer.
  423. *
  424. * @return
  425. * 0 when the fragment is ready,
  426. * AVERROR(EAGAIN) if more input data is required.
  427. */
  428. static int yae_load_frag(ATempoContext *atempo,
  429. const uint8_t **src_ref,
  430. const uint8_t *src_end)
  431. {
  432. // shortcuts:
  433. AudioFragment *frag = yae_curr_frag(atempo);
  434. uint8_t *dst;
  435. int64_t missing, start, zeros;
  436. uint32_t nsamples;
  437. const uint8_t *a, *b;
  438. int i0, i1, n0, n1, na, nb;
  439. int64_t stop_here = frag->position[0] + atempo->window;
  440. if (src_ref && yae_load_data(atempo, src_ref, src_end, stop_here) != 0) {
  441. return AVERROR(EAGAIN);
  442. }
  443. // calculate the number of samples we don't have:
  444. missing =
  445. stop_here > atempo->position[0] ?
  446. stop_here - atempo->position[0] : 0;
  447. nsamples =
  448. missing < (int64_t)atempo->window ?
  449. (uint32_t)(atempo->window - missing) : 0;
  450. // setup the output buffer:
  451. frag->nsamples = nsamples;
  452. dst = frag->data;
  453. start = atempo->position[0] - atempo->size;
  454. zeros = 0;
  455. if (frag->position[0] < start) {
  456. // what we don't have we substitute with zeros:
  457. zeros = FFMIN(start - frag->position[0], (int64_t)nsamples);
  458. av_assert0(zeros != nsamples);
  459. memset(dst, 0, zeros * atempo->stride);
  460. dst += zeros * atempo->stride;
  461. }
  462. if (zeros == nsamples) {
  463. return 0;
  464. }
  465. // get the remaining data from the ring buffer:
  466. na = (atempo->head < atempo->tail ?
  467. atempo->tail - atempo->head :
  468. atempo->ring - atempo->head);
  469. nb = atempo->head < atempo->tail ? 0 : atempo->tail;
  470. // sanity check:
  471. av_assert0(nsamples <= zeros + na + nb);
  472. a = atempo->buffer + atempo->head * atempo->stride;
  473. b = atempo->buffer;
  474. i0 = frag->position[0] + zeros - start;
  475. i1 = i0 < na ? 0 : i0 - na;
  476. n0 = i0 < na ? FFMIN(na - i0, (int)(nsamples - zeros)) : 0;
  477. n1 = nsamples - zeros - n0;
  478. if (n0) {
  479. memcpy(dst, a + i0 * atempo->stride, n0 * atempo->stride);
  480. dst += n0 * atempo->stride;
  481. }
  482. if (n1) {
  483. memcpy(dst, b + i1 * atempo->stride, n1 * atempo->stride);
  484. }
  485. return 0;
  486. }
  487. /**
  488. * Prepare for loading next audio fragment.
  489. */
  490. static void yae_advance_to_next_frag(ATempoContext *atempo)
  491. {
  492. const double fragment_step = atempo->tempo * (double)(atempo->window / 2);
  493. const AudioFragment *prev;
  494. AudioFragment *frag;
  495. atempo->nfrag++;
  496. prev = yae_prev_frag(atempo);
  497. frag = yae_curr_frag(atempo);
  498. frag->position[0] = prev->position[0] + (int64_t)fragment_step;
  499. frag->position[1] = prev->position[1] + atempo->window / 2;
  500. frag->nsamples = 0;
  501. }
  502. /**
  503. * Calculate cross-correlation via rDFT.
  504. *
  505. * Multiply two vectors of complex numbers (result of real_to_complex rDFT)
  506. * and transform back via complex_to_real rDFT.
  507. */
  508. static void yae_xcorr_via_rdft(FFTSample *xcorr,
  509. RDFTContext *complex_to_real,
  510. const FFTComplex *xa,
  511. const FFTComplex *xb,
  512. const int window)
  513. {
  514. FFTComplex *xc = (FFTComplex *)xcorr;
  515. int i;
  516. // NOTE: first element requires special care -- Given Y = rDFT(X),
  517. // Im(Y[0]) and Im(Y[N/2]) are always zero, therefore av_rdft_calc
  518. // stores Re(Y[N/2]) in place of Im(Y[0]).
  519. xc->re = xa->re * xb->re;
  520. xc->im = xa->im * xb->im;
  521. xa++;
  522. xb++;
  523. xc++;
  524. for (i = 1; i < window; i++, xa++, xb++, xc++) {
  525. xc->re = (xa->re * xb->re + xa->im * xb->im);
  526. xc->im = (xa->im * xb->re - xa->re * xb->im);
  527. }
  528. // apply inverse rDFT:
  529. av_rdft_calc(complex_to_real, xcorr);
  530. }
  531. /**
  532. * Calculate alignment offset for given fragment
  533. * relative to the previous fragment.
  534. *
  535. * @return alignment offset of current fragment relative to previous.
  536. */
  537. static int yae_align(AudioFragment *frag,
  538. const AudioFragment *prev,
  539. const int window,
  540. const int delta_max,
  541. const int drift,
  542. FFTSample *correlation,
  543. RDFTContext *complex_to_real)
  544. {
  545. int best_offset = -drift;
  546. FFTSample best_metric = -FLT_MAX;
  547. FFTSample *xcorr;
  548. int i0;
  549. int i1;
  550. int i;
  551. yae_xcorr_via_rdft(correlation,
  552. complex_to_real,
  553. (const FFTComplex *)prev->xdat,
  554. (const FFTComplex *)frag->xdat,
  555. window);
  556. // identify search window boundaries:
  557. i0 = FFMAX(window / 2 - delta_max - drift, 0);
  558. i0 = FFMIN(i0, window);
  559. i1 = FFMIN(window / 2 + delta_max - drift, window - window / 16);
  560. i1 = FFMAX(i1, 0);
  561. // identify cross-correlation peaks within search window:
  562. xcorr = correlation + i0;
  563. for (i = i0; i < i1; i++, xcorr++) {
  564. FFTSample metric = *xcorr;
  565. // normalize:
  566. FFTSample drifti = (FFTSample)(drift + i);
  567. metric *= drifti * (FFTSample)(i - i0) * (FFTSample)(i1 - i);
  568. if (metric > best_metric) {
  569. best_metric = metric;
  570. best_offset = i - window / 2;
  571. }
  572. }
  573. return best_offset;
  574. }
  575. /**
  576. * Adjust current fragment position for better alignment
  577. * with previous fragment.
  578. *
  579. * @return alignment correction.
  580. */
  581. static int yae_adjust_position(ATempoContext *atempo)
  582. {
  583. const AudioFragment *prev = yae_prev_frag(atempo);
  584. AudioFragment *frag = yae_curr_frag(atempo);
  585. const double prev_output_position =
  586. (double)(prev->position[1] - atempo->origin[1] + atempo->window / 2) *
  587. atempo->tempo;
  588. const double ideal_output_position =
  589. (double)(prev->position[0] - atempo->origin[0] + atempo->window / 2);
  590. const int drift = (int)(prev_output_position - ideal_output_position);
  591. const int delta_max = atempo->window / 2;
  592. const int correction = yae_align(frag,
  593. prev,
  594. atempo->window,
  595. delta_max,
  596. drift,
  597. atempo->correlation,
  598. atempo->complex_to_real);
  599. if (correction) {
  600. // adjust fragment position:
  601. frag->position[0] -= correction;
  602. // clear so that the fragment can be reloaded:
  603. frag->nsamples = 0;
  604. }
  605. return correction;
  606. }
  607. /**
  608. * A helper macro for blending the overlap region of previous
  609. * and current audio fragment.
  610. */
  611. #define yae_blend(scalar_type) \
  612. do { \
  613. const scalar_type *aaa = (const scalar_type *)a; \
  614. const scalar_type *bbb = (const scalar_type *)b; \
  615. \
  616. scalar_type *out = (scalar_type *)dst; \
  617. scalar_type *out_end = (scalar_type *)dst_end; \
  618. int64_t i; \
  619. \
  620. for (i = 0; i < overlap && out < out_end; \
  621. i++, atempo->position[1]++, wa++, wb++) { \
  622. float w0 = *wa; \
  623. float w1 = *wb; \
  624. int j; \
  625. \
  626. for (j = 0; j < atempo->channels; \
  627. j++, aaa++, bbb++, out++) { \
  628. float t0 = (float)*aaa; \
  629. float t1 = (float)*bbb; \
  630. \
  631. *out = \
  632. frag->position[0] + i < 0 ? \
  633. *aaa : \
  634. (scalar_type)(t0 * w0 + t1 * w1); \
  635. } \
  636. } \
  637. dst = (uint8_t *)out; \
  638. } while (0)
  639. /**
  640. * Blend the overlap region of previous and current audio fragment
  641. * and output the results to the given destination buffer.
  642. *
  643. * @return
  644. * 0 if the overlap region was completely stored in the dst buffer,
  645. * AVERROR(EAGAIN) if more destination buffer space is required.
  646. */
  647. static int yae_overlap_add(ATempoContext *atempo,
  648. uint8_t **dst_ref,
  649. uint8_t *dst_end)
  650. {
  651. // shortcuts:
  652. const AudioFragment *prev = yae_prev_frag(atempo);
  653. const AudioFragment *frag = yae_curr_frag(atempo);
  654. const int64_t start_here = FFMAX(atempo->position[1],
  655. frag->position[1]);
  656. const int64_t stop_here = FFMIN(prev->position[1] + prev->nsamples,
  657. frag->position[1] + frag->nsamples);
  658. const int64_t overlap = stop_here - start_here;
  659. const int64_t ia = start_here - prev->position[1];
  660. const int64_t ib = start_here - frag->position[1];
  661. const float *wa = atempo->hann + ia;
  662. const float *wb = atempo->hann + ib;
  663. const uint8_t *a = prev->data + ia * atempo->stride;
  664. const uint8_t *b = frag->data + ib * atempo->stride;
  665. uint8_t *dst = *dst_ref;
  666. av_assert0(start_here <= stop_here &&
  667. frag->position[1] <= start_here &&
  668. overlap <= frag->nsamples);
  669. if (atempo->format == AV_SAMPLE_FMT_U8) {
  670. yae_blend(uint8_t);
  671. } else if (atempo->format == AV_SAMPLE_FMT_S16) {
  672. yae_blend(int16_t);
  673. } else if (atempo->format == AV_SAMPLE_FMT_S32) {
  674. yae_blend(int);
  675. } else if (atempo->format == AV_SAMPLE_FMT_FLT) {
  676. yae_blend(float);
  677. } else if (atempo->format == AV_SAMPLE_FMT_DBL) {
  678. yae_blend(double);
  679. }
  680. // pass-back the updated destination buffer pointer:
  681. *dst_ref = dst;
  682. return atempo->position[1] == stop_here ? 0 : AVERROR(EAGAIN);
  683. }
  684. /**
  685. * Feed as much data to the filter as it is able to consume
  686. * and receive as much processed data in the destination buffer
  687. * as it is able to produce or store.
  688. */
  689. static void
  690. yae_apply(ATempoContext *atempo,
  691. const uint8_t **src_ref,
  692. const uint8_t *src_end,
  693. uint8_t **dst_ref,
  694. uint8_t *dst_end)
  695. {
  696. while (1) {
  697. if (atempo->state == YAE_LOAD_FRAGMENT) {
  698. // load additional data for the current fragment:
  699. if (yae_load_frag(atempo, src_ref, src_end) != 0) {
  700. break;
  701. }
  702. // down-mix to mono:
  703. yae_downmix(atempo, yae_curr_frag(atempo));
  704. // apply rDFT:
  705. av_rdft_calc(atempo->real_to_complex, yae_curr_frag(atempo)->xdat);
  706. // must load the second fragment before alignment can start:
  707. if (!atempo->nfrag) {
  708. yae_advance_to_next_frag(atempo);
  709. continue;
  710. }
  711. atempo->state = YAE_ADJUST_POSITION;
  712. }
  713. if (atempo->state == YAE_ADJUST_POSITION) {
  714. // adjust position for better alignment:
  715. if (yae_adjust_position(atempo)) {
  716. // reload the fragment at the corrected position, so that the
  717. // Hann window blending would not require normalization:
  718. atempo->state = YAE_RELOAD_FRAGMENT;
  719. } else {
  720. atempo->state = YAE_OUTPUT_OVERLAP_ADD;
  721. }
  722. }
  723. if (atempo->state == YAE_RELOAD_FRAGMENT) {
  724. // load additional data if necessary due to position adjustment:
  725. if (yae_load_frag(atempo, src_ref, src_end) != 0) {
  726. break;
  727. }
  728. // down-mix to mono:
  729. yae_downmix(atempo, yae_curr_frag(atempo));
  730. // apply rDFT:
  731. av_rdft_calc(atempo->real_to_complex, yae_curr_frag(atempo)->xdat);
  732. atempo->state = YAE_OUTPUT_OVERLAP_ADD;
  733. }
  734. if (atempo->state == YAE_OUTPUT_OVERLAP_ADD) {
  735. // overlap-add and output the result:
  736. if (yae_overlap_add(atempo, dst_ref, dst_end) != 0) {
  737. break;
  738. }
  739. // advance to the next fragment, repeat:
  740. yae_advance_to_next_frag(atempo);
  741. atempo->state = YAE_LOAD_FRAGMENT;
  742. }
  743. }
  744. }
  745. /**
  746. * Flush any buffered data from the filter.
  747. *
  748. * @return
  749. * 0 if all data was completely stored in the dst buffer,
  750. * AVERROR(EAGAIN) if more destination buffer space is required.
  751. */
  752. static int yae_flush(ATempoContext *atempo,
  753. uint8_t **dst_ref,
  754. uint8_t *dst_end)
  755. {
  756. AudioFragment *frag = yae_curr_frag(atempo);
  757. int64_t overlap_end;
  758. int64_t start_here;
  759. int64_t stop_here;
  760. int64_t offset;
  761. const uint8_t *src;
  762. uint8_t *dst;
  763. int src_size;
  764. int dst_size;
  765. int nbytes;
  766. atempo->state = YAE_FLUSH_OUTPUT;
  767. if (!atempo->nfrag) {
  768. // there is nothing to flush:
  769. return 0;
  770. }
  771. if (atempo->position[0] == frag->position[0] + frag->nsamples &&
  772. atempo->position[1] == frag->position[1] + frag->nsamples) {
  773. // the current fragment is already flushed:
  774. return 0;
  775. }
  776. if (frag->position[0] + frag->nsamples < atempo->position[0]) {
  777. // finish loading the current (possibly partial) fragment:
  778. yae_load_frag(atempo, NULL, NULL);
  779. if (atempo->nfrag) {
  780. // down-mix to mono:
  781. yae_downmix(atempo, frag);
  782. // apply rDFT:
  783. av_rdft_calc(atempo->real_to_complex, frag->xdat);
  784. // align current fragment to previous fragment:
  785. if (yae_adjust_position(atempo)) {
  786. // reload the current fragment due to adjusted position:
  787. yae_load_frag(atempo, NULL, NULL);
  788. }
  789. }
  790. }
  791. // flush the overlap region:
  792. overlap_end = frag->position[1] + FFMIN(atempo->window / 2,
  793. frag->nsamples);
  794. while (atempo->position[1] < overlap_end) {
  795. if (yae_overlap_add(atempo, dst_ref, dst_end) != 0) {
  796. return AVERROR(EAGAIN);
  797. }
  798. }
  799. // check whether all of the input samples have been consumed:
  800. if (frag->position[0] + frag->nsamples < atempo->position[0]) {
  801. yae_advance_to_next_frag(atempo);
  802. return AVERROR(EAGAIN);
  803. }
  804. // flush the remainder of the current fragment:
  805. start_here = FFMAX(atempo->position[1], overlap_end);
  806. stop_here = frag->position[1] + frag->nsamples;
  807. offset = start_here - frag->position[1];
  808. av_assert0(start_here <= stop_here && frag->position[1] <= start_here);
  809. src = frag->data + offset * atempo->stride;
  810. dst = (uint8_t *)*dst_ref;
  811. src_size = (int)(stop_here - start_here) * atempo->stride;
  812. dst_size = dst_end - dst;
  813. nbytes = FFMIN(src_size, dst_size);
  814. memcpy(dst, src, nbytes);
  815. dst += nbytes;
  816. atempo->position[1] += (nbytes / atempo->stride);
  817. // pass-back the updated destination buffer pointer:
  818. *dst_ref = (uint8_t *)dst;
  819. return atempo->position[1] == stop_here ? 0 : AVERROR(EAGAIN);
  820. }
  821. static av_cold int init(AVFilterContext *ctx)
  822. {
  823. ATempoContext *atempo = ctx->priv;
  824. atempo->format = AV_SAMPLE_FMT_NONE;
  825. atempo->state = YAE_LOAD_FRAGMENT;
  826. return 0;
  827. }
  828. static av_cold void uninit(AVFilterContext *ctx)
  829. {
  830. ATempoContext *atempo = ctx->priv;
  831. yae_release_buffers(atempo);
  832. }
  833. static int query_formats(AVFilterContext *ctx)
  834. {
  835. AVFilterChannelLayouts *layouts = NULL;
  836. AVFilterFormats *formats = NULL;
  837. // WSOLA necessitates an internal sliding window ring buffer
  838. // for incoming audio stream.
  839. //
  840. // Planar sample formats are too cumbersome to store in a ring buffer,
  841. // therefore planar sample formats are not supported.
  842. //
  843. static const enum AVSampleFormat sample_fmts[] = {
  844. AV_SAMPLE_FMT_U8,
  845. AV_SAMPLE_FMT_S16,
  846. AV_SAMPLE_FMT_S32,
  847. AV_SAMPLE_FMT_FLT,
  848. AV_SAMPLE_FMT_DBL,
  849. AV_SAMPLE_FMT_NONE
  850. };
  851. int ret;
  852. layouts = ff_all_channel_counts();
  853. if (!layouts) {
  854. return AVERROR(ENOMEM);
  855. }
  856. ret = ff_set_common_channel_layouts(ctx, layouts);
  857. if (ret < 0)
  858. return ret;
  859. formats = ff_make_format_list(sample_fmts);
  860. if (!formats) {
  861. return AVERROR(ENOMEM);
  862. }
  863. ret = ff_set_common_formats(ctx, formats);
  864. if (ret < 0)
  865. return ret;
  866. formats = ff_all_samplerates();
  867. if (!formats) {
  868. return AVERROR(ENOMEM);
  869. }
  870. return ff_set_common_samplerates(ctx, formats);
  871. }
  872. static int config_props(AVFilterLink *inlink)
  873. {
  874. AVFilterContext *ctx = inlink->dst;
  875. ATempoContext *atempo = ctx->priv;
  876. enum AVSampleFormat format = inlink->format;
  877. int sample_rate = (int)inlink->sample_rate;
  878. return yae_reset(atempo, format, sample_rate, inlink->channels);
  879. }
  880. static int push_samples(ATempoContext *atempo,
  881. AVFilterLink *outlink,
  882. int n_out)
  883. {
  884. int ret;
  885. atempo->dst_buffer->sample_rate = outlink->sample_rate;
  886. atempo->dst_buffer->nb_samples = n_out;
  887. // adjust the PTS:
  888. atempo->dst_buffer->pts = atempo->start_pts +
  889. av_rescale_q(atempo->nsamples_out,
  890. (AVRational){ 1, outlink->sample_rate },
  891. outlink->time_base);
  892. ret = ff_filter_frame(outlink, atempo->dst_buffer);
  893. atempo->dst_buffer = NULL;
  894. atempo->dst = NULL;
  895. atempo->dst_end = NULL;
  896. if (ret < 0)
  897. return ret;
  898. atempo->nsamples_out += n_out;
  899. return 0;
  900. }
  901. static int filter_frame(AVFilterLink *inlink, AVFrame *src_buffer)
  902. {
  903. AVFilterContext *ctx = inlink->dst;
  904. ATempoContext *atempo = ctx->priv;
  905. AVFilterLink *outlink = ctx->outputs[0];
  906. int ret = 0;
  907. int n_in = src_buffer->nb_samples;
  908. int n_out = (int)(0.5 + ((double)n_in) / atempo->tempo);
  909. const uint8_t *src = src_buffer->data[0];
  910. const uint8_t *src_end = src + n_in * atempo->stride;
  911. if (atempo->start_pts == AV_NOPTS_VALUE)
  912. atempo->start_pts = av_rescale_q(src_buffer->pts,
  913. inlink->time_base,
  914. outlink->time_base);
  915. while (src < src_end) {
  916. if (!atempo->dst_buffer) {
  917. atempo->dst_buffer = ff_get_audio_buffer(outlink, n_out);
  918. if (!atempo->dst_buffer) {
  919. av_frame_free(&src_buffer);
  920. return AVERROR(ENOMEM);
  921. }
  922. av_frame_copy_props(atempo->dst_buffer, src_buffer);
  923. atempo->dst = atempo->dst_buffer->data[0];
  924. atempo->dst_end = atempo->dst + n_out * atempo->stride;
  925. }
  926. yae_apply(atempo, &src, src_end, &atempo->dst, atempo->dst_end);
  927. if (atempo->dst == atempo->dst_end) {
  928. int n_samples = ((atempo->dst - atempo->dst_buffer->data[0]) /
  929. atempo->stride);
  930. ret = push_samples(atempo, outlink, n_samples);
  931. if (ret < 0)
  932. goto end;
  933. }
  934. }
  935. atempo->nsamples_in += n_in;
  936. end:
  937. av_frame_free(&src_buffer);
  938. return ret;
  939. }
  940. static int request_frame(AVFilterLink *outlink)
  941. {
  942. AVFilterContext *ctx = outlink->src;
  943. ATempoContext *atempo = ctx->priv;
  944. int ret;
  945. ret = ff_request_frame(ctx->inputs[0]);
  946. if (ret == AVERROR_EOF) {
  947. // flush the filter:
  948. int n_max = atempo->ring;
  949. int n_out;
  950. int err = AVERROR(EAGAIN);
  951. while (err == AVERROR(EAGAIN)) {
  952. if (!atempo->dst_buffer) {
  953. atempo->dst_buffer = ff_get_audio_buffer(outlink, n_max);
  954. if (!atempo->dst_buffer)
  955. return AVERROR(ENOMEM);
  956. atempo->dst = atempo->dst_buffer->data[0];
  957. atempo->dst_end = atempo->dst + n_max * atempo->stride;
  958. }
  959. err = yae_flush(atempo, &atempo->dst, atempo->dst_end);
  960. n_out = ((atempo->dst - atempo->dst_buffer->data[0]) /
  961. atempo->stride);
  962. if (n_out) {
  963. ret = push_samples(atempo, outlink, n_out);
  964. if (ret < 0)
  965. return ret;
  966. }
  967. }
  968. av_frame_free(&atempo->dst_buffer);
  969. atempo->dst = NULL;
  970. atempo->dst_end = NULL;
  971. return AVERROR_EOF;
  972. }
  973. return ret;
  974. }
  975. static int process_command(AVFilterContext *ctx,
  976. const char *cmd,
  977. const char *arg,
  978. char *res,
  979. int res_len,
  980. int flags)
  981. {
  982. return !strcmp(cmd, "tempo") ? yae_set_tempo(ctx, arg) : AVERROR(ENOSYS);
  983. }
  984. static const AVFilterPad atempo_inputs[] = {
  985. {
  986. .name = "default",
  987. .type = AVMEDIA_TYPE_AUDIO,
  988. .filter_frame = filter_frame,
  989. .config_props = config_props,
  990. },
  991. { NULL }
  992. };
  993. static const AVFilterPad atempo_outputs[] = {
  994. {
  995. .name = "default",
  996. .request_frame = request_frame,
  997. .type = AVMEDIA_TYPE_AUDIO,
  998. },
  999. { NULL }
  1000. };
  1001. AVFilter ff_af_atempo = {
  1002. .name = "atempo",
  1003. .description = NULL_IF_CONFIG_SMALL("Adjust audio tempo."),
  1004. .init = init,
  1005. .uninit = uninit,
  1006. .query_formats = query_formats,
  1007. .process_command = process_command,
  1008. .priv_size = sizeof(ATempoContext),
  1009. .priv_class = &atempo_class,
  1010. .inputs = atempo_inputs,
  1011. .outputs = atempo_outputs,
  1012. };