wmavoice.c
Go to the documentation of this file.
1 /*
2  * Windows Media Audio Voice decoder.
3  * Copyright (c) 2009 Ronald S. Bultje
4  *
5  * This file is part of Libav.
6  *
7  * Libav is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU Lesser General Public
9  * License as published by the Free Software Foundation; either
10  * version 2.1 of the License, or (at your option) any later version.
11  *
12  * Libav is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15  * Lesser General Public License for more details.
16  *
17  * You should have received a copy of the GNU Lesser General Public
18  * License along with Libav; if not, write to the Free Software
19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20  */
21 
28 #define UNCHECKED_BITSTREAM_READER 1
29 
30 #include <math.h>
31 
33 #include "libavutil/mem.h"
34 #include "dsputil.h"
35 #include "avcodec.h"
36 #include "internal.h"
37 #include "get_bits.h"
38 #include "put_bits.h"
39 #include "wmavoice_data.h"
40 #include "celp_filters.h"
41 #include "acelp_vectors.h"
42 #include "acelp_filters.h"
43 #include "lsp.h"
44 #include "dct.h"
45 #include "rdft.h"
46 #include "sinewin.h"
47 
48 #define MAX_BLOCKS 8
49 #define MAX_LSPS 16
50 #define MAX_LSPS_ALIGN16 16
51 #define MAX_FRAMES 3
53 #define MAX_FRAMESIZE 160
54 #define MAX_SIGNAL_HISTORY 416
55 #define MAX_SFRAMESIZE (MAX_FRAMESIZE * MAX_FRAMES)
56 #define SFRAME_CACHE_MAXSIZE 256
58 #define VLC_NBITS 6
60 
61 
65 
69 enum {
72  ACB_TYPE_HAMMING = 2
77 };
80 
84 enum {
86  FCB_TYPE_HARDCODED = 1,
89  FCB_TYPE_AW_PULSES = 2,
93 };
96 
100 static const struct frame_type_desc {
107  uint16_t frame_size;
110 } frame_descs[17] = {
112  { 1, 0, ACB_TYPE_NONE, FCB_TYPE_SILENCE, 0, 0 },
113  { 2, 1, ACB_TYPE_NONE, FCB_TYPE_HARDCODED, 0, 28 },
114  { 2, 1, ACB_TYPE_ASYMMETRIC, FCB_TYPE_AW_PULSES, 0, 46 },
115  { 2, 1, ACB_TYPE_ASYMMETRIC, FCB_TYPE_EXC_PULSES, 2, 80 },
116  { 2, 1, ACB_TYPE_ASYMMETRIC, FCB_TYPE_EXC_PULSES, 5, 104 },
117  { 4, 2, ACB_TYPE_ASYMMETRIC, FCB_TYPE_EXC_PULSES, 0, 108 },
118  { 4, 2, ACB_TYPE_ASYMMETRIC, FCB_TYPE_EXC_PULSES, 2, 132 },
119  { 4, 2, ACB_TYPE_ASYMMETRIC, FCB_TYPE_EXC_PULSES, 5, 168 },
120  { 2, 1, ACB_TYPE_HAMMING, FCB_TYPE_EXC_PULSES, 0, 64 },
121  { 2, 1, ACB_TYPE_HAMMING, FCB_TYPE_EXC_PULSES, 2, 80 },
122  { 2, 1, ACB_TYPE_HAMMING, FCB_TYPE_EXC_PULSES, 5, 104 },
123  { 4, 2, ACB_TYPE_HAMMING, FCB_TYPE_EXC_PULSES, 0, 108 },
124  { 4, 2, ACB_TYPE_HAMMING, FCB_TYPE_EXC_PULSES, 2, 132 },
125  { 4, 2, ACB_TYPE_HAMMING, FCB_TYPE_EXC_PULSES, 5, 168 },
126  { 8, 3, ACB_TYPE_HAMMING, FCB_TYPE_EXC_PULSES, 0, 176 },
127  { 8, 3, ACB_TYPE_HAMMING, FCB_TYPE_EXC_PULSES, 2, 208 },
128  { 8, 3, ACB_TYPE_HAMMING, FCB_TYPE_EXC_PULSES, 5, 256 }
129 };
130 
134 typedef struct {
141  int8_t vbm_tree[25];
145 
147  int history_nsamples;
150 
152  /* postfilter specific values */
153  int do_apf;
154  int denoise_strength;
156  int denoise_tilt_corr;
158  int dc_level;
160 
162  int lsps;
165  int frame_lsp_bitsize;
167  int sframe_lsp_bitsize;
169 
174  int block_pitch_nbits;
176  int block_pitch_range;
179  int block_delta_pitch_hrange;
183  uint16_t block_conv_table[4];
185 
197  int has_residual_lsps;
201  int skip_bits_next;
206 
213  PutBitContext pb;
218 
228  double prev_lsps[MAX_LSPS];
229  int last_pitch_val;
233  float silence_gain;
235 
237  int aw_pulse_range;
239  int aw_n_pulses[2];
245  int aw_first_pulse_off[2];
248  int aw_next_pulse_off_cache;
250 
256  float gain_pred_err[6];
258  float excitation_history[MAX_SIGNAL_HISTORY];
262  float synth_history[MAX_LSPS];
263 
272  RDFTContext rdft, irdft;
273  DCTContext dct, dst;
275  float sin[511], cos[511];
277  float postfilter_agc;
279  float dcf_mem[2];
281  float zero_exc_pf[MAX_SIGNAL_HISTORY + MAX_SFRAMESIZE];
284  float denoise_filter_cache[MAX_FRAMESIZE];
286  DECLARE_ALIGNED(32, float, tilted_lpcs_pf)[0x80];
288  DECLARE_ALIGNED(32, float, denoise_coeffs_pf)[0x80];
290  DECLARE_ALIGNED(32, float, synth_filter_out_buf)[0x80 + MAX_LSPS_ALIGN16];
293 
297 
307 static av_cold int decode_vbmtree(GetBitContext *gb, int8_t vbm_tree[25])
308 {
309  static const uint8_t bits[] = {
310  2, 2, 2, 4, 4, 4,
311  6, 6, 6, 8, 8, 8,
312  10, 10, 10, 12, 12, 12,
313  14, 14, 14, 14
314  };
315  static const uint16_t codes[] = {
316  0x0000, 0x0001, 0x0002, // 00/01/10
317  0x000c, 0x000d, 0x000e, // 11+00/01/10
318  0x003c, 0x003d, 0x003e, // 1111+00/01/10
319  0x00fc, 0x00fd, 0x00fe, // 111111+00/01/10
320  0x03fc, 0x03fd, 0x03fe, // 11111111+00/01/10
321  0x0ffc, 0x0ffd, 0x0ffe, // 1111111111+00/01/10
322  0x3ffc, 0x3ffd, 0x3ffe, 0x3fff // 111111111111+xx
323  };
324  int cntr[8] = { 0 }, n, res;
325 
326  memset(vbm_tree, 0xff, sizeof(vbm_tree[0]) * 25);
327  for (n = 0; n < 17; n++) {
328  res = get_bits(gb, 3);
329  if (cntr[res] > 3) // should be >= 3 + (res == 7))
330  return -1;
331  vbm_tree[res * 3 + cntr[res]++] = n;
332  }
333  INIT_VLC_STATIC(&frame_type_vlc, VLC_NBITS, sizeof(bits),
334  bits, 1, 1, codes, 2, 2, 132);
335  return 0;
336 }
337 
342 {
343  int n, flags, pitch_range, lsp16_flag;
344  WMAVoiceContext *s = ctx->priv_data;
345 
354  if (ctx->extradata_size != 46) {
355  av_log(ctx, AV_LOG_ERROR,
356  "Invalid extradata size %d (should be 46)\n",
357  ctx->extradata_size);
358  return -1;
359  }
360  flags = AV_RL32(ctx->extradata + 18);
361  s->spillover_bitsize = 3 + av_ceil_log2(ctx->block_align);
362  s->do_apf = flags & 0x1;
363  if (s->do_apf) {
364  ff_rdft_init(&s->rdft, 7, DFT_R2C);
365  ff_rdft_init(&s->irdft, 7, IDFT_C2R);
366  ff_dct_init(&s->dct, 6, DCT_I);
367  ff_dct_init(&s->dst, 6, DST_I);
368 
369  ff_sine_window_init(s->cos, 256);
370  memcpy(&s->sin[255], s->cos, 256 * sizeof(s->cos[0]));
371  for (n = 0; n < 255; n++) {
372  s->sin[n] = -s->sin[510 - n];
373  s->cos[510 - n] = s->cos[n];
374  }
375  }
376  s->denoise_strength = (flags >> 2) & 0xF;
377  if (s->denoise_strength >= 12) {
378  av_log(ctx, AV_LOG_ERROR,
379  "Invalid denoise filter strength %d (max=11)\n",
380  s->denoise_strength);
381  return -1;
382  }
383  s->denoise_tilt_corr = !!(flags & 0x40);
384  s->dc_level = (flags >> 7) & 0xF;
385  s->lsp_q_mode = !!(flags & 0x2000);
386  s->lsp_def_mode = !!(flags & 0x4000);
387  lsp16_flag = flags & 0x1000;
388  if (lsp16_flag) {
389  s->lsps = 16;
390  s->frame_lsp_bitsize = 34;
391  s->sframe_lsp_bitsize = 60;
392  } else {
393  s->lsps = 10;
394  s->frame_lsp_bitsize = 24;
395  s->sframe_lsp_bitsize = 48;
396  }
397  for (n = 0; n < s->lsps; n++)
398  s->prev_lsps[n] = M_PI * (n + 1.0) / (s->lsps + 1.0);
399 
400  init_get_bits(&s->gb, ctx->extradata + 22, (ctx->extradata_size - 22) << 3);
401  if (decode_vbmtree(&s->gb, s->vbm_tree) < 0) {
402  av_log(ctx, AV_LOG_ERROR, "Invalid VBM tree; broken extradata?\n");
403  return -1;
404  }
405 
406  s->min_pitch_val = ((ctx->sample_rate << 8) / 400 + 50) >> 8;
407  s->max_pitch_val = ((ctx->sample_rate << 8) * 37 / 2000 + 50) >> 8;
408  pitch_range = s->max_pitch_val - s->min_pitch_val;
409  if (pitch_range <= 0) {
410  av_log(ctx, AV_LOG_ERROR, "Invalid pitch range; broken extradata?\n");
411  return -1;
412  }
413  s->pitch_nbits = av_ceil_log2(pitch_range);
414  s->last_pitch_val = 40;
416  s->history_nsamples = s->max_pitch_val + 8;
417 
419  int min_sr = ((((1 << 8) - 50) * 400) + 0xFF) >> 8,
420  max_sr = ((((MAX_SIGNAL_HISTORY - 8) << 8) + 205) * 2000 / 37) >> 8;
421 
422  av_log(ctx, AV_LOG_ERROR,
423  "Unsupported samplerate %d (min=%d, max=%d)\n",
424  ctx->sample_rate, min_sr, max_sr); // 322-22097 Hz
425 
426  return -1;
427  }
428 
429  s->block_conv_table[0] = s->min_pitch_val;
430  s->block_conv_table[1] = (pitch_range * 25) >> 6;
431  s->block_conv_table[2] = (pitch_range * 44) >> 6;
432  s->block_conv_table[3] = s->max_pitch_val - 1;
433  s->block_delta_pitch_hrange = (pitch_range >> 3) & ~0xF;
434  if (s->block_delta_pitch_hrange <= 0) {
435  av_log(ctx, AV_LOG_ERROR, "Invalid delta pitch hrange; broken extradata?\n");
436  return -1;
437  }
438  s->block_delta_pitch_nbits = 1 + av_ceil_log2(s->block_delta_pitch_hrange);
440  s->block_conv_table[3] + 1 +
441  2 * (s->block_conv_table[1] - 2 * s->min_pitch_val);
442  s->block_pitch_nbits = av_ceil_log2(s->block_pitch_range);
443 
444  ctx->channels = 1;
447 
449  ctx->coded_frame = &s->frame;
450 
451  return 0;
452 }
453 
475 static void adaptive_gain_control(float *out, const float *in,
476  const float *speech_synth,
477  int size, float alpha, float *gain_mem)
478 {
479  int i;
480  float speech_energy = 0.0, postfilter_energy = 0.0, gain_scale_factor;
481  float mem = *gain_mem;
482 
483  for (i = 0; i < size; i++) {
484  speech_energy += fabsf(speech_synth[i]);
485  postfilter_energy += fabsf(in[i]);
486  }
487  gain_scale_factor = (1.0 - alpha) * speech_energy / postfilter_energy;
488 
489  for (i = 0; i < size; i++) {
490  mem = alpha * mem + gain_scale_factor;
491  out[i] = in[i] * mem;
492  }
493 
494  *gain_mem = mem;
495 }
496 
515 static int kalman_smoothen(WMAVoiceContext *s, int pitch,
516  const float *in, float *out, int size)
517 {
518  int n;
519  float optimal_gain = 0, dot;
520  const float *ptr = &in[-FFMAX(s->min_pitch_val, pitch - 3)],
521  *end = &in[-FFMIN(s->max_pitch_val, pitch + 3)],
522  *best_hist_ptr;
523 
524  /* find best fitting point in history */
525  do {
526  dot = ff_scalarproduct_float_c(in, ptr, size);
527  if (dot > optimal_gain) {
528  optimal_gain = dot;
529  best_hist_ptr = ptr;
530  }
531  } while (--ptr >= end);
532 
533  if (optimal_gain <= 0)
534  return -1;
535  dot = ff_scalarproduct_float_c(best_hist_ptr, best_hist_ptr, size);
536  if (dot <= 0) // would be 1.0
537  return -1;
538 
539  if (optimal_gain <= dot) {
540  dot = dot / (dot + 0.6 * optimal_gain); // 0.625-1.000
541  } else
542  dot = 0.625;
543 
544  /* actual smoothing */
545  for (n = 0; n < size; n++)
546  out[n] = best_hist_ptr[n] + dot * (in[n] - best_hist_ptr[n]);
547 
548  return 0;
549 }
550 
561 static float tilt_factor(const float *lpcs, int n_lpcs)
562 {
563  float rh0, rh1;
564 
565  rh0 = 1.0 + ff_scalarproduct_float_c(lpcs, lpcs, n_lpcs);
566  rh1 = lpcs[0] + ff_scalarproduct_float_c(lpcs, &lpcs[1], n_lpcs - 1);
567 
568  return rh1 / rh0;
569 }
570 
574 static void calc_input_response(WMAVoiceContext *s, float *lpcs,
575  int fcb_type, float *coeffs, int remainder)
576 {
577  float last_coeff, min = 15.0, max = -15.0;
578  float irange, angle_mul, gain_mul, range, sq;
579  int n, idx;
580 
581  /* Create frequency power spectrum of speech input (i.e. RDFT of LPCs) */
582  s->rdft.rdft_calc(&s->rdft, lpcs);
583 #define log_range(var, assign) do { \
584  float tmp = log10f(assign); var = tmp; \
585  max = FFMAX(max, tmp); min = FFMIN(min, tmp); \
586  } while (0)
587  log_range(last_coeff, lpcs[1] * lpcs[1]);
588  for (n = 1; n < 64; n++)
589  log_range(lpcs[n], lpcs[n * 2] * lpcs[n * 2] +
590  lpcs[n * 2 + 1] * lpcs[n * 2 + 1]);
591  log_range(lpcs[0], lpcs[0] * lpcs[0]);
592 #undef log_range
593  range = max - min;
594  lpcs[64] = last_coeff;
595 
596  /* Now, use this spectrum to pick out these frequencies with higher
597  * (relative) power/energy (which we then take to be "not noise"),
598  * and set up a table (still in lpc[]) of (relative) gains per frequency.
599  * These frequencies will be maintained, while others ("noise") will be
600  * decreased in the filter output. */
601  irange = 64.0 / range; // so irange*(max-value) is in the range [0, 63]
602  gain_mul = range * (fcb_type == FCB_TYPE_HARDCODED ? (5.0 / 13.0) :
603  (5.0 / 14.7));
604  angle_mul = gain_mul * (8.0 * M_LN10 / M_PI);
605  for (n = 0; n <= 64; n++) {
606  float pwr;
607 
608  idx = FFMAX(0, lrint((max - lpcs[n]) * irange) - 1);
610  lpcs[n] = angle_mul * pwr;
611 
612  /* 70.57 =~ 1/log10(1.0331663) */
613  idx = (pwr * gain_mul - 0.0295) * 70.570526123;
614  if (idx > 127) { // fallback if index falls outside table range
615  coeffs[n] = wmavoice_energy_table[127] *
616  powf(1.0331663, idx - 127);
617  } else
618  coeffs[n] = wmavoice_energy_table[FFMAX(0, idx)];
619  }
620 
621  /* calculate the Hilbert transform of the gains, which we do (since this
622  * is a sinus input) by doing a phase shift (in theory, H(sin())=cos()).
623  * Hilbert_Transform(RDFT(x)) = Laplace_Transform(x), which calculates the
624  * "moment" of the LPCs in this filter. */
625  s->dct.dct_calc(&s->dct, lpcs);
626  s->dst.dct_calc(&s->dst, lpcs);
627 
628  /* Split out the coefficient indexes into phase/magnitude pairs */
629  idx = 255 + av_clip(lpcs[64], -255, 255);
630  coeffs[0] = coeffs[0] * s->cos[idx];
631  idx = 255 + av_clip(lpcs[64] - 2 * lpcs[63], -255, 255);
632  last_coeff = coeffs[64] * s->cos[idx];
633  for (n = 63;; n--) {
634  idx = 255 + av_clip(-lpcs[64] - 2 * lpcs[n - 1], -255, 255);
635  coeffs[n * 2 + 1] = coeffs[n] * s->sin[idx];
636  coeffs[n * 2] = coeffs[n] * s->cos[idx];
637 
638  if (!--n) break;
639 
640  idx = 255 + av_clip( lpcs[64] - 2 * lpcs[n - 1], -255, 255);
641  coeffs[n * 2 + 1] = coeffs[n] * s->sin[idx];
642  coeffs[n * 2] = coeffs[n] * s->cos[idx];
643  }
644  coeffs[1] = last_coeff;
645 
646  /* move into real domain */
647  s->irdft.rdft_calc(&s->irdft, coeffs);
648 
649  /* tilt correction and normalize scale */
650  memset(&coeffs[remainder], 0, sizeof(coeffs[0]) * (128 - remainder));
651  if (s->denoise_tilt_corr) {
652  float tilt_mem = 0;
653 
654  coeffs[remainder - 1] = 0;
655  ff_tilt_compensation(&tilt_mem,
656  -1.8 * tilt_factor(coeffs, remainder - 1),
657  coeffs, remainder);
658  }
659  sq = (1.0 / 64.0) * sqrtf(1 / ff_scalarproduct_float_c(coeffs, coeffs, remainder));
660  for (n = 0; n < remainder; n++)
661  coeffs[n] *= sq;
662 }
663 
690 static void wiener_denoise(WMAVoiceContext *s, int fcb_type,
691  float *synth_pf, int size,
692  const float *lpcs)
693 {
694  int remainder, lim, n;
695 
696  if (fcb_type != FCB_TYPE_SILENCE) {
697  float *tilted_lpcs = s->tilted_lpcs_pf,
698  *coeffs = s->denoise_coeffs_pf, tilt_mem = 0;
699 
700  tilted_lpcs[0] = 1.0;
701  memcpy(&tilted_lpcs[1], lpcs, sizeof(lpcs[0]) * s->lsps);
702  memset(&tilted_lpcs[s->lsps + 1], 0,
703  sizeof(tilted_lpcs[0]) * (128 - s->lsps - 1));
704  ff_tilt_compensation(&tilt_mem, 0.7 * tilt_factor(lpcs, s->lsps),
705  tilted_lpcs, s->lsps + 2);
706 
707  /* The IRDFT output (127 samples for 7-bit filter) beyond the frame
708  * size is applied to the next frame. All input beyond this is zero,
709  * and thus all output beyond this will go towards zero, hence we can
710  * limit to min(size-1, 127-size) as a performance consideration. */
711  remainder = FFMIN(127 - size, size - 1);
712  calc_input_response(s, tilted_lpcs, fcb_type, coeffs, remainder);
713 
714  /* apply coefficients (in frequency spectrum domain), i.e. complex
715  * number multiplication */
716  memset(&synth_pf[size], 0, sizeof(synth_pf[0]) * (128 - size));
717  s->rdft.rdft_calc(&s->rdft, synth_pf);
718  s->rdft.rdft_calc(&s->rdft, coeffs);
719  synth_pf[0] *= coeffs[0];
720  synth_pf[1] *= coeffs[1];
721  for (n = 1; n < 64; n++) {
722  float v1 = synth_pf[n * 2], v2 = synth_pf[n * 2 + 1];
723  synth_pf[n * 2] = v1 * coeffs[n * 2] - v2 * coeffs[n * 2 + 1];
724  synth_pf[n * 2 + 1] = v2 * coeffs[n * 2] + v1 * coeffs[n * 2 + 1];
725  }
726  s->irdft.rdft_calc(&s->irdft, synth_pf);
727  }
728 
729  /* merge filter output with the history of previous runs */
730  if (s->denoise_filter_cache_size) {
731  lim = FFMIN(s->denoise_filter_cache_size, size);
732  for (n = 0; n < lim; n++)
733  synth_pf[n] += s->denoise_filter_cache[n];
734  s->denoise_filter_cache_size -= lim;
735  memmove(s->denoise_filter_cache, &s->denoise_filter_cache[size],
737  }
738 
739  /* move remainder of filter output into a cache for future runs */
740  if (fcb_type != FCB_TYPE_SILENCE) {
741  lim = FFMIN(remainder, s->denoise_filter_cache_size);
742  for (n = 0; n < lim; n++)
743  s->denoise_filter_cache[n] += synth_pf[size + n];
744  if (lim < remainder) {
745  memcpy(&s->denoise_filter_cache[lim], &synth_pf[size + lim],
746  sizeof(s->denoise_filter_cache[0]) * (remainder - lim));
747  s->denoise_filter_cache_size = remainder;
748  }
749  }
750 }
751 
772 static void postfilter(WMAVoiceContext *s, const float *synth,
773  float *samples, int size,
774  const float *lpcs, float *zero_exc_pf,
775  int fcb_type, int pitch)
776 {
777  float synth_filter_in_buf[MAX_FRAMESIZE / 2],
778  *synth_pf = &s->synth_filter_out_buf[MAX_LSPS_ALIGN16],
779  *synth_filter_in = zero_exc_pf;
780 
781  assert(size <= MAX_FRAMESIZE / 2);
782 
783  /* generate excitation from input signal */
784  ff_celp_lp_zero_synthesis_filterf(zero_exc_pf, lpcs, synth, size, s->lsps);
785 
786  if (fcb_type >= FCB_TYPE_AW_PULSES &&
787  !kalman_smoothen(s, pitch, zero_exc_pf, synth_filter_in_buf, size))
788  synth_filter_in = synth_filter_in_buf;
789 
790  /* re-synthesize speech after smoothening, and keep history */
791  ff_celp_lp_synthesis_filterf(synth_pf, lpcs,
792  synth_filter_in, size, s->lsps);
793  memcpy(&synth_pf[-s->lsps], &synth_pf[size - s->lsps],
794  sizeof(synth_pf[0]) * s->lsps);
795 
796  wiener_denoise(s, fcb_type, synth_pf, size, lpcs);
797 
798  adaptive_gain_control(samples, synth_pf, synth, size, 0.99,
799  &s->postfilter_agc);
800 
801  if (s->dc_level > 8) {
802  /* remove ultra-low frequency DC noise / highpass filter;
803  * coefficients are identical to those used in SIPR decoding,
804  * and very closely resemble those used in AMR-NB decoding. */
806  (const float[2]) { -1.99997, 1.0 },
807  (const float[2]) { -1.9330735188, 0.93589198496 },
808  0.93980580475, s->dcf_mem, size);
809  }
810 }
826 static void dequant_lsps(double *lsps, int num,
827  const uint16_t *values,
828  const uint16_t *sizes,
829  int n_stages, const uint8_t *table,
830  const double *mul_q,
831  const double *base_q)
832 {
833  int n, m;
834 
835  memset(lsps, 0, num * sizeof(*lsps));
836  for (n = 0; n < n_stages; n++) {
837  const uint8_t *t_off = &table[values[n] * num];
838  double base = base_q[n], mul = mul_q[n];
839 
840  for (m = 0; m < num; m++)
841  lsps[m] += base + mul * t_off[m];
842 
843  table += sizes[n] * num;
844  }
845 }
846 
858 static void dequant_lsp10i(GetBitContext *gb, double *lsps)
859 {
860  static const uint16_t vec_sizes[4] = { 256, 64, 32, 32 };
861  static const double mul_lsf[4] = {
862  5.2187144800e-3, 1.4626986422e-3,
863  9.6179549166e-4, 1.1325736225e-3
864  };
865  static const double base_lsf[4] = {
866  M_PI * -2.15522e-1, M_PI * -6.1646e-2,
867  M_PI * -3.3486e-2, M_PI * -5.7408e-2
868  };
869  uint16_t v[4];
870 
871  v[0] = get_bits(gb, 8);
872  v[1] = get_bits(gb, 6);
873  v[2] = get_bits(gb, 5);
874  v[3] = get_bits(gb, 5);
875 
876  dequant_lsps(lsps, 10, v, vec_sizes, 4, wmavoice_dq_lsp10i,
877  mul_lsf, base_lsf);
878 }
879 
885  double *i_lsps, const double *old,
886  double *a1, double *a2, int q_mode)
887 {
888  static const uint16_t vec_sizes[3] = { 128, 64, 64 };
889  static const double mul_lsf[3] = {
890  2.5807601174e-3, 1.2354460219e-3, 1.1763821673e-3
891  };
892  static const double base_lsf[3] = {
893  M_PI * -1.07448e-1, M_PI * -5.2706e-2, M_PI * -5.1634e-2
894  };
895  const float (*ipol_tab)[2][10] = q_mode ?
897  uint16_t interpol, v[3];
898  int n;
899 
900  dequant_lsp10i(gb, i_lsps);
901 
902  interpol = get_bits(gb, 5);
903  v[0] = get_bits(gb, 7);
904  v[1] = get_bits(gb, 6);
905  v[2] = get_bits(gb, 6);
906 
907  for (n = 0; n < 10; n++) {
908  double delta = old[n] - i_lsps[n];
909  a1[n] = ipol_tab[interpol][0][n] * delta + i_lsps[n];
910  a1[10 + n] = ipol_tab[interpol][1][n] * delta + i_lsps[n];
911  }
912 
913  dequant_lsps(a2, 20, v, vec_sizes, 3, wmavoice_dq_lsp10r,
914  mul_lsf, base_lsf);
915 }
916 
920 static void dequant_lsp16i(GetBitContext *gb, double *lsps)
921 {
922  static const uint16_t vec_sizes[5] = { 256, 64, 128, 64, 128 };
923  static const double mul_lsf[5] = {
924  3.3439586280e-3, 6.9908173703e-4,
925  3.3216608306e-3, 1.0334960326e-3,
926  3.1899104283e-3
927  };
928  static const double base_lsf[5] = {
929  M_PI * -1.27576e-1, M_PI * -2.4292e-2,
930  M_PI * -1.28094e-1, M_PI * -3.2128e-2,
931  M_PI * -1.29816e-1
932  };
933  uint16_t v[5];
934 
935  v[0] = get_bits(gb, 8);
936  v[1] = get_bits(gb, 6);
937  v[2] = get_bits(gb, 7);
938  v[3] = get_bits(gb, 6);
939  v[4] = get_bits(gb, 7);
940 
941  dequant_lsps( lsps, 5, v, vec_sizes, 2,
942  wmavoice_dq_lsp16i1, mul_lsf, base_lsf);
943  dequant_lsps(&lsps[5], 5, &v[2], &vec_sizes[2], 2,
944  wmavoice_dq_lsp16i2, &mul_lsf[2], &base_lsf[2]);
945  dequant_lsps(&lsps[10], 6, &v[4], &vec_sizes[4], 1,
946  wmavoice_dq_lsp16i3, &mul_lsf[4], &base_lsf[4]);
947 }
948 
954  double *i_lsps, const double *old,
955  double *a1, double *a2, int q_mode)
956 {
957  static const uint16_t vec_sizes[3] = { 128, 128, 128 };
958  static const double mul_lsf[3] = {
959  1.2232979501e-3, 1.4062241527e-3, 1.6114744851e-3
960  };
961  static const double base_lsf[3] = {
962  M_PI * -5.5830e-2, M_PI * -5.2908e-2, M_PI * -5.4776e-2
963  };
964  const float (*ipol_tab)[2][16] = q_mode ?
966  uint16_t interpol, v[3];
967  int n;
968 
969  dequant_lsp16i(gb, i_lsps);
970 
971  interpol = get_bits(gb, 5);
972  v[0] = get_bits(gb, 7);
973  v[1] = get_bits(gb, 7);
974  v[2] = get_bits(gb, 7);
975 
976  for (n = 0; n < 16; n++) {
977  double delta = old[n] - i_lsps[n];
978  a1[n] = ipol_tab[interpol][0][n] * delta + i_lsps[n];
979  a1[16 + n] = ipol_tab[interpol][1][n] * delta + i_lsps[n];
980  }
981 
982  dequant_lsps( a2, 10, v, vec_sizes, 1,
983  wmavoice_dq_lsp16r1, mul_lsf, base_lsf);
984  dequant_lsps(&a2[10], 10, &v[1], &vec_sizes[1], 1,
985  wmavoice_dq_lsp16r2, &mul_lsf[1], &base_lsf[1]);
986  dequant_lsps(&a2[20], 12, &v[2], &vec_sizes[2], 1,
987  wmavoice_dq_lsp16r3, &mul_lsf[2], &base_lsf[2]);
988 }
989 
1004  const int *pitch)
1005 {
1006  static const int16_t start_offset[94] = {
1007  -11, -9, -7, -5, -3, -1, 1, 3, 5, 7, 9, 11,
1008  13, 15, 18, 17, 19, 20, 21, 22, 23, 24, 25, 26,
1009  27, 28, 29, 30, 31, 32, 33, 35, 37, 39, 41, 43,
1010  45, 47, 49, 51, 53, 55, 57, 59, 61, 63, 65, 67,
1011  69, 71, 73, 75, 77, 79, 81, 83, 85, 87, 89, 91,
1012  93, 95, 97, 99, 101, 103, 105, 107, 109, 111, 113, 115,
1013  117, 119, 121, 123, 125, 127, 129, 131, 133, 135, 137, 139,
1014  141, 143, 145, 147, 149, 151, 153, 155, 157, 159
1015  };
1016  int bits, offset;
1017 
1018  /* position of pulse */
1019  s->aw_idx_is_ext = 0;
1020  if ((bits = get_bits(gb, 6)) >= 54) {
1021  s->aw_idx_is_ext = 1;
1022  bits += (bits - 54) * 3 + get_bits(gb, 2);
1023  }
1024 
1025  /* for a repeated pulse at pulse_off with a pitch_lag of pitch[], count
1026  * the distribution of the pulses in each block contained in this frame. */
1027  s->aw_pulse_range = FFMIN(pitch[0], pitch[1]) > 32 ? 24 : 16;
1028  for (offset = start_offset[bits]; offset < 0; offset += pitch[0]) ;
1029  s->aw_n_pulses[0] = (pitch[0] - 1 + MAX_FRAMESIZE / 2 - offset) / pitch[0];
1030  s->aw_first_pulse_off[0] = offset - s->aw_pulse_range / 2;
1031  offset += s->aw_n_pulses[0] * pitch[0];
1032  s->aw_n_pulses[1] = (pitch[1] - 1 + MAX_FRAMESIZE - offset) / pitch[1];
1033  s->aw_first_pulse_off[1] = offset - (MAX_FRAMESIZE + s->aw_pulse_range) / 2;
1034 
1035  /* if continuing from a position before the block, reset position to
1036  * start of block (when corrected for the range over which it can be
1037  * spread in aw_pulse_set1()). */
1038  if (start_offset[bits] < MAX_FRAMESIZE / 2) {
1039  while (s->aw_first_pulse_off[1] - pitch[1] + s->aw_pulse_range > 0)
1040  s->aw_first_pulse_off[1] -= pitch[1];
1041  if (start_offset[bits] < 0)
1042  while (s->aw_first_pulse_off[0] - pitch[0] + s->aw_pulse_range > 0)
1043  s->aw_first_pulse_off[0] -= pitch[0];
1044  }
1045 }
1046 
1055  int block_idx, AMRFixed *fcb)
1056 {
1057  uint16_t use_mask_mem[9]; // only 5 are used, rest is padding
1058  uint16_t *use_mask = use_mask_mem + 2;
1059  /* in this function, idx is the index in the 80-bit (+ padding) use_mask
1060  * bit-array. Since use_mask consists of 16-bit values, the lower 4 bits
1061  * of idx are the position of the bit within a particular item in the
1062  * array (0 being the most significant bit, and 15 being the least
1063  * significant bit), and the remainder (>> 4) is the index in the
1064  * use_mask[]-array. This is faster and uses less memory than using a
1065  * 80-byte/80-int array. */
1066  int pulse_off = s->aw_first_pulse_off[block_idx],
1067  pulse_start, n, idx, range, aidx, start_off = 0;
1068 
1069  /* set offset of first pulse to within this block */
1070  if (s->aw_n_pulses[block_idx] > 0)
1071  while (pulse_off + s->aw_pulse_range < 1)
1072  pulse_off += fcb->pitch_lag;
1073 
1074  /* find range per pulse */
1075  if (s->aw_n_pulses[0] > 0) {
1076  if (block_idx == 0) {
1077  range = 32;
1078  } else /* block_idx = 1 */ {
1079  range = 8;
1080  if (s->aw_n_pulses[block_idx] > 0)
1081  pulse_off = s->aw_next_pulse_off_cache;
1082  }
1083  } else
1084  range = 16;
1085  pulse_start = s->aw_n_pulses[block_idx] > 0 ? pulse_off - range / 2 : 0;
1086 
1087  /* aw_pulse_set1() already applies pulses around pulse_off (to be exactly,
1088  * in the range of [pulse_off, pulse_off + s->aw_pulse_range], and thus
1089  * we exclude that range from being pulsed again in this function. */
1090  memset(&use_mask[-2], 0, 2 * sizeof(use_mask[0]));
1091  memset( use_mask, -1, 5 * sizeof(use_mask[0]));
1092  memset(&use_mask[5], 0, 2 * sizeof(use_mask[0]));
1093  if (s->aw_n_pulses[block_idx] > 0)
1094  for (idx = pulse_off; idx < MAX_FRAMESIZE / 2; idx += fcb->pitch_lag) {
1095  int excl_range = s->aw_pulse_range; // always 16 or 24
1096  uint16_t *use_mask_ptr = &use_mask[idx >> 4];
1097  int first_sh = 16 - (idx & 15);
1098  *use_mask_ptr++ &= 0xFFFFu << first_sh;
1099  excl_range -= first_sh;
1100  if (excl_range >= 16) {
1101  *use_mask_ptr++ = 0;
1102  *use_mask_ptr &= 0xFFFF >> (excl_range - 16);
1103  } else
1104  *use_mask_ptr &= 0xFFFF >> excl_range;
1105  }
1106 
1107  /* find the 'aidx'th offset that is not excluded */
1108  aidx = get_bits(gb, s->aw_n_pulses[0] > 0 ? 5 - 2 * block_idx : 4);
1109  for (n = 0; n <= aidx; pulse_start++) {
1110  for (idx = pulse_start; idx < 0; idx += fcb->pitch_lag) ;
1111  if (idx >= MAX_FRAMESIZE / 2) { // find from zero
1112  if (use_mask[0]) idx = 0x0F;
1113  else if (use_mask[1]) idx = 0x1F;
1114  else if (use_mask[2]) idx = 0x2F;
1115  else if (use_mask[3]) idx = 0x3F;
1116  else if (use_mask[4]) idx = 0x4F;
1117  else return;
1118  idx -= av_log2_16bit(use_mask[idx >> 4]);
1119  }
1120  if (use_mask[idx >> 4] & (0x8000 >> (idx & 15))) {
1121  use_mask[idx >> 4] &= ~(0x8000 >> (idx & 15));
1122  n++;
1123  start_off = idx;
1124  }
1125  }
1126 
1127  fcb->x[fcb->n] = start_off;
1128  fcb->y[fcb->n] = get_bits1(gb) ? -1.0 : 1.0;
1129  fcb->n++;
1130 
1131  /* set offset for next block, relative to start of that block */
1132  n = (MAX_FRAMESIZE / 2 - start_off) % fcb->pitch_lag;
1133  s->aw_next_pulse_off_cache = n ? fcb->pitch_lag - n : 0;
1134 }
1135 
1144  int block_idx, AMRFixed *fcb)
1145 {
1146  int val = get_bits(gb, 12 - 2 * (s->aw_idx_is_ext && !block_idx));
1147  float v;
1148 
1149  if (s->aw_n_pulses[block_idx] > 0) {
1150  int n, v_mask, i_mask, sh, n_pulses;
1151 
1152  if (s->aw_pulse_range == 24) { // 3 pulses, 1:sign + 3:index each
1153  n_pulses = 3;
1154  v_mask = 8;
1155  i_mask = 7;
1156  sh = 4;
1157  } else { // 4 pulses, 1:sign + 2:index each
1158  n_pulses = 4;
1159  v_mask = 4;
1160  i_mask = 3;
1161  sh = 3;
1162  }
1163 
1164  for (n = n_pulses - 1; n >= 0; n--, val >>= sh) {
1165  fcb->y[fcb->n] = (val & v_mask) ? -1.0 : 1.0;
1166  fcb->x[fcb->n] = (val & i_mask) * n_pulses + n +
1167  s->aw_first_pulse_off[block_idx];
1168  while (fcb->x[fcb->n] < 0)
1169  fcb->x[fcb->n] += fcb->pitch_lag;
1170  if (fcb->x[fcb->n] < MAX_FRAMESIZE / 2)
1171  fcb->n++;
1172  }
1173  } else {
1174  int num2 = (val & 0x1FF) >> 1, delta, idx;
1175 
1176  if (num2 < 1 * 79) { delta = 1; idx = num2 + 1; }
1177  else if (num2 < 2 * 78) { delta = 3; idx = num2 + 1 - 1 * 77; }
1178  else if (num2 < 3 * 77) { delta = 5; idx = num2 + 1 - 2 * 76; }
1179  else { delta = 7; idx = num2 + 1 - 3 * 75; }
1180  v = (val & 0x200) ? -1.0 : 1.0;
1181 
1182  fcb->no_repeat_mask |= 3 << fcb->n;
1183  fcb->x[fcb->n] = idx - delta;
1184  fcb->y[fcb->n] = v;
1185  fcb->x[fcb->n + 1] = idx;
1186  fcb->y[fcb->n + 1] = (val & 1) ? -v : v;
1187  fcb->n += 2;
1188  }
1189 }
1190 
1204 static int pRNG(int frame_cntr, int block_num, int block_size)
1205 {
1206  /* array to simplify the calculation of z:
1207  * y = (x % 9) * 5 + 6;
1208  * z = (49995 * x) / y;
1209  * Since y only has 9 values, we can remove the division by using a
1210  * LUT and using FASTDIV-style divisions. For each of the 9 values
1211  * of y, we can rewrite z as:
1212  * z = x * (49995 / y) + x * ((49995 % y) / y)
1213  * In this table, each col represents one possible value of y, the
1214  * first number is 49995 / y, and the second is the FASTDIV variant
1215  * of 49995 % y / y. */
1216  static const unsigned int div_tbl[9][2] = {
1217  { 8332, 3 * 715827883U }, // y = 6
1218  { 4545, 0 * 390451573U }, // y = 11
1219  { 3124, 11 * 268435456U }, // y = 16
1220  { 2380, 15 * 204522253U }, // y = 21
1221  { 1922, 23 * 165191050U }, // y = 26
1222  { 1612, 23 * 138547333U }, // y = 31
1223  { 1388, 27 * 119304648U }, // y = 36
1224  { 1219, 16 * 104755300U }, // y = 41
1225  { 1086, 39 * 93368855U } // y = 46
1226  };
1227  unsigned int z, y, x = MUL16(block_num, 1877) + frame_cntr;
1228  if (x >= 0xFFFF) x -= 0xFFFF; // max value of x is 8*1877+0xFFFE=0x13AA6,
1229  // so this is effectively a modulo (%)
1230  y = x - 9 * MULH(477218589, x); // x % 9
1231  z = (uint16_t) (x * div_tbl[y][0] + UMULH(x, div_tbl[y][1]));
1232  // z = x * 49995 / (y * 5 + 6)
1233  return z % (1000 - block_size);
1234 }
1235 
1241  int block_idx, int size,
1242  const struct frame_type_desc *frame_desc,
1243  float *excitation)
1244 {
1245  float gain;
1246  int n, r_idx;
1247 
1248  assert(size <= MAX_FRAMESIZE);
1249 
1250  /* Set the offset from which we start reading wmavoice_std_codebook */
1251  if (frame_desc->fcb_type == FCB_TYPE_SILENCE) {
1252  r_idx = pRNG(s->frame_cntr, block_idx, size);
1253  gain = s->silence_gain;
1254  } else /* FCB_TYPE_HARDCODED */ {
1255  r_idx = get_bits(gb, 8);
1256  gain = wmavoice_gain_universal[get_bits(gb, 6)];
1257  }
1258 
1259  /* Clear gain prediction parameters */
1260  memset(s->gain_pred_err, 0, sizeof(s->gain_pred_err));
1261 
1262  /* Apply gain to hardcoded codebook and use that as excitation signal */
1263  for (n = 0; n < size; n++)
1264  excitation[n] = wmavoice_std_codebook[r_idx + n] * gain;
1265 }
1266 
1272  int block_idx, int size,
1273  int block_pitch_sh2,
1274  const struct frame_type_desc *frame_desc,
1275  float *excitation)
1276 {
1277  static const float gain_coeff[6] = {
1278  0.8169, -0.06545, 0.1726, 0.0185, -0.0359, 0.0458
1279  };
1280  float pulses[MAX_FRAMESIZE / 2], pred_err, acb_gain, fcb_gain;
1281  int n, idx, gain_weight;
1282  AMRFixed fcb;
1283 
1284  assert(size <= MAX_FRAMESIZE / 2);
1285  memset(pulses, 0, sizeof(*pulses) * size);
1286 
1287  fcb.pitch_lag = block_pitch_sh2 >> 2;
1288  fcb.pitch_fac = 1.0;
1289  fcb.no_repeat_mask = 0;
1290  fcb.n = 0;
1291 
1292  /* For the other frame types, this is where we apply the innovation
1293  * (fixed) codebook pulses of the speech signal. */
1294  if (frame_desc->fcb_type == FCB_TYPE_AW_PULSES) {
1295  aw_pulse_set1(s, gb, block_idx, &fcb);
1296  aw_pulse_set2(s, gb, block_idx, &fcb);
1297  } else /* FCB_TYPE_EXC_PULSES */ {
1298  int offset_nbits = 5 - frame_desc->log_n_blocks;
1299 
1300  fcb.no_repeat_mask = -1;
1301  /* similar to ff_decode_10_pulses_35bits(), but with single pulses
1302  * (instead of double) for a subset of pulses */
1303  for (n = 0; n < 5; n++) {
1304  float sign;
1305  int pos1, pos2;
1306 
1307  sign = get_bits1(gb) ? 1.0 : -1.0;
1308  pos1 = get_bits(gb, offset_nbits);
1309  fcb.x[fcb.n] = n + 5 * pos1;
1310  fcb.y[fcb.n++] = sign;
1311  if (n < frame_desc->dbl_pulses) {
1312  pos2 = get_bits(gb, offset_nbits);
1313  fcb.x[fcb.n] = n + 5 * pos2;
1314  fcb.y[fcb.n++] = (pos1 < pos2) ? -sign : sign;
1315  }
1316  }
1317  }
1318  ff_set_fixed_vector(pulses, &fcb, 1.0, size);
1319 
1320  /* Calculate gain for adaptive & fixed codebook signal.
1321  * see ff_amr_set_fixed_gain(). */
1322  idx = get_bits(gb, 7);
1323  fcb_gain = expf(ff_scalarproduct_float_c(s->gain_pred_err, gain_coeff, 6) -
1324  5.2409161640 + wmavoice_gain_codebook_fcb[idx]);
1325  acb_gain = wmavoice_gain_codebook_acb[idx];
1326  pred_err = av_clipf(wmavoice_gain_codebook_fcb[idx],
1327  -2.9957322736 /* log(0.05) */,
1328  1.6094379124 /* log(5.0) */);
1329 
1330  gain_weight = 8 >> frame_desc->log_n_blocks;
1331  memmove(&s->gain_pred_err[gain_weight], s->gain_pred_err,
1332  sizeof(*s->gain_pred_err) * (6 - gain_weight));
1333  for (n = 0; n < gain_weight; n++)
1334  s->gain_pred_err[n] = pred_err;
1335 
1336  /* Calculation of adaptive codebook */
1337  if (frame_desc->acb_type == ACB_TYPE_ASYMMETRIC) {
1338  int len;
1339  for (n = 0; n < size; n += len) {
1340  int next_idx_sh16;
1341  int abs_idx = block_idx * size + n;
1342  int pitch_sh16 = (s->last_pitch_val << 16) +
1343  s->pitch_diff_sh16 * abs_idx;
1344  int pitch = (pitch_sh16 + 0x6FFF) >> 16;
1345  int idx_sh16 = ((pitch << 16) - pitch_sh16) * 8 + 0x58000;
1346  idx = idx_sh16 >> 16;
1347  if (s->pitch_diff_sh16) {
1348  if (s->pitch_diff_sh16 > 0) {
1349  next_idx_sh16 = (idx_sh16) &~ 0xFFFF;
1350  } else
1351  next_idx_sh16 = (idx_sh16 + 0x10000) &~ 0xFFFF;
1352  len = av_clip((idx_sh16 - next_idx_sh16) / s->pitch_diff_sh16 / 8,
1353  1, size - n);
1354  } else
1355  len = size;
1356 
1357  ff_acelp_interpolatef(&excitation[n], &excitation[n - pitch],
1359  idx, 9, len);
1360  }
1361  } else /* ACB_TYPE_HAMMING */ {
1362  int block_pitch = block_pitch_sh2 >> 2;
1363  idx = block_pitch_sh2 & 3;
1364  if (idx) {
1365  ff_acelp_interpolatef(excitation, &excitation[-block_pitch],
1367  idx, 8, size);
1368  } else
1369  av_memcpy_backptr((uint8_t *) excitation, sizeof(float) * block_pitch,
1370  sizeof(float) * size);
1371  }
1372 
1373  /* Interpolate ACB/FCB and use as excitation signal */
1374  ff_weighted_vector_sumf(excitation, excitation, pulses,
1375  acb_gain, fcb_gain, size);
1376 }
1377 
1395  int block_idx, int size,
1396  int block_pitch_sh2,
1397  const double *lsps, const double *prev_lsps,
1398  const struct frame_type_desc *frame_desc,
1399  float *excitation, float *synth)
1400 {
1401  double i_lsps[MAX_LSPS];
1402  float lpcs[MAX_LSPS];
1403  float fac;
1404  int n;
1405 
1406  if (frame_desc->acb_type == ACB_TYPE_NONE)
1407  synth_block_hardcoded(s, gb, block_idx, size, frame_desc, excitation);
1408  else
1409  synth_block_fcb_acb(s, gb, block_idx, size, block_pitch_sh2,
1410  frame_desc, excitation);
1411 
1412  /* convert interpolated LSPs to LPCs */
1413  fac = (block_idx + 0.5) / frame_desc->n_blocks;
1414  for (n = 0; n < s->lsps; n++) // LSF -> LSP
1415  i_lsps[n] = cos(prev_lsps[n] + fac * (lsps[n] - prev_lsps[n]));
1416  ff_acelp_lspd2lpc(i_lsps, lpcs, s->lsps >> 1);
1417 
1418  /* Speech synthesis */
1419  ff_celp_lp_synthesis_filterf(synth, lpcs, excitation, size, s->lsps);
1420 }
1421 
1437 static int synth_frame(AVCodecContext *ctx, GetBitContext *gb, int frame_idx,
1438  float *samples,
1439  const double *lsps, const double *prev_lsps,
1440  float *excitation, float *synth)
1441 {
1442  WMAVoiceContext *s = ctx->priv_data;
1443  int n, n_blocks_x2, log_n_blocks_x2, cur_pitch_val;
1444  int pitch[MAX_BLOCKS], last_block_pitch;
1445 
1446  /* Parse frame type ("frame header"), see frame_descs */
1447  int bd_idx = s->vbm_tree[get_vlc2(gb, frame_type_vlc.table, 6, 3)], block_nsamples;
1448 
1449  if (bd_idx < 0) {
1450  av_log(ctx, AV_LOG_ERROR,
1451  "Invalid frame type VLC code, skipping\n");
1452  return -1;
1453  }
1454 
1455  block_nsamples = MAX_FRAMESIZE / frame_descs[bd_idx].n_blocks;
1456 
1457  /* Pitch calculation for ACB_TYPE_ASYMMETRIC ("pitch-per-frame") */
1458  if (frame_descs[bd_idx].acb_type == ACB_TYPE_ASYMMETRIC) {
1459  /* Pitch is provided per frame, which is interpreted as the pitch of
1460  * the last sample of the last block of this frame. We can interpolate
1461  * the pitch of other blocks (and even pitch-per-sample) by gradually
1462  * incrementing/decrementing prev_frame_pitch to cur_pitch_val. */
1463  n_blocks_x2 = frame_descs[bd_idx].n_blocks << 1;
1464  log_n_blocks_x2 = frame_descs[bd_idx].log_n_blocks + 1;
1465  cur_pitch_val = s->min_pitch_val + get_bits(gb, s->pitch_nbits);
1466  cur_pitch_val = FFMIN(cur_pitch_val, s->max_pitch_val - 1);
1467  if (s->last_acb_type == ACB_TYPE_NONE ||
1468  20 * abs(cur_pitch_val - s->last_pitch_val) >
1469  (cur_pitch_val + s->last_pitch_val))
1470  s->last_pitch_val = cur_pitch_val;
1471 
1472  /* pitch per block */
1473  for (n = 0; n < frame_descs[bd_idx].n_blocks; n++) {
1474  int fac = n * 2 + 1;
1475 
1476  pitch[n] = (MUL16(fac, cur_pitch_val) +
1477  MUL16((n_blocks_x2 - fac), s->last_pitch_val) +
1478  frame_descs[bd_idx].n_blocks) >> log_n_blocks_x2;
1479  }
1480 
1481  /* "pitch-diff-per-sample" for calculation of pitch per sample */
1482  s->pitch_diff_sh16 =
1483  ((cur_pitch_val - s->last_pitch_val) << 16) / MAX_FRAMESIZE;
1484  }
1485 
1486  /* Global gain (if silence) and pitch-adaptive window coordinates */
1487  switch (frame_descs[bd_idx].fcb_type) {
1488  case FCB_TYPE_SILENCE:
1490  break;
1491  case FCB_TYPE_AW_PULSES:
1492  aw_parse_coords(s, gb, pitch);
1493  break;
1494  }
1495 
1496  for (n = 0; n < frame_descs[bd_idx].n_blocks; n++) {
1497  int bl_pitch_sh2;
1498 
1499  /* Pitch calculation for ACB_TYPE_HAMMING ("pitch-per-block") */
1500  switch (frame_descs[bd_idx].acb_type) {
1501  case ACB_TYPE_HAMMING: {
1502  /* Pitch is given per block. Per-block pitches are encoded as an
1503  * absolute value for the first block, and then delta values
1504  * relative to this value) for all subsequent blocks. The scale of
1505  * this pitch value is semi-logaritmic compared to its use in the
1506  * decoder, so we convert it to normal scale also. */
1507  int block_pitch,
1508  t1 = (s->block_conv_table[1] - s->block_conv_table[0]) << 2,
1509  t2 = (s->block_conv_table[2] - s->block_conv_table[1]) << 1,
1510  t3 = s->block_conv_table[3] - s->block_conv_table[2] + 1;
1511 
1512  if (n == 0) {
1513  block_pitch = get_bits(gb, s->block_pitch_nbits);
1514  } else
1515  block_pitch = last_block_pitch - s->block_delta_pitch_hrange +
1517  /* Convert last_ so that any next delta is within _range */
1518  last_block_pitch = av_clip(block_pitch,
1520  s->block_pitch_range -
1522 
1523  /* Convert semi-log-style scale back to normal scale */
1524  if (block_pitch < t1) {
1525  bl_pitch_sh2 = (s->block_conv_table[0] << 2) + block_pitch;
1526  } else {
1527  block_pitch -= t1;
1528  if (block_pitch < t2) {
1529  bl_pitch_sh2 =
1530  (s->block_conv_table[1] << 2) + (block_pitch << 1);
1531  } else {
1532  block_pitch -= t2;
1533  if (block_pitch < t3) {
1534  bl_pitch_sh2 =
1535  (s->block_conv_table[2] + block_pitch) << 2;
1536  } else
1537  bl_pitch_sh2 = s->block_conv_table[3] << 2;
1538  }
1539  }
1540  pitch[n] = bl_pitch_sh2 >> 2;
1541  break;
1542  }
1543 
1544  case ACB_TYPE_ASYMMETRIC: {
1545  bl_pitch_sh2 = pitch[n] << 2;
1546  break;
1547  }
1548 
1549  default: // ACB_TYPE_NONE has no pitch
1550  bl_pitch_sh2 = 0;
1551  break;
1552  }
1553 
1554  synth_block(s, gb, n, block_nsamples, bl_pitch_sh2,
1555  lsps, prev_lsps, &frame_descs[bd_idx],
1556  &excitation[n * block_nsamples],
1557  &synth[n * block_nsamples]);
1558  }
1559 
1560  /* Averaging projection filter, if applicable. Else, just copy samples
1561  * from synthesis buffer */
1562  if (s->do_apf) {
1563  double i_lsps[MAX_LSPS];
1564  float lpcs[MAX_LSPS];
1565 
1566  for (n = 0; n < s->lsps; n++) // LSF -> LSP
1567  i_lsps[n] = cos(0.5 * (prev_lsps[n] + lsps[n]));
1568  ff_acelp_lspd2lpc(i_lsps, lpcs, s->lsps >> 1);
1569  postfilter(s, synth, samples, 80, lpcs,
1570  &s->zero_exc_pf[s->history_nsamples + MAX_FRAMESIZE * frame_idx],
1571  frame_descs[bd_idx].fcb_type, pitch[0]);
1572 
1573  for (n = 0; n < s->lsps; n++) // LSF -> LSP
1574  i_lsps[n] = cos(lsps[n]);
1575  ff_acelp_lspd2lpc(i_lsps, lpcs, s->lsps >> 1);
1576  postfilter(s, &synth[80], &samples[80], 80, lpcs,
1577  &s->zero_exc_pf[s->history_nsamples + MAX_FRAMESIZE * frame_idx + 80],
1578  frame_descs[bd_idx].fcb_type, pitch[0]);
1579  } else
1580  memcpy(samples, synth, 160 * sizeof(synth[0]));
1581 
1582  /* Cache values for next frame */
1583  s->frame_cntr++;
1584  if (s->frame_cntr >= 0xFFFF) s->frame_cntr -= 0xFFFF; // i.e. modulo (%)
1585  s->last_acb_type = frame_descs[bd_idx].acb_type;
1586  switch (frame_descs[bd_idx].acb_type) {
1587  case ACB_TYPE_NONE:
1588  s->last_pitch_val = 0;
1589  break;
1590  case ACB_TYPE_ASYMMETRIC:
1591  s->last_pitch_val = cur_pitch_val;
1592  break;
1593  case ACB_TYPE_HAMMING:
1594  s->last_pitch_val = pitch[frame_descs[bd_idx].n_blocks - 1];
1595  break;
1596  }
1597 
1598  return 0;
1599 }
1600 
1613 static void stabilize_lsps(double *lsps, int num)
1614 {
1615  int n, m, l;
1616 
1617  /* set minimum value for first, maximum value for last and minimum
1618  * spacing between LSF values.
1619  * Very similar to ff_set_min_dist_lsf(), but in double. */
1620  lsps[0] = FFMAX(lsps[0], 0.0015 * M_PI);
1621  for (n = 1; n < num; n++)
1622  lsps[n] = FFMAX(lsps[n], lsps[n - 1] + 0.0125 * M_PI);
1623  lsps[num - 1] = FFMIN(lsps[num - 1], 0.9985 * M_PI);
1624 
1625  /* reorder (looks like one-time / non-recursed bubblesort).
1626  * Very similar to ff_sort_nearly_sorted_floats(), but in double. */
1627  for (n = 1; n < num; n++) {
1628  if (lsps[n] < lsps[n - 1]) {
1629  for (m = 1; m < num; m++) {
1630  double tmp = lsps[m];
1631  for (l = m - 1; l >= 0; l--) {
1632  if (lsps[l] <= tmp) break;
1633  lsps[l + 1] = lsps[l];
1634  }
1635  lsps[l + 1] = tmp;
1636  }
1637  break;
1638  }
1639  }
1640 }
1641 
1652  WMAVoiceContext *s)
1653 {
1654  GetBitContext s_gb, *gb = &s_gb;
1655  int n, need_bits, bd_idx;
1656  const struct frame_type_desc *frame_desc;
1657 
1658  /* initialize a copy */
1659  init_get_bits(gb, orig_gb->buffer, orig_gb->size_in_bits);
1660  skip_bits_long(gb, get_bits_count(orig_gb));
1661  assert(get_bits_left(gb) == get_bits_left(orig_gb));
1662 
1663  /* superframe header */
1664  if (get_bits_left(gb) < 14)
1665  return 1;
1666  if (!get_bits1(gb))
1667  return -1; // WMAPro-in-WMAVoice superframe
1668  if (get_bits1(gb)) skip_bits(gb, 12); // number of samples in superframe
1669  if (s->has_residual_lsps) { // residual LSPs (for all frames)
1670  if (get_bits_left(gb) < s->sframe_lsp_bitsize)
1671  return 1;
1673  }
1674 
1675  /* frames */
1676  for (n = 0; n < MAX_FRAMES; n++) {
1677  int aw_idx_is_ext = 0;
1678 
1679  if (!s->has_residual_lsps) { // independent LSPs (per-frame)
1680  if (get_bits_left(gb) < s->frame_lsp_bitsize) return 1;
1682  }
1683  bd_idx = s->vbm_tree[get_vlc2(gb, frame_type_vlc.table, 6, 3)];
1684  if (bd_idx < 0)
1685  return -1; // invalid frame type VLC code
1686  frame_desc = &frame_descs[bd_idx];
1687  if (frame_desc->acb_type == ACB_TYPE_ASYMMETRIC) {
1688  if (get_bits_left(gb) < s->pitch_nbits)
1689  return 1;
1690  skip_bits_long(gb, s->pitch_nbits);
1691  }
1692  if (frame_desc->fcb_type == FCB_TYPE_SILENCE) {
1693  skip_bits(gb, 8);
1694  } else if (frame_desc->fcb_type == FCB_TYPE_AW_PULSES) {
1695  int tmp = get_bits(gb, 6);
1696  if (tmp >= 0x36) {
1697  skip_bits(gb, 2);
1698  aw_idx_is_ext = 1;
1699  }
1700  }
1701 
1702  /* blocks */
1703  if (frame_desc->acb_type == ACB_TYPE_HAMMING) {
1704  need_bits = s->block_pitch_nbits +
1705  (frame_desc->n_blocks - 1) * s->block_delta_pitch_nbits;
1706  } else if (frame_desc->fcb_type == FCB_TYPE_AW_PULSES) {
1707  need_bits = 2 * !aw_idx_is_ext;
1708  } else
1709  need_bits = 0;
1710  need_bits += frame_desc->frame_size;
1711  if (get_bits_left(gb) < need_bits)
1712  return 1;
1713  skip_bits_long(gb, need_bits);
1714  }
1715 
1716  return 0;
1717 }
1718 
1736 static int synth_superframe(AVCodecContext *ctx, int *got_frame_ptr)
1737 {
1738  WMAVoiceContext *s = ctx->priv_data;
1739  GetBitContext *gb = &s->gb, s_gb;
1740  int n, res, n_samples = 480;
1741  double lsps[MAX_FRAMES][MAX_LSPS];
1742  const double *mean_lsf = s->lsps == 16 ?
1744  float excitation[MAX_SIGNAL_HISTORY + MAX_SFRAMESIZE + 12];
1745  float synth[MAX_LSPS + MAX_SFRAMESIZE];
1746  float *samples;
1747 
1748  memcpy(synth, s->synth_history,
1749  s->lsps * sizeof(*synth));
1750  memcpy(excitation, s->excitation_history,
1751  s->history_nsamples * sizeof(*excitation));
1752 
1753  if (s->sframe_cache_size > 0) {
1754  gb = &s_gb;
1756  s->sframe_cache_size = 0;
1757  }
1758 
1759  if ((res = check_bits_for_superframe(gb, s)) == 1) {
1760  *got_frame_ptr = 0;
1761  return 1;
1762  }
1763 
1764  /* First bit is speech/music bit, it differentiates between WMAVoice
1765  * speech samples (the actual codec) and WMAVoice music samples, which
1766  * are really WMAPro-in-WMAVoice-superframes. I've never seen those in
1767  * the wild yet. */
1768  if (!get_bits1(gb)) {
1769  av_log_missing_feature(ctx, "WMAPro-in-WMAVoice", 1);
1770  return AVERROR_PATCHWELCOME;
1771  }
1772 
1773  /* (optional) nr. of samples in superframe; always <= 480 and >= 0 */
1774  if (get_bits1(gb)) {
1775  if ((n_samples = get_bits(gb, 12)) > 480) {
1776  av_log(ctx, AV_LOG_ERROR,
1777  "Superframe encodes >480 samples (%d), not allowed\n",
1778  n_samples);
1779  return -1;
1780  }
1781  }
1782  /* Parse LSPs, if global for the superframe (can also be per-frame). */
1783  if (s->has_residual_lsps) {
1784  double prev_lsps[MAX_LSPS], a1[MAX_LSPS * 2], a2[MAX_LSPS * 2];
1785 
1786  for (n = 0; n < s->lsps; n++)
1787  prev_lsps[n] = s->prev_lsps[n] - mean_lsf[n];
1788 
1789  if (s->lsps == 10) {
1790  dequant_lsp10r(gb, lsps[2], prev_lsps, a1, a2, s->lsp_q_mode);
1791  } else /* s->lsps == 16 */
1792  dequant_lsp16r(gb, lsps[2], prev_lsps, a1, a2, s->lsp_q_mode);
1793 
1794  for (n = 0; n < s->lsps; n++) {
1795  lsps[0][n] = mean_lsf[n] + (a1[n] - a2[n * 2]);
1796  lsps[1][n] = mean_lsf[n] + (a1[s->lsps + n] - a2[n * 2 + 1]);
1797  lsps[2][n] += mean_lsf[n];
1798  }
1799  for (n = 0; n < 3; n++)
1800  stabilize_lsps(lsps[n], s->lsps);
1801  }
1802 
1803  /* get output buffer */
1804  s->frame.nb_samples = 480;
1805  if ((res = ff_get_buffer(ctx, &s->frame)) < 0) {
1806  av_log(ctx, AV_LOG_ERROR, "get_buffer() failed\n");
1807  return res;
1808  }
1809  s->frame.nb_samples = n_samples;
1810  samples = (float *)s->frame.data[0];
1811 
1812  /* Parse frames, optionally preceded by per-frame (independent) LSPs. */
1813  for (n = 0; n < 3; n++) {
1814  if (!s->has_residual_lsps) {
1815  int m;
1816 
1817  if (s->lsps == 10) {
1818  dequant_lsp10i(gb, lsps[n]);
1819  } else /* s->lsps == 16 */
1820  dequant_lsp16i(gb, lsps[n]);
1821 
1822  for (m = 0; m < s->lsps; m++)
1823  lsps[n][m] += mean_lsf[m];
1824  stabilize_lsps(lsps[n], s->lsps);
1825  }
1826 
1827  if ((res = synth_frame(ctx, gb, n,
1828  &samples[n * MAX_FRAMESIZE],
1829  lsps[n], n == 0 ? s->prev_lsps : lsps[n - 1],
1830  &excitation[s->history_nsamples + n * MAX_FRAMESIZE],
1831  &synth[s->lsps + n * MAX_FRAMESIZE]))) {
1832  *got_frame_ptr = 0;
1833  return res;
1834  }
1835  }
1836 
1837  /* Statistics? FIXME - we don't check for length, a slight overrun
1838  * will be caught by internal buffer padding, and anything else
1839  * will be skipped, not read. */
1840  if (get_bits1(gb)) {
1841  res = get_bits(gb, 4);
1842  skip_bits(gb, 10 * (res + 1));
1843  }
1844 
1845  *got_frame_ptr = 1;
1846 
1847  /* Update history */
1848  memcpy(s->prev_lsps, lsps[2],
1849  s->lsps * sizeof(*s->prev_lsps));
1850  memcpy(s->synth_history, &synth[MAX_SFRAMESIZE],
1851  s->lsps * sizeof(*synth));
1852  memcpy(s->excitation_history, &excitation[MAX_SFRAMESIZE],
1853  s->history_nsamples * sizeof(*excitation));
1854  if (s->do_apf)
1855  memmove(s->zero_exc_pf, &s->zero_exc_pf[MAX_SFRAMESIZE],
1856  s->history_nsamples * sizeof(*s->zero_exc_pf));
1857 
1858  return 0;
1859 }
1860 
1869 {
1870  GetBitContext *gb = &s->gb;
1871  unsigned int res;
1872 
1873  if (get_bits_left(gb) < 11)
1874  return 1;
1875  skip_bits(gb, 4); // packet sequence number
1876  s->has_residual_lsps = get_bits1(gb);
1877  do {
1878  res = get_bits(gb, 6); // number of superframes per packet
1879  // (minus first one if there is spillover)
1880  if (get_bits_left(gb) < 6 * (res == 0x3F) + s->spillover_bitsize)
1881  return 1;
1882  } while (res == 0x3F);
1884 
1885  return 0;
1886 }
1887 
1903 static void copy_bits(PutBitContext *pb,
1904  const uint8_t *data, int size,
1905  GetBitContext *gb, int nbits)
1906 {
1907  int rmn_bytes, rmn_bits;
1908 
1909  rmn_bits = rmn_bytes = get_bits_left(gb);
1910  if (rmn_bits < nbits)
1911  return;
1912  if (nbits > pb->size_in_bits - put_bits_count(pb))
1913  return;
1914  rmn_bits &= 7; rmn_bytes >>= 3;
1915  if ((rmn_bits = FFMIN(rmn_bits, nbits)) > 0)
1916  put_bits(pb, rmn_bits, get_bits(gb, rmn_bits));
1917  avpriv_copy_bits(pb, data + size - rmn_bytes,
1918  FFMIN(nbits - rmn_bits, rmn_bytes << 3));
1919 }
1920 
1933  int *got_frame_ptr, AVPacket *avpkt)
1934 {
1935  WMAVoiceContext *s = ctx->priv_data;
1936  GetBitContext *gb = &s->gb;
1937  int size, res, pos;
1938 
1939  /* Packets are sometimes a multiple of ctx->block_align, with a packet
1940  * header at each ctx->block_align bytes. However, Libav's ASF demuxer
1941  * feeds us ASF packets, which may concatenate multiple "codec" packets
1942  * in a single "muxer" packet, so we artificially emulate that by
1943  * capping the packet size at ctx->block_align. */
1944  for (size = avpkt->size; size > ctx->block_align; size -= ctx->block_align);
1945  if (!size) {
1946  *got_frame_ptr = 0;
1947  return 0;
1948  }
1949  init_get_bits(&s->gb, avpkt->data, size << 3);
1950 
1951  /* size == ctx->block_align is used to indicate whether we are dealing with
1952  * a new packet or a packet of which we already read the packet header
1953  * previously. */
1954  if (size == ctx->block_align) { // new packet header
1955  if ((res = parse_packet_header(s)) < 0)
1956  return res;
1957 
1958  /* If the packet header specifies a s->spillover_nbits, then we want
1959  * to push out all data of the previous packet (+ spillover) before
1960  * continuing to parse new superframes in the current packet. */
1961  if (s->spillover_nbits > 0) {
1962  if (s->sframe_cache_size > 0) {
1963  int cnt = get_bits_count(gb);
1964  copy_bits(&s->pb, avpkt->data, size, gb, s->spillover_nbits);
1965  flush_put_bits(&s->pb);
1967  if ((res = synth_superframe(ctx, got_frame_ptr)) == 0 &&
1968  *got_frame_ptr) {
1969  cnt += s->spillover_nbits;
1970  s->skip_bits_next = cnt & 7;
1971  *(AVFrame *)data = s->frame;
1972  return cnt >> 3;
1973  } else
1974  skip_bits_long (gb, s->spillover_nbits - cnt +
1975  get_bits_count(gb)); // resync
1976  } else
1977  skip_bits_long(gb, s->spillover_nbits); // resync
1978  }
1979  } else if (s->skip_bits_next)
1980  skip_bits(gb, s->skip_bits_next);
1981 
1982  /* Try parsing superframes in current packet */
1983  s->sframe_cache_size = 0;
1984  s->skip_bits_next = 0;
1985  pos = get_bits_left(gb);
1986  if ((res = synth_superframe(ctx, got_frame_ptr)) < 0) {
1987  return res;
1988  } else if (*got_frame_ptr) {
1989  int cnt = get_bits_count(gb);
1990  s->skip_bits_next = cnt & 7;
1991  *(AVFrame *)data = s->frame;
1992  return cnt >> 3;
1993  } else if ((s->sframe_cache_size = pos) > 0) {
1994  /* rewind bit reader to start of last (incomplete) superframe... */
1995  init_get_bits(gb, avpkt->data, size << 3);
1996  skip_bits_long(gb, (size << 3) - pos);
1997  assert(get_bits_left(gb) == pos);
1998 
1999  /* ...and cache it for spillover in next packet */
2001  copy_bits(&s->pb, avpkt->data, size, gb, s->sframe_cache_size);
2002  // FIXME bad - just copy bytes as whole and add use the
2003  // skip_bits_next field
2004  }
2005 
2006  return size;
2007 }
2008 
2010 {
2011  WMAVoiceContext *s = ctx->priv_data;
2012 
2013  if (s->do_apf) {
2014  ff_rdft_end(&s->rdft);
2015  ff_rdft_end(&s->irdft);
2016  ff_dct_end(&s->dct);
2017  ff_dct_end(&s->dst);
2018  }
2019 
2020  return 0;
2021 }
2022 
2024 {
2025  WMAVoiceContext *s = ctx->priv_data;
2026  int n;
2027 
2028  s->postfilter_agc = 0;
2029  s->sframe_cache_size = 0;
2030  s->skip_bits_next = 0;
2031  for (n = 0; n < s->lsps; n++)
2032  s->prev_lsps[n] = M_PI * (n + 1.0) / (s->lsps + 1.0);
2033  memset(s->excitation_history, 0,
2034  sizeof(*s->excitation_history) * MAX_SIGNAL_HISTORY);
2035  memset(s->synth_history, 0,
2036  sizeof(*s->synth_history) * MAX_LSPS);
2037  memset(s->gain_pred_err, 0,
2038  sizeof(s->gain_pred_err));
2039 
2040  if (s->do_apf) {
2041  memset(&s->synth_filter_out_buf[MAX_LSPS_ALIGN16 - s->lsps], 0,
2042  sizeof(*s->synth_filter_out_buf) * s->lsps);
2043  memset(s->dcf_mem, 0,
2044  sizeof(*s->dcf_mem) * 2);
2045  memset(s->zero_exc_pf, 0,
2046  sizeof(*s->zero_exc_pf) * s->history_nsamples);
2047  memset(s->denoise_filter_cache, 0, sizeof(s->denoise_filter_cache));
2048  }
2049 }
2050 
2052  .name = "wmavoice",
2053  .type = AVMEDIA_TYPE_AUDIO,
2054  .id = AV_CODEC_ID_WMAVOICE,
2055  .priv_data_size = sizeof(WMAVoiceContext),
2059  .capabilities = CODEC_CAP_SUBFRAMES | CODEC_CAP_DR1,
2060  .flush = wmavoice_flush,
2061  .long_name = NULL_IF_CONFIG_SMALL("Windows Media Audio Voice"),
2062 };