Details | Last modification | View Log | RSS feed
Rev | Author | Line No. | Line |
---|---|---|---|
6148 | serge | 1 | /* |
2 | * Voxware MetaSound decoder |
||
3 | * Copyright (c) 2013 Konstantin Shishkov |
||
4 | * based on TwinVQ decoder |
||
5 | * Copyright (c) 2009 Vitor Sessak |
||
6 | * |
||
7 | * This file is part of FFmpeg. |
||
8 | * |
||
9 | * FFmpeg is free software; you can redistribute it and/or |
||
10 | * modify it under the terms of the GNU Lesser General Public |
||
11 | * License as published by the Free Software Foundation; either |
||
12 | * version 2.1 of the License, or (at your option) any later version. |
||
13 | * |
||
14 | * FFmpeg is distributed in the hope that it will be useful, |
||
15 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
||
16 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
||
17 | * Lesser General Public License for more details. |
||
18 | * |
||
19 | * You should have received a copy of the GNU Lesser General Public |
||
20 | * License along with FFmpeg; if not, write to the Free Software |
||
21 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
||
22 | */ |
||
23 | |||
24 | #include |
||
25 | #include |
||
26 | |||
27 | #define BITSTREAM_READER_LE |
||
28 | #include "libavutil/channel_layout.h" |
||
29 | #include "libavutil/float_dsp.h" |
||
30 | #include "avcodec.h" |
||
31 | #include "get_bits.h" |
||
32 | #include "fft.h" |
||
33 | #include "internal.h" |
||
34 | #include "lsp.h" |
||
35 | #include "sinewin.h" |
||
36 | |||
37 | #include "twinvq.h" |
||
38 | #include "metasound_data.h" |
||
39 | |||
40 | static void add_peak(float period, int width, const float *shape, |
||
41 | float ppc_gain, float *speech, int len) |
||
42 | { |
||
43 | int i, j, center; |
||
44 | const float *shape_end = shape + len; |
||
45 | |||
46 | // First peak centered around zero |
||
47 | for (i = 0; i < width / 2; i++) |
||
48 | speech[i] += ppc_gain * *shape++; |
||
49 | |||
50 | for (i = 1; i < ROUNDED_DIV(len, width); i++) { |
||
51 | center = (int)(i * period + 0.5); |
||
52 | for (j = -width / 2; j < (width + 1) / 2; j++) |
||
53 | speech[j + center] += ppc_gain * *shape++; |
||
54 | } |
||
55 | |||
56 | // For the last block, be careful not to go beyond the end of the buffer |
||
57 | center = (int)(i * period + 0.5); |
||
58 | for (j = -width / 2; j < (width + 1) / 2 && shape < shape_end; j++) |
||
59 | speech[j + center] += ppc_gain * *shape++; |
||
60 | } |
||
61 | |||
62 | static void decode_ppc(TwinVQContext *tctx, int period_coef, int g_coef, |
||
63 | const float *shape, float *speech) |
||
64 | { |
||
65 | const TwinVQModeTab *mtab = tctx->mtab; |
||
66 | int isampf = tctx->avctx->sample_rate / 1000; |
||
67 | int ibps = tctx->avctx->bit_rate / (1000 * tctx->avctx->channels); |
||
68 | int width; |
||
69 | |||
70 | float ratio = (float)mtab->size / isampf; |
||
71 | float min_period, max_period, period_range, period; |
||
72 | float some_mult; |
||
73 | |||
74 | float pgain_base, pgain_step, ppc_gain; |
||
75 | |||
76 | if (tctx->avctx->channels == 1) { |
||
77 | min_period = log2(ratio * 0.2); |
||
78 | max_period = min_period + log2(6); |
||
79 | } else { |
||
80 | min_period = (int)(ratio * 0.2 * 400 + 0.5) / 400.0; |
||
81 | max_period = (int)(ratio * 0.2 * 400 * 6 + 0.5) / 400.0; |
||
82 | } |
||
83 | period_range = max_period - min_period; |
||
84 | period = min_period + period_coef * period_range / |
||
85 | ((1 << mtab->ppc_period_bit) - 1); |
||
86 | if (tctx->avctx->channels == 1) |
||
87 | period = powf(2.0, period); |
||
88 | else |
||
89 | period = (int)(period * 400 + 0.5) / 400.0; |
||
90 | |||
91 | switch (isampf) { |
||
92 | case 8: some_mult = 2.0; break; |
||
93 | case 11: some_mult = 3.0; break; |
||
94 | case 16: some_mult = 3.0; break; |
||
95 | case 22: some_mult = ibps == 32 ? 2.0 : 4.0; break; |
||
96 | case 44: some_mult = 8.0; break; |
||
97 | default: some_mult = 4.0; |
||
98 | } |
||
99 | |||
100 | width = (int)(some_mult / (mtab->size / period) * mtab->ppc_shape_len); |
||
101 | if (isampf == 22 && ibps == 32) |
||
102 | width = (int)((2.0 / period + 1) * width + 0.5); |
||
103 | |||
104 | pgain_base = tctx->avctx->channels == 2 ? 25000.0 : 20000.0; |
||
105 | pgain_step = pgain_base / ((1 << mtab->pgain_bit) - 1); |
||
106 | ppc_gain = 1.0 / 8192 * |
||
107 | twinvq_mulawinv(pgain_step * g_coef + pgain_step / 2, |
||
108 | pgain_base, TWINVQ_PGAIN_MU); |
||
109 | |||
110 | add_peak(period, width, shape, ppc_gain, speech, mtab->ppc_shape_len); |
||
111 | } |
||
112 | |||
113 | static void dec_bark_env(TwinVQContext *tctx, const uint8_t *in, int use_hist, |
||
114 | int ch, float *out, float gain, |
||
115 | enum TwinVQFrameType ftype) |
||
116 | { |
||
117 | const TwinVQModeTab *mtab = tctx->mtab; |
||
118 | int i, j; |
||
119 | float *hist = tctx->bark_hist[ftype][ch]; |
||
120 | float val = ((const float []) { 0.4, 0.35, 0.28 })[ftype]; |
||
121 | int bark_n_coef = mtab->fmode[ftype].bark_n_coef; |
||
122 | int fw_cb_len = mtab->fmode[ftype].bark_env_size / bark_n_coef; |
||
123 | int idx = 0; |
||
124 | |||
125 | if (tctx->avctx->channels == 1) |
||
126 | val = 0.5; |
||
127 | for (i = 0; i < fw_cb_len; i++) |
||
128 | for (j = 0; j < bark_n_coef; j++, idx++) { |
||
129 | float tmp2 = mtab->fmode[ftype].bark_cb[fw_cb_len * in[j] + i] * |
||
130 | (1.0 / 2048); |
||
131 | float st; |
||
132 | |||
133 | if (tctx->avctx->channels == 1) |
||
134 | st = use_hist ? |
||
135 | tmp2 + val * hist[idx] + 1.0 : tmp2 + 1.0; |
||
136 | else |
||
137 | st = use_hist ? (1.0 - val) * tmp2 + val * hist[idx] + 1.0 |
||
138 | : tmp2 + 1.0; |
||
139 | |||
140 | hist[idx] = tmp2; |
||
141 | if (st < 0.1) |
||
142 | st = 0.1; |
||
143 | |||
144 | twinvq_memset_float(out, st * gain, |
||
145 | mtab->fmode[ftype].bark_tab[idx]); |
||
146 | out += mtab->fmode[ftype].bark_tab[idx]; |
||
147 | } |
||
148 | } |
||
149 | |||
150 | static void read_cb_data(TwinVQContext *tctx, GetBitContext *gb, |
||
151 | uint8_t *dst, enum TwinVQFrameType ftype) |
||
152 | { |
||
153 | int i; |
||
154 | |||
155 | for (i = 0; i < tctx->n_div[ftype]; i++) { |
||
156 | int bs_second_part = (i >= tctx->bits_main_spec_change[ftype]); |
||
157 | |||
158 | *dst++ = get_bits(gb, tctx->bits_main_spec[0][ftype][bs_second_part]); |
||
159 | *dst++ = get_bits(gb, tctx->bits_main_spec[1][ftype][bs_second_part]); |
||
160 | } |
||
161 | } |
||
162 | |||
163 | static int metasound_read_bitstream(AVCodecContext *avctx, TwinVQContext *tctx, |
||
164 | const uint8_t *buf, int buf_size) |
||
165 | { |
||
166 | TwinVQFrameData *bits = &tctx->bits; |
||
167 | const TwinVQModeTab *mtab = tctx->mtab; |
||
168 | int channels = tctx->avctx->channels; |
||
169 | int sub; |
||
170 | GetBitContext gb; |
||
171 | int i, j, k; |
||
172 | |||
173 | init_get_bits(&gb, buf, buf_size * 8); |
||
174 | |||
175 | bits->window_type = get_bits(&gb, TWINVQ_WINDOW_TYPE_BITS); |
||
176 | |||
177 | if (bits->window_type > 8) { |
||
178 | av_log(avctx, AV_LOG_ERROR, "Invalid window type, broken sample?\n"); |
||
179 | return AVERROR_INVALIDDATA; |
||
180 | } |
||
181 | |||
182 | bits->ftype = ff_twinvq_wtype_to_ftype_table[tctx->bits.window_type]; |
||
183 | |||
184 | sub = mtab->fmode[bits->ftype].sub; |
||
185 | |||
186 | if (bits->ftype != TWINVQ_FT_SHORT) |
||
187 | get_bits(&gb, 2); |
||
188 | |||
189 | read_cb_data(tctx, &gb, bits->main_coeffs, bits->ftype); |
||
190 | |||
191 | for (i = 0; i < channels; i++) |
||
192 | for (j = 0; j < sub; j++) |
||
193 | for (k = 0; k < mtab->fmode[bits->ftype].bark_n_coef; k++) |
||
194 | bits->bark1[i][j][k] = |
||
195 | get_bits(&gb, mtab->fmode[bits->ftype].bark_n_bit); |
||
196 | |||
197 | for (i = 0; i < channels; i++) |
||
198 | for (j = 0; j < sub; j++) |
||
199 | bits->bark_use_hist[i][j] = get_bits1(&gb); |
||
200 | |||
201 | if (bits->ftype == TWINVQ_FT_LONG) { |
||
202 | for (i = 0; i < channels; i++) |
||
203 | bits->gain_bits[i] = get_bits(&gb, TWINVQ_GAIN_BITS); |
||
204 | } else { |
||
205 | for (i = 0; i < channels; i++) { |
||
206 | bits->gain_bits[i] = get_bits(&gb, TWINVQ_GAIN_BITS); |
||
207 | for (j = 0; j < sub; j++) |
||
208 | bits->sub_gain_bits[i * sub + j] = |
||
209 | get_bits(&gb, TWINVQ_SUB_GAIN_BITS); |
||
210 | } |
||
211 | } |
||
212 | |||
213 | for (i = 0; i < channels; i++) { |
||
214 | bits->lpc_hist_idx[i] = get_bits(&gb, mtab->lsp_bit0); |
||
215 | bits->lpc_idx1[i] = get_bits(&gb, mtab->lsp_bit1); |
||
216 | |||
217 | for (j = 0; j < mtab->lsp_split; j++) |
||
218 | bits->lpc_idx2[i][j] = get_bits(&gb, mtab->lsp_bit2); |
||
219 | } |
||
220 | |||
221 | if (bits->ftype == TWINVQ_FT_LONG) { |
||
222 | read_cb_data(tctx, &gb, bits->ppc_coeffs, 3); |
||
223 | for (i = 0; i < channels; i++) { |
||
224 | bits->p_coef[i] = get_bits(&gb, mtab->ppc_period_bit); |
||
225 | bits->g_coef[i] = get_bits(&gb, mtab->pgain_bit); |
||
226 | } |
||
227 | } |
||
228 | |||
229 | return (get_bits_count(&gb) + 7) / 8; |
||
230 | } |
||
231 | |||
232 | typedef struct MetasoundProps { |
||
233 | uint32_t tag; |
||
234 | int bit_rate; |
||
235 | int channels; |
||
236 | int sample_rate; |
||
237 | } MetasoundProps; |
||
238 | |||
239 | static const MetasoundProps codec_props[] = { |
||
240 | { MKTAG('V','X','0','3'), 6, 1, 8000 }, |
||
241 | { MKTAG('V','X','0','4'), 12, 2, 8000 }, |
||
242 | |||
243 | { MKTAG('V','O','X','i'), 8, 1, 8000 }, |
||
244 | { MKTAG('V','O','X','j'), 10, 1, 11025 }, |
||
245 | { MKTAG('V','O','X','k'), 16, 1, 16000 }, |
||
246 | { MKTAG('V','O','X','L'), 24, 1, 22050 }, |
||
247 | { MKTAG('V','O','X','q'), 32, 1, 44100 }, |
||
248 | { MKTAG('V','O','X','r'), 40, 1, 44100 }, |
||
249 | { MKTAG('V','O','X','s'), 48, 1, 44100 }, |
||
250 | { MKTAG('V','O','X','t'), 16, 2, 8000 }, |
||
251 | { MKTAG('V','O','X','u'), 20, 2, 11025 }, |
||
252 | { MKTAG('V','O','X','v'), 32, 2, 16000 }, |
||
253 | { MKTAG('V','O','X','w'), 48, 2, 22050 }, |
||
254 | { MKTAG('V','O','X','x'), 64, 2, 44100 }, |
||
255 | { MKTAG('V','O','X','y'), 80, 2, 44100 }, |
||
256 | { MKTAG('V','O','X','z'), 96, 2, 44100 }, |
||
257 | |||
258 | { 0, 0, 0, 0 } |
||
259 | }; |
||
260 | |||
261 | static av_cold int metasound_decode_init(AVCodecContext *avctx) |
||
262 | { |
||
263 | int isampf, ibps; |
||
264 | TwinVQContext *tctx = avctx->priv_data; |
||
265 | uint32_t tag; |
||
266 | const MetasoundProps *props = codec_props; |
||
267 | |||
268 | if (!avctx->extradata || avctx->extradata_size < 16) { |
||
269 | av_log(avctx, AV_LOG_ERROR, "Missing or incomplete extradata\n"); |
||
270 | return AVERROR_INVALIDDATA; |
||
271 | } |
||
272 | |||
273 | tag = AV_RL32(avctx->extradata + 12); |
||
274 | |||
275 | for (;;) { |
||
276 | if (!props->tag) { |
||
277 | av_log(avctx, AV_LOG_ERROR, "Could not find tag %08X\n", tag); |
||
278 | return AVERROR_INVALIDDATA; |
||
279 | } |
||
280 | if (props->tag == tag) { |
||
281 | avctx->sample_rate = props->sample_rate; |
||
282 | avctx->channels = props->channels; |
||
283 | avctx->bit_rate = props->bit_rate * 1000; |
||
284 | isampf = avctx->sample_rate / 1000; |
||
285 | break; |
||
286 | } |
||
287 | props++; |
||
288 | } |
||
289 | |||
290 | if (avctx->channels <= 0 || avctx->channels > TWINVQ_CHANNELS_MAX) { |
||
291 | av_log(avctx, AV_LOG_ERROR, "Unsupported number of channels: %i\n", |
||
292 | avctx->channels); |
||
293 | return AVERROR_INVALIDDATA; |
||
294 | } |
||
295 | avctx->channel_layout = avctx->channels == 1 ? AV_CH_LAYOUT_MONO |
||
296 | : AV_CH_LAYOUT_STEREO; |
||
297 | |||
298 | ibps = avctx->bit_rate / (1000 * avctx->channels); |
||
299 | |||
300 | switch ((avctx->channels << 16) + (isampf << 8) + ibps) { |
||
301 | case (1 << 16) + ( 8 << 8) + 8: |
||
302 | tctx->mtab = &ff_metasound_mode0808; |
||
303 | break; |
||
304 | case (1 << 16) + (16 << 8) + 16: |
||
305 | tctx->mtab = &ff_metasound_mode1616; |
||
306 | break; |
||
307 | case (1 << 16) + (44 << 8) + 32: |
||
308 | tctx->mtab = &ff_metasound_mode4432; |
||
309 | break; |
||
310 | case (2 << 16) + (44 << 8) + 48: |
||
311 | tctx->mtab = &ff_metasound_mode4448s; |
||
312 | break; |
||
313 | default: |
||
314 | av_log(avctx, AV_LOG_ERROR, |
||
315 | "This version does not support %d kHz - %d kbit/s/ch mode.\n", |
||
316 | isampf, isampf); |
||
317 | return AVERROR(ENOSYS); |
||
318 | } |
||
319 | |||
320 | avctx->block_align = (avctx->bit_rate * tctx->mtab->size |
||
321 | / avctx->sample_rate + 7) / 8; |
||
322 | |||
323 | tctx->codec = TWINVQ_CODEC_METASOUND; |
||
324 | tctx->read_bitstream = metasound_read_bitstream; |
||
325 | tctx->dec_bark_env = dec_bark_env; |
||
326 | tctx->decode_ppc = decode_ppc; |
||
327 | |||
328 | return ff_twinvq_decode_init(avctx); |
||
329 | } |
||
330 | |||
331 | AVCodec ff_metasound_decoder = { |
||
332 | .name = "metasound", |
||
333 | .long_name = NULL_IF_CONFIG_SMALL("Voxware MetaSound"), |
||
334 | .type = AVMEDIA_TYPE_AUDIO, |
||
335 | .id = AV_CODEC_ID_METASOUND, |
||
336 | .priv_data_size = sizeof(TwinVQContext), |
||
337 | .init = metasound_decode_init, |
||
338 | .close = ff_twinvq_decode_close, |
||
339 | .decode = ff_twinvq_decode_frame, |
||
340 | .capabilities = CODEC_CAP_DR1, |
||
341 | .sample_fmts = (const enum AVSampleFormat[]) { AV_SAMPLE_FMT_FLTP, |
||
342 | AV_SAMPLE_FMT_NONE }, |
||
343 | };><>><>><>><>><>><>><>><>><>><>=>>>>>>>>>>>>>>>>>><>><>>>>>> |