Edinburgh Speech Tools 2.4-release
EST_sigpr_frame.h
1/*************************************************************************/
2/* */
3/* Centre for Speech Technology Research */
4/* University of Edinburgh, UK */
5/* Copyright (c) 1995,1996 */
6/* All Rights Reserved. */
7/* */
8/* Permission is hereby granted, free of charge, to use and distribute */
9/* this software and its documentation without restriction, including */
10/* without limitation the rights to use, copy, modify, merge, publish, */
11/* distribute, sublicense, and/or sell copies of this work, and to */
12/* permit persons to whom this work is furnished to do so, subject to */
13/* the following conditions: */
14/* 1. The code must retain the above copyright notice, this list of */
15/* conditions and the following disclaimer. */
16/* 2. Any modifications must be clearly marked as such. */
17/* 3. Original authors' names are not deleted. */
18/* 4. The authors' names are not used to endorse or promote products */
19/* derived from this software without specific prior written */
20/* permission. */
21/* */
22/* THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK */
23/* DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING */
24/* ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT */
25/* SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE */
26/* FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES */
27/* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN */
28/* AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, */
29/* ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF */
30/* THIS SOFTWARE. */
31/* */
32/*************************************************************************/
33
34#ifndef __EST_SIGPR_FRAME_H__
35#define __EST_SIGPR_FRAME_H__
36
37#include "EST_FMatrix.h"
38
39
40
41/**@name Linear Prediction functions
42Including, generation of coefficients from the signal, reflection
43coefficients, line spectral frequencies, areas.
44*/
45//@{
46
47/** Produce the full set of linear prediction coefficients from a
48 frame of speech waveform.
49
50@param sig: the frame of input waveform
51@param acf: the autocorrelation coefficients
52@param ref: the reflection coefficients
53@param lpc: the LPC coefficients
54
55The order of the lpc analysis is given as the size of the <parameter>
56lpc <parameter> vector - 1. The coefficients are placed in the
57locations 1 - size, and the energy is placed in location 0.
58*/
59
60void sig2lpc(const EST_FVector &sig, EST_FVector &acf,
61 EST_FVector &ref, EST_FVector &lpc);
62
63
64/** Calculate cepstral coefficients from lpc coefficients.
65
66It is possible to calculate a set of cepstral coefficients from
67lpc coefficients using the relationship:
68
69\[c_{k}= a_{k} + \frac{1}{k}\sum_{i=1}^{k-1} i c_{i} a_{k-1}\]
70
71The order of the cepstral analysis can be different from the lpc
72order. If the cepstral order is greater, interpolation is used (FINISH
73add equation). Both orders are taken from the lengths of the
74respective vectors. Note that these cepstral coefficients take on the
75assumptions (and errors) of the lpc model and hence will not be the
76same as cepstral coefficients calculated using DFT functions.
77
78@param lpc: the LPC coefficients (input)
79@param lpc: the cepstral coefficients (output)
80*/
81
82void lpc2cep(const EST_FVector &lpc, EST_FVector &cep);
83
84
85
86/** Produce a set linear prediction coefficients from a
87 frame of speech waveform. {\tt sig} is the frame of input waveform,
88 and {\tt lpc} are the LPC coefficients. The
89 {\bf order} of the lpc analysis is given as the size of the {\tt lpc}
90 vector -1. The coefficients are placed in the locations 1 - size, and
91 the energy is placed in location 0.
92*/
93void sig2lpc(const EST_FVector &sig, EST_FVector &lpc);
94
95/** Produce a set of reflection coefficients from a
96 frame of speech waveform. {\tt sig} is the frame of input waveform,
97 and {\tt ref} are the LPC coefficients. The
98 {\bf order} of the lpc analysis is given as the size of the {\tt lpc}
99 vector -1. The coefficients are placed in the locations 1 - size, and
100 the energy is placed in location 0.
101*/
102void sig2ref(const EST_FVector &sig, EST_FVector &ref);
103
104
105/**@name Area Functions
106Using the analogy of the lossless tube, the
107cross-sectional areas of the sections of this tube are related to the reflection coefficients and can be calculated from the following relationship:
108
109\[\frac{A_{i+1}}{A_{i}} = \frac{i - k_{i}}{1 + k_{i}} \]
110
111*/
112//@{
113/** The area according to the formula. */
114void ref2truearea(const EST_FVector &ref, EST_FVector &area);
115
116/** An approximation of the area is calculate by skipping the denominator
117in the formula. */
118void ref2area(const EST_FVector &ref, EST_FVector &area);
119
120/** The logs of the areas. */
121void ref2logarea(const EST_FVector &ref, EST_FVector &logarea);
122//@}
123
124/** Calculate the reflection coefficients from the lpc
125coefficients. Note that in the standard linear prediction analysis,
126the reflection coefficients are generated as a by-product. @see
127sig2lpc */
128
129void lpc2ref(const EST_FVector &lpc, EST_FVector &ref);
130
131/** Calculate the linear prediction coefficients from the reflection
132coefficients.
133Use the equation:
134\[power=\frac{1}{n}\sum_{i=1}^{n}a_{i}^2\]
135
136@see lpc2ref*/
137
138void ref2lpc(const EST_FVector &ref, EST_FVector &lpc);
139
140/** Calculate line spectral frequencies from linear prediction coefficients.
141Use the equation:
142\[power=\frac{1}{n}\sum_{i=1}^{n}a_{i}^2\]
143
144@see lsf2lpc
145*/
146
147void lpc2lsf(const EST_FVector &lpc, EST_FVector &lsf);
148
149/** Calculate line spectral frequencies from linear prediction coefficients.
150Use the equation:
151\[power=\frac{1}{n}\sum_{i=1}^{n}a_{i}^2\]
152
153@see lpc2lsf
154*/
155
156void lsf2lpc(const EST_FVector &lsf, EST_FVector &lpc);
157//@}
158
159void frame_convert(const EST_FVector &in_frame, const EST_String &in_type,
160 EST_FVector &out_frame, const EST_String &out_type);
161
162
163
164// end of lpc functions
165
166/**@name Energy and power frame functions
167*/
168
169//@{
170
171/** Calculate the power for a frame of speech. This is defined as
172\[power=\frac{1}{n}\sum_{i=1}^{n}a_{i}^2\]
173*/
174
175
176void sig2pow(EST_FVector &frame, float &power);
177
178/** Calculate the root mean square energy for a frame of speech. This
179is defined as \[energy=\sqrt{\frac{1}{n}\sum_{i=1}^{n}a_{i}^2}\] */
180
181void sig2rms(EST_FVector &frame, float &rms_energy);
182
183//@}
184// end of power and energy
185
186/**@name Frame based filter bank and cepstral analysis
187
188These functions are \Ref{Frame based signal processing functions}.
189*/
190
191//@{
192
193/** Calculate the (log) energy (or power) in each channel of a Mel
194scale filter bank for a frame of speech. The filters are triangular, are
195evenly spaced and are all of equal width, on a Mel scale. The upper and lower
196cutoffs of each filter are at the centre frequencies of the adjacent filters.
197The Mel scale is described under {\tt Hz2Mel}.
198
199@see Hz2Mel
200@see sig2fft
201@see fft2fbank
202*/
203
204void sig2fbank(const EST_FVector &sig,
205 EST_FVector &fbank_frame,
206 const float sample_rate,
207 const bool use_power_rather_than_energy,
208 const bool take_log);
209
210/** Calculate the energy (or power) spectrum of a frame of speech. The FFT
211order is determined by the number of samples in the frame of speech, and is
212a power of 2. Note that the FFT vector returned corresponds to frequencies
213from 0 to half the sample rate. Energy is the magnitude of the FFT; power is
214the squared magnitude.
215
216@see fft2fbank
217@see sig2fbank
218*/
219
220void sig2fft(const EST_FVector &sig,
221 EST_FVector &fft_vec,
222 const bool use_power_rather_than_energy);
223
224/** Given a Mel filter bank description, bin the FFT coefficients
225to compute the output of the filters. The first and last elements of
226{\tt mel_fbank_frequencies} define the lower and upper bound of
227the first and last filters respectively and the intervening elements
228give the filter centre frequencies. That is, {\tt mel_fbank_frequencies} has
229two more elements than {\tt fbank_vec}.
230
231@see fastFFT
232@see sig2fft
233@see sig2fbank
234@see fbank2melcep
235*/
236
237void fft2fbank(const EST_FVector &fft_frame,
238 EST_FVector &fbank_vec,
239 const float Hz_per_fft_coeff,
240 const EST_FVector &mel_fbank_frequencies);
241
242/** Compute the discrete cosine transform of log Mel-scale filter bank output
243to get the Mel cepstral coefficients for a frame of speech.
244Optional liftering (filtering in the cepstral domain) can be applied to
245normalise the magnitudes of the coefficients. This is useful because,
246typically, the higher order cepstral coefficients are significantly
247smaller than the lower ones and it is often desirable to normalise
248the means and variances across coefficients.
249
250The lifter (cepstral filter) used is:
251\[c_i' = \{ 1 + \frac{L}{2} sin \frac{\Pi i}{L} \} \; c_i\]
252
253A typical value of L used in speech recognition is 22. A value of L=0 is taken
254to mean no liftering. This is equivalent to L=1.
255
256@see sig2fft
257@see fft2fbank
258@see sig2fbank
259*/
260
261void fbank2melcep(const EST_FVector &fbank_vec,
262 EST_FVector &mfcc,
263 const float liftering_parameter,
264 const bool include_c0 = false);
265
266/** Make a triangular Mel scale filter. The filter is centred at
267{\tt this_mel_centre} and
268extends from {\tt this_mel_low} to {\tt this_mel_high}. {\tt half_fft_order}
269is the length of a power/energy spectrum covering 0Hz to half the sampling
270frequency with a resolution of {\tt Hz_per_fft_coeff}.
271
272The routine returns a vector of weights to be applied to the energy/power
273spectrum starting at element {\tt fft_index_start}.
274The number of points (FFT coefficients) covered
275by the filter is given by the length of the returned vector {\tt filter}.
276
277@see fft2fbank
278@see Hz2Mel
279@see Mel2Hz
280*/
281
282void make_mel_triangular_filter(const float this_mel_centre,
283 const float this_mel_low,
284 const float this_mel_high,
285 const float Hz_per_fft_coeff,
286 const int half_fft_order,
287 int &fft_index_start,
288 EST_FVector &filter);
289
290/**@name Frequency conversion functions
291
292These are functions used in \Ref{Filter bank and cepstral analysis}.
293*/
294
295//@{
296
297/** Convert Hertz to Mel. The Mel scale is defined by
298\[f_{\mbox{Mel}} = 1127 \; log( 1 + \frac{f_{\mbox{Hertz}}}{700} )\]
299
300@see Mel2Hz
301@see Frequency conversion functions
302*/
303
304float Hz2Mel(float frequency_in_Hertz);
305
306/**
307Convert Mel to Hertz.
308
309@see Hz2Mel
310*/
311
312float Mel2Hz(float frequency_in_Mel);
313
314//@}
315// end of frequency conversion functions
316
317//@}
318// end of filter bank and cepstral analysis
319
320
321
322
323#endif /* __EST_SIGPR_FRAME_H__ */