Edinburgh Speech Tools 2.4-release
EST_sigpr_utt.h
1/*************************************************************************/
2/* */
3/* Centre for Speech Technology Research */
4/* University of Edinburgh, UK */
5/* Copyright (c) 1995,1996 */
6/* All Rights Reserved. */
7/* */
8/* Permission is hereby granted, free of charge, to use and distribute */
9/* this software and its documentation without restriction, including */
10/* without limitation the rights to use, copy, modify, merge, publish, */
11/* distribute, sublicense, and/or sell copies of this work, and to */
12/* permit persons to whom this work is furnished to do so, subject to */
13/* the following conditions: */
14/* 1. The code must retain the above copyright notice, this list of */
15/* conditions and the following disclaimer. */
16/* 2. Any modifications must be clearly marked as such. */
17/* 3. Original authors' names are not deleted. */
18/* 4. The authors' names are not used to endorse or promote products */
19/* derived from this software without specific prior written */
20/* permission. */
21/* */
22/* THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK */
23/* DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING */
24/* ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT */
25/* SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE */
26/* FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES */
27/* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN */
28/* AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, */
29/* ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF */
30/* THIS SOFTWARE. */
31/* */
32/*************************************************************************/
33
34#ifndef __EST_SIGPR_UTT_H__
35#define __EST_SIGPR_UTT_H__
36
37#include "sigpr/EST_sigpr_frame.h"
38#include "sigpr/EST_Window.h"
39#include "EST_Track.h"
40#include "EST_Wave.h"
41
42#define DEFAULT_WINDOW_NAME "hamming"
43#define DEFAULT_FRAME_FACTOR 2.0
44
45/* Note: some of these functions deliberately don't have
46 doc++ style comments, mainly because they are, or will be
47 superseded soon.
48*/
49
50/**@name Functions for use with frame based processing
51
52In the following functions, the input is a \Ref{EST_Wave} waveform,
53and the output is a (usually multi-channel) \Ref{EST_Track}. The
54track must be set up appropriately before hand. This means the track
55must be resized accordingly with the correct numbers of frame and
56channels.
57
58The positions of the frames are found by examination of the {\bf time}
59array in the EST_Track, which must be filled prior to the function
60call. The usual requirement is for fixed frame analysis, where each
61analysis frame is, say, 10ms after the previous one.
62
63A common alternative is to perform pitch-synchronous
64analysis where the time shift is related to the local pitch period.
65
66*/
67
68//@{
69
70/** Produce a single set of coefficients from a waveform. The type of
71 coefficient required is given in the argument <parameter>type</parameter>.
72 Possible types are:
73
74<variablelist>
75
76<varlistentry><term>lpc</term><listitem>linear predictive coding</listitem></varlistentry>
77
78<varlistentry><term>cep</term><listitem>cepstrum coding from lpc coefficients</listitem></varlistentry>
79
80<varlistentry><term>melcep</term><listitem>Mel scale cepstrum coding via fbank</listitem></varlistentry>
81
82<varlistentry><term>fbank</term><listitem>Mel scale log filterbank analysis</listitem></varlistentry>
83
84<varlistentry><term>lsf</term><listitem>line spectral frequencies</listitem></varlistentry>
85
86<varlistentry><term>ref</term><listitem>Linear prediction reflection coefficients</listitem></varlistentry>
87
88<varlistentry><term>power</term><listitem></listitem></varlistentry>
89
90<varlistentry><term>f0</term><listitem>srpd algorithm</listitem></varlistentry>
91
92<varlistentry><term>energy</term><listitem>root mean square energy</listitem></varlistentry>
93
94</variablelist>
95
96The order of the analysis is calculated from the number of
97channels in <parameter>fv</parameter>. The positions of the analysis
98windows must be given by filling in the track's time array.
99
100This function windows the waveform at the intervals given by the track
101time array. The length of each window is <parameter>factor<parameter>
102* the local time shift. The windowing function is giveb by
103<parameter>wf</parameter>.
104
105@param sig: input waveform
106@param fv: output coefficients. These have been pre-allocated and the
107 number of channels in a indicates the order of the analysis.
108@param type: the types of coefficients to be produced. "lpc", "cep" etc
109@param factor: the frame length factor, i.e. the analysis frame length
110 will be this times the local pitch period.
111
112@param wf: function for windowing. See \Ref{Windowing mechanisms}
113*/
114
115void sig2coef(EST_Wave &sig, EST_Track &a, EST_String type,
116 float factor = 2.0,
117 EST_WindowFunc *wf = EST_Window::creator(DEFAULT_WINDOW_NAME));
118
119/** Produce multiple coefficients from a waveform by repeated calls to
120 sig2coef.
121
122@param sig: input waveform
123@param fv: output coefficients. These have been pre-allocated and the
124 number of channels in a indicates the order of the analysis.
125@param op: Features structure containing options for analysis order,
126 frame shift etc.
127@param slist: list of types of coefficients required, from the set of
128possible types that sig2coef can take.
129*/
130
131void sigpr_base(EST_Wave &sig, EST_Track &fv, EST_Features &op,
132 const EST_StrList &slist);
133
134/** Calculate the power for each frame of the waveform.
135
136@param sig: input waveform
137@param a: output power track
138@param factor: the frame length factor, i.e. the analysis frame length
139 will be this times the local pitch period.
140*/
141
142void power(EST_Wave &sig, EST_Track &a, float factor);
143
144/** Calculate the rms energy for each frame of the waveform.
145
146This function calls
147\Ref{sig2energy}
148
149
150@param sig input waveform
151@param a output coefficients
152@param factor optional: the frame length factor, i.e. the analysis frame length
153 will be this times the local pitch period.
154
155*/
156
157void energy(EST_Wave &sig, EST_Track &a, float factor);
158
159
160/** Mel scale filter bank analysis. The Mel scale triangular filters
161are computed via an FFT (see \Ref{fastFFT}). This routine is required
162for Mel cepstral analysis (see \Ref{melcep}). The analysis of each
163frame is done by \Ref{sig2fbank}.
164
165A typical filter bank analysis for speech recognition might use log
166energy outputs from 20 filters.
167
168@param sig: input waveform
169@param fbank: the output. The number of filters is determined from the number
170 size of this track.
171@param factor: the frame length factor, i.e. the analysis frame length
172 will be this times the local pitch period
173@param wf: function for windowing. See \Ref{Windowing mechanisms}
174@param up: whether the filterbank analysis should use
175 power rather than energy.
176@param take_log: whether to take logs of the filter outputs
177
178@see sig2fbank
179@see melcep
180*/
181
182void fbank(EST_Wave &sig,
183 EST_Track &fbank,
184 const float factor,
185 EST_WindowFunc *wf = EST_Window::creator(DEFAULT_WINDOW_NAME),
186 const bool up = false,
187 const bool take_log = true);
188
189/** Mel scale cepstral analysis via filter bank analysis. Cepstral
190parameters are computed for each frame of speech. The analysis
191requires \Ref{fbank}. The cepstral analysis of the filterbank outputs
192is performed by \Ref{fbank2melcep}.
193
194A typical Mel cepstral coefficient (MFCC) analysis for speech recognition
195might use 12 cepstral coefficients computed from a 20 channel filterbank.
196
197
198@param sig input: waveform
199@param mfcc_track: the output
200@param factor: the frame length factor, i.e. the analysis frame length
201 will be this times the local pitch period
202@param fbank_order: the number of Mel scale filters used for the analysis
203@param liftering_parameter: for filtering in the cepstral domain
204 See \Ref{fbank2melcep}
205@param wf: function for windowing. See \Ref{Windowing mechanisms}
206@param include_c0: whether the zero'th cepstral coefficient is to be included
207@param up: whether the filterbank analysis should use
208 power rather than energy.
209
210@see fbank
211@see fbank2melcep
212*/
213
214void melcep(EST_Wave &sig,
215 EST_Track &mfcc_track,
216 float factor,
217 int fbank_order,
218 float liftering_parameter,
219 EST_WindowFunc *wf = EST_Window::creator(DEFAULT_WINDOW_NAME),
220 const bool include_c0 = false,
221 const bool up = false);
222
223//@}
224
225
226/**@name Pitch/F0 Detection Algorithm functions
227
228These functions are used to produce a track of fundamental frequency
229(F0) against time of a waveform.
230*/
231
232//@{
233
234
235/** Top level pitch (F0) detection algorithm. Returns a track
236containing evenly spaced frames of speech, each containing a F0 value
237for that point.
238
239At present, only the \Rref{srpd} pitch tracker is implemented, so
240this is always called regardless of what <parameter>method</parameter>
241is set to.
242
243@param sig: input waveform
244@param fz: output f0 contour
245@param op: parameters for pitch tracker
246@param method: pda method to be used.
247*/
248
249
250void pda(EST_Wave &sig, EST_Track &fz, EST_Features &op, EST_String method="");
251
252
253/** Top level intonation contour detection algorithm. Returns a track
254containing evenly spaced frames of speech, each containing a F0 for that point. {\tt icda} differs from \Ref{pda} in that the contour is
255smoothed, and unvoiced portions have interpolated F0
256values.
257
258@param sig: input waveform
259@param fz: output f0 contour
260@param speech: Interpolation is controlled by the <tt>speech</tt> track. When
261a point has a positive value in the speech track, it is a candidate
262for interpolation.
263@param op: parameters for pitch tracker
264@param method: pda method to be used.
265*/
266
267void icda(EST_Wave &sig, EST_Track &fz, EST_Track &speech,
268 EST_Option &op, EST_String method = "");
269
270/** Create a set sensible defaults for use in pda and icda.
271
272*/
273void default_pda_options(EST_Features &al);
274
275
276/** Super resolution pitch tracker.
277
278srpd is a pitch detection algorithm that produces a fundamental
279frequency contour from a speech waveform. At present only the super
280resolution pitch determination algorithm is implemented. See (Medan,
281Yair, and Chazan, 1991) and (Bagshaw et al., 1993) for a detailed
282description of the algorithm. </para><para>
283
284Frames of data are read in from <parameter>sig</parameter> in
285chronological order such that each frame is shifted in time from its
286predecessor by <parameter>pda_frame_shift</parameter>. Each frame is
287analysed in turn.
288
289</para><para>
290
291The maximum and minimum signal amplitudes are initially found over the
292duration of two segments, each of length N_min samples. If the sum of
293their absolute values is below two times
294<parameter>noise_floor</parameter>, the frame is classified as
295representing silence and no coefficients are calculated. Otherwise, a
296cross correlation coefficient is calculated for all n from a period in
297samples corresponding to <parameter>min_pitch
298</parameter> to a period in samples corresponding to
299<parameter>max_pitch</parameter>, in steps
300of <parameter>decimation_factor</parameter>. In calculating the
301coefficient only one in <parameter>decimation_factor</parameter>
302samples of the two segments are used. Such down-sampling permits rapid
303estimates of the coefficients to be calculated over the range
304N_min <= n <= N_max. This results in a cross-correlation track for the
305frame being analysed. </para><para>
306
307Local maxima of the track with a coefficient value above a specified
308threshold form candidates for the fundamental period. The threshold is
309adaptive and dependent upon the values <parameter>v2uv_coeff_thresh
310</parameter>, <parameter>min_v2uv_coef_thresh </parameter>, and
311<parameter> v2uv_coef_thresh_rati_ratio</parameter>. If the previously
312analysed frame was classified as unvoiced or silent (which is the
313initial state) then the threshold is set to
314<parameter>v2uv_coef_thresh</parameter>. Otherwise, the previous
315frame was classified as being voiced, and the threshold is set equal
316to [\-r] <parameter>v2uv_coef_thresh_rati_ratio
317</parameter> times the cross-correlation coefficient
318value at the point of the previous fundamental period in the former
319coefficients track. This product is not permitted to drop below
320<parameter>v2uv_coef_thresh</parameter>.
321
322</para><para>
323
324If no candidates for the fundamental period are found, the frame is classified
325as being unvoiced. Otherwise, the candidates are further processed to identify
326the most likely true pitch period. During this additional processing, a
327threshold given by <parameter>anti_doubling_thres</parameter> is used.
328
329</para><para>
330
331If the <parameter>peak_tracking</parameter> flag is set to true,
332biasing is applied to the cross-correlation track as described in
333(Bagshaw et al., 1993). </para><para> </para><para>
334
335
336@param sig: input waveform
337@param op: options regarding pitch tracking parameters
338@param op.min_pitch: minimum permitted F0 value
339@param op.max_pitch: maximum permitted F0 value
340@param op.pda_frame_shift: analysis frame shift
341@param op.pda_frame_length: analysis frame length
342@param op.lpf_cutoff: cut off frequency for low pass filtering
343@param op.lpf_order: order of low pass filtering (must be odd)
344@param op.decimation
345@param op.noise_floor
346@param op.min_v2uv_coef_thresh
347@param op.v2uv_coef_thresh_ratio
348@param op.v2uv_coef_thresh
349@param op.anti_doubling_thresh
350@param op.peak_tracking
351
352*/
353void srpd(EST_Wave &sig, EST_Track &fz, EST_Features &options);
354
355/** Smooth selected parts of an f0 contour. Interpolation is
356controlled by the <tt>speech</tt> track. When a point has a positive
357value in the speech track, it is a candidate for interpolation.
358*/
359void smooth_phrase(EST_Track &c, EST_Track &speech, EST_Features &options,
360 EST_Track &sm);
361
362/** Smooth all the points in an F0 contour*/
363void smooth_portion(EST_Track &c, EST_Option &op);
364
365//@}
366
367
368/**@name Delta and Acceleration coefficients
369
370Produce delta and acceleration coefficients from a set of coefficients
371or the waveform.
372*/
373
374//@{
375
376/** Produce a set of delta coefficients for a track
377
378The delta function is used to produce a set of coefficients which
379estimate the rate of change of a set of parameters. The output track
380<parameter>d<parameter> must be setup before hand, i.e. it must have
381the same number of frames and channels as <parameter>tr</parameter>.
382
383@param tr: input track of base coefficients
384@param d: output track of delta coefficients.
385@param regression_length: number of previous frames on which delta
386 estimation is calculated on.
387*/
388
389void delta(EST_Track &tr, EST_Track &d, int regression_length = 3);
390
391/** Produce multiple sets of delta coefficients from a waveform.
392
393 Calculate specified types of delta coefficients. This function is
394 used when the base types of coefficients haven't been calculated.
395 This function calls sig2coef to calculate the base types from which
396 the deltas are calculated, and hence the requirements governing the
397 setup of <parameter>fv</parameter> for sig2coef also hold here.
398
399@param sig: input waveform
400@param fv: output coefficients. These have been pre-allocated and the
401 number of channels in a indicates the order of the analysis.
402@param op: Features structure containing options for analysis order,
403 frame shift etc.
404@param slist: list of types of delta coefficients required.
405*/
406
407void sigpr_delta(EST_Wave &sig, EST_Track &fv, EST_Features &op,
408 const EST_StrList &slist);
409
410/** Produce multiple sets of acceleration coefficients from a waveform
411
412 Calculate specified types of acceleration coefficients. This function
413 is used when the base types of coefficient haven't been calculated.
414 This function calls sig2coef to calculate the base types from which
415 the deltas are calculated, and hence the requirements governing the
416 setup of <parameter>fv</parameter> for sig2coef also hold here.
417
418@param sig: input waveform
419@param fv: output coefficients. These have been pre-allocated and the
420 number of channels in a indicates the order of the analysis.
421@param op: Features structure containing options for analysis order,
422 frame shift etc.
423@param slist: list of types of acceleration coefficients required.
424
425
426The delta function is used to produce a set of coefficients which
427estimate the rate of change of a set of parameters.
428*/
429
430void sigpr_acc(EST_Wave &sig, EST_Track &fv, EST_Features &op,
431 const EST_StrList &slist);
432
433//@}
434
435/* Convert a track containing coefficients of one type to a track
436containing coefficients of another.
437
438@param in_track input set of coefficients
439@param out_track input set of coefficients
440@param out_name name of desired output coefficients.
441@param in_name optional: often it is possible to determine the type of
442the input coefficients from the channel names. If this is not possible or
443these names should be ignored, the {\tt in_type} parameter can be used.
444
445*/
446
447void convert_track(EST_Track &in_track, EST_Track &out_track,
448 const EST_String &out_type,
449 const EST_String &in_type = "");
450
451
452
453#endif /* __EST_SIGPR_UTT_H__ */
454
static Func * creator(const char *name, bool report_error=false)
Return the creation function for the given window type.
Definition: EST_Window.cc:216