Edinburgh Speech Tools 2.4-release
pitchmark_main.cc
1/*************************************************************************/
2/* */
3/* Centre for Speech Technology Research */
4/* University of Edinburgh, UK */
5/* Copyright (c) 1996 */
6/* All Rights Reserved. */
7/* */
8/* Permission is hereby granted, free of charge, to use and distribute */
9/* this software and its documentation without restriction, including */
10/* without limitation the rights to use, copy, modify, merge, publish, */
11/* distribute, sublicense, and/or sell copies of this work, and to */
12/* permit persons to whom this work is furnished to do so, subject to */
13/* the following conditions: */
14/* 1. The code must retain the above copyright notice, this list of */
15/* conditions and the following disclaimer. */
16/* 2. Any modifications must be clearly marked as such. */
17/* 3. Original authors' names are not deleted. */
18/* 4. The authors' names are not used to endorse or promote products */
19/* derived from this software without specific prior written */
20/* permission. */
21/* */
22/* THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK */
23/* DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING */
24/* ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT */
25/* SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE */
26/* FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES */
27/* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN */
28/* AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, */
29/* ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF */
30/* THIS SOFTWARE. */
31/* */
32/*************************************************************************/
33/* Author : Paul Taylor */
34/* Date : 1997, 1998, 1999 */
35/*-----------------------------------------------------------------------*/
36/* Pitchmarking program */
37/*************************************************************************/
38
39#include <cstdlib>
40#include <iostream>
41#include <fstream>
42#include "EST_unix.h"
43#include "EST_cmd_line_options.h"
44#include "EST_cmd_line.h"
45#include "EST_speech_class.h"
46#include "sigpr/EST_pitchmark.h"
47
48
49void set_options(EST_Features &op, EST_Option &al);
50
51static EST_write_status save_msec(EST_Track &pm, EST_String filename);
52static EST_write_status save_ogi_bin(EST_Track &pm, EST_String filename,
53 int sr);
54void pm_to_label(EST_Track &pm, EST_Relation &lab);
55
56
57/*void pm_to_label(EST_Track &pm, EST_Relation &lab);
58void find_pm(EST_Wave &sig, EST_Track &pm);
59
60void pm_min_check(EST_Track &pm, float min);
61void pm_sanity_check(EST_Track &pm, float new_end,
62 float max, float min, float def);
63
64void pm_fill(EST_Track &pm, float new_end, float max,
65 float min, float def);
66
67void pm_to_f0(EST_Track &pm, EST_Track &f0);
68*/
69
70
71/** @name <command> pitchmark </command> <emphasis> Find instants of glottal closure in Laryngograph file</emphasis>
72
73 * @id pitchmark-manual
74 * @toc */
75
76//@{
77
78
79/**@name Synopsis
80 */
81//@{
82
83//@synopsis
84
85/**
86<command>pitchmark</command> locates instants of glottal closure in a
87laryngograph waveform, and performs post-processing to produce even
88pitchmarks. EST does not currently provide any means of pitchmarking a
89speech waveform.
90
91Pitchmarking is performed by calling the
92<function>pitchmark()</function> function, which carries out the
93following operations:
94
95<orderedlist> <listitem><para>Double low pass filter the signal. This
96removes noise in the signal. The parameter
97<parameter>lx_lf</parameter> specifies the low pass cutoff frequency,
98and <parameter>lx_lo</parameter> specifies the order. Double filtering
99(feeding the waveform through the filter, then reversing the waveform
100and feeding it through again) is performed to reduce any phase shift
101between the input and output of the filtering operation.
102</para></listitem>
103
104<listitem><para>Double high pass filter the signal. This removes the
105very low frequency swell that is often observed in laryngograph
106waveforms. The parameter <parameter>lx_hf</parameter> specifies the high pass cutoff frequency,
107and <parameter>lx_ho</parameter> specifies the order.
108Double filtering is performed to reduce any phase shift
109between the input and output of the filtering operation.
110</para></listitem>
111
112<listitem><para>Calculate the delta signal. The filtered waveform is
113differentiated using the <function>delta()</function>
114function.</para></listitem>
115
116<listitem><para>Low pass filter the delta signal. Some noise may still
117be present in the signal, and this is removed by further low pass
118filtering. Experimentation has shown that simple mean smoothing is
119often more effective than FIR smoothing at this point. The parameter
120<parameter>mo</parameter> is used to specify the size of the mean
121smoothing window. If FIR smoothing is chosen, the parameter
122<parameter>df_lf</parameter> specifies the low pass cutoff frequency,
123and <parameter>df_lo</parameter> specifies the order. Double filtering
124is again used to avoid phase distortion.
125
126</para></listitem>
127
128<listitem><para>Pick zero crossings. Now simple zero-crossing is used
129to find the pitchmarks themselves. </para></listitem>
130
131</orderedlist>
132
133<command>pitchmark</command> also performs post-processing on the pitchmarks.
134This can be used to eliminate pitchmarks which occur too closely together,
135or to provide estimated evenly spaced pitchmarks during unvoiced regions.
136The -fill option switches <action>this facility on</action>,
137and -min, -max, -def,
138-end and -wave_end control its operation.
139
140*/
141
142//@}
143
144/**@name OPTIONS
145 */
146//@{
147
148//@options
149
150//@}
151
152
153int main (int argc, char *argv[])
154{
155 EST_Track pm;
156 EST_Wave lx;
157 EST_Option al;
158 EST_Features op;
159 EST_String out_file("-");
160 EST_StrList files;
161
162 parse_command_line
163 (argc, argv,
164 EST_String("[input file] -o [output file] [options]")+
165 "Summary: pitchmark laryngograph (lx) files\n"
166 "use \"-\" to make input and output files stdin/out\n"
167 "-h Options help\n\n"+
168 options_wave_input()+
169 options_track_output()+
170 "-lx_lf <int> lx low frequency cutoff\n\n"
171 "-lx_lo <int> lx low order\n\n"
172 "-lx_hf <int> lx high frequency cutoff\n\n"
173 "-lx_ho <int> lx high order\n\n"
174 "-df_lf <int> df low frequeny cutoff\n\n"
175 "-df_lo <int> df low order\n\n"
176 "-med_o <int> median smoothing order\n\n"
177 "-mean_o <int> mean smoothing order\n\n"
178 "-inv Invert polarity of lx signal. Often the lx signal \n"
179 " is upside down. This option inverts the signal prior to \n"
180 " processing.\n\n"
181 "-fill Insert and remove pitchmarks according to min, max\n"
182 " and def period values. Often it is desirable to place limits\n"
183 " on the values of the pitchmarks. This option enforces a \n"
184 " minimum and maximum pitch period (specified by -man and -max).\n"
185 " If the maximum pitch setting is low enough, this will \n"
186 " esnure that unvoiced regions have evenly spaced pitchmarks \n\n"
187 "-min <float> Minimum allowed pitch period, in seconds\n\n"
188 "-max <float> Maximum allowed pitch period, in seconds\n\n"
189 "-def <float> Default pitch period in seconds, used for a guide\n"
190 " as to what length pitch periods should be in unvoiced \n"
191 " sections \n\n"
192 "-pm <ifile> Input is raw pitchmark file. This option is \n"
193 " used to perform filling operations on an already existing \n"
194 " set of pitchmarks \n\n"
195 "-f0 <ofile> Calculate F0 from pitchmarks and save to file\n\n"
196 "-end <float> Specify the end time of the last pitchmark, for use \n"
197 " with the -fill option\n\n"
198 "-wave_end Use the end of a waveform to specify when the \n"
199 " last pitchmark position should be. The waveform file is only \n"
200 " read to determine its end, no processing is performed\n\n"
201 "-inter Output intermediate waveforms. This will output the \n"
202 " signal at various stages of processing. Examination of these \n"
203 " waveforms is extremely useful in setting the parameters for \n"
204 " similar waveforms\n\n"
205 "-style <string> \"track\" or \"lab\"\n\n", files, al);
206
207 set_options(op, al);
208
209 out_file = al.present("-o") ? al.val("-o") : (EST_String)"-";
210
211 if (!al.present("-pm") || (al.present("-pm") && al.present("-wave_end")))
212 if (read_wave(lx, files.first(), al) != read_ok)
213 exit(-1);
214
215 if (al.present("-pm"))
216 pm.load(al.val("-pm"));
217 else
218 {
219 if (al.present("-inv"))
220 invert(lx);
221 pm = pitchmark(lx, op);
222 }
223
224 // this allows the end to be aligned with the end of a waveform
225 op.set("pm_end", lx.end());
226
227 if (al.present("-f0"))
228 {
229 EST_Track f0;
230 pm_to_f0(pm, f0);
231 f0.save(al.val("-f0"));
232 }
233
234 // various options for filling he gaps between distant pitchmarks
235 // and removing pitchmarks that are too close together
236
237 if (al.present("-fill"))
238 {
239 pm_fill(pm, op.F("pm_end"), op.F("max_period"),
240 op.F("min_period"), op.F("def_period"));
241 pm_fill(pm, op.F("pm_end"), op.F("max_period"),
242 op.F("min_period"), op.F("def_period"));
243 }
244 else if (al.present("-min"))
245 pm_min_check(pm, al.fval("-min"));
246
247 if (al.present("-style"))
248 {
249 // label format
250 if (al.val("-style").contains("lab"))
251 {
252 EST_Relation lab;
253 pm_to_label(pm, lab);
254 if (lab.save(out_file + ".pm_lab") != write_ok)
255 exit(-1);
256 }
257 // save file in "traditional" milli-second format
258 if (al.val("-style").contains("msec"))
259 save_msec(pm, out_file + ".pm");
260
261 // ogi binary integer sample point format
262 if (al.val("-style").contains("ogi_bin"))
263 save_ogi_bin(pm, out_file + ".pmv", lx.sample_rate());
264 }
265 else if (pm.save(out_file, al.val("-otype", 0)) != write_ok)
266 {
267 cerr << "pitchmark: failed to write output to \""
268 << out_file << "\"" << endl;
269 exit(-1);
270 }
271 return 0;
272}
273
274static EST_write_status save_msec(EST_Track &pm, EST_String filename)
275{
276 ostream *outf;
277
278 if (filename == "-")
279 outf = &cout;
280 else
281 outf = new ofstream(filename);
282
283 if (!(*outf))
284 return write_fail;
285
286 outf->precision(5);
287 outf->setf(ios::fixed, ios::floatfield);
288 outf->width(8);
289
290 for (int i = 0; i < pm.num_frames(); ++i)
291 *outf << pm.t(i) * 1000.0 << endl;
292
293 return write_ok;
294}
295
296static EST_write_status save_ogi_bin(EST_Track &pm, EST_String filename, int sr)
297{
298 int *d;
299 FILE *fp;
300 int i;
301
302 d = new int[pm.num_frames()];
303
304 for (i = 0; i < pm.num_frames(); ++i)
305 d[i] = int(pm.t(i) * (float) sr);
306
307 if ((fp = fopen(filename, "wb")) == NULL)
308 return misc_write_error;
309
310 if (fwrite(d, pm.num_frames(), sizeof(int), fp) != 1)
311 {
312 fclose(fp);
313 return misc_write_error;
314 }
315 delete d;
316
317 return write_ok;
318}
319
320void override_lib_ops(EST_Option &op, EST_Option &al)
321{
322 op.override_ival("lx_low_frequency", 400);
323 op.override_ival("lx_low_order", 19);
324 op.override_ival("lx_high_frequency", 40);
325 op.override_ival("lx_high_order", 19);
326 op.override_ival("df_low_frequency", 1000);
327 op.override_ival("df_low_order", 19);
328 op.override_fval("min_period", 0.003);
329 op.override_fval("max_period", 0.02);
330 op.override_fval("def_period", 0.01);
331 op.override_fval("pm_end", -1.0);
332
333 if (al.present("-lx_lf"))
334 op.override_ival("lx_low_frequency", al.ival("-lx_lf", 0));
335 if (al.present("-lx_lo"))
336 op.override_ival("lx_low_order", al.ival("-lx_lo", 0));
337 if (al.present("-lx_hf"))
338 op.override_ival("lx_high_frequency", al.ival("-lx_hf", 0));
339 if (al.present("-lx_ho"))
340 op.override_ival("lx_high_order", al.ival("-lx_ho", 0));
341 if (al.present("-med_o"))
342 op.override_ival("median_order", al.ival("-med_o", 0));
343 if (al.present("-mean_o"))
344 op.override_ival("mean_order", al.ival("-mean_o", 0));
345 if (al.present("-df_lf"))
346 op.override_ival("df_low_frequency", al.ival("-df_lf", 0));
347 if (al.present("-df_lo"))
348 op.override_ival("df_low_order", al.ival("-df_lo", 0));
349 if (al.present("-min"))
350 op.override_fval("min_period", al.fval("-min", 0));
351 if (al.present("-max"))
352 op.override_fval("max_period", al.fval("-max", 0));
353 if (al.present("-def"))
354 op.override_fval("def_period", al.fval("-def", 0));
355 if (al.present("-end"))
356 op.override_fval("pm_end", al.fval("-end", 0));
357 if (al.present("-inter"))
358 op.override_ival("pm_debug", 1);
359}
360
361void set_options(EST_Features &op, EST_Option &al)
362{
363 op.set("lx_low_frequency", LX_LOW_FREQUENCY);
364 op.set("lx_low_order", LX_LOW_ORDER);
365 op.set("lx_high_frequency", LX_HIGH_FREQUENCY);
366 op.set("lx_high_order", LX_HIGH_ORDER);
367 op.set("df_low_frequency", DF_LOW_FREQUENCY);
368 op.set("df_low_order", DF_LOW_ORDER);
369 op.set("min_period", MIN_PERIOD);
370 op.set("max_period", MAX_PERIOD);
371 op.set("def_period", DEF_PERIOD);
372 op.set("pm_end", PM_END);
373
374 if (al.present("-lx_lf"))
375 op.set("lx_low_frequency", al.ival("-lx_lf", 0));
376 if (al.present("-lx_lo"))
377 op.set("lx_low_order", al.ival("-lx_lo", 0));
378 if (al.present("-lx_hf"))
379 op.set("lx_high_frequency", al.ival("-lx_hf", 0));
380 if (al.present("-lx_ho"))
381 op.set("lx_high_order", al.ival("-lx_ho", 0));
382 if (al.present("-med_o"))
383 op.set("median_order", al.ival("-med_o", 0));
384 if (al.present("-mean_o"))
385 op.set("mean_order", al.ival("-mean_o", 0));
386 if (al.present("-df_lf"))
387 op.set("df_low_frequency", al.ival("-df_lf", 0));
388 if (al.present("-df_lo"))
389 op.set("df_low_order", al.ival("-df_lo", 0));
390 if (al.present("-min"))
391 op.set("min_period", al.fval("-min", 0));
392 if (al.present("-max"))
393 op.set("max_period", al.fval("-max", 0));
394 if (al.present("-def"))
395 op.set("def_period", al.fval("-def", 0));
396 if (al.present("-end"))
397 op.set("pm_end", al.fval("-end", 0));
398 if (al.present("-inter"))
399 op.set("pm_debug", 1);
400}
401
402/** @name Examples
403</para>
404<formalpara><title>Basic Pitchmarking</title>
405<para>
406<screen>
407$ pitchmark kdt_010.lar -o kdt_010.pm -otype est
408</screen>
409</para>
410</formalpara>
411
412<formalpara><title>Pitchmarking with unvoiced regions
413filled</title> <para> The following fills unvoiced regions with pitch
414periods that are about 0.01 seconds long. It also post-processes the
415set of pitchmarks and ensures that noe are above 0.02 seconds long and
416none below 0.003. A final unvoiced region extending to the end of the
417wave is specified by using the -wave_end option.
418</para> </formalpara><para>
419<screen>
420$ pitchmark kdt_010.lar -o kdt_010.pm -otype est -fill -min 0.003 \
421 -max 0.02 -def 0.01 -wave_end
422</screen>
423
424*/
425
426//@{
427//@}
void set(const EST_String &name, int ival)
Definition: EST_Features.h:185
const float F(const EST_String &path) const
Definition: EST_Features.h:135
int override_ival(const EST_String rkey, const int rval)
add to end of list or overwrite. If rval is empty, do nothing
Definition: EST_Option.cc:66
float fval(const EST_String &rkey, int m=1) const
Definition: EST_Option.cc:98
int override_fval(const EST_String rkey, const float rval)
add to end of list or overwrite. If rval is empty, do nothing
Definition: EST_Option.cc:56
int ival(const EST_String &rkey, int m=1) const
Definition: EST_Option.cc:76
EST_write_status save(const EST_String &filename, bool evaluate_ff=false) const
int contains(const char *s, int pos=-1) const
Does it contain this substring?
Definition: EST_String.h:375
const V & val(const K &rkey, bool m=0) const
return value according to key (const)
Definition: EST_TKVL.cc:145
const int present(const K &rkey) const
Returns true if key is present.
Definition: EST_TKVL.cc:222
const T & first() const
return const reference to first item in list
Definition: EST_TList.h:146
EST_read_status load(const EST_String name, float ishift=0.0, float startt=0.0)
Definition: EST_Track.cc:1309
float & t(int i=0)
return time position of frame i
Definition: EST_Track.h:477
EST_write_status save(const EST_String name, const EST_String EST_filetype="")
Definition: EST_Track.cc:1230
int num_frames() const
return number of frames in track
Definition: EST_Track.h:650
int sample_rate() const
return the sampling rate (frequency)
Definition: EST_Wave.h:147
float end()
return the time position of the last sample.
Definition: EST_Wave.h:153