Edinburgh Speech Tools 2.4-release
kkcompile.cc
1/*************************************************************************/
2/* */
3/* Centre for Speech Technology Research */
4/* University of Edinburgh, UK */
5/* Copyright (c) 1996-1998 */
6/* All Rights Reserved. */
7/* */
8/* Permission is hereby granted, free of charge, to use and distribute */
9/* this software and its documentation without restriction, including */
10/* without limitation the rights to use, copy, modify, merge, publish, */
11/* distribute, sublicense, and/or sell copies of this work, and to */
12/* permit persons to whom this work is furnished to do so, subject to */
13/* the following conditions: */
14/* 1. The code must retain the above copyright notice, this list of */
15/* conditions and the following disclaimer. */
16/* 2. Any modifications must be clearly marked as such. */
17/* 3. Original authors' names are not deleted. */
18/* 4. The authors' names are not used to endorse or promote products */
19/* derived from this software without specific prior written */
20/* permission. */
21/* */
22/* THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK */
23/* DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING */
24/* ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT */
25/* SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE */
26/* FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES */
27/* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN */
28/* AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, */
29/* ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF */
30/* THIS SOFTWARE. */
31/* */
32/*************************************************************************/
33/* Author : Alan W Black */
34/* Date : December 1997 */
35/*-----------------------------------------------------------------------*/
36/* */
37/* A Koskenniemi/Kay/Kaplan rule compiler to WFST using the techniques */
38/* Ritchie et al.'s "Computational Morphology" (but followed through to */
39/* make real WFSTs). */
40/* */
41/*=======================================================================*/
42#include <iostream>
43#include "EST_WFST.h"
44#include "EST_cutils.h"
45
46ostream &operator << (ostream &s, const EST_WFST &w)
47{
48 (void)w;
49 return s << "<<EST_WFST>>";
50}
51
52Declare_TList(EST_WFST)
53
54#if defined(INSTANTIATE_TEMPLATES)
55#include "../base_class/EST_TList.cc"
56
57Instantiate_TList(EST_WFST)
58
59#endif
60
61static LISP expand_fp(const EST_String p,LISP fp);
62static LISP find_feasible_pairs(LISP rules);
63static LISP all_but(LISP rulepair,LISP fp);
64static LISP expand_sets(LISP sets,LISP fp);
65static LISP inline_sets(LISP l, LISP sets);
66static void full_kkcompile(LISP inalpha,LISP outalpha,
67 LISP fp, LISP rules, LISP sets,
68 EST_WFST &all_wfst);
69
70void kkcompile(LISP ruleset, EST_WFST &all_wfst)
71{
72 // Build a transducer from given kkrule (Kay/Kaplan/Koskenniemi)
73 // Rules are of the form LeftContext Map RightContext
74
75 // The WFST is recognizing all string except the rulepair unless
76 // its in the proper context.
77 LISP fp; // feasible pairs, those pairs with rules (rather than IxO)
78 LISP inalpha = siod_nth(1,siod_assoc_str("Alphabets",cdr(cdr(ruleset))));
79 LISP outalpha = siod_nth(2,siod_assoc_str("Alphabets",cdr(cdr(ruleset))));
80 LISP sets = cdr(siod_assoc_str("Sets",ruleset));
81 LISP rules = cdr(siod_assoc_str("Rules",ruleset));
82
83 fp = find_feasible_pairs(rules);
84 sets = expand_sets(sets,fp);
85
86 full_kkcompile(inalpha,outalpha,fp,rules,sets,all_wfst);
87}
88
89static void full_kkcompile(LISP inalpha,LISP outalpha,
90 LISP fp, LISP rules, LISP sets,
91 EST_WFST &all_wfst)
92{
93 wfst_list rulelist;
94 LISP r;
95
96 for (r=rules; r != NIL; r=cdr(r))
97 {
98 EST_WFST r_wfst,base_wfst,det_wfst;
99 rulelist.append(r_wfst);
100 EST_WFST &rr_wfst = rulelist.last(); // to avoid copying the filled one
101 cout << "Rule: " << siod_llength(rules)-siod_llength(r) << endl;
102 pprint(car(r));
103 base_wfst.kkrule_compile(inalpha,outalpha,fp,car(r),sets);
104 cout << " base " << base_wfst.summary() << endl;
105 det_wfst.determinize(base_wfst);
106 cout << " determinized " << det_wfst.summary() << endl;
107 rr_wfst.minimize(det_wfst);
108 cout << " minimized " << rr_wfst.summary() << endl;
109 }
110
111 cout << "WFST: intersecting " << rulelist.length() << " rules" << endl;
112 EST_Litem *p,*nnp;
113 int i;
114 for (i=0,p=rulelist.head(); p->next() != 0; p=nnp)
115 {
116 EST_WFST r_wfst,base_wfst,det_wfst;
117 EST_WFST mmm;
118 rulelist.append(r_wfst);
119 EST_WFST &rr_wfst = rulelist.last(); // to avoid copying the filled one
120 cout << "intersecting " << i << " and " << i+1 << " " <<
121 rulelist.length()-2 << " left" << endl;
122 cout << " " << rulelist(p).summary() << " and " << endl;
123 cout << " " << rulelist(p->next()).summary() << " becomes " << endl;
124 mmm.intersection(rulelist(p),rulelist(p->next()));
125 cout << " " << mmm.summary() << " minimizes to " << endl;
126 rr_wfst.minimize(mmm);
127 cout << " " << rr_wfst.summary() << endl;
128 nnp=p->next()->next();
129 i+=2;
130 rulelist.remove(p->next());
131 rulelist.remove(p);
132 }
133
134 all_wfst = rulelist.first();
135
136}
137
138static LISP expand_sets(LISP sets,LISP fp)
139{
140 // Expand sets into regexes that represent them. Single
141 // char values are converted to disjunctions of feasible pairs
142 // that have the same surface character
143 LISP s,es=NIL,e,ne;
144
145 for (s=sets; s != NIL; s=cdr(s))
146 {
147 for (ne=NIL,e=cdr(car(s)); e != NIL; e=cdr(e))
148 {
149 EST_String ss = get_c_string(car(e));
150 if (ss.contains("/"))
151 ne = cons(car(e),ne);
152 else
153 ne = append(expand_fp(ss,fp),ne);
154 }
155 if (ne == NIL)
156 {
157 cerr << "WFST: kkcompile: set " << get_c_string(car(car(s))) <<
158 " has no feasible pairs" << endl;
159 }
160
161 else if (siod_llength(ne) == 1)
162 es = cons(cons(car(car(s)),ne),es);
163 else
164 es = cons(cons(car(car(s)),
165 cons(cons(rintern("or"),reverse(ne)),
166 NIL)),es);
167 }
168
169 return reverse(es);
170}
171
172static LISP expand_fp(const EST_String p,LISP fp)
173{
174 // Find all fp's that have this p as their surface char
175 LISP m=NIL,f;
176 EST_Regex rg(EST_String("^")+p+"/.*");
177
178 for (f=fp; f != NIL; f=cdr(f))
179 {
180 EST_String ss = get_c_string(car(f));
181 if ((p == ss) || (ss.matches(rg)))
182 m = cons(car(f),m);
183 }
184 return m;
185}
186
187static LISP find_feasible_pairs(LISP rules)
188{
189 // Find the set of pairs that have rules associated with them
190 // This effectively defines the transducer alphabet.
191 LISP fp = NIL;
192 LISP r;
193
194 for (r=rules; r != NIL; r=cdr(r))
195 {
196 if (siod_member_str(get_c_string(siod_nth(0,car(r))),fp) == NIL)
197 fp = cons(siod_nth(0,car(r)),fp);
198 }
199 return fp;
200}
201
202static int surface_coercion(LISP rt)
203{
204 return (streq("<=",get_c_string(rt)));
205}
206
207static int context_restriction(LISP rt)
208{
209 return (streq("=>",get_c_string(rt)));
210}
211
212static int composite(LISP rt)
213{
214 return (streq("<=>",get_c_string(rt)));
215}
216
217static LISP inline_sets(LISP l, LISP sets)
218{
219 // Replace any set name with the regex equivalent
220 LISP s;
221 if (l == NIL)
222 return NIL;
223 else if (consp(l))
224 return cons(inline_sets(car(l),sets),inline_sets(cdr(l),sets));
225 else if ((s=siod_assoc_str(get_c_string(l),sets)) != NIL)
226 return car(cdr(s));
227 else
228 return l;
229}
230
231void EST_WFST::kkrule_compile(LISP inalpha, LISP outalpha, LISP fp,
232 LISP rule,LISP sets)
233{
234 // Build a WFST to transduce this particular rule
235 // Accepts any other combination of feasible pairs too
236 LISP leftcontext = inline_sets(siod_nth(2,rule),sets);
237 LISP rulepair = siod_nth(0,rule);
238 LISP ruletype = siod_nth(1,rule);
239 LISP rightcontext = inline_sets(siod_nth(4,rule),sets);
240 LISP p;
241 int i;
242 int end_LC,end_RP,end_NOTRP,end_RC,err_state;
243
244 // Initialize alphabets
245 init(inalpha,outalpha); // should be passed as discretes
246
247 p_start_state = add_state(wfst_final); // empty WFST
248 // Add transitions for all pairs except rulepair
249 for (p=fp; p != NIL; p=cdr(p))
250 if ((!equal(rulepair,car(p))) ||
251 (surface_coercion(ruletype)))
252 build_wfst(p_start_state,p_start_state,car(p));
253
254 // build for LC
255 if (leftcontext)
256 {
257 end_LC = add_state(wfst_final);
258 build_wfst(p_start_state,end_LC,leftcontext);
259 // for all states in LC mark final & add epsilon to p_start_state
260 for (i=end_LC; i < p_num_states; i++)
261 {
262 build_wfst(i,p_start_state,epsilon_label());
263 p_states[i]->set_type(wfst_final);
264 }
265 }
266 else // no LC
267 end_LC = p_start_state;
268
269 // build for RP and RC from end_LC
270 if (composite(ruletype) || context_restriction(ruletype))
271 {
272 if (rightcontext)
273 {
274 end_RP = add_state(wfst_nonfinal);
275 build_wfst(end_LC,end_RP,rulepair);
276 // build for RC from end map to p_start_state
277 build_wfst(end_RP,p_start_state,rightcontext);
278 err_state = add_state(wfst_error);
279 for (i=end_RP; i < err_state; i++)
280 { // for everything other that the correct path go to err_state
281 // without this explicit error state the epsilon to start
282 // allows almost everything
283 if (transition(i,get_c_string(epsilon_label()))
284 != WFST_ERROR_STATE)
285 break; // not a state require extra transitions
286 for (p=fp; p != NIL; p=cdr(p))
287 if (transition(i,get_c_string(car(p))) == WFST_ERROR_STATE)
288 build_wfst(i,err_state,car(p));
289 build_wfst(i,p_start_state,epsilon_label());
290 p_states[i]->set_type(wfst_licence);
291 }
292 }
293 else // no RC, so end back at start
294 build_wfst(end_LC,p_start_state,rulepair);
295 }
296
297 // Build for notRP and RC from end_LC
298 if (composite(ruletype) || surface_coercion(ruletype))
299 {
300 LISP abrp = all_but(rulepair,fp);
301 if (abrp)
302 {
303 if (rightcontext)
304 {
305 end_RC = add_state(wfst_error);
306 end_NOTRP = add_state(wfst_nonfinal);
307 build_wfst(end_LC,end_NOTRP,abrp);
308 // build for RC from end RP to error state
309 build_wfst(end_NOTRP,end_RC,rightcontext);
310 // for all states in RC except final one mark final & add
311 // epsilon to p_start_state
312 for (i=end_NOTRP; i < p_num_states; i++)
313 {
314 build_wfst(i,p_start_state,epsilon_label());
315 p_states[i]->set_type(wfst_final);
316 }
317 }
318 else // no RC,
319 {
320 end_RC = add_state(wfst_error);
321 build_wfst(end_LC,end_RC,abrp);
322 }
323 }
324 }
325}
326
327static LISP all_but(LISP rulepair,LISP fp)
328{
329 // Returns pairs that have the same surface symbol as rulepair
330 // but different lexical symbol
331 LISP r,notrp=NIL;
332 EST_String s,l,p,sr,lr,rr;
333
334 p = get_c_string(rulepair);
335 if (p.contains("/"))
336 {
337 s = p.before("/");
338 l = p.after("/");
339 }
340 else
341 {
342 s = p;
343 l = p;
344 }
345
346 for (r=fp; r != NIL; r = cdr(r))
347 {
348 rr = get_c_string(car(r));
349 if (rr.contains("/"))
350 {
351 sr = rr.before("/");
352 lr = rr.after("/");
353 }
354 else
355 {
356 sr = rr;
357 lr = rr;
358 }
359 if ((l != lr) && (s == sr))
360 notrp = cons(car(r),notrp);
361 }
362
363 if (siod_llength(notrp) > 1)
364 notrp = cons(strintern("or"),notrp);
365 return notrp;
366}
367
368void intersect(wfst_list &wl, EST_WFST &all)
369{
370 // Intersect the wfst's in wl into all
371
372 all.intersection(wl);
373
374}
375
EST_String before(int pos, int len=0) const
Part before position.
Definition: EST_String.h:286
int contains(const char *s, int pos=-1) const
Does it contain this substring?
Definition: EST_String.h:375
EST_String after(int pos, int len=1) const
Part after pos+len.
Definition: EST_String.h:318
int matches(const char *e, int pos=0) const
Exactly match this string?
Definition: EST_String.cc:652
const T & last() const
return const reference to last item in list
Definition: EST_TList.h:149
void append(const T &item)
add item onto end of list
Definition: EST_TList.h:191
EST_Litem * remove(EST_Litem *ptr)
Definition: EST_TList.h:175
const T & first() const
return const reference to first item in list
Definition: EST_TList.h:146
int add_state(enum wfst_state_type state_type)
Add a new state, returns new name.
Definition: EST_WFST.cc:652
void init(int init_num_states=10)
Clear with (estimation of number of states required)
Definition: EST_WFST.cc:145
void build_wfst(int start, int end, LISP regex)
Basic regex constructor.
Definition: wfst_regex.cc:140
LISP epsilon_label() const
LISP for on epsilon symbols.
Definition: EST_WFST.h:216
void minimize(const EST_WFST &a)
Build minimized form of a.
Definition: wfst_ops.cc:484
void intersection(EST_TList< EST_WFST > &wl)
Definition: wfst_ops.cc:356
void determinize(const EST_WFST &a)
Build determinized form of a.
Definition: wfst_ops.cc:164
int transition(int state, int in, int out) const
Find (first) new state given in and out symbols.
Definition: EST_WFST.cc:260