Edinburgh Speech Tools 2.4-release
EST_Ngrammar.cc
1/*************************************************************************/
2/* */
3/* Centre for Speech Technology Research */
4/* University of Edinburgh, UK */
5/* Copyright (c) 1996 */
6/* All Rights Reserved. */
7/* */
8/* Permission is hereby granted, free of charge, to use and distribute */
9/* this software and its documentation without restriction, including */
10/* without limitation the rights to use, copy, modify, merge, publish, */
11/* distribute, sublicense, and/or sell copies of this work, and to */
12/* permit persons to whom this work is furnished to do so, subject to */
13/* the following conditions: */
14/* 1. The code must retain the above copyright notice, this list of */
15/* conditions and the following disclaimer. */
16/* 2. Any modifications must be clearly marked as such. */
17/* 3. Original authors' names are not deleted. */
18/* 4. The authors' names are not used to endorse or promote products */
19/* derived from this software without specific prior written */
20/* permission. */
21/* */
22/* THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK */
23/* DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING */
24/* ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT */
25/* SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE */
26/* FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES */
27/* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN */
28/* AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, */
29/* ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF */
30/* THIS SOFTWARE. */
31/* */
32/*************************************************************************/
33/* Author : Simon King & Alan W Black */
34/* Date : February 1997 */
35/*-----------------------------------------------------------------------*/
36/* */
37/* An EST_Ngram class for building and manipulating bigrams trigrams etc */
38/* */
39/*=======================================================================*/
40#include <iostream>
41#include <fstream>
42#include <cstring>
43#include <cmath>
44#include <climits>
45#include <cfloat>
46
47using namespace std;
48
49#include "EST_Ngrammar.h"
50#include "EST_Pathname.h"
51#include "EST_Token.h"
52#include "EST_io_aux.h"
53
54
55const EST_DiscreteProbDistribution PSTnullProbDistribution;
56static EST_String NOVOCAB("NOVOCAB");
57
58// **********************************************************************
59
60EST_NgrammarState::EST_NgrammarState(const EST_NgrammarState &s)
61{
62 clear();
63 init(s.id(),s.pdf_const());
64}
65
66EST_NgrammarState::EST_NgrammarState(const EST_NgrammarState *const s)
67{
68 clear();
69 init(s->id(),s->pdf_const()); // copy
70}
71
72EST_NgrammarState::~EST_NgrammarState()
73{
74 clear();
75}
76
77void EST_NgrammarState::clear()
78{
79 p_id = -1;
80 p_pdf.clear();
81}
82
83void EST_NgrammarState::init()
84{
85 p_id=-1;
86 p_pdf.init();
87}
88
89void EST_NgrammarState::init(int id,EST_Discrete *d)
90{
91 p_id = id;
92 p_pdf.init(d);
93}
94
95
96void EST_NgrammarState::init(int id, const EST_DiscreteProbDistribution &pdf)
97{
98 p_id = id;
99 p_pdf = pdf; // copy it
100}
101
102
103ostream& operator<<(ostream& s, const EST_NgrammarState &a)
104{
105 s << "(" << a.id() << ": " << a.pdf_const() << " )";
106 return s;
107}
108
109// **********************************************************************
110
111EST_BackoffNgrammarState::~EST_BackoffNgrammarState()
112{
113 p_pdf.clear();
114 children.clear();
115}
116
117
118void EST_BackoffNgrammarState::clear()
119{
120 backoff_weight=0;
121 p_pdf.clear();
122}
123
124void EST_BackoffNgrammarState::init()
125{
126 backoff_weight=0;
127 p_pdf.init();
128}
129
130void EST_BackoffNgrammarState::init(const EST_Discrete *d,int level)
131{
132 backoff_weight=0;
133 p_level = level;
134 p_pdf.init(d);
135}
136
137void EST_BackoffNgrammarState::init(const EST_DiscreteProbDistribution &pdf, int level)
138{
139 backoff_weight=0;
140 p_level = level;
141 p_pdf = pdf; // copy it
142}
143
144bool EST_BackoffNgrammarState::accumulate(const EST_StrVector &words,
145 const double count)
146{
147
148// int i;
149// cerr << "accumulate ";
150// for(i=0;i<words.n();i++)
151// {
152// if(words.n()-1-p_level == i)
153// cerr <<"*";
154// cerr << words(i);
155// if(words.n()-1-p_level == i)
156// cerr <<"*";
157// }
158// cerr << endl;
159
160 // update pdf at this node
161 p_pdf.cumulate(words(words.n()-1-p_level),count);
162
163 // then go down
164 if (words.n()-1-p_level > 0) // not at bottom of tree
165 {
166
168
169 // have we got the child
170 s = get_child(words(words.n()-1-p_level));
171 if (s==NULL)
172 // have to extend the tree
173 s = add_child(p_pdf.get_discrete(),words);
174 return s->accumulate(words,count);
175
176 }
177 else
178 {
179 return true;
180 }
181}
182
183bool EST_BackoffNgrammarState::accumulate(const EST_IVector &words,
184 const double count)
185{
186
187// int i;
188// cerr << "accumulate level " << p_level << " : ";
189// for(i=0;i<words.n();i++)
190// {
191// if(words.n()-1-p_level == i)
192// cerr <<"*";
193// cerr << p_pdf.item_name(words(i));
194// if(words.n()-1-p_level == i)
195// cerr <<"*";
196// cerr << " ";
197// }
198// cerr << endl;
199
200 // update pdf at this node
201 p_pdf.cumulate(words(words.n()-1-p_level),count);
202
203 //cerr << *this << endl;
204 //cerr << endl;
205
206 // then go down
207 if (words.n()-1-p_level > 0) // not at bottom of tree
208 {
209
211
212 // have we got the child
213 s = get_child(words(words.n()-1-p_level));
214 if (s==NULL)
215 // have to extend the tree
216 s = add_child(p_pdf.get_discrete(),words);
217
218 // get pointer again in case we built more than one level
219 s = get_child(words(words.n()-1-p_level));
220 if (s==NULL)
221 {
222 cerr << "Failed to extend tree - unknown reason !" << endl;
223 return false;
224 }
225 return s->accumulate(words,count);
226 }
227 else
228 {
229 return true;
230 }
231}
232
233
235EST_BackoffNgrammarState::add_child(const EST_Discrete *d,
236 const EST_StrVector &words)
237{
239
240 if (words.n()-1-p_level > 0) // more history still to go
241 {
242 // see if we can go down the tree
243 s = get_child(words(words.n()-1-p_level));
244 if (s != NULL)
245 return s->add_child(d,words);
246 else
247 {
248 // construct tree as we go
250 new_child->init(d,p_level+1);
251 children.add(words(words.n()-1-p_level), (void*)new_child);
252 return new_child->add_child(d,words);
253 }
254 }
255 else
256 {
257 // this is the node we are trying to add !
258 return this;
259 }
260}
261
262
263
265EST_BackoffNgrammarState::add_child(const EST_Discrete *d,
266 const EST_IVector &words)
267{
269
270 if (words.n()-1-p_level > 0) // more history still to go
271 {
272 // see if we can go down the tree
273 s = get_child(words(words.n()-1-p_level));
274 if (s != NULL)
275 return s->add_child(d,words);
276 else
277 {
278 // construct tree as we go
280 new_child->init(d,p_level+1);
281 children.add(p_pdf.get_discrete()->name(words(words.n()-1-p_level)), (void*)new_child);
282 return new_child->add_child(d,words);
283 }
284 }
285 else
286 {
287 // this is the node we are trying to add !
288 return this;
289 }
290}
291
292void EST_BackoffNgrammarState::remove_child(EST_BackoffNgrammarState *child,
293 const EST_String &name)
294{
295
296 child->zap();
297 // can't remove from StringTrie, but can set pointer to NULL
298 children.add(name,NULL);
299 delete child;
300}
301
302void EST_BackoffNgrammarState::print_freqs(ostream &os,
303 const int order,
304 EST_String followers)
305{
306 // not right - just print out, then recurse through children
307 // change to use 'backoff_traverse'
308
309 EST_Litem *k;
310 double freq;
311 EST_String name;
312 for (k=p_pdf.item_start();
313 !p_pdf.item_end(k);
314 k = p_pdf.item_next(k))
315 {
316 p_pdf.item_freq(k,name,freq);
318 if (p_level==order-1)
319 {
320 if(freq>0)
321 os << name << " " << followers
322 << ": " << freq << endl;
323 }
324 else if (s!=NULL)
325 s->print_freqs(os,order,name+" "+followers);
326
327 }
328}
329
330bool EST_BackoffNgrammarState::ngram_exists(const EST_StrVector &words,
331 const double threshold) const
332{
334 s = get_state(words);
335 if (s != NULL)
336 {
337 // return true for unigrams (state level 0)
338 // even if freq < threshold
339 return (bool)((s->level()==0) ||
340 ( s->frequency(words(0)) > threshold ));
341 }
342 else
343 return false;
344}
345
346const EST_BackoffNgrammarState *const
347EST_BackoffNgrammarState::get_state(const EST_StrVector &words) const
348{
350 if (words.n()-1-p_level > 0) // more history still to go
351 {
352 s = get_child(words(words.n()-1-p_level));
353 if (s != NULL)
354 {
355 //cerr << "traversing from " << *this << endl;
356 //cerr << " to " << *s << endl << endl;
357 return s->get_state(words);
358 }
359 else
360 {
361 //cerr << "got NULL" << endl;
362 return NULL;
363 }
364 }
365 else
366 {
367 //cerr << "got " << *this << endl;
368 return this;
369 }
370}
371
372void EST_BackoffNgrammarState::zap()
373{
374
375 // recursively delete this state and all its children
376 EST_Litem *k;
377 double freq;
378 EST_String name;
379 for (k=p_pdf.item_start();
380 !p_pdf.item_end(k);
381 k = p_pdf.item_next(k))
382 {
383 p_pdf.item_freq(k,name,freq);
384 EST_BackoffNgrammarState *child = get_child(name);
385 if (child!=NULL)
386 remove_child(child,name);
387 }
388
389 children.clear();
390 p_pdf.clear();
391
392}
393
394
395const double EST_BackoffNgrammarState::get_backoff_weight(const EST_StrVector &words) const
396{
398 if (words.n()-1-p_level >= 0)
399 {
400 s = get_child(words(words.n()-1-p_level));
401 if (s != NULL)
402 return s->get_backoff_weight(words);
403 else
404 {
405 // if there is no node, the weight would have been 1 anyway
406 //cerr << "Couldn't get weight for " << words << endl;
407 return 1; // default
408 }
409 }
410
411 else
412 {
413 // reached node
414
415/*
416 cerr << "got bw for ";
417 for(int i=0;i<words.n();i++)
418 cerr << words(i) << " ";
419 cerr << " at level " << p_level << " = " << backoff_weight << endl;
420*/
421 return backoff_weight;
422 }
423}
424
425
426bool EST_BackoffNgrammarState::set_backoff_weight(const EST_StrVector &words, const double w)
427{
429 if (words.n()-1-p_level >= 0)
430 {
431 s = get_child(words(words.n()-1-p_level));
432 if (s != NULL)
433 return s->set_backoff_weight(words,w);
434 else
435 {
436 // we can't set the weight because the node
437 // doesn't exist - this is an error if the weight
438 // is not 1
439 if (w != 1)
440 {
441 cerr << "Couldn't set weight for " << words
442 << " to " << w << endl;
443 return false;
444 }
445 else
446 return true; // a weight of 1 does not need storing
447 }
448 }
449 else
450 {
451 // reached node
452 backoff_weight = w;
453 return true;
454 }
455}
456
457void EST_BackoffNgrammarState::frequency_of_frequencies(EST_DVector &ff)
458{
459 int max=ff.n();
460 EST_Litem *k;
461 double freq;
462 EST_String name;
463 for (k=p_pdf.item_start();
464 !p_pdf.item_end(k);
465 k = p_pdf.item_next(k))
466 {
467 p_pdf.item_freq(k,name,freq);
468 if(freq < max)
469 ff[(int)(freq+0.5)] += 1;
470 }
471}
472
473ostream& operator<<(ostream& s, const EST_BackoffNgrammarState &a)
474{
475 s << "(backoff level:" << a.p_level
476 << " weight:" << a.backoff_weight << " " << a.pdf_const() << " )";
477
478 return s;
479}
480
481// **********************************************************************
482
483void EST_Ngrammar::default_values()
484{
485 p_representation = EST_Ngrammar::sparse;
486 p_entry_type = EST_Ngrammar::frequencies; // by default
487 sparse_representation.clear();
488 allow_oov=false;
489 p_num_samples = 0;
490 p_num_states = 0;
491 p_states = 0;
492 vocab = 0;
493 pred_vocab = 0;
494 backoff_threshold = 1.0;
495 backoff_unigram_floor_freq = 0.0;
496}
497
498EST_Ngrammar::~EST_Ngrammar()
499{
500 delete [] p_states;
501}
502
503void EST_Ngrammar::clear()
504{
505 // to do
506}
507
508bool EST_Ngrammar::init(int o, EST_Ngrammar::representation_t r,
509 const EST_StrList &wordlist)
510{
511 return (bool)(init_vocab(wordlist) && p_init(o,r));
512}
513
514bool EST_Ngrammar::init(int o, EST_Ngrammar::representation_t r,
515 const EST_StrList &wordlist,
516 const EST_StrList &predlist)
517{
518 return (bool)(init_vocab(wordlist,predlist) && p_init(o,r));
519}
520
521bool EST_Ngrammar::init(int o, EST_Ngrammar::representation_t r,
522 EST_Discrete &v)
523{
524 vocab = &v;
525 pred_vocab = vocab;
526 vocab_pdf.init(pred_vocab);
527 return p_init(o,r);
528}
529
530bool EST_Ngrammar::init(int o, EST_Ngrammar::representation_t r,
532{
533 vocab = &v;
534 pred_vocab = &pv;
535 vocab_pdf.init(pred_vocab);
536 return p_init(o,r);
537}
538
539bool EST_Ngrammar::p_init(int o, EST_Ngrammar::representation_t r)
540{
541 if (o <= 0)
542 {
543 cerr << "EST_Ngrammar order must be > 0" << endl;
544 return false;
545 }
546
547 p_order = o;
548 p_representation = r;
549 p_number_of_sentences = 0;
550
551 switch(p_representation)
552 {
553
554 case EST_Ngrammar::sparse:
555 sparse_representation.init(p_order);
556 return true;
557 break;
558
559 case EST_Ngrammar::dense:
560 return init_dense_representation();
561 break;
562
563 case EST_Ngrammar::backoff:
564 return init_backoff_representation();
565 break;
566
567 default:
568 cerr << "Unknown internal representation requested for EST_Ngrammar"
569 << endl;
570 return false;
571 break;
572 }
573}
574
575bool EST_Ngrammar::init_dense_representation()
576{
577 // allocate a flattened N-dimensional matrix of states
578 int i;
579
580 if (vocab->length() <= 0)
581 {
582 cerr << "EST_Ngrammar: dense_representation requires explicit vocab"
583 << endl;
584 return false;
585 }
586
587 p_num_states = (int)pow(float(vocab->length()),float(p_order-1));
588 p_states = new EST_NgrammarState[p_num_states];
589 for (i=0; i < p_num_states; i++)
590 p_states[i].init(i,pred_vocab);
591
592 return true;
593}
594
595bool EST_Ngrammar::init_sparse_representation()
596{
597
598 if (vocab->length() <= 0)
599 {
600 cerr << "EST_Ngrammar: dense_representation requires explicit vocab"
601 << endl;
602 return false;
603 }
604
605 p_num_states = (int)pow(float(vocab->length()),float(p_order-1));
606 p_states = new EST_NgrammarState[p_num_states];
607
608 return (bool)(p_states != NULL);
609}
610
611bool EST_Ngrammar::init_backoff_representation()
612{
613
614 // nothing much to do
615 backoff_representation = new EST_BackoffNgrammarState;
616 backoff_representation->init(vocab,0);
617 return true;
618}
619
620
621const EST_StrVector &EST_Ngrammar::make_ngram_from_index(const int index) const
622{
623 int i,ind=index;
624 EST_StrVector *ngram = new EST_StrVector;
625 ngram->resize(p_order-1); // exclude last word
626
627 // matches 'find_dense_state_index' so first word is most significant
628 // also, cannot fill in last word
629
630 for(i=p_order-2;i>=0;i--)
631 {
632#if defined(sun) && ! defined(__svr4__)
633/* SunOS */
634 int rem = ind%vocab->length();
635 int quot = ind/vocab->length();
636 (*ngram)[i] = wordlist_index(rem);
637 ind = quot;
638#else
639 div_t d = div(ind,vocab->length());
640 (*ngram)[i] = wordlist_index(d.rem);
641 ind = d.quot;
642#endif
643 }
644
645 return *ngram;
646}
647
648
649
650
651bool EST_Ngrammar::init_vocab(const EST_StrList &word_list)
652{
653 vocab = new EST_Discrete();
654 if(!vocab->init(word_list))
655 return false;
656
657 pred_vocab = vocab; // same thing in this case
658 vocab_pdf.init(pred_vocab);
659
660 return (bool)(vocab != NULL);
661}
662
663bool EST_Ngrammar::init_vocab(const EST_StrList &word_list,
664 const EST_StrList &pred_list)
665{
666 vocab = new EST_Discrete();
667 if(!vocab->init(word_list))
668 return false;
669 pred_vocab = new EST_Discrete();
670 if(!pred_vocab->init(pred_list))
671 return false;
672 vocab_pdf.init(pred_vocab);
673
674 return (bool)(vocab != NULL);
675}
676
677bool EST_Ngrammar::check_vocab(const EST_StrList &word_list)
678{
679 EST_Discrete *comp_vocab = new EST_Discrete();
680
681 if(!comp_vocab->init(word_list))
682 {
683 delete comp_vocab;
684 return false;
685 }
686
687 if(*vocab != *comp_vocab)
688 {
689 delete comp_vocab;
690 return false;
691 }
692
693 delete comp_vocab;
694 return true;
695}
696
697const EST_String & EST_Ngrammar::wordlist_index(int i) const
698{
699 return vocab->name(i);
700}
701
702int EST_Ngrammar::predlist_index(const EST_String &word) const
703{
704
705 if (word=="") // can't lookup
706 return -1;
707
708 int i;
709 i = pred_vocab->index(word);
710 if(i >= 0 )
711 return i;
712
713 cerr << "Word \"" << word << "\" is not in the predictee word list" << endl;
714
715 if (allow_oov)
716 {
717 i = pred_vocab->index(OOV_MARKER);
718 if(i >= 0)
719 return i;
720
721 cerr << "Even " << OOV_MARKER << " is not in the predictee word list !" << endl;
722 }
723
724 return -1;
725}
726
727
728const EST_String & EST_Ngrammar::predlist_index(int i) const
729{
730 return pred_vocab->name(i);
731}
732
733int EST_Ngrammar::wordlist_index(const EST_String &word, const bool report) const
734{
735
736 if (word=="") // can't lookup
737 return -1;
738
739 int i;
740 i = vocab->index(word);
741 if(i >= 0 )
742 return i;
743
744 if(report)
745 cerr << "Word \"" << word << "\" is not in the word list" << endl;
746
747 if (allow_oov)
748 {
749 i = vocab->index(OOV_MARKER);
750 if(i >= 0)
751 return i;
752 if(report)
753 cerr << "Even " << OOV_MARKER << " is not in the word list !" << endl;
754 }
755
756 return -1;
757}
758
759bool EST_Ngrammar::build(const EST_StrList &filenames,
760 const EST_String &prev,
761 const EST_String &prev_prev,
762 const EST_String &last,
763 const EST_String &input_format,
764 const EST_String &oov_mode,
765 const int mincount,
766 const int maxcount)
767{
768
769 p_sentence_start_marker = prev;
770 p_sentence_end_marker = last;
771
772
773 // when backing off, safest modes ...
774 if( (p_representation == EST_Ngrammar::backoff) &&
775 (oov_mode != "skip_file") &&
776 (oov_mode != "skip_sentence"))
777 cerr << "Warning : building a backoff grammar" << endl
778 << " with oov_mode '" << oov_mode
779 << "' is not recommended !" << endl;
780
781 if( (oov_mode != "skip_ngram") &&
782 (oov_mode != "skip_sentence") &&
783 (oov_mode != "skip_file") &&
784 (oov_mode != "use_oov_marker") )
785 {
786 cerr << "Unknown oov_mode '" << oov_mode << "'" << endl;
787 return false;
788 }
789
790 if( (oov_mode == "skip_sentence") &&
791 (input_format == "ngram_per_line"))
792 {
793 cerr << "Sorry, with input format 'ngram_per_line' you cannot " << endl
794 << " select oov_mode 'skip_sentence'" << endl;
795 return false;
796 }
797
798 if(oov_mode == "use_oov_marker")
799 allow_oov = true;
800 else
801 allow_oov = false;
802
803 bool skip_this;
804 EST_String new_filename;
805 EST_Litem *p;
806 for (p = filenames.head(); p; p = p->next())
807 {
808 cerr << "Building from " << filenames(p) << endl;
809
810 skip_this=false;
811 if( ((oov_mode == "skip_sentence") &&
812 (input_format == "sentence_per_file")) ||
813 (oov_mode == "skip_file") )
814 skip_this = oov_preprocess(filenames(p),new_filename,
815 "skip if found");
816
817 else if( ((oov_mode == "skip_sentence") &&
818 (input_format == "sentence_per_line")) ||
819 ((oov_mode == "skip_ngram") &&
820 (input_format == "ngram_per_line")) )
821 oov_preprocess(filenames(p),new_filename,"eliminate lines");
822
823 else
824 new_filename = filenames(p);
825
826
827 if(skip_this)
828 {
829 cerr << "Skipping " << filenames(p)
830 << " (out of vocabulary words found)" << endl;
831 }
832 else
833 {
834
835 switch(p_representation)
836 {
837 case EST_Ngrammar::sparse:
838 {
839 if (input_format != "")
840 {
841 cerr << "Can't build sparse ngram from '" << input_format;
842 cerr << "' data" << endl;
843 return false;
844 }
845 else if (!build_sparse(new_filename,prev,prev_prev,last))
846 return false;
847 }
848 break;
849
850 case EST_Ngrammar::dense:
851 if (!build_ngram(new_filename,prev,prev_prev,last,input_format))
852 return false;
853 break;
854
855 case EST_Ngrammar::backoff:
856 if (!build_ngram(new_filename,prev,prev_prev,last,input_format))
857 return false;
858 break;
859
860 default:
861 cerr << "Unknown internal representation set for EST_Ngrammar"
862 << endl;
863 return false;
864 break;
865 }
866 }
867
868 if((new_filename != filenames(p)) &&
869 (new_filename != "") &&
870 !delete_file(new_filename) )
871 {
872 cerr << "Warning : couldn't remove temporary file : "
873 << new_filename << endl;
874 }
875
876 } // loop round files
877
878 if (p_representation == EST_Ngrammar::backoff)
879 return compute_backoff_weights(mincount,maxcount);
880
881 return true;
882}
883
884void EST_Ngrammar::accumulate(const EST_StrVector &words,
885 const double count)
886{
887 if (words.n() < p_order)
888 cerr << "EST_Ngrammar::accumulate - window is too small" << endl;
889 else
890 {
891 p_num_samples++;
892 const EST_String &w = lastword(words);
893 vocab_pdf.cumulate(w,count);
894
895 switch(p_representation)
896 {
897 case EST_Ngrammar::dense:
898 case EST_Ngrammar::sparse:
899 find_state(words).cumulate(w,count);
900 break;
901
902 case EST_Ngrammar::backoff:
903 backoff_representation->accumulate(words,count);
904 break;
905
906 default:
907 cerr << "EST_Ngrammar::accumulate : invalid representation !"
908 << endl;
909 break;
910 }
911 } // switch
912}
913
914void EST_Ngrammar::accumulate(const EST_IVector &words,
915 const double count)
916{
917
918 /*
919 int i;
920 for(i=0;i<words.n();i++)
921 {
922 cerr << vocab_pdf.item_name(words(i));
923 cerr << " ";
924 }
925 cerr << endl;
926 */
927
928 if (words.n() < p_order)
929 cerr << "EST_Ngrammar::accumulate - window is too small" << endl;
930 else
931 {
932 p_num_samples++;
933 vocab_pdf.cumulate(words(p_order-1),count);
934
935 switch(p_representation)
936 {
937
938 case EST_Ngrammar::dense:
939 case EST_Ngrammar::sparse:
940 find_state(words).cumulate(words(p_order-1),count);
941 break;
942
943 case EST_Ngrammar::backoff:
944 backoff_representation->accumulate(words,count);
945 break;
946
947 default:
948 cerr << "EST_Ngrammar::accumulate : invalid representation !"
949 << endl;
950 break;
951 } // switch
952 }
953}
954
955
956bool EST_Ngrammar::ngram_exists(const EST_StrVector &words) const
957{
958 switch(p_representation)
959 {
960 case EST_Ngrammar::sparse:
961 return false;
962 break;
963
964 case EST_Ngrammar::dense:
965 return true; // always exists !
966 break;
967
968 case EST_Ngrammar::backoff:
969 {
970 if(words.n()==1)
971 return backoff_representation->ngram_exists(words,0);
972 else
973 return backoff_representation->ngram_exists(words,backoff_threshold);
974 }
975 break;
976
977 default:
978 cerr << "ngram_exists: unknown ngrammar representation" << endl;
979 break;
980 }
981 return false;
982}
983
984bool EST_Ngrammar::ngram_exists(const EST_StrVector &words, const double threshold) const
985{
986 if (p_representation != EST_Ngrammar::backoff)
987 {
988 cerr << "Not a backoff grammar !" << endl;
989 return false;
990 }
991
992 return backoff_representation->ngram_exists(words,threshold);
993
994}
995
996
997const double EST_Ngrammar::get_backoff_weight(const EST_StrVector &words) const
998{
999 if(p_representation == EST_Ngrammar::backoff)
1000 return backoff_representation->get_backoff_weight(words);
1001 else
1002 {
1003 cerr << "Can't get backoff weight - not a backed off ngrammar !" << endl;
1004 return 0;
1005 }
1006}
1007
1008bool EST_Ngrammar::set_backoff_weight(const EST_StrVector &words, const double w)
1009{
1010 if(p_representation == EST_Ngrammar::backoff)
1011 return backoff_representation->set_backoff_weight(words,w);
1012 else
1013 {
1014 cerr << "Can't set backoff weight - not a backed off ngrammar !" << endl;
1015 return false;
1016 }
1017}
1018
1019
1020bool EST_Ngrammar::build_sparse(const EST_String &filename,
1021 const EST_String &prev,
1022 const EST_String &prev_prev,
1023 const EST_String &last)
1024{
1025 sparse_representation.build(filename,prev,prev_prev,last);
1026 return true;
1027}
1028
1029
1030void EST_Ngrammar::fill_window_start(EST_IVector &window,
1031 const EST_String &prev,
1032 const EST_String &prev_prev) const
1033{
1034 int i;
1035
1036 for (i=0; i<window.n()-1; i++)
1037 window[i] = wordlist_index(prev_prev);
1038 window[i++] = wordlist_index(prev);
1039}
1040
1041void EST_Ngrammar::fill_window_start(EST_StrVector &window,
1042 const EST_String &prev,
1043 const EST_String &prev_prev) const
1044{
1045 int i;
1046
1047 for (i=0; i<window.n()-1; i++)
1048 window[i] = prev_prev;
1049 window[i++] = prev;
1050}
1051
1052bool EST_Ngrammar::oov_preprocess(const EST_String &filename,
1053 EST_String &new_filename,
1054 const EST_String &what)
1055{
1056 ostream *ost=0;
1057 EST_TokenStream ts;
1058 new_filename="";
1059 int bad_line_count=0;
1060 int good_line_count=0;
1061
1062 // if filename is stdin we have to make a temporary file
1063 // if we are eliminating lines we also need a temporary file
1064
1065 // what = "skip if found" | "eliminate lines"
1066
1067 bool write_out = false;
1068 if( (what == "eliminate lines") || (filename == "-") )
1069 write_out = true;
1070
1071 // open the original files for reading
1072 if (filename == "-")
1073 {
1074 if( ts.open(stdin, FALSE) == -1)
1075 {
1076 cerr << "EST_Ngrammar:: failed to open stdin";
1077 cerr << " for reading" << endl;
1078 return false;
1079 }
1080 }
1081 else if (ts.open(filename) == -1){
1082 cerr << "EST_Ngrammar: failed to open file \"" << filename
1083 << "\" for reading" << endl;
1084 return false; // hmmm ?
1085 }
1086
1087 // open the output file if necessary
1088 if(write_out)
1089 {
1090 new_filename = make_tmp_filename();
1091 ost = new ofstream(new_filename);
1092
1093 if(!(*ost))
1094 {
1095 cerr << "Ngrammar: couldn't create temporary file \""
1096 << new_filename << "\"" << endl;
1097 new_filename="";
1098 return false;
1099 }
1100 }
1101 else
1102 new_filename = filename;
1103
1104 EST_String s,this_line;
1105 bool bad_line=false;
1106 while (!ts.eof())
1107 {
1108 s=ts.get().string();
1109
1110 if (!bad_line && (s != ""))
1111 {
1112 if(wordlist_index(s,false) < 0)
1113 {
1114
1115 if(what == "eliminate lines")
1116 {
1117 bad_line=true;
1118 }
1119 else // skip file
1120 {
1121 if(write_out)
1122 {
1123 // input was stdin, but we are now discarding it
1124 delete ost;
1125 if(!delete_file(new_filename))
1126 cerr << "Warning : couldn't delete temporary file '"
1127 << new_filename << "'" << endl;
1128 }
1129 new_filename="";
1130 return false;
1131 }
1132
1133 }
1134 else
1135 this_line += s + " ";
1136 }
1137
1138 // write out
1139 if(ts.eoln())
1140 {
1141 if(bad_line)
1142 {
1143 bad_line_count++;
1144 }
1145 else
1146 {
1147 if(write_out)
1148 {
1149 // this_line
1150 *ost << this_line << endl;
1151 good_line_count++;
1152 }
1153 }
1154 bad_line=false;
1155 this_line = "";
1156 }
1157
1158 }
1159 cerr << "skipped " << bad_line_count << " and kept "
1160 << good_line_count << " lines from file " << filename << endl;
1161 return true;
1162}
1163
1164bool EST_Ngrammar::build_ngram(const EST_String &filename,
1165 const EST_String &prev,
1166 const EST_String &prev_prev,
1167 const EST_String &last,
1168 const EST_String &input_format)
1169{
1170 p_entry_type = EST_Ngrammar::frequencies;
1171 int bad_word=0;
1172 EST_String s;
1173 EST_TokenStream ts;
1174 int eoln_is_eos = FALSE;
1175 int sliding_window = TRUE;
1176 int count=0;
1177 clear();
1178
1179 if ( (input_format == "") || (input_format == "sentence_per_line") )
1180 {
1181 // may do something here later
1182 eoln_is_eos = TRUE;
1183 sliding_window = TRUE;
1184 }
1185 else if (input_format == "sentence_per_file")
1186 {
1187 eoln_is_eos = FALSE;
1188 sliding_window = TRUE;
1189 p_number_of_sentences = 1;
1190 }
1191 else if(input_format == "ngram_per_line")
1192 {
1193 eoln_is_eos = FALSE;
1194 sliding_window = FALSE;
1195 p_number_of_sentences = 1;
1196 }
1197 else
1198 {
1199 cerr << "Can't build from '" << input_format << "' data" << endl;
1200 return false;
1201 }
1202
1203 EST_IVector window(p_order);
1204
1205 if (filename == "-")
1206 {
1207 if( ts.open(stdin, FALSE) == -1)
1208 {
1209 cerr << "EST_Ngrammar:: failed to open stdin";
1210 cerr << " for reading" << endl;
1211 return false;
1212 }
1213 }
1214 else if (ts.open(filename) == -1){
1215 cerr << "EST_Ngrammar: failed to open \"" << filename
1216 << "\" for reading" << endl;
1217 return false;
1218 }
1219
1220 // default input format is one sentence per line
1221 // full stops and commas etc. are taken literally
1222 // i.e. the user must do the preprocessing
1223
1224 // we assume that all of prev,prev_prev,last are either
1225 // null or set, not a mixture of the two
1226
1227 // at start window contains
1228 // [prev_prev, prev_prev, ....., prev_prev, prev]
1229 // This is not added to the ngram model though
1230 if(sliding_window)
1231 {
1232 fill_window_start(window,prev,prev_prev);
1233 if (window(p_order-1) == -1)
1234 bad_word = p_order;
1235 else if( (p_order>1) && (window(p_order-2) == -1))
1236 bad_word = p_order-1;
1237 else
1238 bad_word=0;
1239
1240 if(bad_word > 0)
1241 cerr << "at start : bad word at " << bad_word << endl;
1242
1243 }
1244 while (!ts.eof())
1245 {
1246 s=ts.get().string();
1247
1248 if (s != "")
1249 {
1250 if(sliding_window)
1251 {
1252 slide(window,-1);
1253 if (bad_word > 0)
1254 bad_word--;
1255
1256 window[p_order-1] = wordlist_index(s);
1257 if (window(p_order-1) < 0)
1258 {
1259 cerr << "EST_Ngrammar::build_ngram " <<
1260 " word \"" << s << "\" is not in vocabulary, skipping"
1261 << endl;
1262 bad_word = p_order;
1263 }
1264 if (bad_word == 0)
1265 accumulate(window);
1266 else
1267 {
1268 cerr << "not accumulating : bad word at " << bad_word;
1269 cerr << " window=" << window; // l<< endl;
1270 bad_word--;
1271 }
1272 }
1273 else
1274 {
1275 // not a sliding window - wait for end of line and accumulate
1276 if(count < p_order)
1277 {
1278 if(count == p_order-1) // last thing in window (predictee)
1279 window[count++] = predlist_index(s);
1280 else // not last thing (predictor)
1281 window[count++] = wordlist_index(s);
1282
1283 if (window(count-1) < 0)
1284 {
1285 cerr << "EST_Ngrammar::build_ngram " <<
1286 " word \"" << s << "\" is not in vocabulary, skipping"
1287 << endl;
1288 bad_word = 1;
1289 }
1290 }
1291 else
1292 cerr << "Too many items on line - ignoring trailing ones !" << endl;
1293 }
1294 }
1295
1296 // end of sentence ?
1297 if(ts.eoln())
1298 {
1299
1300 if(!sliding_window)
1301 {
1302 if((count == p_order) && bad_word == 0)
1303 accumulate(window);
1304 count = 0;
1305 bad_word = 0;
1306 }
1307 else if (eoln_is_eos)
1308 {
1309 // have there been any accumulates since the last increment
1310 if (window(p_order-1) != wordlist_index(last))
1311 p_number_of_sentences += 1;
1312
1313 slide(window,-1);
1314 window[p_order-1] = wordlist_index(last);
1315
1316 if(window(p_order-1) == -1)
1317 {
1318 //cerr << "WARNING : skipping over bad word '"
1319 //<< last << "'" << endl;
1320 bad_word = p_order;
1321 }
1322
1323 if (bad_word == 0)
1324 accumulate(window);
1325
1326 fill_window_start(window,prev,prev_prev);
1327
1328 // check for bad tags
1329 if (window(p_order-1) == -1)
1330 bad_word = p_order;
1331 else if( (p_order>1) && (window(p_order-2) == -1) )
1332 bad_word = p_order-1;
1333 else
1334 bad_word=0;
1335 }
1336 }
1337 }
1338
1339 // if last accumulate was at end of sentence
1340 // we don't need to do the extra accumulate
1341 if ( sliding_window && (window(p_order-1) != wordlist_index(prev)))
1342 {
1343
1344 // and finish with the ngram [.....last few words,last]
1345 slide(window,-1);
1346 window[p_order-1] = wordlist_index(last);
1347
1348 if (window(p_order-1) == -1)
1349 bad_word=p_order;
1350
1351 if (bad_word == 0)
1352 {
1353 accumulate(window);
1354 p_number_of_sentences += 1;
1355 }
1356 }
1357
1358 ts.close();
1359
1360 cerr << "Accumulated " << p_number_of_sentences << " sentences." << endl;
1361 return true;
1362}
1363
1364
1365void compute_backoff_weight(EST_Ngrammar *n, EST_StrVector &ngram, void*)
1366{
1367 int i,j;
1368 double sum1=0,sum2=0;
1369
1370
1371 EST_StrVector new_ngram;
1372 new_ngram.resize(ngram.n()-1);
1373 for(i=0;i<new_ngram.n();i++)
1374 new_ngram[i] = ngram(i);
1375
1376
1377 cerr << "computing backoff w for ";
1378 for(i=0;i<new_ngram.n();i++)
1379 cerr << new_ngram(i) << " ";
1380 cerr << " \r";
1381
1382 cerr << endl;
1383
1384 // only have backoff weights if the associated n-1 gram exists
1385 if (!n->ngram_exists(new_ngram))
1386 {
1387 cerr << " NONE";
1388
1389 // if ngram really exists, but was below threshold
1390 // set backoff weight to 1 (always back off)
1391 if (n->ngram_exists(new_ngram,0))
1392 {
1393 if(!n->set_backoff_weight(new_ngram,1))
1394 cerr << "WARNING : couldn't set weight !" << endl;
1395 cerr << " = 1";
1396 }
1397 cerr << endl;
1398 return;
1399 }
1400
1401 // save
1402 EST_String tmp = ngram(ngram.n()-1);
1403
1404 // for each possible word in the last position
1405 for(j=0;j<n->get_pred_vocab_length();j++)
1406 {
1407 ngram[ngram.n()-1] = n->get_pred_vocab_word(j);
1408
1409 for(i=0;i<ngram.n();i++)
1410 cerr << ngram(i) << " ";
1411
1412 if (n->ngram_exists(ngram))
1413 {
1414 cerr << n->probability(ngram) << " exists " << endl;
1415 // if we have the complete ngram, add it to sum1
1416 sum1 += n->probability(ngram);
1417 }
1418 else
1419 {
1420
1421 // make this faster - take out of loop
1422
1423 // or add the n-1 gram, excluding the first word to sum2
1424 EST_StrVector tmp_ngram;
1425 tmp_ngram.resize(ngram.n()-1);
1426 for(i=0;i<tmp_ngram.n();i++)
1427 tmp_ngram[i] = ngram(i+1);
1428
1429 cerr << " unseen, P(";
1430 for(i=0;i<tmp_ngram.n();i++)
1431 cerr << tmp_ngram(i) << " ";
1432
1433 cerr << ") = " << n->probability(tmp_ngram) << " " << endl;
1434 sum2 += n->probability(tmp_ngram);
1435 }
1436 }
1437
1438 // and fill in the backoff weight
1439
1440 if (sum2 == 0) // no unseen ngrams, no backing off
1441 {
1442 if(!n->set_backoff_weight(new_ngram,1))
1443 cerr << "WARNING : couldn't set weight !" << endl;
1444 cerr << 0 << endl;
1445 }
1446 else
1447 {
1448 if (sum1 > 1)
1449 {
1450 cerr << "NEGATIVE WEIGHT for ";
1451 for(i=0;i<new_ngram.n();i++)
1452 cerr << new_ngram(i) << " ";
1453 cerr << endl;
1454
1455 cerr << "sum1=" << sum1 << " sum2=" << sum2;
1456 cerr << " " << (1 - sum1) / sum2 << endl;
1457
1458 for(j=0;j<n->get_pred_vocab_length();j++)
1459 {
1460 ngram[ngram.n()-1] = n->get_pred_vocab_word(j);
1461
1462
1463 if (n->ngram_exists(ngram))
1464 {
1465
1466 for(i=0;i<ngram.n();i++)
1467 cerr << ngram(i) << " ";
1468 cerr << " exists, prob = ";
1469 cerr << n->probability(ngram,false,true) << endl;
1470 }
1471
1472 }
1473 sum1 = 1;
1474 sum2 = 1; // to get a weight of 0
1475 }
1476
1477 if(!n->set_backoff_weight(new_ngram, (1 - sum1) / sum2 ))
1478 cerr << "WARNING : couldn't set weight !" << endl;
1479
1480 cerr << "sum1=" << sum1 << " sum2=" << sum2;
1481 cerr << " weight=" << (1 - sum1) / sum2 << endl;
1482 }
1483
1484 // restore
1485 ngram[ngram.n()-1] = tmp;
1486
1487}
1488
1489bool EST_Ngrammar::compute_backoff_weights(const int mincount,
1490 const int maxcount)
1491{
1492
1493
1494 backoff_threshold = mincount;
1495 backoff_discount = new EST_DVector[p_order];
1496
1497 int i,o;
1498
1499 // but we must have children from the root node
1500 // for every unigram, since these can not be backed off
1501 // (must therefore be present, even if zero)
1502 // smoothing will fill in a floor for any zero frequencies
1503
1504 backoff_restore_unigram_states();
1505
1506 Good_Turing_discount(*this,maxcount,0.5);
1507
1508 // and since some frequencies will have been set to zero
1509 // we have to prune away those branches of the tree
1510 //prune_backoff_representation();
1511
1512
1513 // now compute backoff weights, to satisfy the
1514 // 'probs sum to 1' condition
1515
1516 // condition is (trigram case):
1517 // sum( p(w3|w1,w2) ) = 1, over all w1,w2
1518
1519 // p(wd3|wd1,wd2)=
1520 // if(trigram exists) p_3(wd1,wd2,wd3)
1521 // else if(bigram w1,w2 exists) bo_wt_2(w1,w2)*p(wd3|wd2)
1522 // else p(wd3|w2)
1523 // and for a given wd3 they all sum to 1
1524
1525 // so compute three sums :
1526 // sum(p_3(wd1,wd2,wd3)), for all w1,w2 where we have the trigram
1527 // sum(p(wd3|wd2)), for all w1,w2 where we have the bigram(w1,w2)
1528 // but not the trigram
1529 // sum(p(wd3|wd2)), for all w1,w2 where we don't have either
1530
1531 // could probably do this recursively and more elegantly
1532 // but it makes my head hurt
1533
1534 for (o=2;o<=order();o++) // for (o=1 .. .????
1535 {
1536
1537 cerr << "Backing off order " << o << endl;
1538 //cerr << "=======================" << endl;
1539
1540 EST_StrVector words;
1541 words.resize(o);
1542
1543 // for each possible history (ngram, excluding last word)
1544 // compute the backoff weight
1545 for(i=0;i<o-1;i++)
1546 words[i]="";
1547 words[o-1] = "!FILLED!";
1548 iterate(words,&compute_backoff_weight,NULL);
1549
1550 //cerr << "=========done==========" << endl;
1551
1552 }
1553
1554 return true;
1555}
1556
1557
1558void EST_Ngrammar::backoff_restore_unigram_states()
1559{
1560 // for every item in the root pdf, look for a child
1561 // and make it if not present
1562
1563 // short cut is to cumulate some zero freq bigrams
1564 // to force construction of states where necessary
1565 EST_StrVector words;
1566 int j;
1567 words.resize(2);
1568 words[0] = "wibble"; // doesn't matter what this is, count is 0
1569 for(j=0;j<get_pred_vocab_length();j++)
1570 {
1571 words[1] = get_pred_vocab_word(j);
1572 backoff_representation->accumulate(words,0);
1573 }
1574
1575}
1576
1577
1578void EST_Ngrammar::prune_backoff_representation(EST_BackoffNgrammarState *start_state)
1579{
1580
1581 if (start_state == NULL)
1582 start_state = backoff_representation;
1583
1584 //cerr << "pruning state " << start_state << endl << *start_state << endl;
1585
1586 // remove any branches with zero frequency count
1587
1588 // find children of this state with zero freq and zap them
1589 EST_Litem *k;
1590 double freq;
1591 EST_String name;
1592 for (k=start_state->pdf_const().item_start();
1593 !start_state->pdf_const().item_end(k);
1594 k = start_state->pdf_const().item_next(k))
1595 {
1596 start_state->pdf_const().item_freq(k,name,freq);
1597 if (freq < TINY_FREQ)
1598 {
1599 EST_BackoffNgrammarState *child = start_state->get_child(name);
1600
1601 if (child!=NULL)
1602 {
1603 //cerr << "Zapping " << name << " : " << child->level()
1604 //<< " " << child<< endl;
1605 start_state->remove_child(child,name);
1606 }
1607 }
1608
1609 }
1610
1611 // then recurse through remaining children
1612 for (k=start_state->pdf_const().item_start();
1613 !start_state->pdf_const().item_end(k);
1614 k = start_state->pdf_const().item_next(k))
1615 {
1616 start_state->pdf_const().item_freq(k,name,freq);
1617 EST_BackoffNgrammarState *child = start_state->get_child(name);
1618 if (child!=NULL)
1619 {
1620 //cerr << "recursing to " << name << " : " << child->level() << endl;
1621 //if((child!=NULL) && (child->level() == 3))
1622 //cerr << *child << endl;
1623 prune_backoff_representation(child);
1624 }
1625 }
1626}
1627
1628
1629ostream& operator<<(ostream& s, EST_Ngrammar &n)
1630{
1631 switch(n.p_representation)
1632 {
1633 case EST_Ngrammar::sparse:
1634 n.sparse_representation.print_freqs(s);
1635 break;
1636
1637 case EST_Ngrammar::dense:
1638 s << "Dense" << endl;
1639 // s << n.dense_representation << endl;
1640 break;
1641
1642 case EST_Ngrammar::backoff:
1643 s << "Backoff" << endl;
1644 s << *(n.backoff_representation) << endl;
1645 break;
1646
1647 default:
1648 cerr << "Unknown internal representation of EST_Ngrammar : can't print"
1649 << endl;
1650 break;
1651 }
1652
1653 return s;
1654}
1655
1656bool
1657EST_Ngrammar::set_entry_type(EST_Ngrammar::entry_t new_type)
1658{
1659 if (new_type == p_entry_type)
1660 return true;
1661
1662 // entry type conversion --- hmmmmm
1663
1664 cerr << "Couldn't do entry type conversion !" << endl;
1665 return false;
1666}
1667
1668bool EST_Ngrammar::sparse_to_dense()
1669{
1670 cerr << "EST_Ngrammar::sparse_to_dense() "
1671 <<" not implemented" << endl;
1672 return false;
1673}
1674
1675bool EST_Ngrammar::dense_to_sparse()
1676{
1677 cerr << "EST_Ngrammar::dense_to_sparse()"
1678 <<" not implemented" << endl;
1679 return false;
1680}
1681
1682int EST_Ngrammar::find_dense_state_index(const EST_IVector &words,
1683 int index) const
1684{
1685 int i,ind=0;
1686 int vl,wa;
1687 for(i=0;i<p_order-1;i++)
1688 {
1689 vl = vocab->length();
1690 wa = words.a_no_check(i+index);
1691 if (wa < 0) wa = 0;
1692 // printf("awb_debug ngrammar i %d ind %d v.length() %d words.a_no_check() %d\n",i,ind,vl,wa);
1693 ind = ind*vl + wa;
1694 }
1695
1696 return ind;
1697}
1698
1699int EST_Ngrammar::find_next_state_id(int state, int word) const
1700{
1701 // Find a new state index from the current after moving with word
1702 int i,f;
1703
1704 if (p_order == 1)
1705 return 0;
1706 for (f=1,i=0; i<p_order-2; i++)
1707 f*=vocab->length();
1708 return ((state%f)*vocab->length())+word;
1709}
1710
1711EST_NgrammarState &EST_Ngrammar::find_state(const EST_StrVector &words)
1712{
1713 switch(p_representation)
1714 {
1715 case EST_Ngrammar::sparse:
1716 // return p_states[sparse_representation.find_state(words)];
1717 return p_states[0];
1718 break;
1719
1720 case EST_Ngrammar::dense:
1721 {
1722 EST_IVector tmp(words.n());
1723 int i;
1724 for(i=0;i<p_order-1;i++)
1725 {
1726 tmp[i] = wordlist_index(words(i));
1727 if (tmp(i) == -1) break;
1728 }
1729 tmp[i] = pred_vocab->index(words(i));
1730 if (tmp(i) == -1) break;
1731 return p_states[find_dense_state_index(tmp)];
1732 }
1733 break;
1734
1735 case EST_Ngrammar::backoff:
1736 cerr << "find_state: not valid in backoff mode !" << endl;
1737 break;
1738
1739 default:
1740 cerr << "find_state: unknown ngrammar representation" << endl;
1741 break;
1742 }
1743
1744 return p_states[0];
1745}
1746
1747
1748const EST_NgrammarState &
1749EST_Ngrammar::find_state_const(const EST_StrVector &words) const
1750{
1751 switch(p_representation)
1752 {
1753 case EST_Ngrammar::sparse:
1754 // return p_states[sparse_representation.find_state(words)];
1755 return p_states[0];
1756 break;
1757
1758 case EST_Ngrammar::dense:
1759 {
1760 EST_IVector tmp(words.n());
1761 int i;
1762 for(i=0;i<p_order-1;i++)
1763 {
1764 tmp[i] = wordlist_index(words(i));
1765 if (tmp(i) == -1) break;
1766 }
1767 tmp[i] = pred_vocab->index(words(i));
1768 if (tmp(i) == -1) break;
1769 return p_states[find_dense_state_index(tmp)];
1770 }
1771 break;
1772
1773 case EST_Ngrammar::backoff:
1774 cerr << "find_state_const: not valid in backoff mode !" << endl;
1775 break;
1776
1777 default:
1778 cerr << "find_state: unknown ngrammar representation" << endl;
1779 break;
1780 }
1781 return p_states[0];
1782}
1783
1784
1785EST_NgrammarState &EST_Ngrammar::find_state(const EST_IVector &words)
1786{
1787 switch(p_representation)
1788 {
1789 case EST_Ngrammar::sparse:
1790 // return p_states[sparse_representation.find_state(words)];
1791 return p_states[0];
1792 break;
1793
1794 case EST_Ngrammar::dense:
1795 return p_states[find_dense_state_index(words)];
1796 break;
1797
1798 case EST_Ngrammar::backoff:
1799 cerr << "find_state: not valid in backoff mode !" << endl;
1800 break;
1801
1802 default:
1803 cerr << "find_state: unknown ngrammar representation" << endl;
1804 break;
1805 }
1806 return p_states[0];
1807}
1808
1809
1810const EST_NgrammarState &
1811EST_Ngrammar::find_state_const(const EST_IVector &words) const
1812{
1813 switch(p_representation)
1814 {
1815 case EST_Ngrammar::sparse:
1816 // return p_states[sparse_representation.find_state(words)];
1817 return p_states[0];
1818 break;
1819 case EST_Ngrammar::dense:
1820 return p_states[find_dense_state_index(words)];
1821 break;
1822
1823 case EST_Ngrammar::backoff:
1824 cerr << "find_state_const: not valid in backoff mode !" << endl;
1825 break;
1826
1827 default:
1828 cerr << "find_state: unknown ngrammar representation" << endl;
1829 break;
1830 }
1831
1832 return p_states[0];
1833}
1834
1835
1836bool EST_Ngrammar::set_representation(EST_Ngrammar::representation_t new_representation)
1837{
1838
1839 if (new_representation == p_representation)
1840 return true;
1841
1842 if (new_representation == EST_Ngrammar::sparse)
1843 return sparse_to_dense();
1844 else if (new_representation == EST_Ngrammar::dense)
1845 return dense_to_sparse();
1846 else
1847 {
1848 cerr << "set_representation: unknown ngrammar representation" << endl;
1849 return FALSE;
1850 }
1851}
1852
1853double EST_Ngrammar::probability(const EST_StrVector &words, bool force, const bool trace) const
1854{
1855 // Probability of this ngram
1856 (void)force;
1857
1858 switch(p_representation)
1859 {
1860 case EST_Ngrammar::sparse:
1861 case EST_Ngrammar::dense:
1862 return find_state_const(words).probability(lastword(words));
1863 break;
1864
1865 case EST_Ngrammar::backoff:
1866 return backoff_probability(words,trace);
1867 break;
1868
1869 default:
1870 cerr << "probability: unknown ngrammar representation" << endl;
1871 return -1;
1872 break;
1873 }
1874}
1875
1876double EST_Ngrammar::frequency(const EST_StrVector &words, bool force, const bool trace) const
1877{
1878 // Frequency of this ngram
1879 (void)force;
1880
1881 switch(p_representation)
1882 {
1883 case EST_Ngrammar::sparse:
1884 case EST_Ngrammar::dense:
1885 return find_state_const(words).frequency(lastword(words));
1886 break;
1887
1888 case EST_Ngrammar::backoff:
1889 return backoff_probability(words,trace); // can't do freqs
1890 break;
1891
1892 default:
1893 cerr << "probability: unknown ngrammar representation" << endl;
1894 return -1;
1895 break;
1896 }
1897}
1898
1899const EST_String &EST_Ngrammar::predict(const EST_StrVector &words,
1900 double *prob,int *state) const
1901{
1902 // What's the most probable word given this list of words
1903
1904 switch(p_representation)
1905 {
1906 case EST_Ngrammar::sparse:
1907 case EST_Ngrammar::dense:
1908 {
1909 const EST_NgrammarState &s = find_state_const(words);
1910 *state = s.id();
1911 return s.most_probable(prob);
1912 }
1913 break;
1914
1915 case EST_Ngrammar::backoff:
1916 state=NULL; // there are no states per se
1917 return backoff_most_probable(words,prob);
1918 break;
1919
1920 default:
1921 cerr << "probability: unknown ngrammar representation" << endl;
1922 return EST_String::Empty;
1923 break;
1924 }
1925}
1926
1927const EST_String &EST_Ngrammar::predict(const EST_IVector &words,
1928 double *prob,int *state) const
1929{
1930 // What's the most probable word given this list of words
1931
1932 switch(p_representation)
1933 {
1934 case EST_Ngrammar::sparse:
1935 case EST_Ngrammar::dense:
1936 {
1937 const EST_NgrammarState &s = find_state_const(words);
1938 *state = s.id();
1939 return s.most_probable(prob);
1940 }
1941 break;
1942
1943 case EST_Ngrammar::backoff:
1944 cerr << "probability: IVector access to backoff not supported" << endl;
1945 return EST_String::Empty;
1946 break;
1947
1948 default:
1949 cerr << "probability: unknown ngrammar representation" << endl;
1950 return EST_String::Empty;
1951 break;
1952 }
1953}
1954
1955int EST_Ngrammar::find_state_id(const EST_StrVector &words) const
1956{
1957 switch(p_representation)
1958 {
1959 case EST_Ngrammar::sparse:
1960 case EST_Ngrammar::dense:
1961 {
1962 const EST_NgrammarState &s = find_state_const(words);
1963 return s.id();
1964 }
1965 default:
1966 cerr << "Ngrammar: representation doesn't support states" << endl;
1967 return 0;
1968 break;
1969 }
1970}
1971
1972int EST_Ngrammar::find_state_id(const EST_IVector &words) const
1973{
1974 switch(p_representation)
1975 {
1976 case EST_Ngrammar::sparse:
1977 case EST_Ngrammar::dense:
1978 {
1979 const EST_NgrammarState &s = find_state_const(words);
1980 return s.id();
1981 }
1982 default:
1983 cerr << "Ngrammar: representation doesn't support states" << endl;
1984 return 0;
1985 break;
1986 }
1987}
1988
1989EST_String EST_Ngrammar::get_vocab_word(int i) const
1990{
1991 if (vocab)
1992 return vocab->name(i);
1993 else
1994 return NOVOCAB;
1995}
1996
1997int EST_Ngrammar::get_vocab_word(const EST_String &s) const
1998{
1999 int index = vocab->name(s);
2000 return index;
2001}
2002
2003double EST_Ngrammar::reverse_probability(const EST_StrVector &words,
2004 bool force) const
2005{
2006 // Whats the probability of this ngram-1 given last word in ngram
2007 (void)force;
2008
2009 switch(p_representation)
2010 {
2011 case EST_Ngrammar::sparse:
2012 case EST_Ngrammar::dense:
2013 {
2014 const EST_NgrammarState &s = find_state_const(words);
2015 // need number of occurrences of words[p_order-1]
2016 return s.frequency(lastword(words))/
2017 vocab_pdf.frequency(lastword(words));
2018 }
2019 break;
2020
2021 case EST_Ngrammar::backoff:
2022 return backoff_reverse_probability(words);
2023 break;
2024
2025 default:
2026 cerr << "probability: unknown ngrammar representation" << endl;
2027 return -1;
2028 break;
2029 }
2030}
2031
2032double EST_Ngrammar::reverse_probability(const EST_IVector &words,
2033 bool force) const
2034{
2035 // Whats the probability of this ngram-1 given last word in ngram
2036 (void)force;
2037
2038 switch(p_representation)
2039 {
2040 case EST_Ngrammar::sparse:
2041 case EST_Ngrammar::dense:
2042 {
2043 const EST_NgrammarState &s = find_state_const(words);
2044 // need number of occurrences of words[p_order-1]
2045 return s.frequency(lastword(words))/
2046 vocab_pdf.frequency(lastword(words));
2047 }
2048 break;
2049
2050 case EST_Ngrammar::backoff:
2051 cerr << "probability: reverse prob unavailable for backoff ngram"
2052 << endl;
2053 return -1;
2054 break;
2055
2056 default:
2057 cerr << "probability: unknown ngrammar representation" << endl;
2058 return -1;
2059 break;
2060 }
2061}
2062
2064EST_Ngrammar::prob_dist(int state) const
2065{
2066 return p_states[state].pdf_const();
2067}
2068
2070EST_Ngrammar::prob_dist(const EST_StrVector &words) const
2071{
2072
2073 switch(p_representation)
2074 {
2075 case EST_Ngrammar::sparse:
2076 case EST_Ngrammar::dense:
2077 {
2078 const EST_NgrammarState &s = find_state_const(words);
2079 return s.pdf_const();
2080 }
2081 break;
2082
2083 case EST_Ngrammar::backoff:
2084 return backoff_prob_dist(words);
2085 break;
2086
2087 default:
2088 cerr << "probability: unknown ngrammar representation" << endl;
2089 return PSTnullProbDistribution;
2090 break;
2091 }
2092}
2093
2095EST_Ngrammar::prob_dist(const EST_IVector &words) const
2096{
2097
2098 switch(p_representation)
2099 {
2100 case EST_Ngrammar::sparse:
2101 case EST_Ngrammar::dense:
2102 {
2103 const EST_NgrammarState &s = find_state_const(words);
2104 return s.pdf_const();
2105 }
2106 break;
2107
2108 case EST_Ngrammar::backoff:
2109 cerr << "probability: unsupport IVector access of backoff ngram" << endl;
2110 return PSTnullProbDistribution;
2111 break;
2112
2113 default:
2114 cerr << "probability: unknown ngrammar representation" << endl;
2115 return PSTnullProbDistribution;
2116 break;
2117 }
2118}
2119
2120EST_read_status
2121EST_Ngrammar::load(const EST_String &filename)
2122{
2123
2124 EST_read_status r_val;
2125
2126 //if ((r_val = load_ngram_htk_ascii(filename, *this)) != wrong_format)
2127 //return r_val;
2128 //if ((r_val = load_ngram_htk_binary(filename, *this)) != wrong_format)
2129 //return r_val;
2130 if ((r_val = load_ngram_cstr_ascii(filename, *this)) != wrong_format)
2131 return r_val;
2132 if ((r_val = load_ngram_cstr_bin(filename, *this)) != wrong_format)
2133 return r_val;
2134
2135 // maybe the file is compressed ?
2136 EST_Pathname fname(filename);
2137 EST_String tmp_fname("");
2138
2139 // crude but effective
2140 if (fname.extension() == GZIP_FILENAME_EXTENSION)
2141 tmp_fname = uncompress_file_to_temporary(filename,
2142 "gzip --decompress --stdout");
2143 else if (fname.extension() == COMPRESS_FILENAME_EXTENSION)
2144 tmp_fname = uncompress_file_to_temporary(filename,"uncompress -c");
2145
2146 if(tmp_fname != "")
2147 {
2148 r_val = load(tmp_fname);
2149 delete_file(tmp_fname);
2150 return r_val;
2151 }
2152 else
2153 return misc_read_error;
2154
2155 cerr << "EST_Ngrammar::load can't determine ngrammar file type for input file " << filename << endl;
2156 return wrong_format;
2157}
2158
2159EST_read_status
2160EST_Ngrammar::load(const EST_String &filename, const EST_StrList &wordlist)
2161{
2162
2163 // for backed off grammars
2164 // need a wordlist to load ARPA format
2165
2166 EST_read_status r_val;
2167
2168 if ((r_val = load_ngram_arpa(filename, *this, wordlist)) != wrong_format)
2169 return r_val;
2170
2171 // try other types, then check wordlist fits
2172 //if ((r_val = load_ngram_htk_ascii(filename, *this)) != wrong_format)
2173 //return r_val;
2174 //if ((r_val = load_ngram_htk_binary(filename, *this)) != wrong_format)
2175 //return r_val;
2176 if ((r_val = load_ngram_cstr_ascii(filename, *this)) != wrong_format)
2177 {
2178 if ((r_val == format_ok) && check_vocab(wordlist))
2179 return r_val;
2180 else
2181 {
2182 cerr << "Wordlist file does not match grammar wordlist !" << endl;
2183 return misc_read_error;
2184 }
2185 }
2186
2187 if ((r_val = load_ngram_cstr_bin(filename, *this)) != wrong_format)
2188 {
2189 if ((r_val == format_ok) && check_vocab(wordlist))
2190 return r_val;
2191 else
2192 {
2193 cerr << "Wordlist does not match grammar !" << endl;
2194 return misc_read_error;
2195 }
2196 }
2197
2198
2199 cerr << "EST_Ngrammar::load can't determine ngrammar file type for input file " << filename << endl;
2200 return wrong_format;
2201}
2202
2203
2204void
2205EST_Ngrammar::make_htk_compatible()
2206{
2207
2208 cerr << "EST_Ngrammar::make_htk_compatible() not written yet." << endl;
2209 return;
2210}
2211
2212EST_write_status
2213EST_Ngrammar::save(const EST_String &filename, const EST_String type,
2214 const bool trace,double floor)
2215{
2216
2217 if (type == "")
2218 return save(filename,"cstr_ascii",false,floor); // choose default type
2219 if (type == "htk_ascii")
2220 return save_ngram_htk_ascii(filename, *this, floor);
2221 //if (type == "htk_binary")
2222 //return save_htk_binary(filename, *this);
2223 if (type == "arpa")
2224 return save_ngram_arpa(filename, *this);
2225 if (type == "cstr_ascii")
2226 return save_ngram_cstr_ascii(filename, *this,trace,floor);
2227 if (type == "cstr_bin")
2228 return save_ngram_cstr_bin(filename, *this, trace,floor);
2229 if (type == "wfst")
2230 return save_ngram_wfst(filename, *this);
2231
2232 cerr << "EST_Ngrammar::save unknown output file type " << type << endl;
2233 return write_fail;
2234}
2235
2236void EST_Ngrammar::iterate(EST_StrVector &words,
2237 void (*function)(EST_Ngrammar *n,
2238 EST_StrVector &words,
2239 void *params),
2240 void *params)
2241{
2242 int i,j=-1;
2243 EST_String tmp;
2244
2245 // find next item in ngram to fill in
2246 for(i=0;i<words.n();i++)
2247 if (words[i] == "")
2248 {
2249 j=i;
2250 break;
2251 }
2252
2253 if (j==-1)
2254 {
2255 // all filled in, so do the function
2256 (*function)(this,words,params);
2257
2258 }
2259 else
2260 {
2261
2262 tmp = words(j);
2263 if (j==p_order-1) // final position - use pred vocab
2264 {
2265 for(i=0;i<pred_vocab->length();i++){
2266 words[j] = pred_vocab->name(i);
2267 iterate(words,function,params);
2268 }
2269
2270 }
2271 else
2272 {
2273 for(i=0;i<vocab->length();i++){
2274 words[j] = vocab->name(i);
2275 iterate(words,function,params);
2276 }
2277 }
2278 words[j] = tmp;
2279 }
2280}
2281
2282void EST_Ngrammar::const_iterate(EST_StrVector &words,
2283 void (*function)(const EST_Ngrammar *const n,
2284 EST_StrVector &words,
2285 void *params),
2286 void *params) const
2287{
2288 int i,j=-1;
2289 EST_String tmp;
2290
2291 // find next item in ngram to fill in
2292 for(i=0;i<words.n();i++)
2293 if (words[i] == "")
2294 {
2295 j=i;
2296 break;
2297 }
2298
2299 if (j==-1)
2300 {
2301 // all filled in, so do the function
2302 (*function)(this,words,params);
2303
2304 }
2305 else
2306 {
2307
2308 tmp = words(j);
2309 if (j==p_order-1) // final position - use pred vocab
2310 {
2311 for(i=0;i<pred_vocab->length();i++){
2312 words[j] = pred_vocab->name(i);
2313 const_iterate(words,function,params);
2314 }
2315
2316 }
2317 else
2318 {
2319 for(i=0;i<vocab->length();i++){
2320 words[j] = vocab->name(i);
2321 const_iterate(words,function,params);
2322 }
2323 }
2324 words[j] = tmp;
2325 }
2326}
2327
2328void EST_Ngrammar::print_freqs(ostream &os,double floor)
2329{
2330
2331 if (p_representation == EST_Ngrammar::backoff)
2332 backoff_representation->print_freqs(os,p_order);
2333 else
2334 {
2335 int i,j;
2336 EST_Litem *k;
2337 EST_IVector window(p_order-1);
2338
2339 for (i=0; i < p_num_states; i++)
2340 {
2341 // print out each ngram : freq
2342 for (k=p_states[i].pdf().item_start();
2343 !p_states[i].pdf().item_end(k);
2344 k = p_states[i].pdf().item_next(k))
2345 {
2346 double freq;
2347 EST_String name;
2348 int ind = i;
2349 p_states[i].pdf().item_freq(k,name,freq);
2350 if (freq == 0)
2351 freq = floor;
2352 if (freq > 0)
2353 {
2354 for (j = p_order-2; j >= 0; j--)
2355 {
2356 window[j] = ind%vocab->length();
2357 ind /= vocab->length();
2358 }
2359 for (j = 0; j < p_order-1; j++)
2360 os << wordlist_index(window(j)) << " ";
2361 os << name << " : " << freq << endl;
2362 }
2363 }
2364 }
2365 }
2366}
2367
2369EST_Ngrammar::backoff_prob_dist(const EST_StrVector &words) const
2370{
2371 // need to make this on the fly
2372 // by looking at all possible words in the final
2373 // position
2374
2375 int i,j;
2376 EST_StrVector ngram;
2377 ngram.resize(words.n()+1);
2378 for(i=0;i<words.n();i++)
2379 ngram[i] = words(i);
2380
2382
2383 for(j=0;j<get_pred_vocab_length();j++)
2384 {
2385 ngram[ngram.n()-1] = get_pred_vocab_word(j);
2386 double tmp = backoff_probability(ngram,false);
2387 p->set_frequency(j,tmp); // actually probability
2388 }
2389 p->set_num_samples(1.0); // we can't do it in frequencies
2390
2391 return *p;
2392}
2393
2394const double EST_Ngrammar::get_backoff_discount(const int order, const double freq) const
2395{
2396 if(order > p_order)
2397 {
2398 cerr << "order too great in EST_Ngrammar::get_backoff_discount" << endl;
2399 return 0;
2400 }
2401
2402 else if( (int)freq < backoff_discount[order-1].n())
2403 return backoff_discount[order-1]((int)freq);
2404
2405 else
2406 return 0;
2407}
2408
2409const double EST_Ngrammar::backoff_probability(const EST_StrVector &words,
2410 const bool trace) const
2411{
2412 const EST_BackoffNgrammarState *state;
2413 int i;
2414 EST_StrVector new_ngram;
2415 double f=0,f2=0;
2416
2417
2418 if (trace)
2419 {
2420 cerr << "backoff_probability( ";
2421 for(i=0;i<words.n();i++)
2422 cerr << words(i) << " ";
2423 cerr << ") ";
2424 }
2425
2426 // are we down to the unigram ?
2427 if (words.n() == 1)
2428 {
2429 if (trace)
2430 cerr << "unigram " << backoff_representation->probability(words(0))
2431 << endl;
2432
2433 f=backoff_representation->frequency(words(0));
2434
2435 // it seems outrageous, but use a floor because unigram
2436 // probs of zero mess up backing off
2437 if(f>0)
2438 return f / backoff_representation->pdf_const().samples();
2439 else
2440 return backoff_unigram_floor_freq / backoff_representation->pdf_const().samples();
2441 }
2442
2443 // the first n-1 words in the ngram -- deeply inefficient at the moment !
2444 // should pass separately
2445 new_ngram.resize(words.n()-1);
2446 for(i=0;i<new_ngram.n();i++)
2447 new_ngram[i] = words(i);
2448
2449 // see if we have the ngram
2450 state=backoff_representation->get_state(words);
2451
2452 if( (state != NULL) &&
2453 ((f=state->frequency(words(0))) > backoff_threshold) )
2454 {
2455
2456 //if (trace)
2457 //cerr << "in state " << state << " = " << *state << endl << endl;
2458
2459
2460 // because f>0, the freq of new_ngram must be non-zero too
2461
2462 // special case - we don't have a freq for !ENTER (or whatever it's called)
2463 // so use the no. of sentences used to build this grammar
2464 if((new_ngram(0) == p_sentence_start_marker) ||
2465 (new_ngram(0) == p_sentence_end_marker) )
2466 {
2467 f2 = p_number_of_sentences;
2468 if (trace)
2469 cerr << "special freq used : " << f2 << endl;
2470 }
2471 else
2472 {
2473 state=backoff_representation->get_state(new_ngram);
2474 if (state == NULL)
2475 {
2476 cerr << "Something went horribly wrong !" << endl;
2477 return -1;
2478 }
2479 // state->frequency(new_ngram(0)) is the number of times
2480 // we saw new_ngram
2481
2482 f2=state->frequency(new_ngram(0));
2483
2484 if (trace)
2485 {
2486 //cerr << "n-1 state is " << *state << endl;
2487 cerr << " using freq for " << new_ngram(0) << " of " << f2 << endl;
2488 }
2489 }
2490
2491 if (trace)
2492 {
2493
2494 cerr << " ..... got (" << f << " - "
2495 << get_backoff_discount(state->level()+1,f)
2496 << ")/" << f2 << " = "
2497 << (f - get_backoff_discount(state->level()+1,f) ) / f2
2498 << endl;
2499 }
2500 return (f - get_backoff_discount(state->level()+1,f) ) / f2;
2501 }
2502
2503 else // back off
2504 {
2505
2506 double bo_wt = get_backoff_weight(new_ngram);
2507
2508 for(i=0;i<new_ngram.n();i++)
2509 new_ngram[i] = words(i+1);
2510
2511 if (trace)
2512 {
2513 cerr << "backed off(" << bo_wt << ") to (";
2514 for(i=0;i<new_ngram.n();i++)
2515 cerr << new_ngram(i) << " ";
2516 cerr << ") ";
2517 }
2518
2519 return bo_wt * backoff_probability(new_ngram,trace);
2520 }
2521
2522 // should never reach here !
2523 // but gcc thinks it does
2524 cerr << "oops !";
2525 return -1;
2526
2527}
2528
2529
2530const double
2531EST_Ngrammar::backoff_reverse_probability_sub(const EST_StrVector &words,
2532 const EST_BackoffNgrammarState *root) const
2533{
2534
2535 // so similar to backoff prob - should combine
2536 // to do ......
2537
2538 const EST_BackoffNgrammarState *state;
2539 int i;
2540 EST_StrVector new_ngram;
2541 double f=0;
2542
2543 /*
2544 cerr << "backoff_rev_probability_sub( ";
2545 for(i=0;i<words.n();i++)
2546 cerr << words(i) << " ";
2547 cerr << ") ";
2548 */
2549 // are we down to the unigram ?
2550 if (words.n() == 1)
2551 {
2552 //cerr << "unigram " << root->probability(words(0))
2553 //<< endl;
2554 return root->probability(words(0));
2555 }
2556
2557 // the first n-1 words in the ngram -- deeply inefficient at the moment !
2558 // should pass separately
2559 new_ngram.resize(words.n()-1);
2560 for(i=0;i<new_ngram.n();i++)
2561 new_ngram[i] = words(i);
2562
2563 // see if we have the ngram
2564 state=root->get_state(words);
2565
2566
2567 if( (state != NULL) &&
2568 ((f=state->frequency(words(0))) > 0) )
2569 {
2570 // because f>0, the freq of new_ngram must be non-zero too
2571 state=root->get_state(new_ngram);
2572 if (state == NULL)
2573 {
2574 cerr << "Something went horribly wrong !" << endl;
2575 return -1;
2576 }
2577 // state->frequency(new_ngram(0)) is the number of times
2578 // we saw new_ngram
2579 //cerr << "got " << f << "/" << state->frequency(new_ngram(0)) << endl;
2580 return f / state->frequency(new_ngram(0));
2581 }
2582
2583 else // back off
2584 {
2585
2586 double bo_wt = root->get_backoff_weight(new_ngram);
2587 //double bo_wt = root->get_backoff_weight(words);
2588
2589 for(i=0;i<new_ngram.n();i++)
2590 new_ngram[i] = words(i+1);
2591
2592 //cerr << "backed off(" << bo_wt << ") ";
2593 return bo_wt * backoff_reverse_probability_sub(new_ngram,root);
2594 }
2595
2596 // should never reach here !
2597 // but gcc thinks it does
2598 cerr << "oops !";
2599 return -1;
2600
2601}
2602
2603const double
2604EST_Ngrammar::backoff_reverse_probability(const EST_StrVector &words) const
2605{
2606
2607 // probability of words 1...n-1 , given the last word
2608 // easier to compute from the backoff tree structure than
2609 // 'normal' probability
2610
2611 // find level 1 child corresponding to history before last word
2613 state = backoff_representation->get_child(words(words.n()-1));
2614
2615 if(state == NULL)
2616 {
2617 // predictee isn't there so ......... ???
2618 return 0;
2619 }
2620
2621 // now find backoff probability of words 0...n-2
2622 // starting from this state
2623 return backoff_reverse_probability_sub(words,state);
2624
2625}
2626
2627const EST_String &
2628EST_Ngrammar::backoff_most_probable(const EST_StrVector &words,
2629 double *prob) const
2630{
2631 return backoff_prob_dist(words).most_probable(prob);
2632}
2633
2634void slide(EST_IVector &v, const int l)
2635{
2636 int i;
2637
2638 // slide elements by 'l' without wraparound
2639
2640 if (l==0)
2641 return;
2642
2643 else if (l<0)
2644 {
2645 // slide left
2646
2647 for(i=0;i<v.n()+l;i++)
2648 v[i] = v(i-l);
2649
2650 for(;i<v.n();i++)
2651 v[i] = 0;
2652
2653 }
2654 else
2655 {
2656 // slide right
2657
2658 for(i=v.n()-1;i>=l;i--)
2659 v[i] = v(i-l);
2660
2661 for(;i>=0;i--)
2662 v[i] = 0;
2663 }
2664}
2665
2666void
2667EST_Ngrammar::backoff_traverse(EST_BackoffNgrammarState *start_state,
2668 void (*function)(EST_BackoffNgrammarState *s,
2669 void *params),
2670 void *params)
2671{
2672
2673 // apply the function to this node
2674 function(start_state,params);
2675
2676 // and recurse down the tree
2677 EST_Litem *k;
2678 double freq;
2679 EST_String name;
2680 for (k=start_state->pdf_const().item_start();
2681 !start_state->pdf_const().item_end(k);
2682 k = start_state->pdf_const().item_next(k))
2683 {
2684 start_state->pdf_const().item_freq(k,name,freq);
2685 EST_BackoffNgrammarState *child = start_state->get_child(name);
2686 if (child!=NULL)
2687 backoff_traverse(child,function,params);
2688
2689 }
2690}
2691
2692void
2693EST_Ngrammar::backoff_traverse(EST_BackoffNgrammarState *start_state,
2694 void (*function)(EST_BackoffNgrammarState *s,
2695 void *params),
2696 void *params,
2697 const int level)
2698{
2699 // apply the function to this node, if level is correct
2700 if (start_state->level() == level)
2701 {
2702 function(start_state,params);
2703 }
2704 else if (start_state->level() < level)
2705 {
2706 // and recurse down the tree if we haven't
2707 // reached the level yet
2708 EST_Litem *k;
2709 double freq;
2710 EST_String name;
2711
2712 for (k=start_state->pdf_const().item_start();
2713 !start_state->pdf_const().item_end(k);
2714 k = start_state->pdf_const().item_next(k))
2715 {
2716 start_state->pdf_const().item_freq(k,name,freq);
2717 EST_BackoffNgrammarState *child = start_state->get_child(name);
2718 if (child!=NULL)
2719 backoff_traverse(child,function,params,level);
2720
2721 }
2722
2723
2724 }
2725}
2726void
2727merge_other_grammar(EST_Ngrammar *n, EST_StrVector &ngram, void *params)
2728{
2729
2730 EST_Ngrammar *other_n = (EST_Ngrammar*)((void**)params)[0];
2731 float *weight = (float*)((void**)params)[1];
2732
2733 if(other_n->ngram_exists(ngram))
2734 n->accumulate(ngram,*weight * other_n->frequency(ngram));
2735
2736}
2737
2738bool
2739EST_Ngrammar::merge(EST_Ngrammar &n,float weight)
2740{
2741 EST_StrVector words;
2742 words.resize(p_order);
2743
2744 void **params = new void*[2];
2745 params[0] = (void*)&n;
2746 params[1] = (void*)&weight;
2747
2748 iterate(words,&merge_other_grammar,(void*)params);
2749
2750 delete [] params;
2751 return true;
2752}
2753
2754
2755void slide(EST_StrVector &v, const int l)
2756{
2757 // slide elements by 'l' without wraparound
2758 int i;
2759
2760 if (l==0)
2761 return;
2762
2763 else if (l<0)
2764 {
2765 // slide left
2766
2767 for(i=0;i<v.n()+l;i++)
2768 v[i] = v(i-l);
2769
2770 for(;i<v.n();i++)
2771 v[i] = "";
2772
2773 }
2774 else
2775 {
2776 // slide right
2777
2778 for(i=v.n()-1;i>=l;i--)
2779 v[i] = v(i-l);
2780
2781 for(;i>=0;i--)
2782 v[i] = "";
2783
2784 }
2785}
2786
const EST_Discrete *const get_discrete() const
Returns discrete vocabulary of distribution.
EST_Litem * item_next(EST_Litem *idx) const
Used for iterating through members of the distribution.
bool init(const EST_StrList &vocab)
Initialise using given vocabulary.
void item_freq(EST_Litem *idx, EST_String &s, double &freq) const
During iteration returns name and frequency given index
void set_num_samples(const double c)
EST_Litem * item_start() const
Used for iterating through members of the distribution.
const EST_String & most_probable(double *prob=NULL) const
Return the most probable member of the distribution.
double samples(void) const
Total number of example found.
void clear(void)
Reset, clearing all counts and vocabulary.
void cumulate(const EST_String &s, double count=1)
Add this observation, may specify number of occurrences.
void set_frequency(const EST_String &s, double c)
int item_end(EST_Litem *idx) const
Used for iterating through members of the distribution.
bool init(const EST_StrList &vocab)
(re-)initialise
Definition: EST_Discrete.cc:80
const int index(const EST_String &n) const
const EST_String & name(const int n) const
The name given the index.
const int length(void) const
The number of members in the discrete.
void add(const EST_String &key, void *item)
Add {\tt item} indexed by {\tt key}, overwriting previous contents.
void * lookup(const EST_String &key) const
Find contents index by {\tt key}, 0 if there is not contents.
void clear(void)
Delete the tree.
static const EST_String Empty
Constant empty string.
Definition: EST_String.h:111
void resize(int n, int set=1)
Definition: EST_TVector.cc:196
INLINE int n() const
number of items in vector.
Definition: EST_TVector.h:254
INLINE const T & a_no_check(int n) const
read-only const access operator: without bounds checking
Definition: EST_TVector.h:257
int eof()
end of file
Definition: EST_Token.h:356
int eoln()
end of line
Definition: EST_Token.cc:818
void close(void)
Close stream.
Definition: EST_Token.cc:406
int open(const EST_String &filename)
open a \Ref{EST_TokenStream} for a file.
Definition: EST_Token.cc:200
EST_TokenStream & get(EST_Token &t)
get next token in stream
Definition: EST_Token.cc:486