Edinburgh Speech Tools 2.4-release
EST_Wagon.h
1/*************************************************************************/
2/* */
3/* Centre for Speech Technology Research */
4/* University of Edinburgh, UK */
5/* Copyright (c) 1996,1997 */
6/* All Rights Reserved. */
7/* */
8/* Permission is hereby granted, free of charge, to use and distribute */
9/* this software and its documentation without restriction, including */
10/* without limitation the rights to use, copy, modify, merge, publish, */
11/* distribute, sublicense, and/or sell copies of this work, and to */
12/* permit persons to whom this work is furnished to do so, subject to */
13/* the following conditions: */
14/* 1. The code must retain the above copyright notice, this list of */
15/* conditions and the following disclaimer. */
16/* 2. Any modifications must be clearly marked as such. */
17/* 3. Original authors' names are not deleted. */
18/* 4. The authors' names are not used to endorse or promote products */
19/* derived from this software without specific prior written */
20/* permission. */
21/* */
22/* THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK */
23/* DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING */
24/* ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT */
25/* SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE */
26/* FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES */
27/* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN */
28/* AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, */
29/* ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF */
30/* THIS SOFTWARE. */
31/* */
32/*************************************************************************/
33/* Author : Alan W Black */
34/* Date : May 1996 */
35/*-----------------------------------------------------------------------*/
36/* */
37/* Public declarations for Wagon (CART builder) */
38/* */
39/*=======================================================================*/
40#ifndef __WAGON_H__
41#define __WAGON_H__
42
43#include "EST_String.h"
44#include "EST_Val.h"
45#include "EST_TVector.h"
46#include "EST_TList.h"
47#include "EST_simplestats.h" /* For EST_SuffStats class */
48#include "EST_Track.h"
49#include "siod.h"
50
51// When set to one wagon supports using multiple threads if
52// --omp_nthreads X is used (works for most gccs)
53// #define OMP_WAGON 1
54#ifdef OMP_WAGON
55#include "omp.h"
56#endif
57
58#define wagon_error(WMESS) (cerr << WMESS << endl,exit(-1))
59
60// I get floating point exceptions of Alphas when I do any comparisons
61// with HUGE_VAL or FLT_MAX so I'll make my own
62#define WGN_HUGE_VAL 1.0e20
63
64class WVector : public EST_FVector
65{
66 public:
67 WVector(int n) : EST_FVector(n) {}
68 int get_int_val(int n) const { return (int)a_no_check(n); }
69 float get_flt_val(int n) const { return a_no_check(n); }
70 void set_int_val(int n,int i) { a_check(n) = (int)i; }
71 void set_flt_val(int n,float f) { a_check(n) = f; }
72};
73
76
77/* Different types of feature */
78enum wn_dtype {/* for predictees and predictors */
79 wndt_binary, wndt_float, wndt_class,
80 /* for predictees only */
81 wndt_cluster, wndt_vector, wndt_matrix, wndt_trajectory,
82 wndt_ols,
83 /* for ignored features */
84 wndt_ignore};
85
86class WDataSet : public WVectorList {
87 private:
88 int dlength;
89 EST_IVector p_type;
90 EST_IVector p_ignore;
91 EST_StrVector p_name;
92 public:
93 void load_description(const EST_String& descfname,LISP ignores);
94 void ignore_non_numbers();
95
96 int ftype(const int &i) const {return p_type(i);}
97 int ignore(int i) const {return p_ignore(i); }
98 void set_ignore(int i,int value) { p_ignore[i] = value; }
99 const EST_String &feat_name(const int &i) const {return p_name(i);}
100 int samples(void) const {return length();}
101 int width(void) const {return dlength;}
102};
103enum wn_oper {wnop_equal, wnop_binary, wnop_greaterthan,
104 wnop_lessthan, wnop_is, wnop_in, wnop_matches};
105
107 private:
108 int feature_pos;
109 wn_oper op;
110 int yes;
111 int no;
112 EST_Val operand1;
113 EST_IList operandl;
114 float score;
115 public:
116 WQuestion() {;}
117 WQuestion(const WQuestion &s)
118 { feature_pos=s.feature_pos;
119 op=s.op; yes=s.yes; no=s.no; operand1=s.operand1;
120 operandl = s.operandl; score=s.score;}
121 ~WQuestion() {;}
122 WQuestion(int fp, wn_oper o,EST_Val a)
123 { feature_pos=fp; op=o; operand1=a; }
124 void set_fp(const int &fp) {feature_pos=fp;}
125 void set_oper(const wn_oper &o) {op=o;}
126 void set_operand1(const EST_Val &a) {operand1 = a;}
127 void set_yes(const int &y) {yes=y;}
128 void set_no(const int &n) {no=n;}
129 int get_yes(void) const {return yes;}
130 int get_no(void) const {return no;}
131 const int get_fp(void) const {return feature_pos;}
132 const wn_oper get_op(void) const {return op;}
133 const EST_Val get_operand1(void) const {return operand1;}
134 const EST_IList &get_operandl(void) const {return operandl;}
135 const float get_score(void) const {return score;}
136 void set_score(const float &f) {score=f;}
137 const int ask(const WVector &w) const;
138 friend ostream& operator<<(ostream& s, const WQuestion &q);
139};
140
141enum wnim_type {wnim_unset, wnim_float, wnim_class,
142 wnim_cluster, wnim_vector, wnim_matrix, wnim_ols,
143 wnim_trajectory};
144
145// Impurity measure for cumulating impurities from set of data
147 private:
148 wnim_type t;
151
152 float cluster_impurity();
153 float cluster_member_mean(int i);
154 float vector_impurity();
155 float trajectory_impurity();
156 float ols_impurity();
157 public:
158 EST_IList members; // Maybe there should be a cluster class
159 EST_FList member_counts; // AUP: Implement counts for vectors
160 EST_SuffStats **trajectory;
161 const WVectorVector *data; // Needed for ols
162 float score;
163 int l,width;
164
165 WImpurity() { t=wnim_unset; a.reset(); trajectory=0; l=0; width=0; data=0;}
166 ~WImpurity();
167 WImpurity(const WVectorVector &ds);
168 void copy(const WImpurity &s)
169 {
170 int i,j;
171 t=s.t; a=s.a; p=s.p; members=s.members; member_counts = s.member_counts; l=s.l; width=s.width;
172 score = s.score;
173 data = s.data;
174 if (s.trajectory)
175 {
176 trajectory = new EST_SuffStats *[l];
177 for (i=0; i<l; i++)
178 {
179 trajectory[i] = new EST_SuffStats[width];
180 for (j=0; j<width; j++)
181 trajectory[i][j] = s.trajectory[i][j];
182 }
183 }
184 }
185 WImpurity &operator = (const WImpurity &a) { copy(a); return *this; }
186
187 float measure(void);
188 double samples(void);
189 wnim_type type(void) const { return t;}
190 void cumulate(const float pv,double count=1.0);
191 EST_Val value(void);
192 EST_DiscreteProbDistribution &pd() { return p; }
193 float cluster_distance(int i); // distance i from centre in sds
194 int in_cluster(int i); // distance i from centre < most remote member
195 float cluster_ranking(int i); // position in closeness to centre
196 friend ostream& operator<<(ostream &s, WImpurity &imp);
197};
198
199class WDlist {
200 private:
201 float p_score;
202 WQuestion p_question;
203 EST_String p_token;
204 int p_freq;
205 int p_samples;
206 WDlist *next;
207 public:
208 WDlist() { next=0; }
209 ~WDlist() { if (next != 0) delete next; }
210 void set_score(float s) { p_score = s; }
211 void set_question(const WQuestion &q) { p_question = q; }
212 void set_best(const EST_String &t,int freq, int samples)
213 { p_token = t; p_freq = freq; p_samples = samples;}
214 float score() const {return p_score;}
215 const EST_String &token(void) const {return p_token;}
216 const WQuestion &question() const {return p_question;}
217 EST_Val predict(const WVector &w);
218 friend WDlist *add_to_dlist(WDlist *l,WDlist *a);
219 friend ostream &operator<<(ostream &s, WDlist &d);
220};
221
222class WNode {
223 private:
224 WVectorVector data;
225 WQuestion question;
226 WImpurity impurity;
227 WNode *left;
228 WNode *right;
229 void print_out(ostream &s, int margin);
230 int leaf(void) const { return ((left == 0) || (right == 0)); }
231 int pure(void);
232 public:
233 WNode() { left = right = 0; }
234 ~WNode() { if (left != 0) {delete left; left=0;}
235 if (right != 0) {delete right; right=0;} }
236 WVectorVector &get_data(void) { return data; }
237 void set_subnodes(WNode *l,WNode *r) { left=l; right=r; }
238 void set_impurity(const WImpurity &imp) {impurity=imp;}
239 void set_question(const WQuestion &q) {question=q;}
240 void prune(void);
241 void held_out_prune(void);
242 WImpurity &get_impurity(void) {return impurity;}
243 WQuestion &get_question(void) {return question;}
244 EST_Val predict(const WVector &w);
245 WNode *predict_node(const WVector &d);
246 int samples(void) const { return data.n(); }
247 friend ostream& operator<<(ostream &s, WNode &n);
248};
249
250extern Discretes wgn_discretes;
251extern WDataSet wgn_dataset;
252extern WDataSet wgn_test_dataset;
253extern EST_FMatrix wgn_DistMatrix;
254extern EST_Track wgn_VertexTrack;
255extern EST_Track wgn_UnitTrack;
256extern EST_Track wgn_VertexFeats;
257
258void wgn_load_datadescription(EST_String fname,LISP ignores);
259void wgn_load_dataset(WDataSet &ds,EST_String fname);
260WNode *wgn_build_tree(float &score);
261WNode *wgn_build_dlist(float &score,ostream *output);
262WNode *wagon_stepwise(float limit);
263float wgn_score_question(WQuestion &q, WVectorVector &ds);
264void wgn_find_split(WQuestion &q,WVectorVector &ds,
266float summary_results(WNode &tree,ostream *output);
267
268extern int wgn_min_cluster_size;
269extern int wgn_max_questions;
270extern int wgn_held_out;
271extern float wgn_dropout_feats;
272extern float wgn_dropout_samples;
273extern int wgn_cos;
274extern int wgn_prune;
275extern int wgn_quiet;
276extern int wgn_verbose;
277extern int wgn_predictee;
278extern int wgn_count_field;
279extern EST_String wgn_count_field_name;
280extern EST_String wgn_predictee_name;
281extern float wgn_float_range_split;
282extern float wgn_balance;
283extern EST_String wgn_opt_param;
284extern EST_String wgn_vertex_output;
285
286#define wgn_ques_feature(X) (get_c_string(car(X)))
287#define wgn_ques_oper_str(X) (get_c_string(car(cdr(X))))
288#define wgn_ques_operand(X) (car(cdr(cdr(X))))
289
290int wagon_ask_question(LISP question, LISP value);
291
292int stepwise_ols(const EST_FMatrix &X,
293 const EST_FMatrix &Y,
294 const EST_StrList &feat_names,
295 float limit,
296 EST_FMatrix &coeffs,
297 const EST_FMatrix &Xtest,
298 const EST_FMatrix &Ytest,
299 EST_IVector &included,
300 float &best_score);
301int robust_ols(const EST_FMatrix &X,
302 const EST_FMatrix &Y,
303 EST_IVector &included,
304 EST_FMatrix &coeffs);
305int ols_apply(const EST_FMatrix &samples,
306 const EST_FMatrix &coeffs,
307 EST_FMatrix &res);
308int ols_test(const EST_FMatrix &real,
309 const EST_FMatrix &predicted,
310 float &correlation,
311 float &rmse);
312
313#endif /* __WAGON_H__ */
EST_FVector()
Default constructor.
Definition: EST_FMatrix.h:121
void reset(void)
reset internal values
const float & a_check(int n) const
read-only const access operator: with bounds checking
Definition: EST_TVector.cc:249
INLINE int n() const
number of items in vector.
Definition: EST_TVector.h:254
INLINE const float & a_no_check(int n) const
read-only const access operator: without bounds checking
Definition: EST_TVector.h:257