SHOGUN  v1.1.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
GaussianNaiveBayes.cpp
Go to the documentation of this file.
1 /*
2  * This program is free software; you can redistribute it and/or modify
3  * it under the terms of the GNU General Public License as published by
4  * the Free Software Foundation; either version 3 of the License, or
5  * (at your option) any later version.
6  *
7  * Written (W) 2011 Sergey Lisitsyn
8  * Copyright (C) 2011 Berlin Institute of Technology and Max-Planck-Society
9  */
10 
12 #include <shogun/machine/Machine.h>
14 #include <shogun/features/Labels.h>
16 #include <shogun/lib/Signal.h>
17 
18 using namespace shogun;
19 
21 CMachine(), m_features(NULL), m_min_label(0),
22 m_num_classes(0), m_dim(0), m_means(),
23 m_variances(), m_label_prob(), m_rates()
24 {
25 
26 };
27 
29 CMachine(), m_features(NULL), m_min_label(0),
30 m_num_classes(0), m_dim(0), m_means(),
31 m_variances(), m_label_prob(), m_rates()
32 {
33  ASSERT(train_examples->get_num_vectors() == train_labels->get_num_labels());
34  set_labels(train_labels);
35  if (!train_examples->has_property(FP_DOT))
36  SG_ERROR("Specified features are not of type CDotFeatures\n");
37  set_features((CDotFeatures*)train_examples);
38 };
39 
41 {
43 
48 };
49 
51 {
52  // init features with data if necessary and assure type is correct
53  if (data)
54  {
55  if (!data->has_property(FP_DOT))
56  SG_ERROR("Specified features are not of type CDotFeatures\n");
57  set_features((CDotFeatures*) data);
58  }
59  // get int labels to train_labels and check length equality
60  ASSERT(labels);
61  SGVector<int32_t> train_labels = labels->get_int_labels();
62  ASSERT(m_features->get_num_vectors()==train_labels.vlen);
63 
64  // init min_label, max_label and loop variables
65  int32_t min_label = train_labels.vector[0];
66  int32_t max_label = train_labels.vector[0];
67  int i,j;
68 
69  // find minimal and maximal label
70  for (i=1; i<train_labels.vlen; i++)
71  {
72  min_label = CMath::min(min_label, train_labels.vector[i]);
73  max_label = CMath::max(max_label, train_labels.vector[i]);
74  }
75 
76  // subtract minimal label from all labels
77  for (i=0; i<train_labels.vlen; i++)
78  train_labels.vector[i]-= min_label;
79 
80  // get number of classes, minimal label and dimensionality
81  m_num_classes = max_label-min_label+1;
82  m_min_label = min_label;
84 
85  // allocate memory for distributions' parameters and a priori probability
88 
91 
94 
95  // allocate memory for label rates
98 
99  // assure that memory is allocated
104 
105  // make arrays filled by zeros before using
106  for (i=0;i<m_num_classes*m_dim;i++)
107  {
108  m_means.vector[i] = 0.0;
109  m_variances.vector[i] = 0.0;
110  }
111  for (i=0;i<m_num_classes;i++)
112  {
113  m_label_prob.vector[i] = 0.0;
114  m_rates.vector[i] = 0.0;
115  }
116 
118 
119  // get sum of features among labels
120  for (i=0; i<train_labels.vlen; i++)
121  {
122  for (j=0; j<m_dim; j++)
123  m_means.vector[m_dim*train_labels.vector[i]+j]+=feature_matrix.matrix[i*m_dim+j];
124 
125  m_label_prob.vector[train_labels.vector[i]]+=1.0;
126  }
127 
128  // get means of features of labels
129  for (i=0; i<m_num_classes; i++)
130  {
131  for (j=0; j<m_dim; j++)
132  m_means.vector[m_dim*i+j] /= m_label_prob.vector[i];
133  }
134 
135  // compute squared residuals with means available
136  for (i=0; i<train_labels.vlen; i++)
137  {
138  for (j=0; j<m_dim; j++)
139  m_variances.vector[m_dim*train_labels.vector[i]+j]+=
140  CMath::sq(feature_matrix.matrix[i*m_dim+j]-m_means.vector[m_dim*train_labels.vector[i]+j]);
141  }
142 
143  // get variance of features of labels
144  for (i=0; i<m_num_classes; i++)
145  {
146  for (j=0; j<m_dim; j++)
147  m_variances.vector[m_dim*i+j] /= m_label_prob.vector[i] > 1 ? m_label_prob.vector[i]-1 : 1;
148  }
149 
150  // get a priori probabilities of labels
151  for (i=0; i<m_num_classes; i++)
152  {
154  }
155 
156  train_labels.free_vector();
157 
158  return true;
159 }
160 
162 {
163  // init number of vectors
164  int32_t n = m_features->get_num_vectors();
165 
166  // init result labels
167  CLabels* result = new CLabels(n);
168 
169  // classify each example of data
170  for (int i=0; i<n; i++)
171  result->set_label(i,apply(i));
172 
173  return result;
174 };
175 
177 {
178  // check data correctness
179  if (!data)
180  SG_ERROR("No features specified\n");
181  if (!data->has_property(FP_DOT))
182  SG_ERROR("Specified features are not of type CDotFeatures\n");
183 
184  // set features to classify
185  set_features((CDotFeatures*)data);
186 
187  // classify using features
188  return apply();
189 };
190 
192 {
193  // get [idx] feature vector
195 
196  // init loop variables
197  int i,k;
198 
199  // rate all labels
200  for (i=0; i<m_num_classes; i++)
201  {
202  // set rate to 0.0 if a priori probability is 0.0 and continue
203  if (m_label_prob.vector[i]==0.0)
204  {
205  m_rates.vector[i] = 0.0;
206  continue;
207  }
208  else
210 
211  // product all conditional gaussian probabilities
212  for (k=0; k<m_dim; k++)
213  m_rates.vector[i]*= normal_exp(feature_vector.vector[k],i,k)/CMath::sqrt(m_variances.vector[i*m_dim+k]);
214  }
215 
216  // find label with maximum rate
217  int32_t max_label_idx = 0;
218 
219  for (i=0; i<m_num_classes; i++)
220  {
221  if (m_rates.vector[i]>m_rates.vector[max_label_idx])
222  max_label_idx = i;
223  }
224 
225  return max_label_idx+m_min_label;
226 };
bool has_property(EFeatureProperty p)
Definition: Features.cpp:337
SGVector< float64_t > m_label_prob
a priori probabilities of labels
bool set_label(int32_t idx, float64_t label)
Definition: Labels.cpp:199
The class Labels models labels, i.e. class assignments of objects.
Definition: Labels.h:35
static T sq(T x)
x^2
Definition: Math.h:277
virtual int32_t get_num_vectors() const =0
int32_t get_num_labels()
Definition: Labels.cpp:240
#define SG_ERROR(...)
Definition: SGIO.h:75
Features that support dot products among other operations.
Definition: DotFeatures.h:41
A generic learning machine interface.
Definition: Machine.h:96
SGVector< float64_t > m_rates
label rates
virtual int32_t get_dim_feature_space() const =0
CLabels * labels
Definition: Machine.h:251
virtual void free_vector()
Definition: DataType.h:212
#define ASSERT(x)
Definition: SGIO.h:102
SGVector< int32_t > get_int_labels()
Definition: Labels.cpp:152
int32_t m_num_classes
number of different classes (labels)
SGVector< float64_t > m_means
means for normal distributions of features
virtual void destroy_vector()
Definition: DataType.h:223
double float64_t
Definition: common.h:56
int32_t m_min_label
minimal label
static T max(T a, T b)
return the maximum of two integers
Definition: Math.h:162
int32_t m_dim
dimensionality of feature space
float64_t normal_exp(float64_t x, int32_t l_idx, int32_t f_idx)
virtual void set_features(CDotFeatures *features)
#define SG_UNREF(x)
Definition: SGObject.h:45
virtual bool train(CFeatures *data=NULL)
The class Features is the base class of all feature objects.
Definition: Features.h:56
static T min(T a, T b)
return the minimum of two integers
Definition: Math.h:155
SGVector< float64_t > m_variances
variances for normal distributions of features
SGVector< float64_t > get_computed_dot_feature_vector(int32_t num)
SGMatrix< float64_t > get_computed_dot_feature_matrix()
static float32_t sqrt(float32_t x)
x^0.5
Definition: Math.h:283
virtual void set_labels(CLabels *lab)
Definition: Machine.cpp:63
#define SG_MALLOC(type, len)
Definition: memory.h:36
CDotFeatures * m_features
features for training or classifying
index_t vlen
Definition: DataType.h:248

SHOGUN Machine Learning Toolbox - Documentation