SHOGUN  v1.1.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
PositionalPWM.cpp
Go to the documentation of this file.
1 /*
2  * This program is free software; you can redistribute it and/or modify
3  * it under the terms of the GNU General Public License as published by
4  * the Free Software Foundation; either version 3 of the License, or
5  * (at your option) any later version.
6  *
7  * Written (W) 2011 Soeren Sonnenburg
8  * Copyright (C) 2011 Berlin Institute of Technology and Max-Planck-Society
9  */
12 #include <shogun/base/Parameter.h>
15 
16 using namespace shogun;
17 
19  m_pwm_rows(0), m_pwm_cols(0), m_pwm(NULL),
20  m_sigma(0), m_mean(0),
21  m_w_rows(0), m_w_cols(0), m_w(NULL), m_poim(NULL)
22 
23 {
24  register_params();
25 }
26 
28 {
29  SG_FREE(m_pwm);
30  SG_FREE(m_w);
31 }
32 
34 {
36  return true;
37 }
38 
40 {
41  return m_pwm_rows*m_pwm_cols+2;
42 }
43 
45 {
46  ASSERT(num_param>0 && num_param<=m_pwm_rows*m_pwm_cols+2);
47 
48  if (num_param<m_pwm_rows*m_pwm_cols)
49  {
50  ASSERT(m_pwm);
51  return m_pwm[num_param];
52  }
53  else if (num_param<m_pwm_rows*m_pwm_cols+1)
54  return CMath::log(m_sigma);
55  else
56  return CMath::log(m_mean);
57 }
58 
59 float64_t CPositionalPWM::get_log_derivative(int32_t num_param, int32_t num_example)
60 {
62  return 0;
63 }
64 
66 {
70 
72 
73  float64_t lik=0;
74  int32_t len=0;
75  bool do_free=false;
76 
77  uint8_t* str = strs->get_feature_vector(num_example, len, do_free);
78 
79  if (!(m_w && m_w_cols==len))
80  return 0; //TODO
81 
82  for (int32_t i=0; i<len; i++)
83  lik+=m_w[4*i+str[i]];
84 
85  strs->free_feature_vector(str, num_example, do_free);
86  return lik;
87 }
88 
90 {
91  ASSERT(m_pwm_cols == len);
92  float64_t score = CMath::log(1/(m_sigma*CMath::sqrt(2*M_PI))) -
94 
95  for (int32_t i=0; i<m_pwm_cols; i++)
96  score+=m_pwm[m_pwm_rows*i+window[i]];
97 
98  return score;
99 }
100 
101 void CPositionalPWM::compute_w(int32_t num_pos)
102 {
104 
106  m_w_cols=num_pos;
107 
108  SG_FREE(m_w);
110 
111  uint8_t* window=SG_MALLOC(uint8_t, m_pwm_cols);
112  CMath::fill_vector(window, m_pwm_cols, (uint8_t) 0);
113 
114  const int32_t last_idx=m_pwm_cols-1;
115  for (int32_t i=0; i<m_w_rows; i++)
116  {
117  for (int32_t j=0; j<m_w_cols; j++)
118  m_w[j*m_w_rows+i]=get_log_likelihood_window(window, m_pwm_cols, j);
119 
120  window[last_idx]++;
121  int32_t window_ptr=last_idx;
122  while (window[window_ptr]==m_pwm_rows && window_ptr>0)
123  {
124  window[window_ptr]=0;
125  window_ptr--;
126  window[window_ptr]++;
127  }
128 
129  }
130 }
131 
132 void CPositionalPWM::register_params()
133 {
134  m_parameters->add_vector(&m_poim, &m_poim_len, "poim", "POIM Scoring Matrix");
135  m_parameters->add_matrix(&m_w, &m_w_rows, &m_w_cols, "w", "Scoring Matrix");
136  m_parameters->add_matrix(&m_pwm, &m_pwm_rows, &m_pwm_cols, "pwm", "Positional Weight Matrix.");
137  m_parameters->add(&m_sigma, "sigma", "Standard Deviation.");
138  m_parameters->add(&m_mean, "mean", "Mean.");
139 }
140 
141 void CPositionalPWM::compute_scoring(int32_t max_degree)
142 {
143  ASSERT(m_w);
144 
145  int32_t num_feat=m_w_cols;
146  int32_t num_sym=0;
147  int32_t order=m_pwm_cols;
148  int32_t num_words=m_pwm_cols;
149 
150  CAlphabet* alpha=new CAlphabet(DNA);
152  int32_t num_bits=alpha->get_num_bits();
153  str->compute_symbol_mask_table(num_bits);
154 
155  for (int32_t i=0; i<order; i++)
156  num_sym+=CMath::pow((int32_t) num_words,i+1);
157 
158  SG_FREE(m_poim);
159  m_poim_len=num_feat*num_sym;
160  m_poim=SG_MALLOC(float64_t, num_feat*num_sym);
161  memset(m_poim,0, size_t(num_feat)*size_t(num_sym));
162 
163  uint32_t kmer_mask=0;
164  uint32_t words=CMath::pow((int32_t) num_words,(int32_t) order);
165  int32_t offset=0;
166 
167  for (int32_t o=0; o<max_degree; o++)
168  {
169  float64_t* contrib=&m_poim[offset];
170  offset+=CMath::pow((int32_t) num_words,(int32_t) o+1);
171 
172  kmer_mask=(kmer_mask<<(num_bits)) | str->get_masked_symbols(0xffff, 1);
173 
174  for (int32_t p=-o; p<order; p++)
175  {
176  int32_t o_sym=0, m_sym=0, il=0,ir=0, jl=0;
177  uint32_t imer_mask=kmer_mask;
178  uint32_t jmer_mask=kmer_mask;
179 
180  if (p<0)
181  {
182  il=-p;
183  m_sym=order-o-p-1;
184  o_sym=-p;
185  }
186  else if (p<order-o)
187  {
188  ir=p;
189  m_sym=order-o-1;
190  }
191  else
192  {
193  ir=p;
194  m_sym=p;
195  o_sym=p-order+o+1;
196  jl=order-ir;
197  imer_mask=(kmer_mask>>(num_bits*o_sym));
198  jmer_mask=(kmer_mask>>(num_bits*jl));
199  }
200 
201  float64_t marginalizer=
202  1.0/CMath::pow((int32_t) num_words,(int32_t) m_sym);
203 
204  for (uint32_t i=0; i<words; i++)
205  {
206  uint16_t x= ((i << (num_bits*il)) >> (num_bits*ir)) & imer_mask;
207 
208  if (p>=0 && p<order-o)
209  {
210  contrib[x]+=m_w[m_w_cols*ir+i]*marginalizer;
211  }
212  else
213  {
214  for (uint32_t j=0; j< (uint32_t) CMath::pow((int32_t) num_words, (int32_t) o_sym); j++)
215  {
216  uint32_t c=x | ((j & jmer_mask) << (num_bits*jl));
217  contrib[c]+=m_w[m_w_cols*il+i]*marginalizer;
218  }
219  }
220  }
221  }
222  }
223 }
224 
226 {
227  int32_t offs=0;
228  for (int32_t i=0; i<d-1; i++)
229  offs+=CMath::pow((int32_t) m_w_rows,i+1);
230  int32_t rows=CMath::pow((int32_t) m_w_rows,d);
231  int32_t cols=m_w_cols;
232  return SGMatrix<float64_t>(&m_poim[offs],rows,cols);
233 }
DNA - letters A,C,G,T.
Definition: Alphabet.h:23
virtual EFeatureType get_feature_type()=0
virtual float64_t get_log_derivative(int32_t num_param, int32_t num_example)
static T sq(T x)
x^2
Definition: Math.h:277
#define SG_NOTIMPLEMENTED
Definition: SGIO.h:82
The class Alphabet implements an alphabet and alphabet utility functions.
Definition: Alphabet.h:88
ST get_masked_symbols(ST symbol, uint8_t mask)
Parameter * m_parameters
Definition: SGObject.h:297
void compute_symbol_mask_table(int64_t max_val)
Base class Distribution from which all methods implementing a distribution are derived.
Definition: Distribution.h:41
virtual float64_t get_log_model_parameter(int32_t num_param)
void add(bool *param, const char *name, const char *description="")
Definition: Parameter.cpp:23
virtual int32_t get_num_model_parameters()
#define ASSERT(x)
Definition: SGIO.h:102
virtual EFeatureClass get_feature_class()=0
static int32_t pow(int32_t x, int32_t n)
Definition: Math.h:339
double float64_t
Definition: common.h:56
int32_t get_num_bits() const
Definition: Alphabet.h:146
#define SG_FREE(ptr)
Definition: memory.h:39
virtual float64_t get_log_likelihood_example(int32_t num_example)
void add_vector(bool **param, index_t *length, const char *name, const char *description="")
Definition: Parameter.cpp:306
virtual SGMatrix< float64_t > get_scoring(int32_t d)
The class Features is the base class of all feature objects.
Definition: Features.h:56
static void fill_vector(T *vec, int32_t len, T value)
Definition: Math.h:616
void compute_w(int32_t num_pos)
static float64_t log(float64_t v)
Definition: Math.h:414
float64_t get_log_likelihood_window(uint8_t *window, int32_t len, float64_t pos)
static float32_t sqrt(float32_t x)
x^0.5
Definition: Math.h:283
void add_matrix(bool **param, index_t *length_y, index_t *length_x, const char *name, const char *description="")
Definition: Parameter.cpp:886
#define SG_MALLOC(type, len)
Definition: memory.h:36
void compute_scoring(int32_t max_degree)
virtual bool train(CFeatures *data=NULL)

SHOGUN Machine Learning Toolbox - Documentation