22 using namespace shogun;
63 getExpFunctionCache(max_len);
68 const std::string& sequence, uint32_t k_mer_length,
69 const std::string& allowed_characters,
70 std::vector< std::pair<int32_t, float64_t> >& values)
74 std::map<std::string::value_type, uint32_t> residue_values;
76 uint32_t number_of_residues = allowed_characters.size();
77 uint32_t sequence_length = sequence.size();
78 bool sequence_ok =
true;
81 for (uint32_t i = 0; i < sequence.size(); ++i)
83 if (allowed_characters.find(sequence.at(i)) == std::string::npos)
87 if (sequence_ok && k_mer_length <= sequence_length)
89 values.resize(sequence_length - k_mer_length + 1,
90 std::pair<int32_t, float64_t>());
91 for (uint32_t i = 0; i < number_of_residues; ++i)
93 residue_values.insert(std::make_pair(allowed_characters[i], counter));
96 for (int32_t
k = k_mer_length - 1;
k >= 0;
k--)
98 oligo_value += factor * residue_values[sequence[
k]];
99 factor *= number_of_residues;
101 factor /= number_of_residues;
103 values[counter].first = 1;
104 values[counter].second = oligo_value;
107 for (uint32_t j = 1; j < sequence_length - k_mer_length + 1; j++)
109 oligo_value -= factor * residue_values[sequence[j - 1]];
110 oligo_value = oligo_value * number_of_residues +
111 residue_values[sequence[j + k_mer_length - 1]];
113 values[counter].first = j + 1;
114 values[counter].second = oligo_value ;
117 stable_sort(values.begin(), values.end(), cmpOligos_);
126 const std::vector<std::string>& sequences, uint32_t k_mer_length,
127 const std::string& allowed_characters,
128 std::vector< std::vector< std::pair<int32_t, float64_t> > >& encoded_sequences)
130 std::vector< std::pair<int32_t, float64_t> > temp_vector;
131 encoded_sequences.resize(sequences.size(),
132 std::vector< std::pair<int32_t, float64_t> >());
134 for (uint32_t i = 0; i < sequences.size(); ++i)
136 encodeOligo(sequences[i], k_mer_length, allowed_characters, temp_vector);
137 encoded_sequences[i] = temp_vector;
141 void COligoStringKernel::getExpFunctionCache(uint32_t sequence_length)
147 for (uint32_t i = 1; i < sequence_length; i++)
154 const std::vector< std::pair<int32_t, float64_t> >& x,
155 const std::vector< std::pair<int32_t, float64_t> >& y,
156 int32_t max_distance)
162 uint32_t x_size = x.size();
163 uint32_t y_size = y.size();
165 while ((uint32_t) i1 + 1 < x_size && (uint32_t) i2 + 1 < y_size)
167 if (x[i1].second == y[i2].second)
170 || (abs(x[i1].first - y[i2].first)) <= max_distance)
172 result +=
gauss_table[abs((x[i1].first - y[i2].first))];
173 if (x[i1].second == x[i1 + 1].second)
178 else if (y[i2].second == y[i2 + 1].second)
192 if (x[i1].first < y[i2].first)
194 if (x[i1].second == x[i1 + 1].second)
198 else if (y[i2].second == y[i2 + 1].second)
200 while (y[i2].second == y[i2+1].second)
222 if (x[i1].second < y[i2].second)
239 std::vector< std::pair<int32_t, float64_t> > aenc;
240 std::vector< std::pair<int32_t, float64_t> > benc;
249 void COligoStringKernel::init()
virtual bool init(CFeatures *l, CFeatures *r)
virtual float64_t compute(int32_t x, int32_t y)
virtual bool set_normalizer(CKernelNormalizer *normalizer)
static void encodeOligo(const std::string &sequence, uint32_t k_mer_length, const std::string &allowed_characters, std::vector< std::pair< int32_t, float64_t > > &values)
encodes the signals of the sequence
void add(bool *param, const char *name, const char *description="")
virtual ~COligoStringKernel()
static T max(T a, T b)
return the maximum of two integers
static void getSequences(const std::vector< std::string > &sequences, uint32_t k_mer_length, const std::string &allowed_characters, std::vector< std::vector< std::pair< int32_t, float64_t > > > &encoded_sequences)
encodes all sequences with the encodeOligo function and stores them in 'encoded_sequences' ...
virtual bool init_normalizer()
CFeatures * rhs
feature vectors to occur on right hand side
void add_vector(bool **param, index_t *length, const char *name, const char *description="")
CFeatures * lhs
feature vectors to occur on left hand side
The class Features is the base class of all feature objects.
friend class CSqrtDiagKernelNormalizer
Template class StringKernel, is the base class of all String Kernels.
#define SG_MALLOC(type, len)
float64_t kernelOligoFast(const std::vector< std::pair< int32_t, float64_t > > &x, const std::vector< std::pair< int32_t, float64_t > > &y, int32_t max_distance=-1)
returns the value of the oligo kernel for sequences 'x' and 'y'