15 sparse_feature_matrix(NULL), feature_cache(NULL)
21 int32_t num_feat, int32_t num_vec,
bool copy)
23 sparse_feature_matrix(NULL), feature_cache(NULL)
33 for (int32_t i=0; i< num_vec; i++)
44 sparse_feature_matrix(NULL), feature_cache(NULL)
53 sparse_feature_matrix(NULL), feature_cache(NULL)
62 num_features(orig.num_features),
63 sparse_feature_matrix(orig.sparse_feature_matrix),
64 feature_cache(orig.feature_cache)
85 sparse_feature_matrix(NULL), feature_cache(NULL)
94 free_sparse_features();
98 clean_tsparse(sparse_feature_matrix, num_vectors);
99 sparse_feature_matrix = NULL;
106 free_sparse_feature_matrix();
107 delete feature_cache;
108 feature_cache = NULL;
117 ASSERT(index>=0 && index<num_features) ;
118 ASSERT(num>=0 && num<get_num_vectors()) ;
127 if (sv.
features[i].feat_index==index)
131 free_sparse_feature_vector(sv, num);
148 for (i=0; i<num_features; i++)
155 free_sparse_feature_vector(sv, num);
162 if (num>=num_vectors)
164 SG_ERROR(
"Index out of bounds (number of vectors %d, you "
165 "requested %d)\n", num_vectors, num);
175 dense.
vlen=num_features;
177 memset(dense.
vector, 0,
sizeof(ST)*num_features);
185 free_sparse_feature_vector(sv, num);
194 free_sparse_feature_vector(sv, num);
200 ASSERT(num<get_num_vectors());
202 index_t real_num=subset_idx_conversion(num);
206 if (sparse_feature_matrix)
208 result=sparse_feature_matrix[real_num];
218 result.
features=feature_cache->lock_entry(num);
224 result.
features=feature_cache->set_entry(num);
231 result.
features=compute_sparse_feature_vector(num,
235 if (get_num_preprocessors())
241 for (int32_t i=0; i<get_num_preprocessors(); i++)
247 tmp_feat_before=tmp_feat_after;
250 memcpy(result.
features, tmp_feat_after,
272 for (int32_t i=0; i<alen; i++)
276 while ( (j<blen) && (bvec[j].feat_index < a_feat_idx) )
279 if ( (j<blen) && (bvec[j].feat_index == a_feat_idx) )
289 for (int32_t i=0; i<blen; i++)
293 while ( (j<alen) && (avec[j].feat_index < b_feat_idx) )
296 if ( (j<alen) && (avec[j].feat_index == b_feat_idx) )
313 ASSERT(dim==num_features);
322 result+=alpha*vec[sv.
features[i].feat_index]
327 free_sparse_feature_vector(sv, num);
334 if (dim!=num_features)
336 SG_ERROR(
"dimension of vec (=%d) does not match number of features (=%d)\n",
348 vec[sv.
features[i].feat_index]+=alpha
356 vec[sv.
features[i].feat_index]+=alpha
362 free_sparse_feature_vector(sv, num);
368 feature_cache->unlock_entry(subset_idx_conversion(num));
376 SG_ERROR(
"get_sparse_feature_matrix() not allowed with subset\n");
378 num_feat=num_features;
381 return sparse_feature_matrix;
387 SG_ERROR(
"get_sparse_feature_matrix() not allowed with subset\n");
398 for (int32_t i=0; i<num_vec; i++)
415 num_feat=get_num_vectors();
416 num_vec=num_features;
418 int32_t* hist=
SG_MALLOC(int32_t, num_features);
419 memset(hist, 0,
sizeof(int32_t)*num_features);
422 for (int32_t v=0; v<num_feat; v++)
429 free_sparse_feature_vector(sv, v);
434 for (int32_t v=0; v<num_vec; v++)
442 memset(hist,0,
sizeof(int32_t)*num_features);
443 for (int32_t v=0; v<num_feat; v++)
449 int32_t vidx=sv.
features[i].feat_index;
451 sfm[vidx].
features[hist[vidx]].feat_index=fidx;
456 free_sparse_feature_vector(sv, v);
466 SG_ERROR(
"set_sparse_feature_matrix() not allowed with subset\n");
469 free_sparse_feature_matrix();
481 SG_INFO(
"converting sparse features to full feature matrix of %ld x %ld entries\n", num_vectors, num_features);
487 memset(full.
matrix, 0,
size_t(num_features)*
size_t(get_num_vectors())*
sizeof(ST));
489 for (int32_t v=0; v<full.
num_cols; v++)
492 sparse_feature_matrix[subset_idx_conversion(v)];
496 int64_t offs=(current.
vec_index*num_features)
514 free_sparse_feature_matrix();
516 num_features=num_feat;
519 SG_INFO(
"converting dense feature matrix to sparse one\n");
520 int32_t* num_feat_entries=
SG_MALLOC(
int, num_vectors);
522 if (num_feat_entries)
524 int64_t num_total_entries=0;
527 for (int32_t i=0; i< num_vec; i++)
529 num_feat_entries[i]=0;
530 for (int32_t j=0; j< num_feat; j++)
532 if (src[i*((int64_t) num_feat) + j] != 0)
533 num_feat_entries[i]++;
541 if (sparse_feature_matrix)
543 for (int32_t i=0; i< num_vec; i++)
545 sparse_feature_matrix[i].vec_index=i;
546 sparse_feature_matrix[i].num_feat_entries=0;
547 sparse_feature_matrix[i].features= NULL;
549 if (num_feat_entries[i]>0)
553 if (!sparse_feature_matrix[i].features)
555 SG_INFO(
"allocation of features failed\n");
559 sparse_feature_matrix[i].num_feat_entries=num_feat_entries[i];
560 int32_t sparse_feat_idx=0;
562 for (int32_t j=0; j< num_feat; j++)
564 int64_t pos= i*num_feat + j;
568 sparse_feature_matrix[i].features[sparse_feat_idx].entry=src[pos];
569 sparse_feature_matrix[i].features[sparse_feat_idx].feat_index=j;
579 SG_ERROR(
"allocation of sparse feature matrix failed\n");
583 SG_INFO(
"sparse feature matrix has %ld entries (full matrix had %ld, sparsity %2.2f%%)\n",
584 num_total_entries, int64_t(num_feat)*num_vec, (100.0*num_total_entries)/(int64_t(num_feat)*num_vec));
588 SG_ERROR(
"huh ? zero size matrix given ?\n");
598 SG_INFO(
"force: %d\n", force_preprocessing);
600 if ( sparse_feature_matrix && get_num_preprocessors() )
602 for (int32_t i=0; i<get_num_preprocessors(); i++)
604 if ( (!is_preprocessed(i) || force_preprocessing) )
607 SG_INFO(
"preprocessing using preproc %s\n", get_preprocessor(i)->get_name());
617 SG_WARNING(
"no sparse feature matrix available or features already preprocessed - skipping.\n");
632 return set_full_feature_matrix(fm);
637 return m_subset ? m_subset->get_size() : num_vectors;
647 int32_t n=num_features;
661 feature_cache->unlock_entry(subset_idx_conversion(num));
669 index_t num_vec=get_num_vectors();
670 for (int32_t i=0; i<num_vec; i++)
671 num+=sparse_feature_matrix[subset_idx_conversion(i)].num_feat_entries;
680 index_t num_vec=get_num_vectors();
681 for (int32_t i=0; i<num_vec; i++)
689 free_feature_vector(vec, i);
708 float64_t result=sq_lhs[idx_a]+sq_rhs[idx_b];
715 int32_t a_feat_idx=avec.
features[i].feat_index;
718 &&(bvec.
features[j].feat_index<a_feat_idx))
722 &&(bvec.
features[j].feat_index==a_feat_idx))
734 int32_t b_feat_idx=bvec.
features[i].feat_index;
737 &&(avec.
features[j].feat_index<b_feat_idx))
741 &&(avec.
features[j].feat_index==b_feat_idx))
756 bool do_sort_features)
762 size_t blocksize=1024*1024;
763 size_t required_blocksize=blocksize;
764 uint8_t* dummy=
SG_MALLOC(uint8_t, blocksize);
765 FILE* f=fopen(fname,
"ro");
769 free_sparse_feature_matrix();
773 SG_INFO(
"counting line numbers in file %s\n", fname);
776 size_t old_block_offs=0;
777 fseek(f, 0, SEEK_END);
778 size_t fsize=ftell(f);
781 while (sz == blocksize)
783 sz=fread(dummy,
sizeof(uint8_t), blocksize, f);
784 for (
size_t i=0; i<sz; i++)
787 if (dummy[i]==
'\n' || (i==sz-1 && sz<blocksize))
790 required_blocksize=
CMath::max(required_blocksize, block_offs-old_block_offs+1);
791 old_block_offs=block_offs;
794 SG_PROGRESS(block_offs, 0, fsize, 1,
"COUNTING:\t");
797 SG_INFO(
"found %d feature vectors\n", num_vectors);
799 blocksize=required_blocksize;
808 while (sz == blocksize)
810 sz=fread(dummy,
sizeof(uint8_t), blocksize, f);
813 for (
size_t i=0; i<sz; i++)
815 if (i==sz-1 && dummy[i]!=
'\n' && sz==blocksize)
817 size_t len=i-old_sz+1;
818 uint8_t* data=&dummy[old_sz];
820 for (
size_t j=0; j<len; j++)
823 sz=fread(dummy+len,
sizeof(uint8_t), blocksize-len, f);
829 if (dummy[i]==
'\n' || (i==sz-1 && sz<blocksize))
833 uint8_t* data=&dummy[old_sz];
836 for (
size_t j=0; j<len; j++)
844 SG_ERROR(
"Error in line %d - number of"
845 " dimensions is %d line is %d characters"
846 " long\n line_content:'%.*s'\n", lines,
847 dims, len, len, (
const char*) data);
858 lab->
set_label(lines, atof((
const char*) data));
865 uint8_t* start=&data[j];
872 feat[d].
feat_index=(int32_t) atoi((
const char*) start)-1;
873 num_features=
CMath::max(num_features, feat[d].feat_index+1);
879 if (data[j]==
' ' || data[j]==
'\n')
882 feat[d].
entry=(ST) atof((
const char*) start);
891 feat[dims-1].
entry=(ST) atof((
const char*) start);
899 sparse_feature_matrix[lines].vec_index=lines;
900 sparse_feature_matrix[lines].num_feat_entries=dims;
901 sparse_feature_matrix[lines].features=feat;
905 SG_PROGRESS(lines, 0, num_vectors, 1,
"LOADING:\t");
909 SG_INFO(
"file successfully read\n");
915 if (do_sort_features)
924 SG_ERROR(
"sort_features() not allowed with subset\n");
926 ASSERT(get_num_preprocessors()==0);
928 if (!sparse_feature_matrix)
929 SG_ERROR(
"Requires sparse feature matrix to be available in-memory\n");
931 for (int32_t i=0; i<num_vectors; i++)
933 int32_t len=sparse_feature_matrix[i].num_feat_entries;
939 int32_t* feat_idx=
SG_MALLOC(int32_t, len);
940 int32_t* orig_idx=
SG_MALLOC(int32_t, len);
942 for (
int j=0; j<len; j++)
951 for (
int j=0; j<len; j++)
952 sf_new[j]=sf_orig[orig_idx[j]];
954 sparse_feature_matrix[i].features=sf_new;
957 for (
int j=0; j<len-1; j++)
958 ASSERT(sf_new[j].feat_index<sf_new[j+1].feat_index);
970 SG_ERROR(
"write_svmlight_file() not allowed with subset\n");
977 FILE* f=fopen(fname,
"wb");
981 for (int32_t i=0; i<num; i++)
986 int32_t num_feat = sparse_feature_matrix[i].num_feat_entries;
988 for (int32_t j=0; j<num_feat; j++)
991 fprintf(f,
"%d:%f ", (int32_t) vec[j].feat_index+1, (
double) vec[j].entry);
993 fprintf(f,
"%d:%f\n", (int32_t) vec[j].feat_index+1, (
double) vec[j].entry);
1005 return num_features;
1022 free_sparse_feature_vector(avec, vec_idx1);
1023 sf->free_sparse_feature_vector(bvec, vec_idx2);
1030 if (vec2_len!=num_features)
1032 SG_ERROR(
"dimension of vec2 (=%d) does not match number of features (=%d)\n",
1033 vec2_len, num_features);
1045 free_sparse_feature_vector(sv, vec_idx1);
1052 if (vector_index>=get_num_vectors())
1054 SG_ERROR(
"Index out of bounds (number of vectors %d, you "
1055 "requested %d)\n", get_num_vectors(), vector_index);
1058 if (!sparse_feature_matrix)
1059 SG_ERROR(
"Requires a in-memory feature matrix\n");
1061 sparse_feature_iterator* it=
SG_MALLOC(sparse_feature_iterator, 1);
1062 it->sv=get_sparse_feature_vector(vector_index);
1070 sparse_feature_iterator* it=(sparse_feature_iterator*) iterator;
1071 if (!it || it->index>=it->sv.num_feat_entries)
1074 int32_t i=it->index++;
1076 index=it->sv.features[i].feat_index;
1077 value=(
float64_t) it->sv.features[i].entry;
1087 sparse_feature_iterator* it=(sparse_feature_iterator*) iterator;
1088 free_sparse_feature_vector(it->sv, it->sv.vec_index);
1095 get_dim_feature_space());
1111 free_sparse_feature_vector(current, index);
1130 m_parameters->add_vector(&sparse_feature_matrix, &num_vectors,
1131 "sparse_feature_matrix",
1132 "Array of sparse vectors.");
1133 m_parameters->add(&num_features,
"num_features",
1134 "Total number of features.");
1137 #define GET_FEATURE_TYPE(sg_type, f_type) \
1138 template<> EFeatureType CSparseFeatures<sg_type>::get_feature_type() \
1155 #undef GET_FEATURE_TYPE
1157 #define LOAD(fname, sg_type) \
1158 template<> void CSparseFeatures<sg_type>::load(CFile* loader) \
1163 SGSparseVector<sg_type>* matrix=NULL; \
1164 int32_t num_feat=0; \
1165 int32_t num_vec=0; \
1166 loader->fname(matrix, num_feat, num_vec); \
1167 set_sparse_feature_matrix(SGSparseMatrix<sg_type>(matrix, num_feat, num_vec)); \
1170 LOAD(get_sparse_matrix,
bool)
1171 LOAD(get_sparse_matrix,
char)
1172 LOAD(get_sparse_matrix, uint8_t)
1173 LOAD(get_int8_sparsematrix, int8_t)
1174 LOAD(get_sparse_matrix, int16_t)
1175 LOAD(get_sparse_matrix, uint16_t)
1176 LOAD(get_sparse_matrix, int32_t)
1177 LOAD(get_uint_sparsematrix, uint32_t)
1178 LOAD(get_long_sparsematrix, int64_t)
1179 LOAD(get_ulong_sparsematrix, uint64_t)
1180 LOAD(get_sparse_matrix, float32_t)
1181 LOAD(get_sparse_matrix, float64_t)
1182 LOAD(get_longreal_sparsematrix, floatmax_t)
1185 #define WRITE(fname, sg_type) \
1186 template<> void CSparseFeatures<sg_type>::save(CFile* writer) \
1189 SG_ERROR("save() not allowed with subset\n"); \
1192 writer->fname(sparse_feature_matrix, num_features, num_vectors); \
1195 WRITE(set_sparse_matrix,
bool)
1196 WRITE(set_sparse_matrix,
char)
1197 WRITE(set_sparse_matrix, uint8_t)
1198 WRITE(set_int8_sparsematrix, int8_t)
1199 WRITE(set_sparse_matrix, int16_t)
1200 WRITE(set_sparse_matrix, uint16_t)
1201 WRITE(set_sparse_matrix, int32_t)
1202 WRITE(set_uint_sparsematrix, uint32_t)
1203 WRITE(set_long_sparsematrix, int64_t)
1204 WRITE(set_ulong_sparsematrix, uint64_t)
1205 WRITE(set_sparse_matrix, float32_t)
1206 WRITE(set_sparse_matrix, float64_t)
1207 WRITE(set_longreal_sparsematrix, floatmax_t)
1210 template class CSparseFeatures<bool>;
1211 template class CSparseFeatures<char>;
1212 template class CSparseFeatures<int8_t>;
1213 template class CSparseFeatures<uint8_t>;
1214 template class CSparseFeatures<int16_t>;
1215 template class CSparseFeatures<uint16_t>;
1216 template class CSparseFeatures<int32_t>;
1217 template class CSparseFeatures<uint32_t>;
1218 template class CSparseFeatures<int64_t>;
1219 template class CSparseFeatures<uint64_t>;
1220 template class CSparseFeatures<float32_t>;
1221 template class CSparseFeatures<float64_t>;
1222 template class CSparseFeatures<floatmax_t>;
int32_t get_num_features()
template class SGSparseMatrix
virtual bool get_next_feature(int32_t &index, float64_t &value, void *iterator)
#define LOAD(fname, sg_type)
virtual EFeatureType get_feature_type()=0
bool set_label(int32_t idx, float64_t label)
virtual CFeatures * duplicate() const
The class Labels models labels, i.e. class assignments of objects.
void free_feature_vector(SGSparseVector< ST > vec, int32_t num)
static ST sparse_dot(ST alpha, SGSparseVectorEntry< ST > *avec, int32_t alen, SGSparseVectorEntry< ST > *bvec, int32_t blen)
CLabels * load_svmlight_file(char *fname, bool do_sort_features=true)
Template class SparsePreprocessor, base class for preprocessors (cf. CPreprocessor) that apply to CSp...
static void qsort_index(T1 *output, T2 *index, uint32_t size)
virtual ~CSparseFeatures()
#define WRITE(fname, sg_type)
int32_t num_vectors
total number of vectors
void get_feature_matrix(ST **dst, int32_t *num_feat, int32_t *num_vec)
Template class SparseFeatures implements sparse matrices.
virtual float64_t dot(int32_t vec_idx1, CDotFeatures *df, int32_t vec_idx2)
void set_sparse_feature_matrix(SGSparseMatrix< ST > sm)
int64_t get_num_nonzero_entries()
SGSparseVector< T > * sparse_matrix
array of sparse vectors of size num_vectors
ST * get_full_feature_vector(int32_t num, int32_t &len)
#define SG_NOTIMPLEMENTED
#define GET_FEATURE_TYPE(sg_type, f_type)
float64_t compute_squared_norm(CSparseFeatures< float64_t > *lhs, float64_t *sq_lhs, int32_t idx_a, CSparseFeatures< float64_t > *rhs, float64_t *sq_rhs, int32_t idx_b)
virtual void free_feature_iterator(void *iterator)
index_t num_vectors
total number of vectors
virtual int32_t get_nnz_features_for_vector(int32_t num)
virtual int32_t get_dim_feature_space() const
ST dense_dot(ST alpha, int32_t num, ST *vec, int32_t dim, ST b)
Features that support dot products among other operations.
bool obtain_from_simple(CSimpleFeatures< ST > *sf)
EFeatureClass
shogun feature class
void free_sparse_feature_matrix()
template class SGSparseVector
SGSparseMatrix< ST > get_sparse_feature_matrix()
int32_t get_int_label(int32_t idx)
virtual int32_t get_num_vectors() const
virtual EFeatureClass get_feature_class()=0
void free_sparse_features()
virtual void * get_feature_iterator(int32_t vector_index)
CSparseFeatures< ST > * get_transposed()
virtual SGSparseVectorEntry< ST > * compute_sparse_feature_vector(int32_t num, int32_t &len, SGSparseVectorEntry< ST > *target=NULL)
SGMatrix< ST > get_full_feature_matrix()
A File access base class.
virtual bool set_full_feature_matrix(SGMatrix< ST > full)
index_t num_features
total number of features
static T max(T a, T b)
return the maximum of two integers
SGSparseVector< ST > * sparse_feature_matrix
array of sparse vectors of size num_vectors
virtual EFeatureClass get_feature_class()
bool write_svmlight_file(char *fname, CLabels *label)
void add_to_dense_vec(float64_t alpha, int32_t num, float64_t *vec, int32_t dim, bool abs_val=false)
virtual bool apply_preprocessor(bool force_preprocessing=false)
float64_t * compute_squared(float64_t *sq)
The class Features is the base class of all feature objects.
ST get_feature(int32_t num, int32_t index)
The class SimpleFeatures implements dense feature matrices.
int32_t set_num_features(int32_t num)
virtual CFeatures * copy_subset(SGVector< index_t > indices)
void free_sparse_feature_vector(SGSparseVector< ST > vec, int32_t num)
SGSparseVector< ST > get_sparse_feature_vector(int32_t num)
CSparseFeatures(int32_t size=0)
#define SG_MALLOC(type, len)
virtual int32_t get_size()
static void clean_tsparse(SGSparseVector< ST > *sfm, int32_t num_vec)
static T abs(T a)
return the absolute value of a number
SGSparseVectorEntry< T > * features