SHOGUN
v1.1.0
|
00001 #include <shogun/features/StreamingSparseFeatures.h> 00002 namespace shogun 00003 { 00004 00005 template <class T> 00006 CStreamingSparseFeatures<T>::CStreamingSparseFeatures() : CStreamingDotFeatures() 00007 { 00008 set_read_functions(); 00009 init(); 00010 } 00011 00012 template <class T> 00013 CStreamingSparseFeatures<T>::CStreamingSparseFeatures(CStreamingFile* file, 00014 bool is_labelled, 00015 int32_t size) 00016 : CStreamingDotFeatures() 00017 { 00018 set_read_functions(); 00019 init(file, is_labelled, size); 00020 } 00021 00022 template <class T> 00023 CStreamingSparseFeatures<T>::~CStreamingSparseFeatures() 00024 { 00025 parser.end_parser(); 00026 } 00027 00028 template <class T> 00029 T CStreamingSparseFeatures<T>::get_feature(int32_t index) 00030 { 00031 ASSERT(index>=0 && index<current_num_features); 00032 00033 T ret=0; 00034 00035 if (current_vector) 00036 { 00037 for (int32_t i=0; i<current_length; i++) 00038 if (current_vector[i].feat_index==index) 00039 ret += current_vector[i].entry; 00040 } 00041 00042 return ret; 00043 } 00044 00045 template <class T> 00046 void CStreamingSparseFeatures<T>::reset_stream() 00047 { 00048 } 00049 00050 template <class T> 00051 int32_t CStreamingSparseFeatures<T>::set_num_features(int32_t num) 00052 { 00053 int32_t n=current_num_features; 00054 ASSERT(n<=num); 00055 current_num_features=num; 00056 return n; 00057 } 00058 00059 template <class T> 00060 void CStreamingSparseFeatures<T>::expand_if_required(float32_t*& vec, int32_t &len) 00061 { 00062 int32_t dim = get_dim_feature_space(); 00063 if (dim > len) 00064 { 00065 vec = SG_REALLOC(float32_t, vec, dim); 00066 memset(&vec[len], 0, (dim-len) * sizeof(float32_t)); 00067 len = dim; 00068 } 00069 } 00070 00071 template <class T> 00072 void CStreamingSparseFeatures<T>::expand_if_required(float64_t*& vec, int32_t &len) 00073 { 00074 int32_t dim = get_dim_feature_space(); 00075 if (dim > len) 00076 { 00077 vec = SG_REALLOC(float64_t, vec, dim); 00078 memset(&vec[len], 0, (dim-len) * sizeof(float64_t)); 00079 len = dim; 00080 } 00081 } 00082 00083 template <class T> 00084 T CStreamingSparseFeatures<T>::sparse_dot(T alpha, SGSparseVectorEntry<T>* avec, int32_t alen, SGSparseVectorEntry<T>* bvec, int32_t blen) 00085 { 00086 T result=0; 00087 00088 //result remains zero when one of the vectors is non existent 00089 if (avec && bvec) 00090 { 00091 if (alen<=blen) 00092 { 00093 int32_t j=0; 00094 for (int32_t i=0; i<alen; i++) 00095 { 00096 int32_t a_feat_idx=avec[i].feat_index; 00097 00098 while ( (j<blen) && (bvec[j].feat_index < a_feat_idx) ) 00099 j++; 00100 00101 if ( (j<blen) && (bvec[j].feat_index == a_feat_idx) ) 00102 { 00103 result+= avec[i].entry * bvec[j].entry; 00104 j++; 00105 } 00106 } 00107 } 00108 else 00109 { 00110 int32_t j=0; 00111 for (int32_t i=0; i<blen; i++) 00112 { 00113 int32_t b_feat_idx=bvec[i].feat_index; 00114 00115 while ( (j<alen) && (avec[j].feat_index < b_feat_idx) ) 00116 j++; 00117 00118 if ( (j<alen) && (avec[j].feat_index == b_feat_idx) ) 00119 { 00120 result+= bvec[i].entry * avec[j].entry; 00121 j++; 00122 } 00123 } 00124 } 00125 00126 result*=alpha; 00127 } 00128 00129 return result; 00130 } 00131 00132 template <class T> 00133 T CStreamingSparseFeatures<T>::dense_dot(T alpha, T* vec, int32_t dim, T b) 00134 { 00135 ASSERT(vec); 00136 ASSERT(dim>=current_num_features); 00137 T result=b; 00138 00139 int32_t num_feat=current_length; 00140 SGSparseVectorEntry<T>* sv=current_vector; 00141 00142 if (sv) 00143 { 00144 for (int32_t i=0; i<num_feat; i++) 00145 result+=alpha*vec[sv[i].feat_index]*sv[i].entry; 00146 } 00147 00148 return result; 00149 } 00150 00151 template <class T> 00152 float64_t CStreamingSparseFeatures<T>::dense_dot(const float64_t* vec2, int32_t vec2_len) 00153 { 00154 ASSERT(vec2); 00155 if (vec2_len < current_num_features) 00156 { 00157 SG_ERROR("dimension of vec2 (=%d) does not match number of features (=%d)\n", 00158 vec2_len, current_num_features); 00159 } 00160 00161 float64_t result=0; 00162 if (current_vector) 00163 { 00164 for (int32_t i=0; i<current_length; i++) 00165 result+=vec2[current_vector[i].feat_index]*current_vector[i].entry; 00166 } 00167 00168 return result; 00169 } 00170 00171 template <class T> 00172 float32_t CStreamingSparseFeatures<T>::dense_dot(const float32_t* vec2, int32_t vec2_len) 00173 { 00174 ASSERT(vec2); 00175 if (vec2_len < current_num_features) 00176 { 00177 SG_ERROR("dimension of vec2 (=%d) does not match number of features (=%d)\n", 00178 vec2_len, current_num_features); 00179 } 00180 00181 float32_t result=0; 00182 if (current_vector) 00183 { 00184 for (int32_t i=0; i<current_length; i++) 00185 result+=vec2[current_vector[i].feat_index]*current_vector[i].entry; 00186 } 00187 00188 return result; 00189 } 00190 00191 template <class T> 00192 void CStreamingSparseFeatures<T>::add_to_dense_vec(float64_t alpha, float64_t* vec2, int32_t vec2_len, bool abs_val) 00193 { 00194 ASSERT(vec2); 00195 if (vec2_len < current_num_features) 00196 { 00197 SG_ERROR("dimension of vec (=%d) does not match number of features (=%d)\n", 00198 vec2_len, current_num_features); 00199 } 00200 00201 SGSparseVectorEntry<T>* sv=current_vector; 00202 int32_t num_feat=current_length; 00203 00204 if (sv) 00205 { 00206 if (abs_val) 00207 { 00208 for (int32_t i=0; i<num_feat; i++) 00209 vec2[sv[i].feat_index]+= alpha*CMath::abs(sv[i].entry); 00210 } 00211 else 00212 { 00213 for (int32_t i=0; i<num_feat; i++) 00214 vec2[sv[i].feat_index]+= alpha*sv[i].entry; 00215 } 00216 } 00217 } 00218 00219 template <class T> 00220 void CStreamingSparseFeatures<T>::add_to_dense_vec(float32_t alpha, float32_t* vec2, int32_t vec2_len, bool abs_val) 00221 { 00222 ASSERT(vec2); 00223 if (vec2_len < current_num_features) 00224 { 00225 SG_ERROR("dimension of vec (=%d) does not match number of features (=%d)\n", 00226 vec2_len, current_num_features); 00227 } 00228 00229 SGSparseVectorEntry<T>* sv=current_vector; 00230 int32_t num_feat=current_length; 00231 00232 if (sv) 00233 { 00234 if (abs_val) 00235 { 00236 for (int32_t i=0; i<num_feat; i++) 00237 vec2[sv[i].feat_index]+= alpha*CMath::abs(sv[i].entry); 00238 } 00239 else 00240 { 00241 for (int32_t i=0; i<num_feat; i++) 00242 vec2[sv[i].feat_index]+= alpha*sv[i].entry; 00243 } 00244 } 00245 } 00246 00247 template <class T> 00248 int64_t CStreamingSparseFeatures<T>::get_num_nonzero_entries() 00249 { 00250 return current_length; 00251 } 00252 00253 template <class T> 00254 float32_t CStreamingSparseFeatures<T>::compute_squared() 00255 { 00256 ASSERT(current_vector); 00257 00258 float32_t sq=0; 00259 00260 for (int32_t i=0; i<current_length; i++) 00261 sq += current_vector[i].entry * current_vector[i].entry; 00262 00263 return sq; 00264 } 00265 00266 template <class T> 00267 void CStreamingSparseFeatures<T>::sort_features() 00268 { 00269 ASSERT(current_vector); 00270 00271 SGSparseVectorEntry<T>* sf_orig=current_vector; 00272 int32_t len=current_length; 00273 00274 int32_t* feat_idx=SG_MALLOC(int32_t, len); 00275 int32_t* orig_idx=SG_MALLOC(int32_t, len); 00276 00277 for (int32_t i=0; i<len; i++) 00278 { 00279 feat_idx[i]=sf_orig[i].feat_index; 00280 orig_idx[i]=i; 00281 } 00282 00283 CMath::qsort_index(feat_idx, orig_idx, len); 00284 00285 SGSparseVectorEntry<T>* sf_new=SG_MALLOC(SGSparseVectorEntry<T>, len); 00286 00287 for (int32_t i=0; i<len; i++) 00288 sf_new[i]=sf_orig[orig_idx[i]]; 00289 00290 // sanity check 00291 for (int32_t i=0; i<len-1; i++) 00292 ASSERT(sf_new[i].feat_index<sf_new[i+1].feat_index); 00293 00294 // Copy new vector back to original 00295 for (int32_t i=0; i<len; i++) 00296 sf_orig[i]=sf_new[i]; 00297 00298 SG_FREE(orig_idx); 00299 SG_FREE(feat_idx); 00300 SG_FREE(sf_new); 00301 } 00302 00303 template <class T> 00304 CFeatures* CStreamingSparseFeatures<T>::duplicate() const 00305 { 00306 return new CStreamingSparseFeatures<T>(*this); 00307 } 00308 00309 template <class T> 00310 int32_t CStreamingSparseFeatures<T>::get_num_vectors() const 00311 { 00312 if (current_vector) 00313 return 1; 00314 return 0; 00315 } 00316 00317 template <class T> 00318 int32_t CStreamingSparseFeatures<T>::get_size() 00319 { 00320 return sizeof(T); 00321 } 00322 00323 template <class T> void CStreamingSparseFeatures<T>::set_vector_reader() 00324 { 00325 parser.set_read_vector(&CStreamingFile::get_sparse_vector); 00326 } 00327 00328 template <class T> void CStreamingSparseFeatures<T>::set_vector_and_label_reader() 00329 { 00330 parser.set_read_vector_and_label 00331 (&CStreamingFile::get_sparse_vector_and_label); 00332 } 00333 00334 #define GET_FEATURE_TYPE(f_type, sg_type) \ 00335 template<> EFeatureType CStreamingSparseFeatures<sg_type>::get_feature_type() \ 00336 { \ 00337 return f_type; \ 00338 } 00339 00340 GET_FEATURE_TYPE(F_BOOL, bool) 00341 GET_FEATURE_TYPE(F_CHAR, char) 00342 GET_FEATURE_TYPE(F_BYTE, uint8_t) 00343 GET_FEATURE_TYPE(F_BYTE, int8_t) 00344 GET_FEATURE_TYPE(F_SHORT, int16_t) 00345 GET_FEATURE_TYPE(F_WORD, uint16_t) 00346 GET_FEATURE_TYPE(F_INT, int32_t) 00347 GET_FEATURE_TYPE(F_UINT, uint32_t) 00348 GET_FEATURE_TYPE(F_LONG, int64_t) 00349 GET_FEATURE_TYPE(F_ULONG, uint64_t) 00350 GET_FEATURE_TYPE(F_SHORTREAL, float32_t) 00351 GET_FEATURE_TYPE(F_DREAL, float64_t) 00352 GET_FEATURE_TYPE(F_LONGREAL, floatmax_t) 00353 #undef GET_FEATURE_TYPE 00354 00355 00356 template <class T> 00357 void CStreamingSparseFeatures<T>::init() 00358 { 00359 working_file=NULL; 00360 current_vector=NULL; 00361 current_length=-1; 00362 current_vec_index=0; 00363 current_num_features=-1; 00364 } 00365 00366 template <class T> 00367 void CStreamingSparseFeatures<T>::init(CStreamingFile* file, 00368 bool is_labelled, 00369 int32_t size) 00370 { 00371 init(); 00372 has_labels = is_labelled; 00373 working_file = file; 00374 parser.init(file, is_labelled, size); 00375 } 00376 00377 template <class T> 00378 void CStreamingSparseFeatures<T>::start_parser() 00379 { 00380 if (!parser.is_running()) 00381 parser.start_parser(); 00382 } 00383 00384 template <class T> 00385 void CStreamingSparseFeatures<T>::end_parser() 00386 { 00387 parser.end_parser(); 00388 } 00389 00390 template <class T> 00391 bool CStreamingSparseFeatures<T>::get_next_example() 00392 { 00393 bool ret_value; 00394 ret_value = (bool) parser.get_next_example(current_vector, 00395 current_length, 00396 current_label); 00397 00398 if (!ret_value) 00399 return false; 00400 00401 // Update number of features based on highest index 00402 for (int32_t i=0; i<current_length; i++) 00403 { 00404 if (current_vector[i].feat_index > current_num_features) 00405 current_num_features = current_vector[i].feat_index+1; 00406 } 00407 current_vec_index++; 00408 00409 return true; 00410 } 00411 00412 template <class T> 00413 SGSparseVector<T> CStreamingSparseFeatures<T>::get_vector() 00414 { 00415 current_sgvector.features=current_vector; 00416 current_sgvector.num_feat_entries=current_length; 00417 current_sgvector.vec_index=current_vec_index; 00418 00419 return current_sgvector; 00420 } 00421 00422 template <class T> 00423 float64_t CStreamingSparseFeatures<T>::get_label() 00424 { 00425 ASSERT(has_labels); 00426 00427 return current_label; 00428 } 00429 00430 template <class T> 00431 void CStreamingSparseFeatures<T>::release_example() 00432 { 00433 parser.finalize_example(); 00434 } 00435 00436 template <class T> 00437 int32_t CStreamingSparseFeatures<T>::get_dim_feature_space() const 00438 { 00439 return current_num_features; 00440 } 00441 00442 template <class T> 00443 float32_t CStreamingSparseFeatures<T>::dot(CStreamingDotFeatures* df) 00444 { 00445 SG_NOTIMPLEMENTED; 00446 return -1; 00447 } 00448 00449 template <class T> 00450 int32_t CStreamingSparseFeatures<T>::get_num_features() 00451 { 00452 return current_num_features; 00453 } 00454 00455 template <class T> 00456 int32_t CStreamingSparseFeatures<T>::get_nnz_features_for_vector() 00457 { 00458 return current_length; 00459 } 00460 00461 template <class T> 00462 EFeatureClass CStreamingSparseFeatures<T>::get_feature_class() 00463 { 00464 return C_STREAMING_SPARSE; 00465 } 00466 00467 template class CStreamingSparseFeatures<bool>; 00468 template class CStreamingSparseFeatures<char>; 00469 template class CStreamingSparseFeatures<int8_t>; 00470 template class CStreamingSparseFeatures<uint8_t>; 00471 template class CStreamingSparseFeatures<int16_t>; 00472 template class CStreamingSparseFeatures<uint16_t>; 00473 template class CStreamingSparseFeatures<int32_t>; 00474 template class CStreamingSparseFeatures<uint32_t>; 00475 template class CStreamingSparseFeatures<int64_t>; 00476 template class CStreamingSparseFeatures<uint64_t>; 00477 template class CStreamingSparseFeatures<float32_t>; 00478 template class CStreamingSparseFeatures<float64_t>; 00479 template class CStreamingSparseFeatures<floatmax_t>; 00480 }