SHOGUN
v1.1.0
|
00001 /* 00002 * Copyright (c) 2009 Yahoo! Inc. All rights reserved. The copyrights 00003 * embodied in the content of this file are licensed under the BSD 00004 * (revised) open source license. 00005 * 00006 * This program is free software; you can redistribute it and/or modify 00007 * it under the terms of the GNU General Public License as published by 00008 * the Free Software Foundation; either version 3 of the License, or 00009 * (at your option) any later version. 00010 * 00011 * Written (W) 2011 Shashwat Lal Das 00012 * Adaptation of Vowpal Wabbit v5.1. 00013 * Copyright (C) 2011 Berlin Institute of Technology and Max-Planck-Society. 00014 */ 00015 00016 #include <shogun/features/StreamingVwFeatures.h> 00017 00018 using namespace shogun; 00019 00020 CStreamingVwFeatures::CStreamingVwFeatures() : CStreamingDotFeatures() 00021 { 00022 init(); 00023 set_read_functions(); 00024 } 00025 00026 CStreamingVwFeatures::CStreamingVwFeatures(CStreamingVwFile* file, 00027 bool is_labelled, int32_t size) 00028 : CStreamingDotFeatures() 00029 { 00030 init(file, is_labelled, size); 00031 set_read_functions(); 00032 } 00033 00034 CStreamingVwFeatures::CStreamingVwFeatures(CStreamingVwCacheFile* file, 00035 bool is_labelled, int32_t size) 00036 : CStreamingDotFeatures() 00037 { 00038 init(file, is_labelled, size); 00039 set_read_functions(); 00040 } 00041 00042 CStreamingVwFeatures::~CStreamingVwFeatures() 00043 { 00044 parser.end_parser(); 00045 SG_UNREF(env); 00046 } 00047 00048 CFeatures* CStreamingVwFeatures::duplicate() const 00049 { 00050 return new CStreamingVwFeatures(*this); 00051 } 00052 00053 void CStreamingVwFeatures::set_vector_reader() 00054 { 00055 parser.set_read_vector(&CStreamingFile::get_vector); 00056 } 00057 00058 void CStreamingVwFeatures::set_vector_and_label_reader() 00059 { 00060 parser.set_read_vector_and_label(&CStreamingFile::get_vector_and_label); 00061 } 00062 00063 void CStreamingVwFeatures::reset_stream() 00064 { 00065 if (working_file->is_seekable()) 00066 { 00067 working_file->reset_stream(); 00068 parser.exit_parser(); 00069 parser.init(working_file, has_labels, parser.get_ring_size()); 00070 parser.set_free_vector_after_release(false); 00071 parser.start_parser(); 00072 } 00073 else 00074 SG_ERROR("The input cannot be reset! Please use 1 pass.\n"); 00075 } 00076 00077 CVwEnvironment* CStreamingVwFeatures::get_env() 00078 { 00079 SG_REF(env); 00080 return env; 00081 } 00082 00083 void CStreamingVwFeatures::set_env(CVwEnvironment* vw_env) 00084 { 00085 env = vw_env; 00086 SG_REF(env); 00087 } 00088 00089 void CStreamingVwFeatures::expand_if_required(float32_t*& vec, int32_t& len) 00090 { 00091 int32_t dim = 1 << env->num_bits; 00092 if (dim > len) 00093 { 00094 vec = SG_REALLOC(float32_t, vec, dim); 00095 memset(&vec[len], 0, (dim-len) * sizeof(float32_t)); 00096 len = dim; 00097 } 00098 } 00099 00100 void CStreamingVwFeatures::expand_if_required(float64_t*& vec, int32_t& len) 00101 { 00102 int32_t dim = 1 << env->num_bits; 00103 if (dim > len) 00104 { 00105 vec = SG_REALLOC(float64_t, vec, dim); 00106 memset(&vec[len], 0, (dim-len) * sizeof(float64_t)); 00107 len = dim; 00108 } 00109 } 00110 00111 float32_t CStreamingVwFeatures::real_weight(float32_t w, float32_t gravity) 00112 { 00113 float32_t wprime = 0; 00114 if (gravity < fabsf(w)) 00115 wprime = CMath::sign(w)*(fabsf(w) - gravity); 00116 return wprime; 00117 } 00118 00119 int32_t CStreamingVwFeatures::get_nnz_features_for_vector() 00120 { 00121 return current_length; 00122 } 00123 00124 int32_t CStreamingVwFeatures::get_num_vectors() const 00125 { 00126 if (current_example) 00127 return 1; 00128 else 00129 return 0; 00130 } 00131 00132 int32_t CStreamingVwFeatures::get_size() 00133 { 00134 return sizeof(VwExample); 00135 } 00136 00137 EFeatureType CStreamingVwFeatures::get_feature_type() 00138 { 00139 return F_DREAL; 00140 } 00141 00142 void CStreamingVwFeatures::init() 00143 { 00144 working_file=NULL; 00145 seekable=false; 00146 current_length=-1; 00147 current_example=NULL; 00148 00149 example_count = 0; 00150 } 00151 00152 void CStreamingVwFeatures::init(CStreamingVwFile* file, bool is_labelled, int32_t size) 00153 { 00154 init(); 00155 has_labels = is_labelled; 00156 working_file = file; 00157 parser.init(file, is_labelled, size); 00158 parser.set_free_vector_after_release(false); 00159 seekable=false; 00160 00161 // Get environment from the StreamingVwFile 00162 env = ((CStreamingVwFile*) file)->get_env(); 00163 SG_REF(env); 00164 } 00165 00166 void CStreamingVwFeatures::init(CStreamingVwCacheFile* file, bool is_labelled, int32_t size) 00167 { 00168 init(); 00169 has_labels = is_labelled; 00170 working_file = file; 00171 parser.init(file, is_labelled, size); 00172 parser.set_free_vector_after_release(false); 00173 seekable=true; 00174 00175 // Get environment from the StreamingVwFile 00176 env = ((CStreamingVwCacheFile*) file)->get_env(); 00177 SG_REF(env); 00178 } 00179 00180 void CStreamingVwFeatures::setup_example(VwExample* ae) 00181 { 00182 ae->pass = env->passes_complete; 00183 ae->num_features = 0; 00184 ae->total_sum_feat_sq = 1; 00185 ae->example_counter = ++example_count; 00186 ae->global_weight = ae->ld->weight; 00187 env->t += ae->global_weight; 00188 ae->example_t = env->t; 00189 00190 // If some namespaces should be ignored, remove them 00191 if (env->ignore_some) 00192 { 00193 for (vw_size_t* i = ae->indices.begin; i != ae->indices.end; i++) 00194 if (env->ignore[*i]) 00195 { 00196 ae->atomics[*i].erase(); 00197 memmove(i,i+1,(ae->indices.end - (i+1))*sizeof(vw_size_t)); 00198 ae->indices.end--; 00199 i--; 00200 } 00201 } 00202 00203 // Add constant feature 00204 vw_size_t constant_namespace = 128; 00205 VwFeature temp = {1,constant_hash & env->mask}; 00206 ae->indices.push(constant_namespace); 00207 ae->atomics[constant_namespace].push(temp); 00208 ae->sum_feat_sq[constant_namespace] = 0; 00209 00210 if(env->stride != 1) 00211 { 00212 // Make room for per-feature information. 00213 vw_size_t stride = env->stride; 00214 for (vw_size_t* i = ae->indices.begin; i != ae->indices.end; i++) 00215 for(VwFeature* j = ae->atomics[*i].begin; j != ae->atomics[*i].end; j++) 00216 j->weight_index = j->weight_index*stride; 00217 } 00218 00219 for (vw_size_t* i = ae->indices.begin; i != ae->indices.end; i++) 00220 { 00221 ae->num_features += ae->atomics[*i].end - ae->atomics[*i].begin; 00222 ae->total_sum_feat_sq += ae->sum_feat_sq[*i]; 00223 } 00224 00225 // For quadratic features 00226 for (int32_t k = 0; k < env->pairs.get_num_elements(); k++) 00227 { 00228 char* i = env->pairs.get_element(k); 00229 00230 ae->num_features 00231 += (ae->atomics[(int32_t)(i[0])].end - ae->atomics[(int32_t)(i[0])].begin) 00232 *(ae->atomics[(int32_t)(i[1])].end - ae->atomics[(int32_t)(i[1])].begin); 00233 00234 ae->total_sum_feat_sq += ae->sum_feat_sq[(int32_t)(i[0])]*ae->sum_feat_sq[(int32_t)(i[1])]; 00235 } 00236 } 00237 00238 void CStreamingVwFeatures::start_parser() 00239 { 00240 if (!parser.is_running()) 00241 parser.start_parser(); 00242 } 00243 00244 void CStreamingVwFeatures::end_parser() 00245 { 00246 parser.end_parser(); 00247 } 00248 00249 bool CStreamingVwFeatures::get_next_example() 00250 { 00251 bool ret_value; 00252 ret_value = (bool) parser.get_next_example(current_example, 00253 current_length, 00254 current_label); 00255 if (current_length < 1) 00256 return false; 00257 00258 if (ret_value) 00259 setup_example(current_example); 00260 else 00261 return false; 00262 00263 current_label = current_example->ld->label; 00264 current_length = current_example->num_features; 00265 00266 return ret_value; 00267 } 00268 00269 VwExample* CStreamingVwFeatures::get_example() 00270 { 00271 return current_example; 00272 } 00273 00274 float64_t CStreamingVwFeatures::get_label() 00275 { 00276 ASSERT(has_labels); 00277 00278 return current_label; 00279 } 00280 00281 void CStreamingVwFeatures::release_example() 00282 { 00283 env->example_number++; 00284 env->weighted_examples += current_example->ld->weight; 00285 00286 if (current_example->ld->label == FLT_MAX) 00287 env->weighted_labels += 0; 00288 else 00289 env->weighted_labels += current_example->ld->label * current_example->ld->weight; 00290 00291 env->total_features += current_example->num_features; 00292 env->sum_loss += current_example->loss; 00293 00294 current_example->reset_members(); 00295 parser.finalize_example(); 00296 } 00297 00298 int32_t CStreamingVwFeatures::get_dim_feature_space() const 00299 { 00300 return current_length; 00301 } 00302 00303 float32_t CStreamingVwFeatures::dot(CStreamingDotFeatures* df) 00304 { 00305 SG_NOTIMPLEMENTED; 00306 return CMath::INFTY; 00307 } 00308 00309 float32_t CStreamingVwFeatures::dense_dot(VwExample* &ex, const float32_t* vec2) 00310 { 00311 float32_t ret = 0.; 00312 for (vw_size_t* i = ex->indices.begin; i!= ex->indices.end; i++) 00313 { 00314 for (VwFeature* f = ex->atomics[*i].begin; f != ex->atomics[*i].end; f++) 00315 ret += vec2[f->weight_index & env->thread_mask] * f->x; 00316 } 00317 return ret; 00318 } 00319 00320 float32_t CStreamingVwFeatures::dense_dot(const float32_t* vec2, int32_t vec2_len) 00321 { 00322 return dense_dot(current_example, vec2); 00323 } 00324 00325 float32_t CStreamingVwFeatures::dense_dot(SGSparseVector<float32_t>* vec1, const float32_t* vec2) 00326 { 00327 float32_t ret = 0.; 00328 for (int32_t i = 0; i < vec1->num_feat_entries; i++) 00329 ret += vec1->features[i].entry * vec2[vec1->features[i].feat_index & env->mask]; 00330 00331 return ret; 00332 } 00333 00334 float32_t CStreamingVwFeatures::dense_dot_truncated(const float32_t* vec2, VwExample* &ex, float32_t gravity) 00335 { 00336 float32_t ret = 0.; 00337 for (vw_size_t* i = ex->indices.begin; i != ex->indices.end; i++) 00338 { 00339 for (VwFeature* f = ex->atomics[*i].begin; f!= ex->atomics[*i].end; f++) 00340 { 00341 float32_t w = vec2[f->weight_index & env->thread_mask]; 00342 float32_t wprime = real_weight(w,gravity); 00343 ret += wprime*f->x; 00344 } 00345 } 00346 00347 return ret; 00348 } 00349 00350 void CStreamingVwFeatures::add_to_dense_vec(float32_t alpha, VwExample* &ex, float32_t* vec2, int32_t vec2_len, bool abs_val) 00351 { 00352 if (abs_val) 00353 { 00354 for (vw_size_t* i = ex->indices.begin; i != ex->indices.end; i++) 00355 { 00356 for (VwFeature* f = ex->atomics[*i].begin; f != ex->atomics[*i].end; f++) 00357 vec2[f->weight_index & env->thread_mask] += alpha * abs(f->x); 00358 } 00359 } 00360 else 00361 { 00362 for (vw_size_t* i = ex->indices.begin; i != ex->indices.end; i++) 00363 { 00364 for (VwFeature* f = ex->atomics[*i].begin; f != ex->atomics[*i].end; f++) 00365 vec2[f->weight_index & env->thread_mask] += alpha * f->x; 00366 } 00367 } 00368 } 00369 00370 void CStreamingVwFeatures::add_to_dense_vec(float32_t alpha, float32_t* vec2, int32_t vec2_len, bool abs_val) 00371 { 00372 add_to_dense_vec(alpha, current_example, vec2, vec2_len, abs_val); 00373 } 00374 00375 int32_t CStreamingVwFeatures::get_num_features() 00376 { 00377 return current_length; 00378 } 00379 00380 EFeatureClass CStreamingVwFeatures::get_feature_class() 00381 { 00382 return C_STREAMING_VW; 00383 }