SHOGUN
v1.1.0
|
00001 /* 00002 * Copyright (c) 2009 Yahoo! Inc. All rights reserved. The copyrights 00003 * embodied in the content of this file are licensed under the BSD 00004 * (revised) open source license. 00005 * 00006 * This program is free software; you can redistribute it and/or modify 00007 * it under the terms of the GNU General Public License as published by 00008 * the Free Software Foundation; either version 3 of the License, or 00009 * (at your option) any later version. 00010 * 00011 * Written (W) 2011 Shashwat Lal Das 00012 * Adaptation of Vowpal Wabbit v5.1. 00013 * Copyright (C) 2011 Berlin Institute of Technology and Max-Planck-Society. 00014 */ 00015 00016 #include <shogun/classifier/vw/VwParser.h> 00017 #include <shogun/classifier/vw/cache/VwNativeCacheWriter.h> 00018 00019 using namespace shogun; 00020 00021 CVwParser::CVwParser() 00022 : CSGObject() 00023 { 00024 env = new CVwEnvironment(); 00025 hasher = CHash::MurmurHashString; 00026 write_cache = false; 00027 cache_writer = NULL; 00028 } 00029 00030 CVwParser::CVwParser(CVwEnvironment* env_to_use) 00031 : CSGObject() 00032 { 00033 ASSERT(env_to_use); 00034 00035 env = env_to_use; 00036 hasher = CHash::MurmurHashString; 00037 write_cache = false; 00038 cache_writer = NULL; 00039 SG_REF(env); 00040 } 00041 00042 CVwParser::~CVwParser() 00043 { 00044 SG_FREE(channels.begin); 00045 channels.begin = channels.end = channels.end_array = NULL; 00046 SG_FREE(words.begin); 00047 words.begin = words.end = words.end_array = NULL; 00048 SG_FREE(name.begin); 00049 name.begin = name.end = name.end_array = NULL; 00050 00051 SG_UNREF(env); 00052 SG_UNREF(cache_writer); 00053 } 00054 00055 int32_t CVwParser::read_features(CIOBuffer* buf, VwExample*& ae) 00056 { 00057 char *line=NULL; 00058 int32_t num_chars = buf->read_line(line); 00059 if (num_chars == 0) 00060 return num_chars; 00061 00062 /* Mark begin and end of example in the buffer */ 00063 substring example_string = {line, line + num_chars}; 00064 00065 /* Channels containing separate namespaces/label information*/ 00066 channels.erase(); 00067 00068 /* Split at '|' character */ 00069 tokenize('|', example_string, channels); 00070 00071 /* If first char is not '|', then the first channel contains label data */ 00072 substring* feature_start = &channels[1]; 00073 00074 if (*line == '|') 00075 feature_start = &channels[0]; /* Unlabelled data */ 00076 else 00077 { 00078 /* First channel has label info */ 00079 substring label_space = channels[0]; 00080 char* tab_location = safe_index(label_space.start, '\t', label_space.end); 00081 if (tab_location != label_space.end) 00082 label_space.start = tab_location+1; 00083 00084 /* Split the label space on spaces */ 00085 tokenize(' ',label_space,words); 00086 if (words.index() > 0 && words.last().end == label_space.end) //The last field is a tag, so record and strip it off 00087 { 00088 substring tag = words.pop(); 00089 ae->tag.push_many(tag.start, tag.end - tag.start); 00090 } 00091 00092 ae->ld->parse_label(words); 00093 set_minmax(ae->ld->label); 00094 } 00095 00096 vw_size_t mask = env->mask; 00097 00098 /* Now parse the individual channels, i.e., namespaces */ 00099 for (substring* i = feature_start; i != channels.end; i++) 00100 { 00101 substring channel = *i; 00102 00103 tokenize(' ',channel, words); 00104 if (words.begin == words.end) 00105 continue; 00106 00107 /* Set default scale value for channel */ 00108 float32_t channel_v = 1.; 00109 vw_size_t channel_hash; 00110 00111 /* Index by which to refer to the namespace */ 00112 vw_size_t index = 0; 00113 bool new_index = false; 00114 vw_size_t feature_offset = 0; 00115 00116 if (channel.start[0] != ' ') 00117 { 00118 /* Nonanonymous namespace specified */ 00119 feature_offset++; 00120 feature_value(words[0], name, channel_v); 00121 00122 if (name.index() > 0) 00123 { 00124 index = (unsigned char)(*name[0].start); 00125 if (ae->atomics[index].begin == ae->atomics[index].end) 00126 { 00127 ae->sum_feat_sq[index] = 0; 00128 new_index = true; 00129 } 00130 } 00131 channel_hash = hasher(name[0], hash_base); 00132 } 00133 else 00134 { 00135 /* Use default namespace with index below */ 00136 index = (unsigned char)' '; 00137 if (ae->atomics[index].begin == ae->atomics[index].end) 00138 { 00139 ae->sum_feat_sq[index] = 0; 00140 new_index = true; 00141 } 00142 channel_hash = 0; 00143 } 00144 00145 for (substring* j = words.begin+feature_offset; j != words.end; j++) 00146 { 00147 /* Get individual features and multiply by scale value */ 00148 float32_t v; 00149 feature_value(*j, name, v); 00150 v *= channel_v; 00151 00152 /* Hash feature */ 00153 vw_size_t word_hash = (hasher(name[0], channel_hash)) & mask; 00154 VwFeature f = {v,word_hash}; 00155 ae->sum_feat_sq[index] += v*v; 00156 ae->atomics[index].push(f); 00157 } 00158 00159 /* Add index to list of indices if required */ 00160 if (new_index && ae->atomics[index].begin != ae->atomics[index].end) 00161 ae->indices.push(index); 00162 00163 } 00164 00165 if (write_cache) 00166 cache_writer->cache_example(ae); 00167 00168 return num_chars; 00169 } 00170 00171 int32_t CVwParser::read_svmlight_features(CIOBuffer* buf, VwExample*& ae) 00172 { 00173 char *line=NULL; 00174 int32_t num_chars = buf->read_line(line); 00175 if (num_chars == 0) 00176 return num_chars; 00177 00178 /* Mark begin and end of example in the buffer */ 00179 substring example_string = {line, line + num_chars}; 00180 00181 vw_size_t mask = env->mask; 00182 tokenize(' ', example_string, words); 00183 00184 ae->ld->label = float_of_substring(words[0]); 00185 ae->ld->weight = 1.; 00186 ae->ld->initial = 0.; 00187 set_minmax(ae->ld->label); 00188 00189 substring* feature_start = &words[1]; 00190 00191 vw_size_t index = (unsigned char)' '; // Any default namespace is ok 00192 vw_size_t channel_hash = 0; 00193 ae->sum_feat_sq[index] = 0; 00194 ae->indices.push(index); 00195 /* Now parse the individual features */ 00196 for (substring* i = feature_start; i != words.end; i++) 00197 { 00198 float32_t v; 00199 feature_value(*i, name, v); 00200 00201 vw_size_t word_hash = (hasher(name[0], channel_hash)) & mask; 00202 VwFeature f = {v,word_hash}; 00203 ae->sum_feat_sq[index] += v*v; 00204 ae->atomics[index].push(f); 00205 } 00206 00207 if (write_cache) 00208 cache_writer->cache_example(ae); 00209 00210 return num_chars; 00211 } 00212 00213 int32_t CVwParser::read_dense_features(CIOBuffer* buf, VwExample*& ae) 00214 { 00215 char *line=NULL; 00216 int32_t num_chars = buf->read_line(line); 00217 if (num_chars == 0) 00218 return num_chars; 00219 00220 // Mark begin and end of example in the buffer 00221 substring example_string = {line, line + num_chars}; 00222 00223 vw_size_t mask = env->mask; 00224 tokenize(' ', example_string, words); 00225 00226 ae->ld->label = float_of_substring(words[0]); 00227 ae->ld->weight = 1.; 00228 ae->ld->initial = 0.; 00229 set_minmax(ae->ld->label); 00230 00231 substring* feature_start = &words[1]; 00232 00233 vw_size_t index = (unsigned char)' '; 00234 00235 ae->sum_feat_sq[index] = 0; 00236 ae->indices.push(index); 00237 // Now parse individual features 00238 int32_t j=0; 00239 for (substring* i = feature_start; i != words.end; i++) 00240 { 00241 float32_t v = float_of_substring(*i); 00242 vw_size_t word_hash = j & mask; 00243 VwFeature f = {v,word_hash}; 00244 ae->sum_feat_sq[index] += v*v; 00245 ae->atomics[index].push(f); 00246 j++; 00247 } 00248 00249 if (write_cache) 00250 cache_writer->cache_example(ae); 00251 00252 return num_chars; 00253 } 00254 00255 void CVwParser::init_cache(char * fname, EVwCacheType type) 00256 { 00257 char* file_name = fname; 00258 char default_cache_name[] = "vw_cache.dat.cache"; 00259 00260 if (!fname) 00261 file_name = default_cache_name; 00262 00263 write_cache = true; 00264 cache_type = type; 00265 00266 switch (type) 00267 { 00268 case C_NATIVE: 00269 cache_writer = new CVwNativeCacheWriter(file_name, env); 00270 return; 00271 case C_PROTOBUF: 00272 SG_ERROR("Protocol buffers cache support is not implemented yet.\n"); 00273 } 00274 00275 SG_ERROR("Unexpected cache type specified!\n"); 00276 } 00277 00278 void CVwParser::feature_value(substring &s, v_array<substring>& feat_name, float32_t &v) 00279 { 00280 // Get the value of the feature in the substring 00281 tokenize(':', s, feat_name); 00282 00283 switch (feat_name.index()) 00284 { 00285 // If feature value is not specified, assume 1.0 00286 case 0: 00287 case 1: 00288 v = 1.; 00289 break; 00290 case 2: 00291 v = float_of_substring(feat_name[1]); 00292 if (isnan(v)) 00293 SG_SERROR("error NaN value for feature %s! Terminating!\n", 00294 c_string_of_substring(feat_name[0])); 00295 break; 00296 default: 00297 SG_SERROR("Examples with a weird name, i.e., '%s'\n", 00298 c_string_of_substring(s)); 00299 } 00300 } 00301 00302 void CVwParser::tokenize(char delim, substring s, v_array<substring>& ret) 00303 { 00304 ret.erase(); 00305 char *last = s.start; 00306 for (; s.start != s.end; s.start++) 00307 { 00308 if (*s.start == delim) 00309 { 00310 if (s.start != last) 00311 { 00312 substring temp = {last,s.start}; 00313 ret.push(temp); 00314 } 00315 last = s.start+1; 00316 } 00317 } 00318 if (s.start != last) 00319 { 00320 substring final = {last, s.start}; 00321 ret.push(final); 00322 } 00323 }