SHOGUN
v1.1.0
|
00001 /* 00002 * This program is free software; you can redistribute it and/or modify 00003 * it under the terms of the GNU General Public License as published by 00004 * the Free Software Foundation; either version 3 of the License, or 00005 * (at your option) any later version. 00006 * 00007 * Written (W) 2009 Soeren Sonnenburg 00008 * Copyright (C) 2009 Fraunhofer Institute FIRST and Max-Planck-Society 00009 */ 00010 00011 #include <shogun/features/WDFeatures.h> 00012 #include <shogun/io/SGIO.h> 00013 00014 using namespace shogun; 00015 00016 CWDFeatures::CWDFeatures() :CDotFeatures() 00017 { 00018 SG_UNSTABLE("CWDFeatures::CWDFeatures() :CDotFeatures()", 00019 "\n"); 00020 00021 strings = NULL; 00022 00023 degree = 0; 00024 from_degree = 0; 00025 string_length = 0; 00026 num_strings = 0; 00027 alphabet_size = 0; 00028 w_dim = 0; 00029 wd_weights = NULL; 00030 normalization_const = 0.0; 00031 } 00032 00033 CWDFeatures::CWDFeatures(CStringFeatures<uint8_t>* str, 00034 int32_t order, int32_t from_order) : CDotFeatures() 00035 { 00036 ASSERT(str); 00037 ASSERT(str->have_same_length()); 00038 SG_REF(str); 00039 00040 strings=str; 00041 string_length=str->get_max_vector_length(); 00042 num_strings=str->get_num_vectors(); 00043 CAlphabet* alpha=str->get_alphabet(); 00044 alphabet_size=alpha->get_num_symbols(); 00045 SG_UNREF(alpha); 00046 00047 degree=order; 00048 from_degree=from_order; 00049 wd_weights=NULL; 00050 set_wd_weights(); 00051 set_normalization_const(); 00052 00053 } 00054 00055 CWDFeatures::CWDFeatures(const CWDFeatures& orig) 00056 : CDotFeatures(orig), strings(orig.strings), 00057 degree(orig.degree), from_degree(orig.from_degree), 00058 normalization_const(orig.normalization_const) 00059 { 00060 SG_REF(strings); 00061 string_length=strings->get_max_vector_length(); 00062 num_strings=strings->get_num_vectors(); 00063 CAlphabet* alpha=strings->get_alphabet(); 00064 alphabet_size=alpha->get_num_symbols(); 00065 SG_UNREF(alpha); 00066 00067 wd_weights=NULL; 00068 set_wd_weights(); 00069 } 00070 00071 CWDFeatures::~CWDFeatures() 00072 { 00073 SG_UNREF(strings); 00074 SG_FREE(wd_weights); 00075 } 00076 00077 float64_t CWDFeatures::dot(int32_t vec_idx1, CDotFeatures* df, int32_t vec_idx2) 00078 { 00079 ASSERT(df); 00080 ASSERT(df->get_feature_type() == get_feature_type()); 00081 ASSERT(df->get_feature_class() == get_feature_class()); 00082 CWDFeatures* wdf = (CWDFeatures*) df; 00083 00084 int32_t len1, len2; 00085 bool free_vec1, free_vec2; 00086 00087 uint8_t* vec1=strings->get_feature_vector(vec_idx1, len1, free_vec1); 00088 uint8_t* vec2=wdf->strings->get_feature_vector(vec_idx2, len2, free_vec2); 00089 00090 ASSERT(len1==len2); 00091 00092 float64_t sum=0.0; 00093 00094 for (int32_t i=0; i<len1; i++) 00095 { 00096 for (int32_t j=0; (i+j<len1) && (j<degree); j++) 00097 { 00098 if (vec1[i+j]!=vec2[i+j]) 00099 break ; 00100 sum += wd_weights[j]*wd_weights[j]; 00101 } 00102 } 00103 strings->free_feature_vector(vec1, vec_idx1, free_vec1); 00104 wdf->strings->free_feature_vector(vec2, vec_idx2, free_vec2); 00105 return sum/CMath::sq(normalization_const); 00106 } 00107 00108 float64_t CWDFeatures::dense_dot(int32_t vec_idx1, const float64_t* vec2, int32_t vec2_len) 00109 { 00110 if (vec2_len != w_dim) 00111 SG_ERROR("Dimensions don't match, vec2_dim=%d, w_dim=%d\n", vec2_len, w_dim); 00112 00113 float64_t sum=0; 00114 int32_t lim=CMath::min(degree, string_length); 00115 int32_t len; 00116 bool free_vec1; 00117 uint8_t* vec = strings->get_feature_vector(vec_idx1, len, free_vec1); 00118 int32_t* val=SG_MALLOC(int32_t, len); 00119 CMath::fill_vector(val, len, 0); 00120 00121 int32_t asize=alphabet_size; 00122 int32_t asizem1=1; 00123 int32_t offs=0; 00124 00125 for (int32_t k=0; k<lim; k++) 00126 { 00127 float64_t wd = wd_weights[k]; 00128 00129 int32_t o=offs; 00130 for (int32_t i=0; i+k < len; i++) 00131 { 00132 val[i]+=asizem1*vec[i+k]; 00133 sum+=vec2[val[i]+o]*wd; 00134 o+=asize; 00135 } 00136 offs+=asize*len; 00137 asize*=alphabet_size; 00138 asizem1*=alphabet_size; 00139 } 00140 SG_FREE(val); 00141 strings->free_feature_vector(vec, vec_idx1, free_vec1); 00142 00143 return sum/normalization_const; 00144 } 00145 00146 void CWDFeatures::add_to_dense_vec(float64_t alpha, int32_t vec_idx1, float64_t* vec2, int32_t vec2_len, bool abs_val) 00147 { 00148 if (vec2_len != w_dim) 00149 SG_ERROR("Dimensions don't match, vec2_dim=%d, w_dim=%d\n", vec2_len, w_dim); 00150 00151 int32_t lim=CMath::min(degree, string_length); 00152 int32_t len; 00153 bool free_vec1; 00154 uint8_t* vec = strings->get_feature_vector(vec_idx1, len, free_vec1); 00155 int32_t* val=SG_MALLOC(int32_t, len); 00156 CMath::fill_vector(val, len, 0); 00157 00158 int32_t asize=alphabet_size; 00159 int32_t asizem1=1; 00160 int32_t offs=0; 00161 00162 for (int32_t k=0; k<lim; k++) 00163 { 00164 float64_t wd = alpha*wd_weights[k]/normalization_const; 00165 00166 if (abs_val) 00167 wd=CMath::abs(wd); 00168 00169 int32_t o=offs; 00170 for (int32_t i=0; i+k < len; i++) 00171 { 00172 val[i]+=asizem1*vec[i+k]; 00173 vec2[val[i]+o]+=wd; 00174 o+=asize; 00175 } 00176 offs+=asize*len; 00177 asize*=alphabet_size; 00178 asizem1*=alphabet_size; 00179 } 00180 SG_FREE(val); 00181 00182 strings->free_feature_vector(vec, vec_idx1, free_vec1); 00183 } 00184 00185 void CWDFeatures::set_wd_weights() 00186 { 00187 ASSERT(degree>0 && degree<=8); 00188 SG_FREE(wd_weights); 00189 wd_weights=SG_MALLOC(float64_t, degree); 00190 w_dim=0; 00191 00192 for (int32_t i=0; i<degree; i++) 00193 { 00194 w_dim+=CMath::pow(alphabet_size, i+1)*string_length; 00195 wd_weights[i]=sqrt(2.0*(from_degree-i)/(from_degree*(from_degree+1))); 00196 } 00197 SG_DEBUG("created WDFeatures with d=%d (%d), alphabetsize=%d, dim=%d num=%d, len=%d\n", degree, from_degree, alphabet_size, w_dim, num_strings, string_length); 00198 } 00199 00200 00201 void CWDFeatures::set_normalization_const(float64_t n) 00202 { 00203 if (n==0) 00204 { 00205 normalization_const=0; 00206 for (int32_t i=0; i<degree; i++) 00207 normalization_const+=(string_length-i)*wd_weights[i]*wd_weights[i]; 00208 00209 normalization_const=CMath::sqrt(normalization_const); 00210 } 00211 else 00212 normalization_const=n; 00213 00214 SG_DEBUG("normalization_const:%f\n", normalization_const); 00215 } 00216 00217 void* CWDFeatures::get_feature_iterator(int32_t vector_index) 00218 { 00219 if (vector_index>=num_strings) 00220 { 00221 SG_ERROR("Index out of bounds (number of strings %d, you " 00222 "requested %d)\n", num_strings, vector_index); 00223 } 00224 00225 wd_feature_iterator* it=SG_MALLOC(wd_feature_iterator, 1); 00226 00227 it->lim=CMath::min(degree, string_length); 00228 it->vec= strings->get_feature_vector(vector_index, it->vlen, it->vfree); 00229 it->vidx=vector_index; 00230 00231 it->vec = strings->get_feature_vector(vector_index, it->vlen, it->vfree); 00232 it->val=SG_MALLOC(int32_t, it->vlen); 00233 CMath::fill_vector(it->val, it->vlen, 0); 00234 00235 it->asize=alphabet_size; 00236 it->asizem1=1; 00237 it->offs=0; 00238 it->k=0; 00239 it->i=0; 00240 it->o=0; 00241 00242 return it; 00243 } 00244 00245 bool CWDFeatures::get_next_feature(int32_t& index, float64_t& value, void* iterator) 00246 { 00247 wd_feature_iterator* it=(wd_feature_iterator*) iterator; 00248 00249 if (it->i + it->k >= it->vlen) 00250 { 00251 if (it->k < it->lim-1) 00252 { 00253 it->offs+=it->asize*it->vlen; 00254 it->asize*=alphabet_size; 00255 it->asizem1*=alphabet_size; 00256 it->k++; 00257 it->i=0; 00258 it->o=it->offs; 00259 } 00260 else 00261 return false; 00262 } 00263 00264 int32_t i=it->i; 00265 int32_t k=it->k; 00266 #ifdef DEBUG_WDFEATURES 00267 SG_PRINT("i=%d k=%d offs=%d o=%d asize=%d asizem1=%d\n", i, k, it->offs, it->o, it->asize, it->asizem1); 00268 #endif 00269 00270 it->val[i]+=it->asizem1*it->vec[i+k]; 00271 value=wd_weights[k]/normalization_const; 00272 index=it->val[i]+it->o; 00273 #ifdef DEBUG_WDFEATURES 00274 SG_PRINT("index=%d val=%f w_size=%d lim=%d vlen=%d\n", index, value, w_dim, it->lim, it->vlen); 00275 #endif 00276 00277 it->o+=it->asize; 00278 it->i=i+1; 00279 00280 return true; 00281 } 00282 00283 void CWDFeatures::free_feature_iterator(void* iterator) 00284 { 00285 ASSERT(iterator); 00286 wd_feature_iterator* it=(wd_feature_iterator*) iterator; 00287 strings->free_feature_vector(it->vec, it->vidx, it->vfree); 00288 SG_FREE(it->val); 00289 SG_FREE(it); 00290 } 00291 00292 CFeatures* CWDFeatures::duplicate() const 00293 { 00294 return new CWDFeatures(*this); 00295 } 00296 00297 int32_t CWDFeatures::get_dim_feature_space() const 00298 { 00299 return w_dim; 00300 } 00301 00302 int32_t CWDFeatures::get_nnz_features_for_vector(int32_t num) 00303 { 00304 int32_t vlen=-1; 00305 bool free_vec; 00306 uint8_t* vec=strings->get_feature_vector(num, vlen, free_vec); 00307 strings->free_feature_vector(vec, num, free_vec); 00308 return degree*vlen; 00309 } 00310 00311 EFeatureType CWDFeatures::get_feature_type() 00312 { 00313 return F_UNKNOWN; 00314 } 00315 00316 EFeatureClass CWDFeatures::get_feature_class() 00317 { 00318 return C_WD; 00319 } 00320 00321 int32_t CWDFeatures::get_num_vectors() const 00322 { 00323 return num_strings; 00324 } 00325 00326 int32_t CWDFeatures::get_size() 00327 { 00328 return sizeof(float64_t); 00329 } 00330 00331 float64_t CWDFeatures::get_normalization_const() 00332 { 00333 return normalization_const; 00334 } 00335 00336 void CWDFeatures::set_wd_weights(SGVector<float64_t> weights) 00337 { 00338 ASSERT(weights.vlen==degree); 00339 00340 for (int32_t i=0; i<degree; i++) 00341 wd_weights[i]=weights.vector[i]; 00342 } 00343