SHOGUN
v1.1.0
|
00001 /* 00002 * This program is free software; you can redistribute it and/or modify 00003 * it under the terms of the GNU General Public License as published by 00004 * the Free Software Foundation; either version 3 of the License, or 00005 * (at your option) any later version. 00006 * 00007 * Written (W) 2009 Soeren Sonnenburg 00008 * Copyright (C) 2009 Fraunhofer Institute FIRST and Max-Planck-Society 00009 */ 00010 00011 #include <shogun/features/ImplicitWeightedSpecFeatures.h> 00012 #include <shogun/io/SGIO.h> 00013 00014 using namespace shogun; 00015 00016 CImplicitWeightedSpecFeatures::CImplicitWeightedSpecFeatures() 00017 :CDotFeatures() 00018 { 00019 SG_UNSTABLE("CImplicitWeightedSpecFeatures::" 00020 "CImplicitWeightedSpecFeatures()", "\n"); 00021 00022 strings = NULL; 00023 normalization_factors = NULL; 00024 num_strings = 0; 00025 alphabet_size = 0; 00026 00027 degree = 0; 00028 spec_size = 0; 00029 spec_weights = 0; 00030 } 00031 00032 CImplicitWeightedSpecFeatures::CImplicitWeightedSpecFeatures(CStringFeatures<uint16_t>* str, bool normalize) : CDotFeatures() 00033 { 00034 ASSERT(str); 00035 strings=str; 00036 SG_REF(strings) 00037 normalization_factors=NULL; 00038 spec_weights=NULL; 00039 num_strings = str->get_num_vectors(); 00040 alphabet_size = str->get_original_num_symbols(); 00041 degree=str->get_order(); 00042 set_wd_weights(); 00043 00044 SG_DEBUG("WEIGHTED SPEC alphasz=%d, size=%d, num_str=%d\n", alphabet_size, 00045 spec_size, num_strings); 00046 00047 if (normalize) 00048 compute_normalization_const(); 00049 } 00050 00051 void CImplicitWeightedSpecFeatures::compute_normalization_const() 00052 { 00053 float64_t* factors=SG_MALLOC(float64_t, num_strings); 00054 00055 for (int32_t i=0; i<num_strings; i++) 00056 factors[i]=1.0/CMath::sqrt(dot(i, this, i)); 00057 00058 normalization_factors=factors; 00059 //CMath::display_vector(normalization_factors, num_strings, "n"); 00060 } 00061 00062 bool CImplicitWeightedSpecFeatures::set_wd_weights() 00063 { 00064 SG_FREE(spec_weights); 00065 spec_weights=SG_MALLOC(float64_t, degree); 00066 00067 int32_t i; 00068 float64_t sum=0; 00069 spec_size=0; 00070 00071 for (i=0; i<degree; i++) 00072 { 00073 spec_size+=CMath::pow(alphabet_size, i+1); 00074 spec_weights[i]=degree-i; 00075 sum+=spec_weights[i]; 00076 } 00077 for (i=0; i<degree; i++) 00078 spec_weights[i]=CMath::sqrt(spec_weights[i]/sum); 00079 00080 return spec_weights!=NULL; 00081 } 00082 00083 bool CImplicitWeightedSpecFeatures::set_weights(float64_t* w, int32_t d) 00084 { 00085 ASSERT(d==degree); 00086 00087 SG_FREE(spec_weights); 00088 spec_weights=SG_MALLOC(float64_t, degree); 00089 for (int32_t i=0; i<degree; i++) 00090 spec_weights[i]=CMath::sqrt(w[i]); 00091 return true; 00092 } 00093 00094 CImplicitWeightedSpecFeatures::CImplicitWeightedSpecFeatures(const CImplicitWeightedSpecFeatures& orig) : CDotFeatures(orig), 00095 num_strings(orig.num_strings), 00096 alphabet_size(orig.alphabet_size), spec_size(orig.spec_size) 00097 { 00098 SG_NOTIMPLEMENTED; 00099 SG_REF(strings); 00100 } 00101 00102 CImplicitWeightedSpecFeatures::~CImplicitWeightedSpecFeatures() 00103 { 00104 SG_UNREF(strings); 00105 SG_FREE(spec_weights); 00106 SG_FREE(normalization_factors); 00107 } 00108 00109 float64_t CImplicitWeightedSpecFeatures::dot(int32_t vec_idx1, CDotFeatures* df, int32_t vec_idx2) 00110 { 00111 ASSERT(df); 00112 ASSERT(df->get_feature_type() == get_feature_type()); 00113 ASSERT(df->get_feature_class() == get_feature_class()); 00114 CImplicitWeightedSpecFeatures* sf = (CImplicitWeightedSpecFeatures*) df; 00115 00116 ASSERT(vec_idx1 < num_strings); 00117 ASSERT(vec_idx2 < sf->get_num_vectors()); 00118 00119 int32_t len1=-1; 00120 int32_t len2=-1; 00121 bool free_vec1; 00122 bool free_vec2; 00123 uint16_t* vec1=strings->get_feature_vector(vec_idx1, len1, free_vec1); 00124 uint16_t* vec2=sf->strings->get_feature_vector(vec_idx2, len2, free_vec2); 00125 00126 float64_t result=0; 00127 uint8_t mask=0; 00128 00129 for (int32_t d=0; d<degree; d++) 00130 { 00131 mask = mask | (1 << (degree-d-1)); 00132 uint16_t masked=strings->get_masked_symbols(0xffff, mask); 00133 00134 int32_t left_idx=0; 00135 int32_t right_idx=0; 00136 float64_t weight=spec_weights[d]*spec_weights[d]; 00137 00138 while (left_idx < len1 && right_idx < len2) 00139 { 00140 uint16_t lsym=vec1[left_idx] & masked; 00141 uint16_t rsym=vec2[right_idx] & masked; 00142 00143 if (lsym == rsym) 00144 { 00145 int32_t old_left_idx=left_idx; 00146 int32_t old_right_idx=right_idx; 00147 00148 while (left_idx<len1 && (vec1[left_idx] & masked) ==lsym) 00149 left_idx++; 00150 00151 while (right_idx<len2 && (vec2[right_idx] & masked) ==lsym) 00152 right_idx++; 00153 00154 result+=weight*(left_idx-old_left_idx)*(right_idx-old_right_idx); 00155 } 00156 else if (lsym<rsym) 00157 left_idx++; 00158 else 00159 right_idx++; 00160 } 00161 } 00162 00163 strings->free_feature_vector(vec1, vec_idx1, free_vec1); 00164 sf->strings->free_feature_vector(vec2, vec_idx2, free_vec2); 00165 00166 if (normalization_factors) 00167 return result*normalization_factors[vec_idx1]*normalization_factors[vec_idx2]; 00168 else 00169 return result; 00170 } 00171 00172 float64_t CImplicitWeightedSpecFeatures::dense_dot(int32_t vec_idx1, const float64_t* vec2, int32_t vec2_len) 00173 { 00174 ASSERT(vec2_len == spec_size); 00175 ASSERT(vec_idx1 < num_strings); 00176 00177 float64_t result=0; 00178 int32_t len1=-1; 00179 bool free_vec1; 00180 uint16_t* vec1=strings->get_feature_vector(vec_idx1, len1, free_vec1); 00181 00182 if (vec1 && len1>0) 00183 { 00184 for (int32_t j=0; j<len1; j++) 00185 { 00186 uint8_t mask=0; 00187 int32_t offs=0; 00188 uint16_t v=*vec1++; 00189 00190 for (int32_t d=0; d<degree; d++) 00191 { 00192 mask = mask | (1 << (degree-d-1)); 00193 int32_t idx=strings->get_masked_symbols(v, mask); 00194 idx=strings->shift_symbol(idx, degree-d-1); 00195 result += vec2[offs + idx]*spec_weights[d]; 00196 offs+=strings->shift_offset(1,d+1); 00197 } 00198 } 00199 00200 strings->free_feature_vector(vec1, vec_idx1, free_vec1); 00201 00202 if (normalization_factors) 00203 result*=normalization_factors[vec_idx1]; 00204 } 00205 else 00206 SG_ERROR("huh?\n"); 00207 00208 return result; 00209 } 00210 00211 void CImplicitWeightedSpecFeatures::add_to_dense_vec(float64_t alpha, int32_t vec_idx1, float64_t* vec2, int32_t vec2_len, bool abs_val) 00212 { 00213 int32_t len1=-1; 00214 bool free_vec1; 00215 uint16_t* vec=strings->get_feature_vector(vec_idx1, len1, free_vec1); 00216 00217 if (normalization_factors) 00218 alpha*=normalization_factors[vec_idx1]; 00219 00220 if (vec && len1>0) 00221 { 00222 for (int32_t j=0; j<len1; j++) 00223 { 00224 uint8_t mask=0; 00225 int32_t offs=0; 00226 for (int32_t d=0; d<degree; d++) 00227 { 00228 mask = mask | (1 << (degree-d-1)); 00229 int32_t idx=strings->get_masked_symbols(vec[j], mask); 00230 idx=strings->shift_symbol(idx, degree-d-1); 00231 if (abs_val) 00232 vec2[offs + idx] += CMath::abs(alpha*spec_weights[d]); 00233 else 00234 vec2[offs + idx] += alpha*spec_weights[d]; 00235 offs+=strings->shift_offset(1,d+1); 00236 } 00237 } 00238 } 00239 00240 strings->free_feature_vector(vec, vec_idx1, free_vec1); 00241 } 00242 00243 CFeatures* CImplicitWeightedSpecFeatures::duplicate() const 00244 { 00245 return new CImplicitWeightedSpecFeatures(*this); 00246 } 00247 00248 int32_t CImplicitWeightedSpecFeatures::get_dim_feature_space() const 00249 { 00250 return spec_size; 00251 } 00252 00253 void* CImplicitWeightedSpecFeatures::get_feature_iterator(int32_t vector_index) 00254 { 00255 if (vector_index>=num_strings) 00256 { 00257 SG_ERROR("Index out of bounds (number of strings %d, you " 00258 "requested %d)\n", num_strings, vector_index); 00259 } 00260 00261 wspec_feature_iterator* it=SG_MALLOC(wspec_feature_iterator, 1); 00262 it->vec= strings->get_feature_vector(vector_index, it->vlen, it->vfree); 00263 it->vidx=vector_index; 00264 00265 it->offs=0; 00266 it->d=0; 00267 it->j=0; 00268 it->mask=0; 00269 it->alpha=normalization_factors[vector_index]; 00270 00271 return it; 00272 } 00273 00274 bool CImplicitWeightedSpecFeatures::get_next_feature(int32_t& index, float64_t& value, void* iterator) 00275 { 00276 wspec_feature_iterator* it=(wspec_feature_iterator*) iterator; 00277 00278 if (it->d>=degree) 00279 { 00280 if (it->j < it->vlen-1) 00281 { 00282 it->j++; 00283 it->d=0; 00284 it->mask=0; 00285 it->offs=0; 00286 } 00287 else 00288 return false; 00289 } 00290 00291 int32_t d=it->d; 00292 00293 it->mask = it->mask | (1 << (degree-d-1)); 00294 int32_t idx=strings->get_masked_symbols(it->vec[it->j], it->mask); 00295 idx=strings->shift_symbol(idx, degree-d-1); 00296 value=it->alpha*spec_weights[d]; 00297 index=it->offs + idx; 00298 it->offs+=strings->shift_offset(1,d+1); 00299 00300 it->d=d+1; 00301 return true; 00302 } 00303 00304 void CImplicitWeightedSpecFeatures::free_feature_iterator(void* iterator) 00305 { 00306 ASSERT(iterator); 00307 wspec_feature_iterator* it=(wspec_feature_iterator*) iterator; 00308 strings->free_feature_vector(it->vec, it->vidx, it->vfree); 00309 SG_FREE(it); 00310 } 00311 00312 00313 int32_t CImplicitWeightedSpecFeatures::get_nnz_features_for_vector(int32_t num) 00314 { 00315 int32_t vlen=-1; 00316 bool free_vec; 00317 uint16_t* vec1=strings->get_feature_vector(num, vlen, free_vec); 00318 strings->free_feature_vector(vec1, num, free_vec); 00319 int32_t nnz=0; 00320 for (int32_t i=1; i<=degree; i++) 00321 nnz+=CMath::min(CMath::pow(alphabet_size,i), vlen); 00322 return nnz; 00323 } 00324 00325 EFeatureType CImplicitWeightedSpecFeatures::get_feature_type() 00326 { 00327 return F_UNKNOWN; 00328 } 00329 00330 EFeatureClass CImplicitWeightedSpecFeatures::get_feature_class() 00331 { 00332 return C_WEIGHTEDSPEC; 00333 } 00334 00335 int32_t CImplicitWeightedSpecFeatures::get_num_vectors() const 00336 { 00337 return num_strings; 00338 } 00339 00340 int32_t CImplicitWeightedSpecFeatures::get_size() 00341 { 00342 return sizeof(float64_t); 00343 }