SHOGUN
v1.1.0
|
00001 /* 00002 * This program is free software; you can redistribute it and/or modify 00003 * it under the terms of the GNU General Public License as published by 00004 * the Free Software Foundation; either version 3 of the License, or 00005 * (at your option) any later version. 00006 * 00007 * Written (W) 1999-2009 Soeren Sonnenburg 00008 * Written (W) 1999-2008 Gunnar Raetsch 00009 * Copyright (C) 1999-2009 Fraunhofer Institute FIRST and Max-Planck-Society 00010 */ 00011 00012 #include <shogun/lib/common.h> 00013 #include <shogun/io/SGIO.h> 00014 #include <shogun/lib/Signal.h> 00015 #include <shogun/lib/Trie.h> 00016 #include <shogun/base/Parameter.h> 00017 #include <shogun/base/Parallel.h> 00018 00019 #include <shogun/kernel/WeightedDegreeStringKernel.h> 00020 #include <shogun/kernel/FirstElementKernelNormalizer.h> 00021 #include <shogun/features/Features.h> 00022 #include <shogun/features/StringFeatures.h> 00023 00024 #ifndef WIN32 00025 #include <pthread.h> 00026 #endif 00027 00028 using namespace shogun; 00029 00030 #ifndef DOXYGEN_SHOULD_SKIP_THIS 00031 struct S_THREAD_PARAM 00032 { 00033 00034 int32_t* vec; 00035 float64_t* result; 00036 float64_t* weights; 00037 CWeightedDegreeStringKernel* kernel; 00038 CTrie<DNATrie>* tries; 00039 float64_t factor; 00040 int32_t j; 00041 int32_t start; 00042 int32_t end; 00043 int32_t length; 00044 int32_t* vec_idx; 00045 }; 00046 #endif // DOXYGEN_SHOULD_SKIP_THIS 00047 00048 CWeightedDegreeStringKernel::CWeightedDegreeStringKernel () 00049 : CStringKernel<char>() 00050 { 00051 init(); 00052 } 00053 00054 00055 CWeightedDegreeStringKernel::CWeightedDegreeStringKernel ( 00056 int32_t d, EWDKernType t) 00057 : CStringKernel<char>() 00058 { 00059 init(); 00060 00061 degree=d; 00062 type=t; 00063 00064 if (type!=E_EXTERNAL) 00065 set_wd_weights_by_type(type); 00066 } 00067 00068 CWeightedDegreeStringKernel::CWeightedDegreeStringKernel ( 00069 float64_t *w, int32_t d) 00070 : CStringKernel<char>(10) 00071 { 00072 init(); 00073 00074 type=E_EXTERNAL; 00075 degree=d; 00076 00077 weights=SG_MALLOC(float64_t, degree*(1+max_mismatch)); 00078 weights_degree=degree; 00079 weights_length=(1+max_mismatch); 00080 00081 for (int32_t i=0; i<degree*(1+max_mismatch); i++) 00082 weights[i]=w[i]; 00083 } 00084 00085 CWeightedDegreeStringKernel::CWeightedDegreeStringKernel( 00086 CStringFeatures<char>* l, CStringFeatures<char>* r, int32_t d) 00087 : CStringKernel<char>(10) 00088 { 00089 init(); 00090 degree=d; 00091 type=E_WD; 00092 set_wd_weights_by_type(type); 00093 set_normalizer(new CFirstElementKernelNormalizer()); 00094 init(l, r); 00095 } 00096 00097 CWeightedDegreeStringKernel::~CWeightedDegreeStringKernel() 00098 { 00099 cleanup(); 00100 00101 SG_FREE(weights); 00102 weights=NULL; 00103 weights_degree=0; 00104 weights_length=0; 00105 00106 SG_FREE(block_weights); 00107 block_weights=NULL; 00108 00109 SG_FREE(position_weights); 00110 position_weights=NULL; 00111 00112 SG_FREE(weights_buffer); 00113 weights_buffer=NULL; 00114 } 00115 00116 00117 void CWeightedDegreeStringKernel::remove_lhs() 00118 { 00119 SG_DEBUG( "deleting CWeightedDegreeStringKernel optimization\n"); 00120 delete_optimization(); 00121 00122 if (tries!=NULL) 00123 tries->destroy(); 00124 00125 CKernel::remove_lhs(); 00126 } 00127 00128 void CWeightedDegreeStringKernel::create_empty_tries() 00129 { 00130 ASSERT(lhs); 00131 00132 seq_length=((CStringFeatures<char>*) lhs)->get_max_vector_length(); 00133 00134 if (tries!=NULL) 00135 { 00136 tries->destroy() ; 00137 tries->create(seq_length, max_mismatch==0) ; 00138 } 00139 } 00140 00141 bool CWeightedDegreeStringKernel::init(CFeatures* l, CFeatures* r) 00142 { 00143 int32_t lhs_changed=(lhs!=l); 00144 int32_t rhs_changed=(rhs!=r); 00145 00146 CStringKernel<char>::init(l,r); 00147 00148 SG_DEBUG("lhs_changed: %i\n", lhs_changed); 00149 SG_DEBUG("rhs_changed: %i\n", rhs_changed); 00150 00151 CStringFeatures<char>* sf_l=(CStringFeatures<char>*) l; 00152 CStringFeatures<char>* sf_r=(CStringFeatures<char>*) r; 00153 00154 int32_t len=sf_l->get_max_vector_length(); 00155 if (lhs_changed && !sf_l->have_same_length(len)) 00156 SG_ERROR("All strings in WD kernel must have same length (lhs wrong)!\n"); 00157 00158 if (rhs_changed && !sf_r->have_same_length(len)) 00159 SG_ERROR("All strings in WD kernel must have same length (rhs wrong)!\n"); 00160 00161 SG_UNREF(alphabet); 00162 alphabet=sf_l->get_alphabet(); 00163 CAlphabet* ralphabet=sf_r->get_alphabet(); 00164 00165 if (!((alphabet->get_alphabet()==DNA) || (alphabet->get_alphabet()==RNA))) 00166 properties &= ((uint64_t) (-1)) ^ (KP_LINADD | KP_BATCHEVALUATION); 00167 00168 ASSERT(ralphabet->get_alphabet()==alphabet->get_alphabet()); 00169 SG_UNREF(ralphabet); 00170 00171 if (tries!=NULL) { 00172 tries->delete_trees(max_mismatch==0); 00173 SG_UNREF(tries); 00174 } 00175 tries=new CTrie<DNATrie>(degree, max_mismatch==0); 00176 create_empty_tries(); 00177 00178 init_block_weights(); 00179 00180 return init_normalizer(); 00181 } 00182 00183 void CWeightedDegreeStringKernel::cleanup() 00184 { 00185 SG_DEBUG("deleting CWeightedDegreeStringKernel optimization\n"); 00186 delete_optimization(); 00187 00188 SG_FREE(block_weights); 00189 block_weights=NULL; 00190 00191 if (tries!=NULL) 00192 { 00193 tries->destroy(); 00194 SG_UNREF(tries); 00195 tries=NULL; 00196 } 00197 00198 seq_length=0; 00199 tree_initialized = false; 00200 00201 SG_UNREF(alphabet); 00202 alphabet=NULL; 00203 00204 CKernel::cleanup(); 00205 } 00206 00207 bool CWeightedDegreeStringKernel::init_optimization(int32_t count, int32_t* IDX, float64_t* alphas, int32_t tree_num) 00208 { 00209 if (tree_num<0) 00210 SG_DEBUG( "deleting CWeightedDegreeStringKernel optimization\n"); 00211 00212 delete_optimization(); 00213 00214 if (tree_num<0) 00215 SG_DEBUG( "initializing CWeightedDegreeStringKernel optimization\n") ; 00216 00217 for (int32_t i=0; i<count; i++) 00218 { 00219 if (tree_num<0) 00220 { 00221 if ( (i % (count/10+1)) == 0) 00222 SG_PROGRESS(i, 0, count); 00223 00224 if (max_mismatch==0) 00225 add_example_to_tree(IDX[i], alphas[i]) ; 00226 else 00227 add_example_to_tree_mismatch(IDX[i], alphas[i]) ; 00228 00229 //SG_DEBUG( "number of used trie nodes: %i\n", tries.get_num_used_nodes()) ; 00230 } 00231 else 00232 { 00233 if (max_mismatch==0) 00234 add_example_to_single_tree(IDX[i], alphas[i], tree_num) ; 00235 else 00236 add_example_to_single_tree_mismatch(IDX[i], alphas[i], tree_num) ; 00237 } 00238 } 00239 00240 if (tree_num<0) 00241 SG_DONE(); 00242 00243 //tries.compact_nodes(NO_CHILD, 0, weights) ; 00244 00245 set_is_initialized(true) ; 00246 return true ; 00247 } 00248 00249 bool CWeightedDegreeStringKernel::delete_optimization() 00250 { 00251 if (get_is_initialized()) 00252 { 00253 if (tries!=NULL) 00254 tries->delete_trees(max_mismatch==0); 00255 set_is_initialized(false); 00256 return true; 00257 } 00258 00259 return false; 00260 } 00261 00262 00263 float64_t CWeightedDegreeStringKernel::compute_with_mismatch( 00264 char* avec, int32_t alen, char* bvec, int32_t blen) 00265 { 00266 float64_t sum = 0.0; 00267 00268 for (int32_t i=0; i<alen; i++) 00269 { 00270 float64_t sumi = 0.0; 00271 int32_t mismatches=0; 00272 00273 for (int32_t j=0; (i+j<alen) && (j<degree); j++) 00274 { 00275 if (avec[i+j]!=bvec[i+j]) 00276 { 00277 mismatches++ ; 00278 if (mismatches>max_mismatch) 00279 break ; 00280 } ; 00281 sumi += weights[j+degree*mismatches]; 00282 } 00283 if (position_weights!=NULL) 00284 sum+=position_weights[i]*sumi ; 00285 else 00286 sum+=sumi ; 00287 } 00288 return sum ; 00289 } 00290 00291 float64_t CWeightedDegreeStringKernel::compute_using_block( 00292 char* avec, int32_t alen, char* bvec, int32_t blen) 00293 { 00294 ASSERT(alen==blen); 00295 00296 float64_t sum=0; 00297 int32_t match_len=-1; 00298 00299 for (int32_t i=0; i<alen; i++) 00300 { 00301 if (avec[i]==bvec[i]) 00302 match_len++; 00303 else 00304 { 00305 if (match_len>=0) 00306 sum+=block_weights[match_len]; 00307 match_len=-1; 00308 } 00309 } 00310 00311 if (match_len>=0) 00312 sum+=block_weights[match_len]; 00313 00314 return sum; 00315 } 00316 00317 float64_t CWeightedDegreeStringKernel::compute_without_mismatch( 00318 char* avec, int32_t alen, char* bvec, int32_t blen) 00319 { 00320 float64_t sum = 0.0; 00321 00322 for (int32_t i=0; i<alen; i++) 00323 { 00324 float64_t sumi = 0.0; 00325 00326 for (int32_t j=0; (i+j<alen) && (j<degree); j++) 00327 { 00328 if (avec[i+j]!=bvec[i+j]) 00329 break ; 00330 sumi += weights[j]; 00331 } 00332 if (position_weights!=NULL) 00333 sum+=position_weights[i]*sumi ; 00334 else 00335 sum+=sumi ; 00336 } 00337 return sum ; 00338 } 00339 00340 float64_t CWeightedDegreeStringKernel::compute_without_mismatch_matrix( 00341 char* avec, int32_t alen, char* bvec, int32_t blen) 00342 { 00343 float64_t sum = 0.0; 00344 00345 for (int32_t i=0; i<alen; i++) 00346 { 00347 float64_t sumi=0.0; 00348 for (int32_t j=0; (i+j<alen) && (j<degree); j++) 00349 { 00350 if (avec[i+j]!=bvec[i+j]) 00351 break; 00352 sumi += weights[i*degree+j]; 00353 } 00354 if (position_weights!=NULL) 00355 sum += position_weights[i]*sumi ; 00356 else 00357 sum += sumi ; 00358 } 00359 00360 return sum ; 00361 } 00362 00363 00364 float64_t CWeightedDegreeStringKernel::compute(int32_t idx_a, int32_t idx_b) 00365 { 00366 int32_t alen, blen; 00367 bool free_avec, free_bvec; 00368 char* avec=((CStringFeatures<char>*) lhs)->get_feature_vector(idx_a, alen, free_avec); 00369 char* bvec=((CStringFeatures<char>*) rhs)->get_feature_vector(idx_b, blen, free_bvec); 00370 float64_t result=0; 00371 00372 if (max_mismatch==0 && length==0 && block_computation) 00373 result=compute_using_block(avec, alen, bvec, blen); 00374 else 00375 { 00376 if (max_mismatch>0) 00377 result=compute_with_mismatch(avec, alen, bvec, blen); 00378 else if (length==0) 00379 result=compute_without_mismatch(avec, alen, bvec, blen); 00380 else 00381 result=compute_without_mismatch_matrix(avec, alen, bvec, blen); 00382 } 00383 ((CStringFeatures<char>*) lhs)->free_feature_vector(avec, idx_a, free_avec); 00384 ((CStringFeatures<char>*) rhs)->free_feature_vector(bvec, idx_b, free_bvec); 00385 00386 return result; 00387 } 00388 00389 00390 void CWeightedDegreeStringKernel::add_example_to_tree( 00391 int32_t idx, float64_t alpha) 00392 { 00393 ASSERT(alphabet); 00394 ASSERT(alphabet->get_alphabet()==DNA || alphabet->get_alphabet()==RNA); 00395 00396 int32_t len=0; 00397 bool free_vec; 00398 char* char_vec=((CStringFeatures<char>*) lhs)->get_feature_vector(idx, len, free_vec); 00399 ASSERT(max_mismatch==0); 00400 int32_t *vec=SG_MALLOC(int32_t, len); 00401 00402 for (int32_t i=0; i<len; i++) 00403 vec[i]=alphabet->remap_to_bin(char_vec[i]); 00404 ((CStringFeatures<char>*) lhs)->free_feature_vector(char_vec, idx, free_vec); 00405 00406 if (length == 0 || max_mismatch > 0) 00407 { 00408 for (int32_t i=0; i<len; i++) 00409 { 00410 float64_t alpha_pw=alpha; 00411 /*if (position_weights!=NULL) 00412 alpha_pw *= position_weights[i] ;*/ 00413 if (alpha_pw==0.0) 00414 continue; 00415 ASSERT(tries); 00416 tries->add_to_trie(i, 0, vec, normalizer->normalize_lhs(alpha_pw, idx), weights, (length!=0)); 00417 } 00418 } 00419 else 00420 { 00421 for (int32_t i=0; i<len; i++) 00422 { 00423 float64_t alpha_pw=alpha; 00424 /*if (position_weights!=NULL) 00425 alpha_pw = alpha*position_weights[i] ;*/ 00426 if (alpha_pw==0.0) 00427 continue ; 00428 ASSERT(tries); 00429 tries->add_to_trie(i, 0, vec, normalizer->normalize_lhs(alpha_pw, idx), weights, (length!=0)); 00430 } 00431 } 00432 SG_FREE(vec); 00433 tree_initialized=true ; 00434 } 00435 00436 void CWeightedDegreeStringKernel::add_example_to_single_tree( 00437 int32_t idx, float64_t alpha, int32_t tree_num) 00438 { 00439 ASSERT(alphabet); 00440 ASSERT(alphabet->get_alphabet()==DNA || alphabet->get_alphabet()==RNA); 00441 00442 int32_t len; 00443 bool free_vec; 00444 char* char_vec=((CStringFeatures<char>*) lhs)->get_feature_vector(idx, len, free_vec); 00445 ASSERT(max_mismatch==0); 00446 int32_t *vec = SG_MALLOC(int32_t, len); 00447 00448 for (int32_t i=tree_num; i<tree_num+degree && i<len; i++) 00449 vec[i]=alphabet->remap_to_bin(char_vec[i]); 00450 ((CStringFeatures<char>*) lhs)->free_feature_vector(char_vec, idx, free_vec); 00451 00452 00453 ASSERT(tries); 00454 if (alpha!=0.0) 00455 tries->add_to_trie(tree_num, 0, vec, normalizer->normalize_lhs(alpha, idx), weights, (length!=0)); 00456 00457 SG_FREE(vec); 00458 tree_initialized=true ; 00459 } 00460 00461 void CWeightedDegreeStringKernel::add_example_to_tree_mismatch(int32_t idx, float64_t alpha) 00462 { 00463 ASSERT(tries); 00464 ASSERT(alphabet); 00465 ASSERT(alphabet->get_alphabet()==DNA || alphabet->get_alphabet()==RNA); 00466 00467 int32_t len ; 00468 bool free_vec; 00469 char* char_vec=((CStringFeatures<char>*) lhs)->get_feature_vector(idx, len, free_vec); 00470 00471 int32_t *vec = SG_MALLOC(int32_t, len); 00472 00473 for (int32_t i=0; i<len; i++) 00474 vec[i]=alphabet->remap_to_bin(char_vec[i]); 00475 ((CStringFeatures<char>*) lhs)->free_feature_vector(char_vec, idx, free_vec); 00476 00477 for (int32_t i=0; i<len; i++) 00478 { 00479 if (alpha!=0.0) 00480 tries->add_example_to_tree_mismatch_recursion(NO_CHILD, i, normalizer->normalize_lhs(alpha, idx), &vec[i], len-i, 0, 0, max_mismatch, weights); 00481 } 00482 00483 SG_FREE(vec); 00484 tree_initialized=true ; 00485 } 00486 00487 void CWeightedDegreeStringKernel::add_example_to_single_tree_mismatch( 00488 int32_t idx, float64_t alpha, int32_t tree_num) 00489 { 00490 ASSERT(tries); 00491 ASSERT(alphabet); 00492 ASSERT(alphabet->get_alphabet()==DNA || alphabet->get_alphabet()==RNA); 00493 00494 int32_t len=0; 00495 bool free_vec; 00496 char* char_vec=((CStringFeatures<char>*) lhs)->get_feature_vector(idx, len, free_vec); 00497 int32_t *vec=SG_MALLOC(int32_t, len); 00498 00499 for (int32_t i=tree_num; i<len && i<tree_num+degree; i++) 00500 vec[i]=alphabet->remap_to_bin(char_vec[i]); 00501 ((CStringFeatures<char>*) lhs)->free_feature_vector(char_vec, idx, free_vec); 00502 00503 if (alpha!=0.0) 00504 { 00505 tries->add_example_to_tree_mismatch_recursion( 00506 NO_CHILD, tree_num, normalizer->normalize_lhs(alpha, idx), &vec[tree_num], len-tree_num, 00507 0, 0, max_mismatch, weights); 00508 } 00509 00510 SG_FREE(vec); 00511 tree_initialized=true; 00512 } 00513 00514 00515 float64_t CWeightedDegreeStringKernel::compute_by_tree(int32_t idx) 00516 { 00517 ASSERT(alphabet); 00518 ASSERT(alphabet->get_alphabet()==DNA || alphabet->get_alphabet()==RNA); 00519 00520 int32_t len=0; 00521 bool free_vec; 00522 char* char_vec=((CStringFeatures<char>*) rhs)->get_feature_vector(idx, len, free_vec); 00523 ASSERT(char_vec && len>0); 00524 int32_t *vec=SG_MALLOC(int32_t, len); 00525 00526 for (int32_t i=0; i<len; i++) 00527 vec[i]=alphabet->remap_to_bin(char_vec[i]); 00528 ((CStringFeatures<char>*) lhs)->free_feature_vector(char_vec, idx, free_vec); 00529 00530 float64_t sum=0; 00531 ASSERT(tries); 00532 for (int32_t i=0; i<len; i++) 00533 sum+=tries->compute_by_tree_helper(vec, len, i, i, i, weights, (length!=0)); 00534 00535 SG_FREE(vec); 00536 return normalizer->normalize_rhs(sum, idx); 00537 } 00538 00539 void CWeightedDegreeStringKernel::compute_by_tree( 00540 int32_t idx, float64_t* LevelContrib) 00541 { 00542 ASSERT(alphabet); 00543 ASSERT(alphabet->get_alphabet()==DNA || alphabet->get_alphabet()==RNA); 00544 00545 int32_t len ; 00546 bool free_vec; 00547 char* char_vec=((CStringFeatures<char>*) rhs)->get_feature_vector(idx, len, free_vec); 00548 00549 int32_t *vec = SG_MALLOC(int32_t, len); 00550 00551 for (int32_t i=0; i<len; i++) 00552 vec[i]=alphabet->remap_to_bin(char_vec[i]); 00553 ((CStringFeatures<char>*) lhs)->free_feature_vector(char_vec, idx, free_vec); 00554 00555 ASSERT(tries); 00556 for (int32_t i=0; i<len; i++) 00557 { 00558 tries->compute_by_tree_helper(vec, len, i, i, i, LevelContrib, 00559 normalizer->normalize_rhs(1.0, idx), 00560 mkl_stepsize, weights, (length!=0)); 00561 } 00562 00563 SG_FREE(vec); 00564 } 00565 00566 float64_t *CWeightedDegreeStringKernel::compute_abs_weights(int32_t &len) 00567 { 00568 ASSERT(tries); 00569 return tries->compute_abs_weights(len); 00570 } 00571 00572 bool CWeightedDegreeStringKernel::set_wd_weights_by_type(EWDKernType p_type) 00573 { 00574 ASSERT(degree>0); 00575 ASSERT(p_type==E_WD); 00576 00577 SG_FREE(weights); 00578 weights=SG_MALLOC(float64_t, degree); 00579 weights_degree=degree; 00580 weights_length=1; 00581 00582 if (weights) 00583 { 00584 int32_t i; 00585 float64_t sum=0; 00586 for (i=0; i<degree; i++) 00587 { 00588 weights[i]=degree-i; 00589 sum+=weights[i]; 00590 } 00591 for (i=0; i<degree; i++) 00592 weights[i]/=sum; 00593 00594 for (i=0; i<degree; i++) 00595 { 00596 for (int32_t j=1; j<=max_mismatch; j++) 00597 { 00598 if (j<i+1) 00599 { 00600 int32_t nk=CMath::nchoosek(i+1, j); 00601 weights[i+j*degree]=weights[i]/(nk*CMath::pow(3.0,j)); 00602 } 00603 else 00604 weights[i+j*degree]= 0; 00605 } 00606 } 00607 00608 if (which_degree>=0) 00609 { 00610 ASSERT(which_degree<degree); 00611 for (i=0; i<degree; i++) 00612 { 00613 if (i!=which_degree) 00614 weights[i]=0; 00615 else 00616 weights[i]=1; 00617 } 00618 } 00619 return true; 00620 } 00621 else 00622 return false; 00623 } 00624 00625 bool CWeightedDegreeStringKernel::set_weights(SGMatrix<float64_t> new_weights) 00626 { 00627 float64_t* ws=new_weights.matrix; 00628 int32_t d=new_weights.num_rows; 00629 int32_t len=new_weights.num_cols; 00630 00631 if (d!=degree || len<0) 00632 SG_ERROR("WD: Dimension mismatch (should be (seq_length | 1) x degree) got (%d x %d)\n", len, degree); 00633 00634 degree=d; 00635 length=len; 00636 00637 if (len <= 0) 00638 len=1; 00639 00640 weights_degree=degree; 00641 weights_length=len+max_mismatch; 00642 00643 00644 SG_DEBUG("Creating weights of size %dx%d\n", weights_degree, weights_length); 00645 int32_t num_weights=weights_degree*weights_length; 00646 SG_FREE(weights); 00647 weights=SG_MALLOC(float64_t, num_weights); 00648 00649 for (int32_t i=0; i<degree*len; i++) 00650 weights[i]=ws[i]; 00651 00652 return true; 00653 } 00654 00655 bool CWeightedDegreeStringKernel::set_position_weights( 00656 float64_t* pws, int32_t len) 00657 { 00658 if (len==0) 00659 { 00660 SG_FREE(position_weights); 00661 position_weights=NULL; 00662 ASSERT(tries); 00663 tries->set_position_weights(position_weights); 00664 } 00665 00666 if (seq_length!=len) 00667 SG_ERROR("seq_length = %i, position_weights_length=%i\n", seq_length, len); 00668 00669 SG_FREE(position_weights); 00670 position_weights=SG_MALLOC(float64_t, len); 00671 position_weights_len=len; 00672 ASSERT(tries); 00673 tries->set_position_weights(position_weights); 00674 00675 if (position_weights) 00676 { 00677 for (int32_t i=0; i<len; i++) 00678 position_weights[i]=pws[i]; 00679 return true; 00680 } 00681 else 00682 return false; 00683 } 00684 00685 bool CWeightedDegreeStringKernel::init_block_weights_from_wd() 00686 { 00687 SG_FREE(block_weights); 00688 block_weights=SG_MALLOC(float64_t, CMath::max(seq_length,degree)); 00689 00690 int32_t k; 00691 float64_t d=degree; // use float to evade rounding errors below 00692 00693 for (k=0; k<degree; k++) 00694 block_weights[k]= 00695 (-CMath::pow(k, 3)+(3*d-3)*CMath::pow(k, 2)+(9*d-2)*k+6*d)/(3*d*(d+1)); 00696 for (k=degree; k<seq_length; k++) 00697 block_weights[k]=(-d+3*k+4)/3; 00698 00699 return true; 00700 } 00701 00702 bool CWeightedDegreeStringKernel::init_block_weights_from_wd_external() 00703 { 00704 ASSERT(weights); 00705 SG_FREE(block_weights); 00706 block_weights=SG_MALLOC(float64_t, CMath::max(seq_length,degree)); 00707 00708 int32_t i=0; 00709 block_weights[0]=weights[0]; 00710 for (i=1; i<CMath::max(seq_length,degree); i++) 00711 block_weights[i]=0; 00712 00713 for (i=1; i<CMath::max(seq_length,degree); i++) 00714 { 00715 block_weights[i]=block_weights[i-1]; 00716 00717 float64_t contrib=0; 00718 for (int32_t j=0; j<CMath::min(degree,i+1); j++) 00719 contrib+=weights[j]; 00720 00721 block_weights[i]+=contrib; 00722 } 00723 return true; 00724 } 00725 00726 bool CWeightedDegreeStringKernel::init_block_weights_const() 00727 { 00728 SG_FREE(block_weights); 00729 block_weights=SG_MALLOC(float64_t, seq_length); 00730 00731 for (int32_t i=1; i<seq_length+1 ; i++) 00732 block_weights[i-1]=1.0/seq_length; 00733 return true; 00734 } 00735 00736 bool CWeightedDegreeStringKernel::init_block_weights_linear() 00737 { 00738 SG_FREE(block_weights); 00739 block_weights=SG_MALLOC(float64_t, seq_length); 00740 00741 for (int32_t i=1; i<seq_length+1 ; i++) 00742 block_weights[i-1]=degree*i; 00743 00744 return true; 00745 } 00746 00747 bool CWeightedDegreeStringKernel::init_block_weights_sqpoly() 00748 { 00749 SG_FREE(block_weights); 00750 block_weights=SG_MALLOC(float64_t, seq_length); 00751 00752 for (int32_t i=1; i<degree+1 ; i++) 00753 block_weights[i-1]=((float64_t) i)*i; 00754 00755 for (int32_t i=degree+1; i<seq_length+1 ; i++) 00756 block_weights[i-1]=i; 00757 00758 return true; 00759 } 00760 00761 bool CWeightedDegreeStringKernel::init_block_weights_cubicpoly() 00762 { 00763 SG_FREE(block_weights); 00764 block_weights=SG_MALLOC(float64_t, seq_length); 00765 00766 for (int32_t i=1; i<degree+1 ; i++) 00767 block_weights[i-1]=((float64_t) i)*i*i; 00768 00769 for (int32_t i=degree+1; i<seq_length+1 ; i++) 00770 block_weights[i-1]=i; 00771 return true; 00772 } 00773 00774 bool CWeightedDegreeStringKernel::init_block_weights_exp() 00775 { 00776 SG_FREE(block_weights); 00777 block_weights=SG_MALLOC(float64_t, seq_length); 00778 00779 for (int32_t i=1; i<degree+1 ; i++) 00780 block_weights[i-1]=exp(((float64_t) i/10.0)); 00781 00782 for (int32_t i=degree+1; i<seq_length+1 ; i++) 00783 block_weights[i-1]=i; 00784 00785 return true; 00786 } 00787 00788 bool CWeightedDegreeStringKernel::init_block_weights_log() 00789 { 00790 SG_FREE(block_weights); 00791 block_weights=SG_MALLOC(float64_t, seq_length); 00792 00793 for (int32_t i=1; i<degree+1 ; i++) 00794 block_weights[i-1]=CMath::pow(CMath::log((float64_t) i),2); 00795 00796 for (int32_t i=degree+1; i<seq_length+1 ; i++) 00797 block_weights[i-1]=i-degree+1+CMath::pow(CMath::log(degree+1.0),2); 00798 00799 return true; 00800 } 00801 00802 bool CWeightedDegreeStringKernel::init_block_weights() 00803 { 00804 switch (type) 00805 { 00806 case E_WD: 00807 return init_block_weights_from_wd(); 00808 case E_EXTERNAL: 00809 return init_block_weights_from_wd_external(); 00810 case E_BLOCK_CONST: 00811 return init_block_weights_const(); 00812 case E_BLOCK_LINEAR: 00813 return init_block_weights_linear(); 00814 case E_BLOCK_SQPOLY: 00815 return init_block_weights_sqpoly(); 00816 case E_BLOCK_CUBICPOLY: 00817 return init_block_weights_cubicpoly(); 00818 case E_BLOCK_EXP: 00819 return init_block_weights_exp(); 00820 case E_BLOCK_LOG: 00821 return init_block_weights_log(); 00822 }; 00823 return false; 00824 } 00825 00826 00827 void* CWeightedDegreeStringKernel::compute_batch_helper(void* p) 00828 { 00829 S_THREAD_PARAM* params = (S_THREAD_PARAM*) p; 00830 int32_t j=params->j; 00831 CWeightedDegreeStringKernel* wd=params->kernel; 00832 CTrie<DNATrie>* tries=params->tries; 00833 float64_t* weights=params->weights; 00834 int32_t length=params->length; 00835 int32_t* vec=params->vec; 00836 float64_t* result=params->result; 00837 float64_t factor=params->factor; 00838 int32_t* vec_idx=params->vec_idx; 00839 00840 CStringFeatures<char>* rhs_feat=((CStringFeatures<char>*) wd->get_rhs()); 00841 CAlphabet* alpha=wd->alphabet; 00842 00843 for (int32_t i=params->start; i<params->end; i++) 00844 { 00845 int32_t len=0; 00846 bool free_vec; 00847 char* char_vec=rhs_feat->get_feature_vector(vec_idx[i], len, free_vec); 00848 for (int32_t k=j; k<CMath::min(len,j+wd->get_degree()); k++) 00849 vec[k]=alpha->remap_to_bin(char_vec[k]); 00850 rhs_feat->free_feature_vector(char_vec, vec_idx[i], free_vec); 00851 00852 ASSERT(tries); 00853 00854 result[i]+=factor* 00855 wd->normalizer->normalize_rhs(tries->compute_by_tree_helper(vec, len, j, j, j, weights, (length!=0)), vec_idx[i]); 00856 } 00857 00858 SG_UNREF(rhs_feat); 00859 00860 return NULL; 00861 } 00862 00863 void CWeightedDegreeStringKernel::compute_batch( 00864 int32_t num_vec, int32_t* vec_idx, float64_t* result, int32_t num_suppvec, 00865 int32_t* IDX, float64_t* alphas, float64_t factor) 00866 { 00867 ASSERT(tries); 00868 ASSERT(alphabet); 00869 ASSERT(alphabet->get_alphabet()==DNA || alphabet->get_alphabet()==RNA); 00870 ASSERT(rhs); 00871 ASSERT(num_vec<=rhs->get_num_vectors()); 00872 ASSERT(num_vec>0); 00873 ASSERT(vec_idx); 00874 ASSERT(result); 00875 create_empty_tries(); 00876 00877 int32_t num_feat=((CStringFeatures<char>*) rhs)->get_max_vector_length(); 00878 ASSERT(num_feat>0); 00879 int32_t num_threads=parallel->get_num_threads(); 00880 ASSERT(num_threads>0); 00881 int32_t* vec=SG_MALLOC(int32_t, num_threads*num_feat); 00882 00883 if (num_threads < 2) 00884 { 00885 #ifdef CYGWIN 00886 for (int32_t j=0; j<num_feat; j++) 00887 #else 00888 CSignal::clear_cancel(); 00889 for (int32_t j=0; j<num_feat && !CSignal::cancel_computations(); j++) 00890 #endif 00891 { 00892 init_optimization(num_suppvec, IDX, alphas, j); 00893 S_THREAD_PARAM params; 00894 params.vec=vec; 00895 params.result=result; 00896 params.weights=weights; 00897 params.kernel=this; 00898 params.tries=tries; 00899 params.factor=factor; 00900 params.j=j; 00901 params.start=0; 00902 params.end=num_vec; 00903 params.length=length; 00904 params.vec_idx=vec_idx; 00905 compute_batch_helper((void*) ¶ms); 00906 00907 SG_PROGRESS(j,0,num_feat); 00908 } 00909 } 00910 #ifndef WIN32 00911 else 00912 { 00913 CSignal::clear_cancel(); 00914 for (int32_t j=0; j<num_feat && !CSignal::cancel_computations(); j++) 00915 { 00916 init_optimization(num_suppvec, IDX, alphas, j); 00917 pthread_t* threads = SG_MALLOC(pthread_t, num_threads-1); 00918 S_THREAD_PARAM* params = SG_MALLOC(S_THREAD_PARAM, num_threads); 00919 int32_t step= num_vec/num_threads; 00920 int32_t t; 00921 00922 for (t=0; t<num_threads-1; t++) 00923 { 00924 params[t].vec=&vec[num_feat*t]; 00925 params[t].result=result; 00926 params[t].weights=weights; 00927 params[t].kernel=this; 00928 params[t].tries=tries; 00929 params[t].factor=factor; 00930 params[t].j=j; 00931 params[t].start = t*step; 00932 params[t].end = (t+1)*step; 00933 params[t].length=length; 00934 params[t].vec_idx=vec_idx; 00935 pthread_create(&threads[t], NULL, CWeightedDegreeStringKernel::compute_batch_helper, (void*)¶ms[t]); 00936 } 00937 params[t].vec=&vec[num_feat*t]; 00938 params[t].result=result; 00939 params[t].weights=weights; 00940 params[t].kernel=this; 00941 params[t].tries=tries; 00942 params[t].factor=factor; 00943 params[t].j=j; 00944 params[t].start=t*step; 00945 params[t].end=num_vec; 00946 params[t].length=length; 00947 params[t].vec_idx=vec_idx; 00948 compute_batch_helper((void*) ¶ms[t]); 00949 00950 for (t=0; t<num_threads-1; t++) 00951 pthread_join(threads[t], NULL); 00952 SG_PROGRESS(j,0,num_feat); 00953 00954 SG_FREE(params); 00955 SG_FREE(threads); 00956 } 00957 } 00958 #endif 00959 00960 SG_FREE(vec); 00961 00962 //really also free memory as this can be huge on testing especially when 00963 //using the combined kernel 00964 create_empty_tries(); 00965 } 00966 00967 bool CWeightedDegreeStringKernel::set_max_mismatch(int32_t max) 00968 { 00969 if (type==E_EXTERNAL && max!=0) 00970 return false; 00971 00972 max_mismatch=max; 00973 00974 if (lhs!=NULL && rhs!=NULL) 00975 return init(lhs, rhs); 00976 else 00977 return true; 00978 } 00979 00980 void CWeightedDegreeStringKernel::init() 00981 { 00982 weights=NULL; 00983 weights_degree=0; 00984 weights_length=0; 00985 00986 position_weights=NULL; 00987 position_weights_len=0; 00988 00989 weights_buffer=NULL; 00990 mkl_stepsize=1; 00991 degree=1; 00992 length=0; 00993 00994 max_mismatch=0; 00995 seq_length=0; 00996 00997 block_weights=NULL; 00998 block_computation=true; 00999 type=E_WD; 01000 which_degree=-1; 01001 tries=NULL; 01002 01003 tree_initialized=false; 01004 alphabet=NULL; 01005 01006 lhs=NULL; 01007 rhs=NULL; 01008 01009 properties |= KP_LINADD | KP_KERNCOMBINATION | KP_BATCHEVALUATION; 01010 01011 set_normalizer(new CFirstElementKernelNormalizer()); 01012 01013 m_parameters->add_matrix(&weights, &weights_degree, &weights_length, 01014 "weights", "WD Kernel weights."); 01015 m_parameters->add_vector(&position_weights, &position_weights_len, 01016 "position_weights", 01017 "Weights per position."); 01018 m_parameters->add(&mkl_stepsize, "mkl_stepsize", "MKL step size."); 01019 m_parameters->add(°ree, "degree", "Order of WD kernel."); 01020 m_parameters->add(&max_mismatch, "max_mismatch", 01021 "Number of allowed mismatches."); 01022 m_parameters->add(&block_computation, "block_computation", 01023 "If block computation shall be used."); 01024 m_parameters->add((machine_int_t*) &type, "type", 01025 "WeightedDegree kernel type."); 01026 m_parameters->add(&which_degree, "which_degree", 01027 "Unqueal -1 if just a single degree is selected."); 01028 m_parameters->add((CSGObject**) &alphabet, "alphabet", 01029 "Alphabet of Features."); 01030 }