SHOGUN
v1.1.0
|
00001 /* 00002 * This program is free software; you can redistribute it and/or modify 00003 * it under the terms of the GNU General Public License as published by 00004 * the Free Software Foundation; either version 3 of the License, or 00005 * (at your option) any later version. 00006 * 00007 * Written (W) 1999-2008 Gunnar Raetsch 00008 * Written (W) 1999-2008,2011 Soeren Sonnenburg 00009 * Copyright (C) 1999-2009 Fraunhofer Institute FIRST and Max-Planck-Society 00010 * Copyright (C) 2011 Berlin Institute of Technology 00011 */ 00012 #include <shogun/preprocessor/PCA.h> 00013 #ifdef HAVE_LAPACK 00014 #include <shogun/mathematics/lapack.h> 00015 #include <shogun/lib/config.h> 00016 #include <shogun/mathematics/Math.h> 00017 #include <string.h> 00018 #include <stdlib.h> 00019 #include <shogun/lib/common.h> 00020 #include <shogun/preprocessor/SimplePreprocessor.h> 00021 #include <shogun/features/Features.h> 00022 #include <shogun/features/SimpleFeatures.h> 00023 #include <shogun/io/SGIO.h> 00024 00025 using namespace shogun; 00026 00027 CPCA::CPCA(bool do_whitening_, EPCAMode mode_, float64_t thresh_) 00028 : CDimensionReductionPreprocessor(), num_dim(0), m_initialized(false), 00029 m_whitening(do_whitening_), m_mode(mode_), thresh(thresh_) 00030 { 00031 init(); 00032 } 00033 00034 void CPCA::init() 00035 { 00036 m_transformation_matrix = SGMatrix<float64_t>(NULL,0,0,true); 00037 m_mean_vector = SGVector<float64_t>(NULL,0,true); 00038 m_eigenvalues_vector = SGVector<float64_t>(NULL,0,true); 00039 00040 00041 m_parameters->add(&m_transformation_matrix, 00042 "transformation matrix", "Transformation matrix (Eigenvectors of covariance matrix)."); 00043 m_parameters->add(&m_mean_vector, 00044 "mean vector", "Mean Vector."); 00045 m_parameters->add(&m_eigenvalues_vector, 00046 "eigenvalues vector", "Vector with Eigenvalues."); 00047 m_parameters->add(&m_initialized, 00048 "initalized", "True when initialized."); 00049 m_parameters->add(&m_whitening, 00050 "whitening", "Whether data shall be whitened."); 00051 m_parameters->add((machine_int_t*) &m_mode, "mode", 00052 "PCA Mode."); 00053 m_parameters->add(&thresh, 00054 "thresh", "Cutoff threshold."); 00055 } 00056 00057 CPCA::~CPCA() 00058 { 00059 m_transformation_matrix.destroy_matrix(); 00060 m_mean_vector.destroy_vector(); 00061 m_eigenvalues_vector.destroy_vector(); 00062 } 00063 00064 bool CPCA::init(CFeatures* features) 00065 { 00066 if (!m_initialized) 00067 { 00068 // loop varibles 00069 int32_t i,j,k; 00070 00071 ASSERT(features->get_feature_class()==C_SIMPLE); 00072 ASSERT(features->get_feature_type()==F_DREAL); 00073 00074 int32_t num_vectors=((CSimpleFeatures<float64_t>*)features)->get_num_vectors(); 00075 int32_t num_features=((CSimpleFeatures<float64_t>*)features)->get_num_features(); 00076 SG_INFO("num_examples: %ld num_features: %ld \n", num_vectors, num_features); 00077 00078 m_mean_vector.vlen = num_features; 00079 m_mean_vector.vector = SG_CALLOC(float64_t, num_features); 00080 00081 // sum 00082 SGMatrix<float64_t> feature_matrix = ((CSimpleFeatures<float64_t>*)features)->get_feature_matrix(); 00083 for (i=0; i<num_vectors; i++) 00084 { 00085 for (j=0; j<num_features; j++) 00086 m_mean_vector.vector[j] += feature_matrix.matrix[i*num_features+j]; 00087 } 00088 00089 //divide 00090 for (i=0; i<num_features; i++) 00091 m_mean_vector.vector[i] /= num_vectors; 00092 00093 float64_t* cov = SG_CALLOC(float64_t, num_features*num_features); 00094 00095 float64_t* sub_mean = SG_MALLOC(float64_t, num_features); 00096 00097 for (i=0; i<num_vectors; i++) 00098 { 00099 for (k=0; k<num_features; k++) 00100 sub_mean[k]=feature_matrix.matrix[i*num_features+k]-m_mean_vector.vector[k]; 00101 00102 cblas_dger(CblasColMajor, 00103 num_features,num_features, 00104 1.0,sub_mean,1, 00105 sub_mean,1, 00106 cov, num_features); 00107 } 00108 00109 SG_FREE(sub_mean); 00110 00111 for (i=0; i<num_features; i++) 00112 { 00113 for (j=0; j<num_features; j++) 00114 cov[i*num_features+j]/=(num_vectors-1); 00115 } 00116 00117 SG_INFO("Computing Eigenvalues ... ") ; 00118 00119 m_eigenvalues_vector.vector = CMath::compute_eigenvectors(cov,num_features,num_features); 00120 m_eigenvalues_vector.vlen = num_features; 00121 num_dim=0; 00122 00123 if (m_mode == FIXED_NUMBER) 00124 { 00125 ASSERT(m_target_dim <= num_features); 00126 num_dim = m_target_dim; 00127 } 00128 if (m_mode == VARIANCE_EXPLAINED) 00129 { 00130 float64_t eig_sum = 0; 00131 for (i=0; i<num_features; i++) 00132 eig_sum += m_eigenvalues_vector.vector[i]; 00133 00134 float64_t com_sum = 0; 00135 for (i=num_features-1; i>-1; i--) 00136 { 00137 num_dim++; 00138 com_sum += m_eigenvalues_vector.vector[i]; 00139 if (com_sum/eig_sum>=thresh) 00140 break; 00141 } 00142 } 00143 if (m_mode == THRESHOLD) 00144 { 00145 for (i=num_features-1; i>-1; i--) 00146 { 00147 if (m_eigenvalues_vector.vector[i]>thresh) 00148 num_dim++; 00149 else 00150 break; 00151 } 00152 } 00153 00154 SG_INFO("Done\nReducing from %i to %i features..", num_features, num_dim) ; 00155 00156 m_transformation_matrix = SGMatrix<float64_t>(num_features,num_dim); 00157 num_old_dim = num_features; 00158 00159 int32_t offs=0; 00160 for (i=num_features-num_dim; i<num_features; i++) 00161 { 00162 for (k=0; k<num_features; k++) 00163 if (m_whitening) 00164 m_transformation_matrix.matrix[offs+k*num_dim] = 00165 cov[num_features*i+k]/sqrt(m_eigenvalues_vector.vector[i]); 00166 else 00167 m_transformation_matrix.matrix[offs+k*num_dim] = 00168 cov[num_features*i+k]; 00169 offs++; 00170 } 00171 00172 SG_FREE(cov); 00173 m_initialized = true; 00174 return true; 00175 } 00176 00177 return false; 00178 } 00179 00180 void CPCA::cleanup() 00181 { 00182 m_transformation_matrix.destroy_matrix(); 00183 m_mean_vector.destroy_vector(); 00184 m_eigenvalues_vector.destroy_vector(); 00185 } 00186 00187 SGMatrix<float64_t> CPCA::apply_to_feature_matrix(CFeatures* features) 00188 { 00189 ASSERT(m_initialized); 00190 SGMatrix<float64_t> m = ((CSimpleFeatures<float64_t>*) features)->get_feature_matrix(); 00191 int32_t num_vectors = m.num_cols; 00192 int32_t num_features = m.num_rows; 00193 SG_INFO("get Feature matrix: %ix%i\n", num_vectors, num_features); 00194 00195 if (m.matrix) 00196 { 00197 SG_INFO("Preprocessing feature matrix\n"); 00198 float64_t* res = SG_MALLOC(float64_t, num_dim); 00199 float64_t* sub_mean = SG_MALLOC(float64_t, num_features); 00200 00201 for (int32_t vec=0; vec<num_vectors; vec++) 00202 { 00203 int32_t i; 00204 00205 for (i=0; i<num_features; i++) 00206 sub_mean[i] = m.matrix[num_features*vec+i] - m_mean_vector.vector[i]; 00207 00208 cblas_dgemv(CblasColMajor,CblasNoTrans, 00209 num_dim,num_features, 00210 1.0,m_transformation_matrix.matrix,num_dim, 00211 sub_mean,1, 00212 0.0,res,1); 00213 00214 float64_t* m_transformed = &m.matrix[num_dim*vec]; 00215 00216 for (i=0; i<num_dim; i++) 00217 m_transformed[i] = res[i]; 00218 } 00219 SG_FREE(res); 00220 SG_FREE(sub_mean); 00221 00222 ((CSimpleFeatures<float64_t>*) features)->set_num_features(num_dim); 00223 ((CSimpleFeatures<float64_t>*) features)->get_feature_matrix(num_features, num_vectors); 00224 SG_INFO("new Feature matrix: %ix%i\n", num_vectors, num_features); 00225 } 00226 00227 return m; 00228 } 00229 00230 SGVector<float64_t> CPCA::apply_to_feature_vector(SGVector<float64_t> vector) 00231 { 00232 float64_t* result = SG_MALLOC(float64_t, num_dim); 00233 float64_t* sub_mean = SG_MALLOC(float64_t, vector.vlen); 00234 00235 for (int32_t i=0; i<vector.vlen; i++) 00236 sub_mean[i]=vector.vector[i]-m_mean_vector.vector[i]; 00237 00238 cblas_dgemv(CblasColMajor,CblasNoTrans, 00239 num_dim,vector.vlen, 00240 1.0,m_transformation_matrix.matrix,m_transformation_matrix.num_cols, 00241 sub_mean,1, 00242 0.0,result,1); 00243 00244 SG_FREE(sub_mean); 00245 return SGVector<float64_t>(result,num_dim); 00246 } 00247 00248 SGMatrix<float64_t> CPCA::get_transformation_matrix() 00249 { 00250 return m_transformation_matrix; 00251 } 00252 00253 SGVector<float64_t> CPCA::get_eigenvalues() 00254 { 00255 return m_eigenvalues_vector; 00256 } 00257 00258 SGVector<float64_t> CPCA::get_mean() 00259 { 00260 return m_mean_vector; 00261 } 00262 00263 #endif /* HAVE_LAPACK */