00001 /* Copyright (C) 2001 MySQL AB 00002 00003 This program is free software; you can redistribute it and/or modify 00004 it under the terms of the GNU General Public License as published by 00005 the Free Software Foundation; version 2 of the License. 00006 00007 This program is distributed in the hope that it will be useful, 00008 but WITHOUT ANY WARRANTY; without even the implied warranty of 00009 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00010 GNU General Public License for more details. 00011 00012 You should have received a copy of the GNU General Public License 00013 along with this program; if not, write to the Free Software 00014 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ 00015 00016 /* 00017 Function to handle quick removal of duplicates 00018 This code is used when doing multi-table deletes to find the rows in 00019 reference tables that needs to be deleted. 00020 00021 The basic idea is as follows: 00022 00023 Store first all strings in a binary tree, ignoring duplicates. 00024 00025 The unique entries will be returned in sort order, to ensure that we do the 00026 deletes in disk order. 00027 */ 00028 00029 #include <config.h> 00030 00031 #include <math.h> 00032 00033 #include <queue> 00034 00035 #include <drizzled/sql_sort.h> 00036 #include <drizzled/session.h> 00037 #include <drizzled/sql_list.h> 00038 #include <drizzled/internal/iocache.h> 00039 #include <drizzled/unique.h> 00040 00041 #if defined(CMATH_NAMESPACE) 00042 using namespace CMATH_NAMESPACE; 00043 #endif 00044 00045 using namespace std; 00046 00047 namespace drizzled 00048 { 00049 00050 int unique_write_to_ptrs(unsigned char* key, 00051 uint32_t, Unique *unique) 00052 { 00053 memcpy(unique->record_pointers, key, unique->size); 00054 unique->record_pointers+=unique->size; 00055 return 0; 00056 } 00057 00058 Unique::Unique(qsort_cmp2 comp_func, void * comp_func_fixed_arg, 00059 uint32_t size_arg, size_t max_in_memory_size_arg) 00060 : max_in_memory_size(max_in_memory_size_arg), 00061 size(size_arg), 00062 elements(0) 00063 { 00064 // Second element is max size for memory use in unique sort 00065 init_tree(&tree, 0, 0, size, comp_func, false, 00066 NULL, comp_func_fixed_arg); 00067 } 00068 00069 00070 /* 00071 Calculate log2(n!) 00072 00073 NOTES 00074 Stirling's approximate formula is used: 00075 00076 n! ~= sqrt(2*M_PI*n) * (n/M_E)^n 00077 00078 Derivation of formula used for calculations is as follows: 00079 00080 log2(n!) = log(n!)/log(2) = log(sqrt(2*M_PI*n)*(n/M_E)^n) / log(2) = 00081 00082 = (log(2*M_PI*n)/2 + n*log(n/M_E)) / log(2). 00083 */ 00084 00085 inline double log2_n_fact(double x) 00086 { 00087 return (log(2*M_PI*x)/2 + x*log(x/M_E)) / M_LN2; 00088 } 00089 00090 00091 /* 00092 Calculate cost of using Unique for processing nkeys elements of size 00093 key_size using max_in_memory_size memory. 00094 00095 SYNOPSIS 00096 Unique::get_use_cost() 00097 buffer space for temporary data, use Unique::get_cost_calc_buff_size 00098 to get # bytes needed. 00099 nkeys #of elements in Unique 00100 key_size size of each elements in bytes 00101 max_in_memory_size amount of memory Unique will be allowed to use 00102 00103 RETURN 00104 Cost in disk seeks. 00105 00106 NOTES 00107 cost(using_unqiue) = 00108 cost(create_trees) + (see #1) 00109 cost(merge) + (see #2) 00110 cost(read_result) (see #3) 00111 00112 1. Cost of trees creation 00113 For each Unique::put operation there will be 2*log2(n+1) elements 00114 comparisons, where n runs from 1 tree_size (we assume that all added 00115 elements are different). Together this gives: 00116 00117 n_compares = 2*(log2(2) + log2(3) + ... + log2(N+1)) = 2*log2((N+1)!) 00118 00119 then cost(tree_creation) = n_compares*ROWID_COMPARE_COST; 00120 00121 Total cost of creating trees: 00122 (n_trees - 1)*max_size_tree_cost + non_max_size_tree_cost. 00123 00124 Approximate value of log2(N!) is calculated by log2_n_fact function. 00125 00126 00127 (The Next two are historical, we do all unique operations in memory or fail) 00128 00129 2. Cost of merging. 00130 If only one tree is created by Unique no merging will be necessary. 00131 Otherwise, we model execution of merge_many_buff function and count 00132 #of merges. (The reason behind this is that number of buffers is small, 00133 while size of buffers is big and we don't want to loose precision with 00134 O(x)-style formula) 00135 00136 3. If only one tree is created by Unique no disk io will happen. 00137 Otherwise, ceil(key_len*n_keys) disk seeks are necessary. We assume 00138 these will be random seeks. 00139 */ 00140 00141 double Unique::get_use_cost(uint32_t *, uint32_t nkeys, uint32_t key_size, 00142 size_t max_in_memory_size_arg) 00143 { 00144 ulong max_elements_in_tree; 00145 ulong last_tree_elems; 00146 double result; 00147 00148 max_elements_in_tree= ((ulong) max_in_memory_size_arg / 00149 ALIGN_SIZE(sizeof(TREE_ELEMENT)+key_size)); 00150 00151 last_tree_elems= nkeys % max_elements_in_tree; 00152 00153 /* Calculate cost of creating trees */ 00154 result= 2*log2_n_fact(last_tree_elems + 1.0); 00155 result /= TIME_FOR_COMPARE_ROWID; 00156 00157 return result; 00158 } 00159 00160 Unique::~Unique() 00161 { 00162 delete_tree(&tree); 00163 } 00164 00165 00166 /* 00167 Clear the tree. 00168 You must call reset() if you want to reuse Unique after walk(). 00169 */ 00170 00171 void 00172 Unique::reset() 00173 { 00174 reset_tree(&tree); 00175 assert(elements == 0); 00176 } 00177 00178 00179 /* 00180 DESCRIPTION 00181 Walks consecutively through all unique elements: 00182 if all elements are in memory, then it simply invokes 'tree_walk', else 00183 all flushed trees are loaded to memory piece-by-piece, pieces are 00184 sorted, and action is called for each unique value. 00185 Note: so as merging resets file_ptrs state, this method can change 00186 internal Unique state to undefined: if you want to reuse Unique after 00187 walk() you must call reset() first! 00188 SYNOPSIS 00189 Unique:walk() 00190 All params are 'IN': 00191 action function-visitor, typed in include/tree.h 00192 function is called for each unique element 00193 arg argument for visitor, which is passed to it on each call 00194 RETURN VALUE 00195 0 OK 00196 <> 0 error 00197 */ 00198 00199 bool Unique::walk(tree_walk_action action, void *walk_action_arg) 00200 { 00201 return tree_walk(&tree, action, walk_action_arg, left_root_right); 00202 } 00203 00204 /* 00205 Modify the Table element so that when one calls init_records() 00206 the rows will be read in priority order. 00207 */ 00208 00209 bool Unique::get(Table *table) 00210 { 00211 table->sort.found_records= elements+tree.elements_in_tree; 00212 00213 if ((record_pointers=table->sort.record_pointers= (unsigned char*) 00214 malloc(size * tree.elements_in_tree))) 00215 { 00216 (void) tree_walk(&tree, (tree_walk_action) unique_write_to_ptrs, 00217 this, left_root_right); 00218 return 0; 00219 } 00220 /* Not enough memory */ 00221 return 1; 00222 } 00223 00224 } /* namespace drizzled */