Drizzled Public API Documentation

charset.cc
00001 /* Copyright (C) 2000 MySQL AB
00002 
00003    This program is free software; you can redistribute it and/or modify
00004    it under the terms of the GNU General Public License as published by
00005    the Free Software Foundation; version 2 of the License.
00006 
00007    This program is distributed in the hope that it will be useful,
00008    but WITHOUT ANY WARRANTY; without even the implied warranty of
00009    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00010    GNU General Public License for more details.
00011 
00012    You should have received a copy of the GNU General Public License
00013    along with this program; if not, write to the Free Software
00014    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA */
00015 
00016 #include <config.h>
00017 
00018 #include <drizzled/charset.h>
00019 #include <drizzled/error.h>
00020 #include <drizzled/charset_info.h>
00021 #include <drizzled/internal/m_string.h>
00022 #include <drizzled/configmake.h>
00023 #include <vector>
00024 
00025 #include <drizzled/visibility.h>
00026 
00027 using namespace std;
00028 
00029 namespace drizzled
00030 {
00031 
00032 /*
00033   We collect memory in this vector that we free on delete.
00034 */
00035 static vector<unsigned char*> memory_vector;
00036 
00037 /*
00038   The code below implements this functionality:
00039 
00040     - Initializing charset related structures
00041     - Loading dynamic charsets
00042     - Searching for a proper CHARSET_INFO
00043       using charset name, collation name or collation ID
00044     - Setting server default character set
00045 */
00046 
00047 bool my_charset_same(const CHARSET_INFO *cs1, const CHARSET_INFO *cs2)
00048 {
00049   return ((cs1 == cs2) || !strcmp(cs1->csname,cs2->csname));
00050 }
00051 
00052 
00053 static uint
00054 get_collation_number_internal(const char *name)
00055 {
00056   for (CHARSET_INFO **cs= all_charsets;
00057        cs < all_charsets+array_elements(all_charsets)-1;
00058        cs++)
00059   {
00060     if ( cs[0] && cs[0]->name && !my_strcasecmp(&my_charset_utf8_general_ci, cs[0]->name, name))
00061     {
00062       return cs[0]->number;
00063     }
00064   }
00065   return 0;
00066 }
00067 
00068 static unsigned char *cs_alloc(size_t size)
00069 {
00070   memory_vector.push_back(new unsigned char[size]);
00071   return memory_vector.back();
00072 }
00073 
00074 static bool init_state_maps(CHARSET_INFO *cs)
00075 {
00076   if (!(cs->state_map= cs_alloc(256)))
00077     return 1;
00078     
00079   if (!(cs->ident_map= cs_alloc(256)))
00080     return 1;
00081 
00082   unsigned char *state_map= cs->state_map;
00083   unsigned char *ident_map= cs->ident_map;
00084 
00085   /* Fill state_map with states to get a faster parser */
00086   for (int i= 0; i < 256; i++)
00087   {
00088     if (my_isalpha(cs,i))
00089       state_map[i]= MY_LEX_IDENT;
00090     else if (my_isdigit(cs,i))
00091       state_map[i]= MY_LEX_NUMBER_IDENT;
00092     else if (my_mbcharlen(cs, i)>1)
00093       state_map[i]= MY_LEX_IDENT;
00094     else if (my_isspace(cs,i))
00095       state_map[i]= MY_LEX_SKIP;
00096     else
00097       state_map[i]= MY_LEX_CHAR;
00098   }
00099   state_map['_']=state_map['$']= MY_LEX_IDENT;
00100   state_map['\'']= MY_LEX_STRING;
00101   state_map['.']= MY_LEX_REAL_OR_POINT;
00102   state_map['>']=state_map['=']=state_map['!']=  MY_LEX_CMP_OP;
00103   state_map['<']=  MY_LEX_LONG_CMP_OP;
00104   state_map['&']=state_map['|']= MY_LEX_BOOL;
00105   state_map['#']= MY_LEX_COMMENT;
00106   state_map[';']= MY_LEX_SEMICOLON;
00107   state_map[':']= MY_LEX_SET_VAR;
00108   state_map[0]= MY_LEX_EOL;
00109   state_map['\\']=  MY_LEX_ESCAPE;
00110   state_map['/']=  MY_LEX_LONG_COMMENT;
00111   state_map['*']=  MY_LEX_END_LONG_COMMENT;
00112   state_map['@']=  MY_LEX_USER_END;
00113   state_map['`']=  MY_LEX_USER_VARIABLE_DELIMITER;
00114   state_map['"']=  MY_LEX_STRING_OR_DELIMITER;
00115 
00116   /*
00117     Create a second map to make it faster to find identifiers
00118   */
00119   for (int i= 0; i < 256; i++)
00120   {
00121     ident_map[i]= state_map[i] == MY_LEX_IDENT || state_map[i] == MY_LEX_NUMBER_IDENT;
00122   }
00123 
00124   /* Special handling of hex and binary strings */
00125   state_map['x']= state_map['X']=  MY_LEX_IDENT_OR_HEX;
00126   state_map['b']= state_map['B']=  MY_LEX_IDENT_OR_BIN;
00127   return 0;
00128 }
00129 
00130 static bool charset_initialized= false;
00131 
00132 DRIZZLED_API CHARSET_INFO *all_charsets[256];
00133 const DRIZZLED_API CHARSET_INFO *default_charset_info = &my_charset_utf8_general_ci;
00134 
00135 void add_compiled_collation(CHARSET_INFO * cs)
00136 {
00137   all_charsets[cs->number]= cs;
00138   cs->state|= MY_CS_AVAILABLE;
00139 }
00140 
00141 static bool init_available_charsets(myf myflags)
00142 {
00143   bool error= false;
00144   /*
00145     We have to use charset_initialized to not lock on THR_LOCK_charset
00146     inside get_internal_charset...
00147   */
00148   if (charset_initialized == false)
00149   {
00150     CHARSET_INFO **cs;
00151     memset(&all_charsets, 0, sizeof(all_charsets));
00152     init_compiled_charsets(myflags);
00153 
00154     /* Copy compiled charsets */
00155     for (cs=all_charsets;
00156          cs < all_charsets+array_elements(all_charsets)-1 ;
00157          cs++)
00158     {
00159       if (*cs)
00160       {
00161         if (cs[0]->ctype)
00162           if (init_state_maps(*cs))
00163             *cs= NULL;
00164       }
00165     }
00166 
00167     charset_initialized= true;
00168   }
00169   assert(charset_initialized);
00170 
00171   return error;
00172 }
00173 
00174 
00175 void free_charsets()
00176 {
00177   charset_initialized= false;
00178 
00179   while (not memory_vector.empty())
00180   {
00181     delete[] memory_vector.back();
00182     memory_vector.pop_back();
00183   }
00184 }
00185 
00186 
00187 uint32_t get_collation_number(const char *name)
00188 {
00189   init_available_charsets(MYF(0));
00190   return get_collation_number_internal(name);
00191 }
00192 
00193 
00194 uint32_t get_charset_number(const char *charset_name, uint32_t cs_flags)
00195 {
00196   CHARSET_INFO **cs;
00197   init_available_charsets(MYF(0));
00198 
00199   for (cs= all_charsets;
00200        cs < all_charsets+array_elements(all_charsets)-1 ;
00201        cs++)
00202   {
00203     if ( cs[0] && cs[0]->csname && (cs[0]->state & cs_flags) && !my_strcasecmp(&my_charset_utf8_general_ci, cs[0]->csname, charset_name))
00204       return cs[0]->number;
00205   }
00206   return 0;
00207 }
00208 
00209 
00210 const char *get_charset_name(uint32_t charset_number)
00211 {
00212   init_available_charsets(MYF(0));
00213 
00214   const CHARSET_INFO *cs= all_charsets[charset_number];
00215   if (cs && (cs->number == charset_number) && cs->name )
00216     return cs->name;
00217 
00218   return "?";   /* this mimics find_type() */
00219 }
00220 
00221 
00222 static const CHARSET_INFO *get_internal_charset(uint32_t cs_number)
00223 {
00224   CHARSET_INFO *cs;
00225   /*
00226     To make things thread safe we are not allowing other threads to interfere
00227     while we may changing the cs_info_table
00228   */
00229   if ((cs= all_charsets[cs_number]))
00230   {
00231     if (!(cs->state & MY_CS_COMPILED) && !(cs->state & MY_CS_LOADED))
00232     {
00233       assert(0);
00234     }
00235     cs= (cs->state & MY_CS_AVAILABLE) ? cs : NULL;
00236   }
00237   if (cs && !(cs->state & MY_CS_READY))
00238   {
00239     if ((cs->cset->init && cs->cset->init(cs, cs_alloc)) ||
00240         (cs->coll->init && cs->coll->init(cs, cs_alloc)))
00241       cs= NULL;
00242     else
00243       cs->state|= MY_CS_READY;
00244   }
00245 
00246   return cs;
00247 }
00248 
00249 
00250 const CHARSET_INFO *get_charset(uint32_t cs_number)
00251 {
00252   const CHARSET_INFO *cs;
00253   if (cs_number == default_charset_info->number)
00254     return default_charset_info;
00255 
00256   (void) init_available_charsets(MYF(0)); /* If it isn't initialized */
00257 
00258   if (!cs_number || cs_number >= array_elements(all_charsets)-1)
00259     return NULL;
00260 
00261   cs= get_internal_charset(cs_number);
00262 
00263   return cs;
00264 }
00265 
00266 const CHARSET_INFO *get_charset_by_name(const char *cs_name)
00267 {
00268   uint32_t cs_number;
00269   const CHARSET_INFO *cs;
00270   (void) init_available_charsets(MYF(0)); /* If it isn't initialized */
00271 
00272   cs_number= get_collation_number(cs_name);
00273   cs= cs_number ? get_internal_charset(cs_number) : NULL;
00274 
00275   return cs;
00276 }
00277 
00278 
00279 const CHARSET_INFO *get_charset_by_csname(const char *cs_name, uint32_t cs_flags)
00280 {
00281   uint32_t cs_number;
00282   const CHARSET_INFO *cs;
00283 
00284   (void) init_available_charsets(MYF(0)); /* If it isn't initialized */
00285 
00286   cs_number= get_charset_number(cs_name, cs_flags);
00287   cs= cs_number ? get_internal_charset(cs_number) : NULL;
00288 
00289   return(cs);
00290 }
00291 
00292 
00293 /*
00294   Escape apostrophes by doubling them up
00295 
00296   SYNOPSIS
00297     escape_quotes_for_drizzle()
00298     charset_info        Charset of the strings
00299     to                  Buffer for escaped string
00300     to_length           Length of destination buffer, or 0
00301     from                The string to escape
00302     length              The length of the string to escape
00303 
00304   DESCRIPTION
00305     This escapes the contents of a string by doubling up any apostrophes that
00306     it contains. This is used when the NO_BACKSLASH_ESCAPES SQL_MODE is in
00307     effect on the server.
00308 
00309   NOTE
00310     To be consistent with escape_string_for_mysql(), to_length may be 0 to
00311     mean "big enough"
00312 
00313   RETURN VALUES
00314     UINT32_MAX  The escaped string did not fit in the to buffer
00315     >=0         The length of the escaped string
00316 */
00317 
00318 size_t escape_quotes_for_drizzle(const CHARSET_INFO *charset_info,
00319                                  char *to, size_t to_length,
00320                                  const char *from, size_t length)
00321 {
00322   const char *to_start= to;
00323   const char *end, *to_end=to_start + (to_length ? to_length-1 : 2*length);
00324   bool overflow= false;
00325   bool use_mb_flag= use_mb(charset_info);
00326   for (end= from + length; from < end; from++)
00327   {
00328     int tmp_length;
00329     if (use_mb_flag && (tmp_length= my_ismbchar(charset_info, from, end)))
00330     {
00331       if (to + tmp_length > to_end)
00332       {
00333         overflow= true;
00334         break;
00335       }
00336       while (tmp_length--)
00337   *to++= *from++;
00338       from--;
00339       continue;
00340     }
00341     /*
00342       We don't have the same issue here with a non-multi-byte character being
00343       turned into a multi-byte character by the addition of an escaping
00344       character, because we are only escaping the ' character with itself.
00345      */
00346     if (*from == '\'')
00347     {
00348       if (to + 2 > to_end)
00349       {
00350         overflow= true;
00351         break;
00352       }
00353       *to++= '\'';
00354       *to++= '\'';
00355     }
00356     else
00357     {
00358       if (to + 1 > to_end)
00359       {
00360         overflow= true;
00361         break;
00362       }
00363       *to++= *from;
00364     }
00365   }
00366   *to= 0;
00367   return overflow ? UINT32_MAX : to - to_start;
00368 }
00369 
00370 } /* namespace drizzled */