1
/* Copyright (C) 2000 MySQL AB
3
This program is free software; you can redistribute it and/or modify
4
it under the terms of the GNU General Public License as published by
5
the Free Software Foundation; version 2 of the License.
7
This program is distributed in the hope that it will be useful,
8
but WITHOUT ANY WARRANTY; without even the implied warranty of
9
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10
GNU General Public License for more details.
12
You should have received a copy of the GNU General Public License
13
along with this program; if not, write to the Free Software
14
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
16
#include "mysys_priv.h"
17
#include "mysys_err.h"
25
The code below implements this functionality:
27
- Initializing charset related structures
28
- Loading dynamic charsets
29
- Searching for a proper CHARSET_INFO
30
using charset name, collation name or collation ID
31
- Setting server default character set
34
my_bool my_charset_same(CHARSET_INFO *cs1, CHARSET_INFO *cs2)
36
return ((cs1 == cs2) || !strcmp(cs1->csname,cs2->csname));
41
get_collation_number_internal(const char *name)
44
for (cs= all_charsets;
45
cs < all_charsets+array_elements(all_charsets)-1 ;
48
if ( cs[0] && cs[0]->name &&
49
!my_strcasecmp(&my_charset_latin1, cs[0]->name, name))
56
static my_bool init_state_maps(CHARSET_INFO *cs)
62
if (!(cs->state_map= (uchar*) my_once_alloc(256, MYF(MY_WME))))
65
if (!(cs->ident_map= (uchar*) my_once_alloc(256, MYF(MY_WME))))
68
state_map= cs->state_map;
69
ident_map= cs->ident_map;
71
/* Fill state_map with states to get a faster parser */
72
for (i=0; i < 256 ; i++)
75
state_map[i]=(uchar) MY_LEX_IDENT;
76
else if (my_isdigit(cs,i))
77
state_map[i]=(uchar) MY_LEX_NUMBER_IDENT;
78
#if defined(USE_MB) && defined(USE_MB_IDENT)
79
else if (my_mbcharlen(cs, i)>1)
80
state_map[i]=(uchar) MY_LEX_IDENT;
82
else if (my_isspace(cs,i))
83
state_map[i]=(uchar) MY_LEX_SKIP;
85
state_map[i]=(uchar) MY_LEX_CHAR;
87
state_map[(uchar)'_']=state_map[(uchar)'$']=(uchar) MY_LEX_IDENT;
88
state_map[(uchar)'\'']=(uchar) MY_LEX_STRING;
89
state_map[(uchar)'.']=(uchar) MY_LEX_REAL_OR_POINT;
90
state_map[(uchar)'>']=state_map[(uchar)'=']=state_map[(uchar)'!']= (uchar) MY_LEX_CMP_OP;
91
state_map[(uchar)'<']= (uchar) MY_LEX_LONG_CMP_OP;
92
state_map[(uchar)'&']=state_map[(uchar)'|']=(uchar) MY_LEX_BOOL;
93
state_map[(uchar)'#']=(uchar) MY_LEX_COMMENT;
94
state_map[(uchar)';']=(uchar) MY_LEX_SEMICOLON;
95
state_map[(uchar)':']=(uchar) MY_LEX_SET_VAR;
96
state_map[0]=(uchar) MY_LEX_EOL;
97
state_map[(uchar)'\\']= (uchar) MY_LEX_ESCAPE;
98
state_map[(uchar)'/']= (uchar) MY_LEX_LONG_COMMENT;
99
state_map[(uchar)'*']= (uchar) MY_LEX_END_LONG_COMMENT;
100
state_map[(uchar)'@']= (uchar) MY_LEX_USER_END;
101
state_map[(uchar) '`']= (uchar) MY_LEX_USER_VARIABLE_DELIMITER;
102
state_map[(uchar)'"']= (uchar) MY_LEX_STRING_OR_DELIMITER;
105
Create a second map to make it faster to find identifiers
107
for (i=0; i < 256 ; i++)
109
ident_map[i]= (uchar) (state_map[i] == MY_LEX_IDENT ||
110
state_map[i] == MY_LEX_NUMBER_IDENT);
113
/* Special handling of hex and binary strings */
114
state_map[(uchar)'x']= state_map[(uchar)'X']= (uchar) MY_LEX_IDENT_OR_HEX;
115
state_map[(uchar)'b']= state_map[(uchar)'B']= (uchar) MY_LEX_IDENT_OR_BIN;
116
state_map[(uchar)'n']= state_map[(uchar)'N']= (uchar) MY_LEX_IDENT_OR_NCHAR;
121
static void simple_cs_init_functions(CHARSET_INFO *cs)
123
if (cs->state & MY_CS_BINSORT)
124
cs->coll= &my_collation_8bit_bin_handler;
126
cs->coll= &my_collation_8bit_simple_ci_handler;
128
cs->cset= &my_charset_8bit_handler;
133
static int cs_copy_data(CHARSET_INFO *to, CHARSET_INFO *from)
135
to->number= from->number ? from->number : to->number;
138
if (!(to->csname= my_once_strdup(from->csname,MYF(MY_WME))))
142
if (!(to->name= my_once_strdup(from->name,MYF(MY_WME))))
146
if (!(to->comment= my_once_strdup(from->comment,MYF(MY_WME))))
151
if (!(to->ctype= (uchar*) my_once_memdup((char*) from->ctype,
152
MY_CS_CTYPE_TABLE_SIZE,
155
if (init_state_maps(to))
159
if (!(to->to_lower= (uchar*) my_once_memdup((char*) from->to_lower,
160
MY_CS_TO_LOWER_TABLE_SIZE,
165
if (!(to->to_upper= (uchar*) my_once_memdup((char*) from->to_upper,
166
MY_CS_TO_UPPER_TABLE_SIZE,
169
if (from->sort_order)
171
if (!(to->sort_order= (uchar*) my_once_memdup((char*) from->sort_order,
172
MY_CS_SORT_ORDER_TABLE_SIZE,
177
if (from->tab_to_uni)
179
uint sz= MY_CS_TO_UNI_TABLE_SIZE*sizeof(uint16);
180
if (!(to->tab_to_uni= (uint16*) my_once_memdup((char*)from->tab_to_uni,
185
if (!(to->tailoring= my_once_strdup(from->tailoring,MYF(MY_WME))))
196
static my_bool simple_cs_is_full(CHARSET_INFO *cs)
198
return ((cs->csname && cs->tab_to_uni && cs->ctype && cs->to_upper &&
200
(cs->number && cs->name &&
201
(cs->sort_order || (cs->state & MY_CS_BINSORT) )));
206
copy_uca_collation(CHARSET_INFO *to, CHARSET_INFO *from)
208
to->cset= from->cset;
209
to->coll= from->coll;
210
to->strxfrm_multiply= from->strxfrm_multiply;
211
to->min_sort_char= from->min_sort_char;
212
to->max_sort_char= from->max_sort_char;
213
to->mbminlen= from->mbminlen;
214
to->mbmaxlen= from->mbmaxlen;
215
to->state|= MY_CS_AVAILABLE | MY_CS_LOADED |
216
MY_CS_STRNXFRM | MY_CS_UNICODE;
220
static int add_collation(CHARSET_INFO *cs)
222
if (cs->name && (cs->number ||
223
(cs->number=get_collation_number_internal(cs->name))) &&
224
cs->number < array_elements(all_charsets))
226
if (!all_charsets[cs->number])
228
if (!(all_charsets[cs->number]=
229
(CHARSET_INFO*) my_once_alloc(sizeof(CHARSET_INFO),MYF(0))))
231
bzero((void*)all_charsets[cs->number],sizeof(CHARSET_INFO));
234
if (cs->primary_number == cs->number)
235
cs->state |= MY_CS_PRIMARY;
237
if (cs->binary_number == cs->number)
238
cs->state |= MY_CS_BINSORT;
240
all_charsets[cs->number]->state|= cs->state;
242
if (!(all_charsets[cs->number]->state & MY_CS_COMPILED))
244
CHARSET_INFO *newcs= all_charsets[cs->number];
245
if (cs_copy_data(all_charsets[cs->number],cs))
248
if (!strcmp(cs->csname,"ucs2") )
250
#if defined(HAVE_CHARSET_ucs2) && defined(HAVE_UCA_COLLATIONS)
251
copy_uca_collation(newcs, &my_charset_ucs2_unicode_ci);
254
else if (!strcmp(cs->csname, "utf8"))
256
#if defined (HAVE_CHARSET_utf8) && defined(HAVE_UCA_COLLATIONS)
257
copy_uca_collation(newcs, &my_charset_utf8_unicode_ci);
262
uchar *sort_order= all_charsets[cs->number]->sort_order;
263
simple_cs_init_functions(all_charsets[cs->number]);
266
if (simple_cs_is_full(all_charsets[cs->number]))
268
all_charsets[cs->number]->state |= MY_CS_LOADED;
270
all_charsets[cs->number]->state|= MY_CS_AVAILABLE;
273
Check if case sensitive sort order: A < a < B.
274
We need MY_CS_FLAG for regex library, and for
275
case sensitivity flag for 5.0 client protocol,
276
to support isCaseSensitive() method in JDBC driver
278
if (sort_order && sort_order['A'] < sort_order['a'] &&
279
sort_order['a'] < sort_order['B'])
280
all_charsets[cs->number]->state|= MY_CS_CSSORT;
282
if (my_charset_is_8bit_pure_ascii(all_charsets[cs->number]))
283
all_charsets[cs->number]->state|= MY_CS_PUREASCII;
289
We need the below to make get_charset_name()
290
and get_charset_number() working even if a
291
character set has not been really incompiled.
292
The above functions are used for example
293
in error message compiler extra/comp_err.c.
294
If a character set was compiled, this information
295
will get lost and overwritten in add_compiled_collation().
297
CHARSET_INFO *dst= all_charsets[cs->number];
298
dst->number= cs->number;
300
if (!(dst->comment= my_once_strdup(cs->comment,MYF(MY_WME))))
303
if (!(dst->csname= my_once_strdup(cs->csname,MYF(MY_WME))))
306
if (!(dst->name= my_once_strdup(cs->name,MYF(MY_WME))))
310
cs->primary_number= 0;
311
cs->binary_number= 0;
314
cs->sort_order= NULL;
321
#define MY_MAX_ALLOWED_BUF 1024*1024
322
#define MY_CHARSET_INDEX "Index.xml"
324
const char *charsets_dir= NULL;
327
static my_bool my_read_charset_file(const char *filename, myf myflags)
334
if (!my_stat(filename, &stat_info, MYF(myflags)) ||
335
((len= (uint)stat_info.st_size) > MY_MAX_ALLOWED_BUF) ||
336
!(buf= (uchar*) my_malloc(len,myflags)))
339
if ((fd=my_open(filename,O_RDONLY,myflags)) < 0)
341
tmp_len=my_read(fd, buf, len, myflags);
342
my_close(fd,myflags);
346
if (my_parse_charset_xml((char*) buf,len,add_collation))
349
printf("ERROR at line %d pos %d '%s'\n",
350
my_xml_error_lineno(&p)+1,
351
my_xml_error_pos(&p),
352
my_xml_error_string(&p));
356
my_free(buf, myflags);
360
my_free(buf, myflags);
365
char *get_charsets_dir(char *buf)
367
const char *sharedir= SHAREDIR;
369
DBUG_ENTER("get_charsets_dir");
371
if (charsets_dir != NULL)
372
strmake(buf, charsets_dir, FN_REFLEN-1);
375
if (test_if_hard_path(sharedir) ||
376
is_prefix(sharedir, DEFAULT_CHARSET_HOME))
377
strxmov(buf, sharedir, "/", CHARSET_DIR, NullS);
379
strxmov(buf, DEFAULT_CHARSET_HOME, "/", sharedir, "/", CHARSET_DIR,
382
res= convert_dirname(buf,buf,NullS);
383
DBUG_PRINT("info",("charsets dir: '%s'", buf));
387
CHARSET_INFO *all_charsets[256]={NULL};
388
CHARSET_INFO *default_charset_info = &my_charset_latin1;
390
void add_compiled_collation(CHARSET_INFO *cs)
392
all_charsets[cs->number]= cs;
393
cs->state|= MY_CS_AVAILABLE;
396
static void *cs_alloc(size_t size)
398
return my_once_alloc(size, MYF(MY_WME));
402
static my_pthread_once_t charsets_initialized= MY_PTHREAD_ONCE_INIT;
404
static void init_available_charsets(void)
406
char fname[FN_REFLEN + sizeof(MY_CHARSET_INDEX)];
409
bzero(&all_charsets,sizeof(all_charsets));
410
init_compiled_charsets(MYF(0));
412
/* Copy compiled charsets */
413
for (cs=all_charsets;
414
cs < all_charsets+array_elements(all_charsets)-1 ;
420
if (init_state_maps(*cs))
425
strmov(get_charsets_dir(fname), MY_CHARSET_INDEX);
426
my_read_charset_file(fname, MYF(0));
430
uint get_collation_number(const char *name)
432
my_pthread_once(&charsets_initialized, init_available_charsets);
433
return get_collation_number_internal(name);
437
uint get_charset_number(const char *charset_name, uint cs_flags)
440
my_pthread_once(&charsets_initialized, init_available_charsets);
442
for (cs= all_charsets;
443
cs < all_charsets+array_elements(all_charsets)-1 ;
446
if ( cs[0] && cs[0]->csname && (cs[0]->state & cs_flags) &&
447
!my_strcasecmp(&my_charset_latin1, cs[0]->csname, charset_name))
448
return cs[0]->number;
454
const char *get_charset_name(uint charset_number)
457
my_pthread_once(&charsets_initialized, init_available_charsets);
459
cs=all_charsets[charset_number];
460
if (cs && (cs->number == charset_number) && cs->name )
461
return (char*) cs->name;
463
return (char*) "?"; /* this mimics find_type() */
467
static CHARSET_INFO *get_internal_charset(uint cs_number, myf flags)
472
if ((cs= all_charsets[cs_number]))
474
if (cs->state & MY_CS_READY) /* if CS is already initialized */
478
To make things thread safe we are not allowing other threads to interfere
479
while we may changing the cs_info_table
481
pthread_mutex_lock(&THR_LOCK_charset);
483
if (!(cs->state & (MY_CS_COMPILED|MY_CS_LOADED))) /* if CS is not in memory */
485
strxmov(get_charsets_dir(buf), cs->csname, ".xml", NullS);
486
my_read_charset_file(buf,flags);
489
if (cs->state & MY_CS_AVAILABLE)
491
if (!(cs->state & MY_CS_READY))
493
if ((cs->cset->init && cs->cset->init(cs, cs_alloc)) ||
494
(cs->coll->init && cs->coll->init(cs, cs_alloc)))
497
cs->state|= MY_CS_READY;
503
pthread_mutex_unlock(&THR_LOCK_charset);
509
CHARSET_INFO *get_charset(uint cs_number, myf flags)
512
if (cs_number == default_charset_info->number)
513
return default_charset_info;
515
my_pthread_once(&charsets_initialized, init_available_charsets);
517
if (!cs_number || cs_number >= array_elements(all_charsets)-1)
520
cs=get_internal_charset(cs_number, flags);
522
if (!cs && (flags & MY_WME))
524
char index_file[FN_REFLEN + sizeof(MY_CHARSET_INDEX)], cs_string[23];
525
strmov(get_charsets_dir(index_file),MY_CHARSET_INDEX);
527
int10_to_str(cs_number, cs_string+1, 10);
528
my_error(EE_UNKNOWN_CHARSET, MYF(ME_BELL), cs_string, index_file);
533
CHARSET_INFO *get_charset_by_name(const char *cs_name, myf flags)
537
my_pthread_once(&charsets_initialized, init_available_charsets);
539
cs_number=get_collation_number(cs_name);
540
cs= cs_number ? get_internal_charset(cs_number,flags) : NULL;
542
if (!cs && (flags & MY_WME))
544
char index_file[FN_REFLEN + sizeof(MY_CHARSET_INDEX)];
545
strmov(get_charsets_dir(index_file),MY_CHARSET_INDEX);
546
my_error(EE_UNKNOWN_COLLATION, MYF(ME_BELL), cs_name, index_file);
553
CHARSET_INFO *get_charset_by_csname(const char *cs_name,
559
DBUG_ENTER("get_charset_by_csname");
560
DBUG_PRINT("enter",("name: '%s'", cs_name));
562
my_pthread_once(&charsets_initialized, init_available_charsets);
564
cs_number= get_charset_number(cs_name, cs_flags);
565
cs= cs_number ? get_internal_charset(cs_number, flags) : NULL;
567
if (!cs && (flags & MY_WME))
569
char index_file[FN_REFLEN + sizeof(MY_CHARSET_INDEX)];
570
strmov(get_charsets_dir(index_file),MY_CHARSET_INDEX);
571
my_error(EE_UNKNOWN_CHARSET, MYF(ME_BELL), cs_name, index_file);
579
Resolve character set by the character set name (utf8, latin1, ...).
581
The function tries to resolve character set by the specified name. If
582
there is character set with the given name, it is assigned to the "cs"
583
parameter and FALSE is returned. If there is no such character set,
584
"default_cs" is assigned to the "cs" and TRUE is returned.
586
@param[in] cs_name Character set name.
587
@param[in] default_cs Default character set.
588
@param[out] cs Variable to store character set.
590
@return FALSE if character set was resolved successfully; TRUE if there
591
is no character set with given name.
594
my_bool resolve_charset(const char *cs_name,
595
CHARSET_INFO *default_cs,
598
*cs= get_charset_by_csname(cs_name, MY_CS_PRIMARY, MYF(0));
611
Resolve collation by the collation name (utf8_general_ci, ...).
613
The function tries to resolve collation by the specified name. If there
614
is collation with the given name, it is assigned to the "cl" parameter
615
and FALSE is returned. If there is no such collation, "default_cl" is
616
assigned to the "cl" and TRUE is returned.
618
@param[out] cl Variable to store collation.
619
@param[in] cl_name Collation name.
620
@param[in] default_cl Default collation.
622
@return FALSE if collation was resolved successfully; TRUE if there is no
623
collation with given name.
626
my_bool resolve_collation(const char *cl_name,
627
CHARSET_INFO *default_cl,
630
*cl= get_charset_by_name(cl_name, MYF(0));
643
Escape string with backslashes (\)
646
escape_string_for_mysql()
647
charset_info Charset of the strings
648
to Buffer for escaped string
649
to_length Length of destination buffer, or 0
650
from The string to escape
651
length The length of the string to escape
654
This escapes the contents of a string by adding backslashes before special
655
characters, and turning others into specific escape sequences, such as
656
turning newlines into \n and null bytes into \0.
659
To maintain compatibility with the old C API, to_length may be 0 to mean
663
(size_t) -1 The escaped string did not fit in the to buffer
664
# The length of the escaped string
667
size_t escape_string_for_mysql(CHARSET_INFO *charset_info,
668
char *to, size_t to_length,
669
const char *from, size_t length)
671
const char *to_start= to;
672
const char *end, *to_end=to_start + (to_length ? to_length-1 : 2*length);
673
my_bool overflow= FALSE;
675
my_bool use_mb_flag= use_mb(charset_info);
677
for (end= from + length; from < end; from++)
682
if (use_mb_flag && (tmp_length= my_ismbchar(charset_info, from, end)))
684
if (to + tmp_length > to_end)
695
If the next character appears to begin a multi-byte character, we
696
escape that first byte of that apparent multi-byte character. (The
697
character just looks like a multi-byte character -- if it were actually
698
a multi-byte character, it would have been passed through in the test
701
Without this check, we can create a problem by converting an invalid
702
multi-byte character into a valid one. For example, 0xbf27 is not
703
a valid GBK character, but 0xbf5c is. (0x27 = ', 0x5c = \)
705
if (use_mb_flag && (tmp_length= my_mbcharlen(charset_info, *from)) > 1)
710
case 0: /* Must be escaped for 'mysql' */
713
case '\n': /* Must be escaped for logs */
725
case '"': /* Better safe than sorry */
728
case '\032': /* This gives problems on Win32 */
753
return overflow ? (size_t) -1 : (size_t) (to - to_start);
757
#ifdef BACKSLASH_MBTAIL
758
static CHARSET_INFO *fs_cset_cache= NULL;
760
CHARSET_INFO *fs_character_set()
765
GetLocaleInfo(LOCALE_SYSTEM_DEFAULT, LOCALE_IDEFAULTANSICODEPAGE,
766
buf+2, sizeof(buf)-3);
768
We cannot call get_charset_by_name here
769
because fs_character_set() is executed before
770
LOCK_THD_charset mutex initialization, which
771
is used inside get_charset_by_name.
772
As we're now interested in cp932 only,
773
let's just detect it using strcmp().
775
fs_cset_cache= !strcmp(buf, "cp932") ?
776
&my_charset_cp932_japanese_ci : &my_charset_bin;
778
return fs_cset_cache;
783
Escape apostrophes by doubling them up
786
escape_quotes_for_mysql()
787
charset_info Charset of the strings
788
to Buffer for escaped string
789
to_length Length of destination buffer, or 0
790
from The string to escape
791
length The length of the string to escape
794
This escapes the contents of a string by doubling up any apostrophes that
795
it contains. This is used when the NO_BACKSLASH_ESCAPES SQL_MODE is in
796
effect on the server.
799
To be consistent with escape_string_for_mysql(), to_length may be 0 to
803
~0 The escaped string did not fit in the to buffer
804
>=0 The length of the escaped string
807
size_t escape_quotes_for_mysql(CHARSET_INFO *charset_info,
808
char *to, size_t to_length,
809
const char *from, size_t length)
811
const char *to_start= to;
812
const char *end, *to_end=to_start + (to_length ? to_length-1 : 2*length);
813
my_bool overflow= FALSE;
815
my_bool use_mb_flag= use_mb(charset_info);
817
for (end= from + length; from < end; from++)
821
if (use_mb_flag && (tmp_length= my_ismbchar(charset_info, from, end)))
823
if (to + tmp_length > to_end)
834
We don't have the same issue here with a non-multi-byte character being
835
turned into a multi-byte character by the addition of an escaping
836
character, because we are only escaping the ' character with itself.
860
return overflow ? (ulong)~0 : (ulong) (to - to_start);