summaryrefslogtreecommitdiff
path: root/include/m_ctype.h
diff options
context:
space:
mode:
Diffstat (limited to 'include/m_ctype.h')
-rw-r--r--include/m_ctype.h192
1 files changed, 153 insertions, 39 deletions
diff --git a/include/m_ctype.h b/include/m_ctype.h
index ddb4c825e1b..04a82953f0a 100644
--- a/include/m_ctype.h
+++ b/include/m_ctype.h
@@ -181,11 +181,12 @@ extern MY_UNI_CTYPE my_uni_ctype[256];
/* A helper macros for "need at least n bytes" */
#define MY_CS_TOOSMALLN(n) (-100-(n))
+#define MY_CS_MBMAXLEN 6 /* Maximum supported mbmaxlen */
#define MY_CS_IS_TOOSMALL(rc) ((rc) >= MY_CS_TOOSMALL6 && (rc) <= MY_CS_TOOSMALL)
-
#define MY_SEQ_INTTAIL 1
#define MY_SEQ_SPACES 2
+#define MY_SEQ_NONSPACES 3 /* Skip non-space characters, including bad bytes */
/* My charsets_list flags */
#define MY_CS_COMPILED 1 /* compiled-in sets */
@@ -329,8 +330,7 @@ struct my_collation_handler_st
int (*strnncoll)(CHARSET_INFO *,
const uchar *, size_t, const uchar *, size_t, my_bool);
int (*strnncollsp)(CHARSET_INFO *,
- const uchar *, size_t, const uchar *, size_t,
- my_bool diff_if_only_endspace_difference);
+ const uchar *, size_t, const uchar *, size_t);
size_t (*strnxfrm)(CHARSET_INFO *,
uchar *dst, size_t dstlen, uint nweights,
const uchar *src, size_t srclen, uint flags);
@@ -361,6 +361,8 @@ struct my_collation_handler_st
extern MY_COLLATION_HANDLER my_collation_8bit_bin_handler;
extern MY_COLLATION_HANDLER my_collation_8bit_simple_ci_handler;
+extern MY_COLLATION_HANDLER my_collation_8bit_nopad_bin_handler;
+extern MY_COLLATION_HANDLER my_collation_8bit_simple_nopad_ci_handler;
extern MY_COLLATION_HANDLER my_collation_ucs2_uca_handler;
/* Some typedef to make it easy for C++ to make function pointers */
@@ -394,7 +396,6 @@ typedef struct
*/
typedef struct
{
- MY_STRCOPY_STATUS m_native_copy_status;
const char *m_cannot_convert_error_pos;
} MY_STRCONV_STATUS;
@@ -404,14 +405,9 @@ struct my_charset_handler_st
{
my_bool (*init)(struct charset_info_st *, MY_CHARSET_LOADER *loader);
/* Multibyte routines */
- uint (*ismbchar)(CHARSET_INFO *, const char *, const char *);
- uint (*mbcharlen)(CHARSET_INFO *, uint c);
size_t (*numchars)(CHARSET_INFO *, const char *b, const char *e);
size_t (*charpos)(CHARSET_INFO *, const char *b, const char *e,
size_t pos);
- size_t (*well_formed_len)(CHARSET_INFO *,
- const char *b,const char *e,
- size_t nchars, int *error);
size_t (*lengthsp)(CHARSET_INFO *, const char *ptr, size_t length);
size_t (*numcells)(CHARSET_INFO *, const char *b, const char *e);
@@ -586,50 +582,87 @@ struct charset_info_st
extern MYSQL_PLUGIN_IMPORT struct charset_info_st my_charset_bin;
extern MYSQL_PLUGIN_IMPORT struct charset_info_st my_charset_latin1;
+extern MYSQL_PLUGIN_IMPORT struct charset_info_st my_charset_latin1_nopad;
extern MYSQL_PLUGIN_IMPORT struct charset_info_st my_charset_filename;
extern MYSQL_PLUGIN_IMPORT struct charset_info_st my_charset_utf8_general_ci;
extern struct charset_info_st my_charset_big5_bin;
extern struct charset_info_st my_charset_big5_chinese_ci;
+extern struct charset_info_st my_charset_big5_nopad_bin;
+extern struct charset_info_st my_charset_big5_chinese_nopad_ci;
extern struct charset_info_st my_charset_cp1250_czech_ci;
extern struct charset_info_st my_charset_cp932_bin;
extern struct charset_info_st my_charset_cp932_japanese_ci;
+extern struct charset_info_st my_charset_cp932_nopad_bin;
+extern struct charset_info_st my_charset_cp932_japanese_nopad_ci;
extern struct charset_info_st my_charset_eucjpms_bin;
extern struct charset_info_st my_charset_eucjpms_japanese_ci;
+extern struct charset_info_st my_charset_eucjpms_nopad_bin;
+extern struct charset_info_st my_charset_eucjpms_japanese_nopad_ci;
extern struct charset_info_st my_charset_euckr_bin;
extern struct charset_info_st my_charset_euckr_korean_ci;
+extern struct charset_info_st my_charset_euckr_nopad_bin;
+extern struct charset_info_st my_charset_euckr_korean_nopad_ci;
extern struct charset_info_st my_charset_gb2312_bin;
extern struct charset_info_st my_charset_gb2312_chinese_ci;
+extern struct charset_info_st my_charset_gb2312_nopad_bin;
+extern struct charset_info_st my_charset_gb2312_chinese_nopad_ci;
extern struct charset_info_st my_charset_gbk_bin;
extern struct charset_info_st my_charset_gbk_chinese_ci;
+extern struct charset_info_st my_charset_gbk_nopad_bin;
+extern struct charset_info_st my_charset_gbk_chinese_nopad_ci;
extern struct charset_info_st my_charset_latin1_bin;
+extern struct charset_info_st my_charset_latin1_nopad_bin;
extern struct charset_info_st my_charset_latin1_german2_ci;
extern struct charset_info_st my_charset_latin2_czech_ci;
extern struct charset_info_st my_charset_sjis_bin;
extern struct charset_info_st my_charset_sjis_japanese_ci;
+extern struct charset_info_st my_charset_sjis_nopad_bin;
+extern struct charset_info_st my_charset_sjis_japanese_nopad_ci;
extern struct charset_info_st my_charset_tis620_bin;
extern struct charset_info_st my_charset_tis620_thai_ci;
+extern struct charset_info_st my_charset_tis620_nopad_bin;
+extern struct charset_info_st my_charset_tis620_thai_nopad_ci;
extern struct charset_info_st my_charset_ucs2_bin;
extern struct charset_info_st my_charset_ucs2_general_ci;
+extern struct charset_info_st my_charset_ucs2_nopad_bin;
+extern struct charset_info_st my_charset_ucs2_general_nopad_ci;
extern struct charset_info_st my_charset_ucs2_general_mysql500_ci;
extern struct charset_info_st my_charset_ucs2_unicode_ci;
+extern struct charset_info_st my_charset_ucs2_unicode_nopad_ci;
extern struct charset_info_st my_charset_ucs2_general_mysql500_ci;
extern struct charset_info_st my_charset_ujis_bin;
extern struct charset_info_st my_charset_ujis_japanese_ci;
+extern struct charset_info_st my_charset_ujis_nopad_bin;
+extern struct charset_info_st my_charset_ujis_japanese_nopad_ci;
extern struct charset_info_st my_charset_utf16_bin;
extern struct charset_info_st my_charset_utf16_general_ci;
extern struct charset_info_st my_charset_utf16_unicode_ci;
+extern struct charset_info_st my_charset_utf16_unicode_nopad_ci;
extern struct charset_info_st my_charset_utf16le_bin;
extern struct charset_info_st my_charset_utf16le_general_ci;
+extern struct charset_info_st my_charset_utf16_general_nopad_ci;
+extern struct charset_info_st my_charset_utf16_nopad_bin;
+extern struct charset_info_st my_charset_utf16le_nopad_bin;
+extern struct charset_info_st my_charset_utf16le_general_nopad_ci;
extern struct charset_info_st my_charset_utf32_bin;
extern struct charset_info_st my_charset_utf32_general_ci;
extern struct charset_info_st my_charset_utf32_unicode_ci;
+extern struct charset_info_st my_charset_utf32_unicode_nopad_ci;
+extern struct charset_info_st my_charset_utf32_nopad_bin;
+extern struct charset_info_st my_charset_utf32_general_nopad_ci;
extern struct charset_info_st my_charset_utf8_bin;
+extern struct charset_info_st my_charset_utf8_nopad_bin;
+extern struct charset_info_st my_charset_utf8_general_nopad_ci;
extern struct charset_info_st my_charset_utf8_general_mysql500_ci;
extern struct charset_info_st my_charset_utf8_unicode_ci;
+extern struct charset_info_st my_charset_utf8_unicode_nopad_ci;
extern struct charset_info_st my_charset_utf8mb4_bin;
extern struct charset_info_st my_charset_utf8mb4_general_ci;
+extern struct charset_info_st my_charset_utf8mb4_nopad_bin;
+extern struct charset_info_st my_charset_utf8mb4_general_nopad_ci;
extern struct charset_info_st my_charset_utf8mb4_unicode_ci;
+extern struct charset_info_st my_charset_utf8mb4_unicode_nopad_ci;
#define MY_UTF8MB3 "utf8"
#define MY_UTF8MB4 "utf8mb4"
@@ -649,16 +682,31 @@ extern int my_strnncoll_simple(CHARSET_INFO *, const uchar *, size_t,
const uchar *, size_t, my_bool);
extern int my_strnncollsp_simple(CHARSET_INFO *, const uchar *, size_t,
- const uchar *, size_t,
- my_bool diff_if_only_endspace_difference);
+ const uchar *, size_t);
extern void my_hash_sort_simple(CHARSET_INFO *cs,
const uchar *key, size_t len,
ulong *nr1, ulong *nr2);
+
+extern void my_hash_sort_simple_nopad(CHARSET_INFO *cs,
+ const uchar *key, size_t len,
+ ulong *nr1, ulong *nr2);
+
extern void my_hash_sort_bin(CHARSET_INFO *cs,
const uchar *key, size_t len, ulong *nr1,
ulong *nr2);
+/**
+ Compare a string to an array of spaces, for PAD SPACE comparison.
+ The function iterates through the string and compares every byte to 0x20.
+ @param - the string
+ @param - its length
+ @return <0 - if a byte less than 0x20 was found in the string.
+ @return 0 - if all bytes in the string were 0x20, or if length was 0.
+ @return >0 - if a byte greater than 0x20 was found in the string.
+*/
+extern int my_strnncollsp_padspace_bin(const uchar *str, size_t length);
+
extern size_t my_lengthsp_8bit(CHARSET_INFO *cs, const char *ptr, size_t length);
extern uint my_instr_simple(CHARSET_INFO *,
@@ -764,14 +812,11 @@ int my_wildcmp_bin(CHARSET_INFO *,
size_t my_numchars_8bit(CHARSET_INFO *, const char *b, const char *e);
size_t my_numcells_8bit(CHARSET_INFO *, const char *b, const char *e);
size_t my_charpos_8bit(CHARSET_INFO *, const char *b, const char *e, size_t pos);
-size_t my_well_formed_len_8bit(CHARSET_INFO *, const char *b, const char *e,
- size_t pos, int *error);
size_t my_well_formed_char_length_8bit(CHARSET_INFO *cs,
const char *b, const char *e,
size_t nchars,
MY_STRCOPY_STATUS *status);
int my_charlen_8bit(CHARSET_INFO *, const uchar *str, const uchar *end);
-uint my_mbcharlen_8bit(CHARSET_INFO *, uint c);
/* Functions for multibyte charsets */
@@ -798,23 +843,11 @@ int my_wildcmp_mb(CHARSET_INFO *,
size_t my_numchars_mb(CHARSET_INFO *, const char *b, const char *e);
size_t my_numcells_mb(CHARSET_INFO *, const char *b, const char *e);
size_t my_charpos_mb(CHARSET_INFO *, const char *b, const char *e, size_t pos);
-size_t my_well_formed_len_mb(CHARSET_INFO *, const char *b, const char *e,
- size_t pos, int *error);
uint my_instr_mb(CHARSET_INFO *,
const char *b, size_t b_length,
const char *s, size_t s_length,
my_match_t *match, uint nmatch);
-int my_strnncoll_mb_bin(CHARSET_INFO * cs,
- const uchar *s, size_t slen,
- const uchar *t, size_t tlen,
- my_bool t_is_prefix);
-
-int my_strnncollsp_mb_bin(CHARSET_INFO *cs,
- const uchar *a, size_t a_length,
- const uchar *b, size_t b_length,
- my_bool diff_if_only_endspace_difference);
-
int my_wildcmp_mb_bin(CHARSET_INFO *cs,
const char *str,const char *str_end,
const char *wildstr,const char *wildend,
@@ -826,18 +859,38 @@ int my_strcasecmp_mb_bin(CHARSET_INFO * cs __attribute__((unused)),
void my_hash_sort_mb_bin(CHARSET_INFO *cs __attribute__((unused)),
const uchar *key, size_t len,ulong *nr1, ulong *nr2);
+void my_hash_sort_mb_nopad_bin(CHARSET_INFO *cs __attribute__((unused)),
+ const uchar *key, size_t len,
+ ulong *nr1, ulong *nr2);
+
size_t my_strnxfrm_mb(CHARSET_INFO *,
uchar *dst, size_t dstlen, uint nweights,
const uchar *src, size_t srclen, uint flags);
+size_t my_strnxfrm_mb_nopad(CHARSET_INFO *,
+ uchar *dst, size_t dstlen, uint nweights,
+ const uchar *src, size_t srclen, uint flags);
+
size_t my_strnxfrm_unicode(CHARSET_INFO *,
uchar *dst, size_t dstlen, uint nweights,
const uchar *src, size_t srclen, uint flags);
+
+size_t my_strnxfrm_unicode_nopad(CHARSET_INFO *,
+ uchar *dst, size_t dstlen, uint nweights,
+ const uchar *src, size_t srclen, uint flags);
+
size_t my_strnxfrmlen_unicode(CHARSET_INFO *, size_t);
size_t my_strnxfrm_unicode_full_bin(CHARSET_INFO *,
- uchar *dst, size_t dstlen, uint nweights,
- const uchar *src, size_t srclen, uint flags);
+ uchar *dst, size_t dstlen,
+ uint nweights, const uchar *src,
+ size_t srclen, uint flags);
+
+size_t my_strnxfrm_unicode_full_nopad_bin(CHARSET_INFO *,
+ uchar *dst, size_t dstlen,
+ uint nweights, const uchar *src,
+ size_t srclen, uint flags);
+
size_t my_strnxfrmlen_unicode_full_bin(CHARSET_INFO *, size_t);
int my_wildcmp_unicode(CHARSET_INFO *cs,
@@ -867,7 +920,6 @@ void my_string_metadata_get(MY_STRING_METADATA *metadata,
CHARSET_INFO *cs, const char *str, size_t len);
uint my_string_repertoire(CHARSET_INFO *cs, const char *str, ulong len);
my_bool my_charset_is_ascii_based(CHARSET_INFO *cs);
-my_bool my_charset_is_8bit_pure_ascii(CHARSET_INFO *cs);
uint my_charset_repertoire(CHARSET_INFO *cs);
uint my_strxfrm_flag_normalize(uint flags, uint nlevels);
@@ -876,8 +928,10 @@ void my_strxfrm_desc_and_reverse(uchar *str, uchar *strend,
size_t my_strxfrm_pad_desc_and_reverse(CHARSET_INFO *cs,
uchar *str, uchar *frmend, uchar *strend,
uint nweights, uint flags, uint level);
-
-my_bool my_charset_is_ascii_compatible(CHARSET_INFO *cs);
+size_t my_strxfrm_pad_desc_and_reverse_nopad(CHARSET_INFO *cs,
+ uchar *str, uchar *frmend,
+ uchar *strend, uint nweights,
+ uint flags, uint level);
const MY_CONTRACTIONS *my_charset_get_contractions(CHARSET_INFO *cs,
int level);
@@ -931,7 +985,9 @@ uint32 my_convert_using_func(char *to, uint32 to_length, CHARSET_INFO *to_cs,
*/
size_t my_convert_fix(CHARSET_INFO *dstcs, char *dst, size_t dst_length,
CHARSET_INFO *srccs, const char *src, size_t src_length,
- size_t nchars, MY_STRCONV_STATUS *status);
+ size_t nchars,
+ MY_STRCOPY_STATUS *copy_status,
+ MY_STRCONV_STATUS *conv_status);
#define _MY_U 01 /* Upper case */
#define _MY_L 02 /* Lower case */
@@ -976,13 +1032,71 @@ size_t my_convert_fix(CHARSET_INFO *dstcs, char *dst, size_t dst_length,
#define my_strcasecmp(s, a, b) ((s)->coll->strcasecmp((s), (a), (b)))
#define my_charpos(cs, b, e, num) (cs)->cset->charpos((cs), (const char*) (b), (const char *)(e), (num))
-#define use_mb(s) ((s)->cset->ismbchar != NULL)
-#define my_ismbchar(s, a, b) ((s)->cset->ismbchar((s), (a), (b)))
-#ifdef USE_MB
-#define my_mbcharlen(s, a) ((s)->cset->mbcharlen((s),(a)))
-#else
-#define my_mbcharlen(s, a) 1
-#endif
+#define use_mb(s) ((s)->mbmaxlen > 1)
+/**
+ Detect if the leftmost character in a string is a valid multi-byte character
+ and return its length, or return 0 otherwise.
+ @param cs - character set
+ @param str - the beginning of the string
+ @param end - the string end (the next byte after the string)
+ @return >0, for a multi-byte character
+ @rerurn 0, for a single byte character, broken sequence, empty string.
+*/
+static inline
+uint my_ismbchar(CHARSET_INFO *cs, const char *str, const char *end)
+{
+ int char_length= (cs->cset->charlen)(cs, (const uchar *) str,
+ (const uchar *) end);
+ return char_length > 1 ? (uint) char_length : 0U;
+}
+
+
+/**
+ Return length of the leftmost character in a string.
+ @param cs - character set
+ @param str - the beginning of the string
+ @param end - the string end (the next byte after the string)
+ @return <=0 on errors (EOL, wrong byte sequence)
+ @return 1 on a single byte character
+ @return >1 on a multi-byte character
+
+ Note, inlike my_ismbchar(), 1 is returned for a single byte character.
+*/
+static inline
+int my_charlen(CHARSET_INFO *cs, const char *str, const char *end)
+{
+ return (cs->cset->charlen)(cs, (const uchar *) str,
+ (const uchar *) end);
+}
+
+
+/**
+ Convert broken and incomplete byte sequences to 1 byte.
+*/
+static inline
+uint my_charlen_fix(CHARSET_INFO *cs, const char *str, const char *end)
+{
+ int char_length= my_charlen(cs, str, end);
+ DBUG_ASSERT(str < end);
+ return char_length > 0 ? (uint) char_length : (uint) 1U;
+}
+
+
+/*
+ A compatibility replacement pure C function for the former
+ cs->cset->well_formed_len().
+ In C++ code please use Well_formed_prefix::length() instead.
+*/
+static inline size_t
+my_well_formed_length(CHARSET_INFO *cs, const char *b, const char *e,
+ size_t nchars, int *error)
+{
+ MY_STRCOPY_STATUS status;
+ (void) cs->cset->well_formed_char_length(cs, b, e, nchars, &status);
+ *error= status.m_well_formed_error_pos == NULL ? 0 : 1;
+ return status.m_source_end_pos - b;
+}
+
#define my_caseup_str(s, a) ((s)->cset->caseup_str((s), (a)))
#define my_casedn_str(s, a) ((s)->cset->casedn_str((s), (a)))