5 files changed, 275 insertions, 7 deletions
diff --git a/include/m_ctype.h b/include/m_ctype.h
index 83b12c3c4f6..811b3b71a17 100644
--- a/include/m_ctype.h
+++ b/include/m_ctype.h
@@ -34,7 +34,9 @@ enum loglevel {
 extern "C" {
 #endif
 
-#define MY_CS_NAME_SIZE			32
+#define MY_CS_CHARACTER_SET_NAME_SIZE   32
+#define MY_CS_COLLATION_NAME_SIZE       64
+
 #define MY_CS_CTYPE_TABLE_SIZE		257
 #define MY_CS_TO_LOWER_TABLE_SIZE	256
 #define MY_CS_TO_UPPER_TABLE_SIZE	256
@@ -116,7 +118,7 @@ extern MY_UNICASE_INFO my_unicase_unicode520;
 */
 #define MY_UCA_MAX_WEIGHT_SIZE (8+1)               /* Including 0 terminator */
 #define MY_UCA_CONTRACTION_MAX_WEIGHT_SIZE (2*8+1) /* Including 0 terminator */
-#define MY_UCA_WEIGHT_LEVELS   2
+#define MY_UCA_WEIGHT_LEVELS   3
 
 typedef struct my_contraction_t
 {
@@ -139,6 +141,65 @@ const uint16 *my_uca_contraction2_weight(const MY_CONTRACTIONS *c,
                                          my_wc_t wc1, my_wc_t wc2);
 
 
+typedef struct my_uca_weight2_t
+{
+  uint16 weight[2];
+} MY_UCA_WEIGHT2;
+
+
+/*
+  In DUCET as of Unicode-14.0.0:
+  - All characters in the range U+0000..U+007F (i.e. using one byte in utf8)
+    have not more than two weights on all weight levels.
+  - All characters in the range U+0080..U+07FF (i.e. using two bytes in utf8)
+    have not more than four weights on all weight levels.
+  Therefore the limit of 4 weights should cover all byte pairs
+  (i.e. two ASCII characters or one 2-byte character)
+  that are a subject for the "process 2 bytes at a time" optimization.
+  If some collation reorders any character from the mentioned ranges
+  in the way that it produces more weights, such character will not
+  be optimized, but will be correctly processed the slower mb_wc-based
+  method (1 character at a time).
+*/
+#define MY_UCA_2BYTES_MAX_WEIGHT_SIZE (4+1) /* Including 0 terminator */
+
+typedef struct my_uca_2bytes_item_t
+{
+  uint16 weight[MY_UCA_2BYTES_MAX_WEIGHT_SIZE];
+} MY_UCA_2BYTES_ITEM;
+
+
+typedef struct my_uca_level_booster_t
+{
+  /*
+    A helper array to process 2 bytes at a time during string comparison.
+    It maps all 2-bytes sequences that make:
+    - two ASCII characters or
+    - one 2-byte character
+    to their weights. The weight length is limited to
+    MY_UCA_2BYTES_MAX_WEIGHT_SIZE-1 weights.
+    This array is used in the main loop optimization.
+  */
+  MY_UCA_2BYTES_ITEM weight_strings_2bytes[0x10000];
+  /*
+    A helper array to process 2bytes at a time during string comparison,
+    with an even more efficient way than the above one.
+    The weight size is limited to 2 weights, so it's used for the cases
+    when 2 input bytes produce 1 or 2 weights.
+    This limit makes the code using this array even simpler and faster.
+    This array is used for prefix optimization.
+  */
+  MY_UCA_WEIGHT2 weight_strings_2bytes_to_1_or_2_weights[0x10000];
+} MY_UCA_LEVEL_BOOSTER;
+
+
+typedef struct my_uca_contraction_hash_t
+{
+  size_t nitems_alloced;
+  MY_CONTRACTION *item;
+} MY_UCA_CONTRACTION_HASH;
+
+
 /* Collation weights on a single level (e.g. primary, secondary, tertiarty) */
 typedef struct my_uca_level_info_st
 {
@@ -147,6 +208,8 @@ typedef struct my_uca_level_info_st
   uint16  **weights;
   MY_CONTRACTIONS contractions;
   uint    levelno;
+  MY_UCA_CONTRACTION_HASH contraction_hash;
+  MY_UCA_LEVEL_BOOSTER *booster;
 } MY_UCA_WEIGHT_LEVEL;
 
 
@@ -168,6 +231,9 @@ typedef struct uca_info_st
   my_wc_t first_variable;
   my_wc_t last_variable;
 
+  /* Unicode version */
+  uint version;
+
 } MY_UCA_INFO;
 
 
@@ -237,6 +303,46 @@ typedef enum enum_repertoire_t
 } my_repertoire_t;
 
 
+/* ID compatibility */
+typedef enum enum_collation_id_type
+{
+  MY_COLLATION_ID_TYPE_PRECISE=          0,
+  MY_COLLATION_ID_TYPE_COMPAT_100800=    1
+} my_collation_id_type_t;
+
+
+/* Collation name display modes */
+typedef enum enum_collation_name_mode
+{
+  MY_COLLATION_NAME_MODE_FULL=                                 0,
+  MY_COLLATION_NAME_MODE_CONTEXT=                              1
+} my_collation_name_mode_t;
+
+
+/* Level flags */
+#define MY_CS_LEVEL_BIT_PRIMARY    0x00
+#define MY_CS_LEVEL_BIT_SECONDARY  0x01
+#define MY_CS_LEVEL_BIT_TERTIARY   0x02
+#define MY_CS_LEVEL_BIT_QUATERNARY 0x03
+
+#define MY_CS_COLL_LEVELS_S1       (1<<MY_CS_LEVEL_BIT_PRIMARY)
+
+#define MY_CS_COLL_LEVELS_AI_CS    (1<<MY_CS_LEVEL_BIT_PRIMARY)| \
+                                   (1<<MY_CS_LEVEL_BIT_TERTIARY)
+
+#define MY_CS_COLL_LEVELS_S2       (1<<MY_CS_LEVEL_BIT_PRIMARY)| \
+                                   (1<<MY_CS_LEVEL_BIT_SECONDARY)
+
+#define MY_CS_COLL_LEVELS_S3       (1<<MY_CS_LEVEL_BIT_PRIMARY)| \
+                                   (1<<MY_CS_LEVEL_BIT_SECONDARY) | \
+                                   (1<<MY_CS_LEVEL_BIT_TERTIARY)
+
+#define MY_CS_COLL_LEVELS_S4       (1<<MY_CS_LEVEL_BIT_PRIMARY)| \
+                                   (1<<MY_CS_LEVEL_BIT_SECONDARY) | \
+                                   (1<<MY_CS_LEVEL_BIT_TERTIARY)  | \
+                                   (1<<MY_CS_LEVEL_BIT_QUATERNARY)
+
+
 /* Flags for strxfrm */
 #define MY_STRXFRM_LEVEL1          0x00000001 /* for primary weights   */
 #define MY_STRXFRM_LEVEL2          0x00000002 /* for secondary weights */
@@ -437,8 +543,13 @@ struct my_collation_handler_st
   */
   size_t (*min_str)(CHARSET_INFO *cs, uchar *dst, size_t dstlen, size_t nchars);
   size_t (*max_str)(CHARSET_INFO *cs, uchar *dst, size_t dstlen, size_t nchars);
+
+  uint (*get_id)(CHARSET_INFO *cs, my_collation_id_type_t type);
+  LEX_CSTRING (*get_collation_name)(CHARSET_INFO *cs,
+                                    my_collation_name_mode_t mode);
 };
 
+
 extern MY_COLLATION_HANDLER my_collation_8bit_bin_handler;
 extern MY_COLLATION_HANDLER my_collation_8bit_simple_ci_handler;
 extern MY_COLLATION_HANDLER my_collation_8bit_nopad_bin_handler;
@@ -840,6 +951,21 @@ struct charset_info_st
   }
 
   /* Collation routines */
+  uint default_flag() const
+  {
+    return state & MY_CS_PRIMARY;
+  }
+
+  uint binsort_flag() const
+  {
+    return state & MY_CS_BINSORT;
+  }
+
+  uint compiled_flag() const
+  {
+    return state & MY_CS_COMPILED;
+  }
+
   int strnncoll(const uchar *a, size_t alen,
                 const uchar *b, size_t blen, my_bool b_is_prefix= FALSE) const
   {
@@ -937,6 +1063,15 @@ struct charset_info_st
     return (coll->max_str)(this, dst, dstlen, nchars);
   }
 
+  uint get_id(my_collation_id_type_t type) const
+  {
+    return (coll->get_id)(this, type);
+  }
+
+  LEX_CSTRING get_collation_name(my_collation_name_mode_t mode) const
+  {
+    return (coll->get_collation_name)(this, mode);
+  }
 #endif /* __cplusplus */
 };
 
@@ -1517,6 +1652,9 @@ extern size_t my_strcspn(CHARSET_INFO *cs, const char *str, const char *end,
 my_bool my_propagate_simple(CHARSET_INFO *cs, const uchar *str, size_t len);
 my_bool my_propagate_complex(CHARSET_INFO *cs, const uchar *str, size_t len);
 
+uint my_ci_get_id_generic(CHARSET_INFO *cs, my_collation_id_type_t type);
+LEX_CSTRING my_ci_get_collation_name_generic(CHARSET_INFO *cs,
+                                             my_collation_name_mode_t mode);
 
 typedef struct 
 {
@@ -1531,7 +1669,7 @@ my_repertoire_t my_string_repertoire(CHARSET_INFO *cs,
 my_bool my_charset_is_ascii_based(CHARSET_INFO *cs);
 my_repertoire_t my_charset_repertoire(CHARSET_INFO *cs);
 
-uint my_strxfrm_flag_normalize(uint flags, uint nlevels);
+uint my_strxfrm_flag_normalize(CHARSET_INFO *cs, uint flags);
 void my_strxfrm_desc_and_reverse(uchar *str, uchar *strend,
                                  uint flags, uint level);
 size_t my_strxfrm_pad_desc_and_reverse(CHARSET_INFO *cs,
diff --git a/include/my_rnd.h b/include/my_rnd.h
index a3e3788085d..dc8efbd276e 100644
--- a/include/my_rnd.h
+++ b/include/my_rnd.h
@@ -25,7 +25,6 @@ struct my_rnd_struct {
 
 void my_rnd_init(struct my_rnd_struct *rand_st, ulong seed1, ulong seed2);
 double my_rnd(struct my_rnd_struct *rand_st);
-double my_rnd_ssl(struct my_rnd_struct *rand_st);
 
 C_MODE_END
 
diff --git a/include/my_sys.h b/include/my_sys.h
index 00a901a313b..a292200d2c4 100644
--- a/include/my_sys.h
+++ b/include/my_sys.h
@@ -243,7 +243,7 @@ extern void (*proc_info_hook)(void *, const PSI_stage_info *, PSI_stage_info *,
                               const char *, const char *, const unsigned int);
 
 /* charsets */
-#define MY_ALL_CHARSETS_SIZE 2048
+#define MY_ALL_CHARSETS_SIZE 4096
 extern MYSQL_PLUGIN_IMPORT CHARSET_INFO *default_charset_info;
 extern MYSQL_PLUGIN_IMPORT CHARSET_INFO *all_charsets[MY_ALL_CHARSETS_SIZE];
 extern struct charset_info_st compiled_charsets[];
@@ -1123,4 +1123,135 @@ void my_init_mysys_psi_keys(void);
 struct st_mysql_file;
 extern struct st_mysql_file *mysql_stdin;
 C_MODE_END
+
+
+#ifdef __cplusplus
+
+class Charset_loader_mysys: public MY_CHARSET_LOADER
+{
+public:
+  Charset_loader_mysys()
+  {
+    my_charset_loader_init_mysys(this);
+  }
+
+  /**
+    Get a CHARSET_INFO by a character set name.
+
+    @param name      Collation name
+    @param cs_flags  e.g. MY_CS_PRIMARY, MY_CS_BINARY
+    @param my_flags  mysys flags (MY_WME, MY_UTF8_IS_UTF8MB3)
+    @return
+    @retval          NULL on error (e.g. not found)
+    @retval          A CHARSET_INFO pointter on success
+  */
+  CHARSET_INFO *get_charset(const char *cs_name, uint cs_flags, myf my_flags)
+  {
+    error[0]= '\0'; // Need to clear in case of the second call
+    return my_charset_get_by_name(this, cs_name, cs_flags, my_flags);
+  }
+
+  /**
+    Get a CHARSET_INFO by an exact collation by name.
+
+    @param name      Collation name
+    @param my_flags  e.g. the utf8 translation flag
+    @return
+    @retval          NULL on error (e.g. not found)
+    @retval          A CHARSET_INFO pointter on success
+  */
+  CHARSET_INFO *get_exact_collation(const char *name, myf my_flags)
+  {
+    error[0]= '\0'; // Need to clear in case of the second call
+    return my_collation_get_by_name(this, name, my_flags);
+  }
+
+  /**
+    Get a CHARSET_INFO by a context collation by name.
+    The returned pointer must be further resolved to a character set.
+
+    @param name      Collation name
+    @param utf8_flag The utf8 translation flag
+    @return
+    @retval          NULL on error (e.g. not found)
+    @retval          A CHARSET_INFO pointter on success
+  */
+  CHARSET_INFO *get_context_collation(const char *name, myf my_flags)
+  {
+    return get_exact_collation_by_context_name(&my_charset_utf8mb4_general_ci,
+                                               name, my_flags);
+  }
+
+  /**
+    Get an exact CHARSET_INFO by a contextually typed collation name.
+
+    @param name      Collation name
+    @param utf8_flag The utf8 translation flag
+    @return
+    @retval          NULL on error (e.g. not found)
+    @retval          A CHARSET_INFO pointer on success
+  */
+  CHARSET_INFO *get_exact_collation_by_context_name(CHARSET_INFO *cs,
+                                                    const char *name,
+                                                    myf my_flags)
+  {
+    char tmp[MY_CS_COLLATION_NAME_SIZE];
+    my_snprintf(tmp, sizeof(tmp), "%s_%s", cs->cs_name.str, name);
+    return get_exact_collation(tmp, my_flags);
+  }
+
+  /*
+    Find a collation with binary comparison rules
+  */
+  CHARSET_INFO *get_bin_collation(CHARSET_INFO *cs, myf my_flags)
+  {
+    /*
+      We don't need to handle old_mode=UTF8_IS_UTF8MB3 here,
+      This method assumes that "cs" points to a real character set name.
+      It can be either "utf8mb3" or "utf8mb4". It cannot be "utf8".
+      No thd->get_utf8_flag() flag passed to get_charset_by_csname().
+    */
+    DBUG_ASSERT(cs->cs_name.length !=4 || memcmp(cs->cs_name.str, "utf8", 4));
+    /*
+      CREATE TABLE t1 (a CHAR(10) BINARY)
+        CHARACTER SET utf8mb4 COLLATE utf8mb4_bin;
+      Nothing to do, we have the binary collation already.
+    */
+    if (cs->state & MY_CS_BINSORT)
+      return cs;
+
+    // CREATE TABLE t1 (a CHAR(10) BINARY) CHARACTER SET utf8mb4;/
+    error[0]= '\0'; // Need in case of the second execution
+    return get_charset(cs->cs_name.str, MY_CS_BINSORT, my_flags);
+  }
+
+  /*
+    Find the default collation in the given character set
+  */
+  CHARSET_INFO *get_default_collation(CHARSET_INFO *cs, myf my_flags)
+  {
+    // See comments in find_bin_collation_or_error()
+    DBUG_ASSERT(cs->cs_name.length !=4 || memcmp(cs->cs_name.str, "utf8", 4));
+    /*
+      CREATE TABLE t1 (a CHAR(10) COLLATE DEFAULT) CHARACTER SET utf8mb4;
+      Nothing to do, we have the default collation already.
+    */
+    if (cs->state & MY_CS_PRIMARY)
+      return cs;
+    /*
+      CREATE TABLE t1 (a CHAR(10) COLLATE DEFAULT)
+        CHARACTER SET utf8mb4 COLLATE utf8mb4_bin;
+
+      Don't need to handle old_mode=UTF8_IS_UTF8MB3 here.
+      See comments in find_bin_collation_or_error.
+    */
+    cs= get_charset(cs->cs_name.str, MY_CS_PRIMARY, my_flags);
+    DBUG_ASSERT(cs);
+    return cs;
+  }
+};
+
+#endif /*__cplusplus */
+
+
 #endif /* _my_sys_h */
diff --git a/include/sslopt-longopts.h b/include/sslopt-longopts.h
index d0278a1645d..b6983b2e718 100644
--- a/include/sslopt-longopts.h
+++ b/include/sslopt-longopts.h
@@ -21,7 +21,7 @@
 
   {"ssl", OPT_SSL_SSL,
    "Enable SSL for connection (automatically enabled with other flags).",
-   &opt_use_ssl, &opt_use_ssl, 0, GET_BOOL, OPT_ARG, 0, 0, 0, 0, 0, 0},
+   &opt_use_ssl, &opt_use_ssl, 0, GET_BOOL, OPT_ARG, 1, 0, 0, 0, 0, 0},
   {"ssl-ca", OPT_SSL_CA,
    "CA file in PEM format (check OpenSSL docs, implies --ssl).",
    &opt_ssl_ca, &opt_ssl_ca, 0, GET_STR, REQUIRED_ARG,
diff --git a/include/sslopt-vars.h b/include/sslopt-vars.h
index e28f19b919d..d263e5dbd90 100644
--- a/include/sslopt-vars.h
+++ b/include/sslopt-vars.h
@@ -22,7 +22,7 @@
 #else
 #define SSL_STATIC static
 #endif
-SSL_STATIC my_bool opt_use_ssl   = 0;
+SSL_STATIC my_bool opt_use_ssl   = 1;
 SSL_STATIC char *opt_ssl_ca      = 0;
 SSL_STATIC char *opt_ssl_capath  = 0;
 SSL_STATIC char *opt_ssl_cert    = 0;