Merge

author: Igor Babaev <igor@askmonty.org> 2010-12-24 16:24:20 -0800
committer: Igor Babaev <igor@askmonty.org> 2010-12-24 16:24:20 -0800
commit: 1eb5e0e60388f460ab2c4d216a98f4088d9997b9 (patch)
tree: cb1fb6fd01bc2bfbc5a38b45c4ebd28e9e6c6675 /sql
parent: f51a26885faa0106286e63fd15f18dae6f618508 (diff)
parent: a095346a9d32ca583d211d07075e805881fdb4e5 (diff)
download: mariadb-git-1eb5e0e60388f460ab2c4d216a98f4088d9997b9.tar.gz
5 files changed, 399 insertions, 15 deletions
diff --git a/sql/field.h b/sql/field.h
index fc43c044d87..b4dab7561ef 100644
--- a/sql/field.h
+++ b/sql/field.h
@@ -593,6 +593,7 @@ public:
 
   /* Check whether the field can be used as a join attribute in hash join */
   virtual bool hash_join_is_possible() { return TRUE; }
+  virtual bool eq_cmp_as_binary() { return TRUE; }
 
   friend bool reopen_table(THD *,struct st_table *,bool);
   friend int cre_myisam(char * name, register TABLE *form, uint options,
@@ -769,12 +770,7 @@ public:
   my_decimal *val_decimal(my_decimal *);
   virtual bool str_needs_quotes() { return TRUE; }
   uint is_equal(Create_field *new_field);
-
-  bool hash_join_is_possible()
-  {
-    /* TODO: support hash joins for non-binary collations */
-    return (flags & BINARY_FLAG);
-  }
+  bool eq_cmp_as_binary() { return test(flags & BINARY_FLAG); }
 };
 
 
diff --git a/sql/key.cc b/sql/key.cc
index 1d27fdcf208..0de1d1fd642 100644
--- a/sql/key.cc
+++ b/sql/key.cc
@@ -618,3 +618,251 @@ int key_tuple_cmp(KEY_PART_INFO *part, uchar *key1, uchar *key2,
 }
 
 
+/**
+  Get hash value for the key from a key buffer 
+
+  @param  key_info       the key descriptor
+  @param  used_key_part  number of key parts used for the key
+  @param  key            pointer to the buffer with the key value
+
+  @datails
+  When hashing we should take special care only of:
+  1. NULLs (and keyparts which can be null so one byte reserved for it);
+  2. Strings for which we have to take into account their collations
+  and the values of their lengths in the prefixes.
+
+  @return  hash value calculated for the key
+*/
+
+ulong key_hashnr(KEY *key_info, uint used_key_parts, const uchar *key)
+{
+  ulong nr=1, nr2=4;
+  KEY_PART_INFO *key_part= key_info->key_part;
+  KEY_PART_INFO *end_key_part= key_part + used_key_parts;
+
+  for (; key_part < end_key_part; key_part++)
+  {
+    uchar *pos= (uchar*)key;
+    CHARSET_INFO *cs;
+    uint length, pack_length;
+    bool is_string= TRUE;
+    LINT_INIT(cs);
+    key+= key_part->length;
+    if (key_part->null_bit)
+    {
+      key++;                       /* Skip null byte */
+      if (*pos)                    /* Found null */
+      {
+        nr^= (nr << 1) | 1;
+        /* Add key pack length to key for VARCHAR segments */
+        switch (key_part->type) {
+        case HA_KEYTYPE_VARTEXT1:
+        case HA_KEYTYPE_VARBINARY1:
+          key++;
+          break;
+        case HA_KEYTYPE_VARTEXT2:
+        case HA_KEYTYPE_VARBINARY2:
+          key+= 2;
+          break;
+        default:
+          ;
+        }
+    continue;
+      }
+      pos++;                       /* Skip null byte */
+    }
+    /* If it is string set parameters of the string */
+    switch (key_part->type) {
+    case HA_KEYTYPE_TEXT:
+      cs= key_part->field->charset();
+      length= key_part->length;
+      pack_length= 0;
+      break;
+    case HA_KEYTYPE_BINARY :
+      cs= &my_charset_bin;
+      length= key_part->length;
+      pack_length= 0;
+      break;
+    case HA_KEYTYPE_VARTEXT1:
+      cs= key_part->field->charset();
+      length= (uint)(pos[0]);
+      pack_length= 1;
+      break;
+    case HA_KEYTYPE_VARBINARY1:
+      cs= &my_charset_bin;
+      length= (uint)(pos[0]);
+      pack_length= 1;
+      break;
+    case HA_KEYTYPE_VARTEXT2:
+      cs= key_part->field->charset();
+      length= uint2korr(pos);
+      pack_length= 2;
+      break;
+    case HA_KEYTYPE_VARBINARY2:
+      cs= &my_charset_bin;
+      length= uint2korr(pos);
+      pack_length= 2;
+      break;
+    default:
+      is_string= FALSE;
+    }
+
+    if (is_string)
+    {
+      if (cs->mbmaxlen > 1)
+      {
+        uint char_length= my_charpos(cs, pos + pack_length,
+                                     pos + pack_length + length,
+                                     length / cs->mbmaxlen);
+        set_if_smaller(length, char_length);
+      }
+      cs->coll->hash_sort(cs, pos+pack_length, length, &nr, &nr2);
+      key+= pack_length;
+    }
+    else
+    {
+      for (; pos < (uchar*)key ; pos++)
+      {
+        nr^=(ulong) ((((uint) nr & 63)+nr2)*((uint) *pos)) + (nr << 8);
+        nr2+=3;
+      }
+    }
+  }
+  DBUG_PRINT("exit", ("hash: %lx", nr));
+  return(nr);
+}
+
+
+/**
+  Check whether two keys in the key buffers are equal
+
+  @param key_info        the key descriptor
+  @param  used_key_part  number of key parts used for the keys
+  @param key1            pointer to the buffer with the first key 
+  @param key2            pointer to the buffer with the second key 
+
+  @detail See details of key_hashnr().
+
+  @retval TRUE  keys in the buffers are NOT equal
+  @retval FALSE keys in the buffers are equal
+*/
+
+bool key_buf_cmp(KEY *key_info, uint used_key_parts,
+                 const uchar *key1, const uchar *key2)
+{
+  KEY_PART_INFO *key_part= key_info->key_part;
+  KEY_PART_INFO *end_key_part= key_part + used_key_parts;
+
+  for (; key_part < end_key_part; key_part++)
+  {
+    uchar *pos1= (uchar*)key1;
+    uchar *pos2= (uchar*)key2;
+    CHARSET_INFO *cs;
+    uint length1, length2, pack_length;
+    bool is_string= TRUE;
+    LINT_INIT(cs);
+    key1+= key_part->length;
+    key2+= key_part->length;
+    if (key_part->null_bit)
+    {
+      key1++; key2++;                           /* Skip null byte */
+      if (*pos1 && *pos2)                       /* Both are null */
+      {
+        /* Add key pack length to key for VARCHAR segments */
+        switch (key_part->type) {
+        case HA_KEYTYPE_VARTEXT1:
+        case HA_KEYTYPE_VARBINARY1:
+          key1++; key2++;
+          break;
+        case HA_KEYTYPE_VARTEXT2:
+        case HA_KEYTYPE_VARBINARY2:
+          key1+= 2; key2+= 2;
+          break;
+        default:
+          ;
+        }
+    continue;
+      }
+      if (*pos1 != *pos2)
+        return FALSE;
+      pos1++; pos2++;
+    }
+
+    /* If it is string set parameters of the string */
+    switch (key_part->type) {
+    case HA_KEYTYPE_TEXT:
+      cs= key_part->field->charset();
+      length1= length2= key_part->length;
+      pack_length= 0;
+      break;
+    case HA_KEYTYPE_BINARY :
+      cs= &my_charset_bin;
+      length1= length2= key_part->length;
+      pack_length= 0;
+      break;
+    case HA_KEYTYPE_VARTEXT1:
+      cs= key_part->field->charset();
+      length1= (uint)(pos1[0]);
+      length2= (uint)(pos2[0]);
+      pack_length= 1;
+      break;
+    case HA_KEYTYPE_VARBINARY1:
+      cs= &my_charset_bin;
+      length1= (uint)(pos1[0]);
+      length2= (uint)(pos2[0]);
+      pack_length= 1;
+      break;
+    case HA_KEYTYPE_VARTEXT2:
+      cs= key_part->field->charset();
+      length1= uint2korr(pos1);
+      length2= uint2korr(pos2);
+      pack_length= 2;
+      break;
+    case HA_KEYTYPE_VARBINARY2:
+      cs= &my_charset_bin;
+      length1= uint2korr(pos1);
+      length2= uint2korr(pos2);
+      pack_length= 2;
+      break;
+    default:
+      is_string= FALSE;
+    }
+
+    if (is_string)
+    {
+      /*
+        Compare the strings taking into account length in characters
+        and collation
+      */
+      uint byte_len1= length1, byte_len2= length2;
+      if (cs->mbmaxlen > 1)
+      {
+        uint char_length1= my_charpos(cs, pos1 + pack_length,
+                                      pos1 + pack_length + length1,
+                                      length1 / cs->mbmaxlen);
+        uint char_length2= my_charpos(cs, pos2 + pack_length,
+                                      pos2 + pack_length + length2,
+                                      length2 / cs->mbmaxlen);
+        set_if_smaller(length1, char_length1);
+        set_if_smaller(length2, char_length2);
+      }
+      if (length1 != length2 ||
+          cs->coll->strnncollsp(cs,
+                                pos1 + pack_length, byte_len1,
+                                pos2 + pack_length, byte_len2,
+                                1))
+        return TRUE;
+      key1+= pack_length; key2+= pack_length;
+    }
+    else
+    {
+      /* it is OK to compare non-string byte per byte */
+      for (; pos1 < (uchar*)key1 ; pos1++, pos2++)
+      {
+        if (pos1[0] != pos2[0])
+          return TRUE;
+      }
+    }
+  }
+  return FALSE;
+}
diff --git a/sql/mysql_priv.h b/sql/mysql_priv.h
index 122baf47bac..ca5ccc699cd 100644
--- a/sql/mysql_priv.h
+++ b/sql/mysql_priv.h
@@ -1883,6 +1883,9 @@ bool key_cmp_if_same(TABLE *form,const uchar *key,uint index,uint key_length);
 void key_unpack(String *to,TABLE *form,uint index);
 bool is_key_used(TABLE *table, uint idx, const MY_BITMAP *fields);
 int key_cmp(KEY_PART_INFO *key_part, const uchar *key, uint key_length);
+ulong key_hashnr(KEY *key_info, uint used_key_parts, const uchar *key);
+bool key_buf_cmp(KEY *key_info, uint used_key_parts,
+                 const uchar *key1, const uchar *key2);
 extern "C" int key_rec_cmp(void *key_info, uchar *a, uchar *b);
 int key_tuple_cmp(KEY_PART_INFO *part, uchar *key1, uchar *key2, uint tuple_length);
 
diff --git a/sql/sql_join_cache.cc b/sql/sql_join_cache.cc
index 2c28aae781c..01635c56585 100644
--- a/sql/sql_join_cache.cc
+++ b/sql/sql_join_cache.cc
@@ -2520,6 +2520,24 @@ int JOIN_CACHE_HASHED::init()
   pack_length+= get_size_of_rec_offset(); 
   pack_length_with_blob_ptrs+= get_size_of_rec_offset();
 
+  ref_key_info= join_tab->table->key_info+join_tab->ref.key;
+  ref_used_key_parts= join_tab->ref.key_parts;
+
+  hash_func= &JOIN_CACHE_HASHED::get_hash_idx_simple;
+  hash_cmp_func= &JOIN_CACHE_HASHED::equal_keys_simple;
+
+  KEY_PART_INFO *key_part= ref_key_info->key_part;
+  KEY_PART_INFO *key_part_end= key_part+ref_used_key_parts;
+  for ( ; key_part < key_part_end; key_part++)
+  {
+    if (!key_part->field->eq_cmp_as_binary())
+    {
+      hash_func= &JOIN_CACHE_HASHED::get_hash_idx_complex;
+      hash_cmp_func= &JOIN_CACHE_HASHED::equal_keys_complex;
+      break;
+    }
+  }
+      
   init_hash_table();
 
   rec_fields_offset= get_size_of_rec_offset()+get_size_of_rec_length()+
@@ -2904,7 +2922,7 @@ bool JOIN_CACHE_HASHED::key_search(uchar *key, uint key_len,
                                    uchar **key_ref_ptr) 
 {
   bool is_found= FALSE;
-  uint idx= get_hash_idx(key, key_length);
+  uint idx= (this->*hash_func)(key, key_length);
   uchar *ref_ptr= hash_table+size_of_key_ofs*idx;
   while (!is_null_key_ref(ref_ptr))
   {
@@ -2913,7 +2931,7 @@ bool JOIN_CACHE_HASHED::key_search(uchar *key, uint key_len,
     next_key= use_emb_key ? get_emb_key(ref_ptr-get_size_of_rec_offset()) :
                             ref_ptr-key_length;
 
-    if (memcmp(next_key, key, key_len) == 0)
+    if ((this->*hash_cmp_func)(next_key, key, key_len))
     {
       is_found= TRUE;
       break;
@@ -2925,22 +2943,24 @@ bool JOIN_CACHE_HASHED::key_search(uchar *key, uint key_len,
 
 
 /* 
-  Calclulate hash value for a key in the hash table of the join buffer
+  Hash function that considers a key in the hash table as byte array
 
   SYNOPSIS
-    get_hash_idx()
+    get_hash_idx_simple()
       key             pointer to the key value
       key_len         key value length
       
   DESCRIPTION
     The function calculates an index of the hash entry in the hash table
-    of the join buffer for the given key  
+    of the join buffer for the given key. It considers the key just as
+    a sequence of bytes of the length key_len.
 
   RETURN VALUE
-    the calculated index of the hash entry for the given key.  
+    the calculated index of the hash entry for the given key  
 */
 
-uint JOIN_CACHE_HASHED::get_hash_idx(uchar* key, uint key_len)
+inline
+uint JOIN_CACHE_HASHED::get_hash_idx_simple(uchar* key, uint key_len)
 {
   ulong nr= 1;
   ulong nr2= 4;
@@ -2956,6 +2976,93 @@ uint JOIN_CACHE_HASHED::get_hash_idx(uchar* key, uint key_len)
 
 
 /* 
+  Hash function that takes into account collations of the components of the key  
+
+  SYNOPSIS
+    get_hash_idx_complex()
+      key             pointer to the key value
+      key_len         key value length
+      
+  DESCRIPTION
+    The function calculates an index of the hash entry in the hash table
+    of the join buffer for the given key. It takes into account that the
+    components of the key may be of a varchar type with different collations.
+    The function guarantees that the same hash value for any two equal
+    keys that may differ as byte sequences.
+    The function takes the info about the components of the key, their
+    types and used collations from the class member ref_key_info containing
+    a pointer to the descriptor of the index that can be used for the join
+    operation.
+
+  RETURN VALUE
+    the calculated index of the hash entry for the given key  
+*/
+
+inline
+uint JOIN_CACHE_HASHED::get_hash_idx_complex(uchar *key, uint key_len)
+{
+  return 
+    (uint) (key_hashnr(ref_key_info, ref_used_key_parts, key) % hash_entries);
+}
+
+
+/* 
+  Compare two key entries in the hash table as sequence of bytes
+
+  SYNOPSIS
+    equal_keys_simple()
+      key1            pointer to the first key entry
+      key2            pointer to the second key entry 
+      key_len         the length of the key values
+      
+  DESCRIPTION
+    The function compares two key entries in the hash table key1 and key2
+    as two sequences bytes of the length key_len
+
+  RETURN VALUE
+    TRUE       key1 coincides with key2
+    FALSE      otherwise
+*/
+
+inline
+bool JOIN_CACHE_HASHED::equal_keys_simple(uchar *key1, uchar *key2,
+                                          uint key_len)
+{
+  return memcmp(key1, key2, key_len) == 0;
+}
+
+
+/* 
+  Compare two key entries taking into account the used collation
+
+  SYNOPSIS
+    equal_keys_complex()
+      key1            pointer to the first key entry
+      key2            pointer to the second key entry 
+      key_len         the length of the key values
+      
+  DESCRIPTION
+    The function checks whether two key entries in the hash table
+    key1 and key2 are equal as, possibly, compound keys of a certain
+    structure whose components may be of a varchar type and may
+    employ different collations.
+    The descriptor of the key structure is taken from the class
+    member ref_key_info.
+
+  RETURN VALUE
+    TRUE       key1 is equal tokey2
+    FALSE      otherwise
+*/
+
+inline
+bool JOIN_CACHE_HASHED::equal_keys_complex(uchar *key1, uchar *key2,
+                                          uint key_len)
+{
+  return key_buf_cmp(ref_key_info, ref_used_key_parts, key1, key2) == 0;
+}
+
+
+/* 
   Clean up the hash table of the join buffer
 
   SYNOPSIS
diff --git a/sql/sql_join_cache.h b/sql/sql_join_cache.h
index fccd32e1319..546067fad24 100644
--- a/sql/sql_join_cache.h
+++ b/sql/sql_join_cache.h
@@ -738,6 +738,10 @@ public:
 class JOIN_CACHE_HASHED: public JOIN_CACHE
 {
 
+  typedef uint (JOIN_CACHE_HASHED::*Hash_func) (uchar *key, uint key_len);
+  typedef bool (JOIN_CACHE_HASHED::*Hash_cmp_func) (uchar *key1, uchar *key2,
+                                                    uint key_len);
+  
 private:
 
   /* Size of the offset of a key entry in the hash table */
@@ -761,8 +765,12 @@ private:
 
   /* The offset of the data fields from the beginning of the record fields */
   uint data_fields_offset;
-  
-  uint get_hash_idx(uchar* key, uint key_len);
+
+  inline uint get_hash_idx_simple(uchar *key, uint key_len);
+  inline uint get_hash_idx_complex(uchar *key, uint key_len);
+
+  inline bool equal_keys_simple(uchar *key1, uchar *key2, uint key_len);
+  inline bool equal_keys_complex(uchar *key1, uchar *key2, uint key_len);
 
   int init_hash_table();
   void cleanup_hash_table();
@@ -770,6 +778,28 @@ private:
 protected:
 
   /* 
+    Index info on the TABLE_REF object used by the hash join
+    to look for matching records
+  */    
+  KEY *ref_key_info;
+  /* 
+    Number of the key parts the TABLE_REF object used by the hash join
+    to look for matching records
+  */    
+  uint ref_used_key_parts;
+
+  /*
+    The hash function used in the hash table,
+    usually set by the init() method
+  */ 
+  Hash_func hash_func;
+  /*
+    The function to check whether two key entries in the hash table
+    are equal or not, usually set by the init() method
+  */ 
+  Hash_cmp_func hash_cmp_func;
+
+  /* 
     Length of a key value.
     It is assumed that all key values have the same length.
   */
author	Igor Babaev <igor@askmonty.org>	2010-12-24 16:24:20 -0800
committer	Igor Babaev <igor@askmonty.org>	2010-12-24 16:24:20 -0800
commit	1eb5e0e60388f460ab2c4d216a98f4088d9997b9 (patch)
tree	cb1fb6fd01bc2bfbc5a38b45c4ebd28e9e6c6675 /sql
parent	f51a26885faa0106286e63fd15f18dae6f618508 (diff)
parent	a095346a9d32ca583d211d07075e805881fdb4e5 (diff)
download	mariadb-git-1eb5e0e60388f460ab2c4d216a98f4088d9997b9.tar.gz