Bug#37721: ORDER BY when WHERE contains non-partitioned

index column There was actually two problems 1) when clustered pk, order by non pk index should also compare with pk as last resort to differ keys from each other 2) bug in the index search handling in ha_partition (was found when extending the test case Solution to 1 was to include the pk in key compare if clustered pk and search on other index. Solution for 2 was to remove the optimization from ordered scan to unordered scan if clustered pk.
author: Mattias Jonsson <mattias.jonsson@sun.com> 2008-10-10 12:01:01 +0200
committer: Mattias Jonsson <mattias.jonsson@sun.com> 2008-10-10 12:01:01 +0200
commit: 44630e09ee7c86e8e2a78a434b59121049c3bebc (patch)
tree: 26c18991c89e887e3a6c706f648f8e6be6a7803f
parent: e300184c0179dc0b668afe085d175e8ce39f6609 (diff)
download: mariadb-git-44630e09ee7c86e8e2a78a434b59121049c3bebc.tar.gz
5 files changed, 237 insertions, 78 deletions
diff --git a/mysql-test/r/partition_innodb.result b/mysql-test/r/partition_innodb.result
index 6e56f9023eb..c6d74238d31 100644
--- a/mysql-test/r/partition_innodb.result
+++ b/mysql-test/r/partition_innodb.result
@@ -1,3 +1,68 @@
+# Bug#37721, test of ORDER BY on PK and WHERE on INDEX
+CREATE TABLE t1 (
+a INT,
+b INT,
+PRIMARY KEY (a),
+INDEX (b))
+ENGINE InnoDB
+PARTITION BY HASH(a)
+PARTITIONS 3;
+INSERT INTO t1 VALUES (0,0),(4,0),(2,0);
+SELECT a FROM t1 WHERE b = 0 ORDER BY a ASC;
+a
+0
+2
+4
+SELECT a FROM t1 WHERE b = 0 ORDER BY a DESC;
+a
+4
+2
+0
+ALTER TABLE t1 DROP INDEX b;
+SELECT a FROM t1 WHERE b = 0 ORDER BY a ASC;
+a
+0
+2
+4
+SELECT a FROM t1 WHERE b = 0 ORDER BY a DESC;
+a
+4
+2
+0
+DROP TABLE t1;
+CREATE TABLE t1 (
+a VARCHAR(600),
+b VARCHAR(600),
+PRIMARY KEY (a),
+INDEX (b))
+ENGINE InnoDB
+PARTITION BY KEY(a)
+PARTITIONS 3;
+INSERT INTO t1 VALUES (concat(repeat('MySQL',100),'1'),repeat('0',257));
+INSERT INTO t1 VALUES (concat(repeat('MySQL',100),'3'),repeat('0',257));
+INSERT INTO t1 VALUES (concat(repeat('MySQL',100),'2'),repeat('0',257));
+SELECT right(a,1) FROM t1 WHERE b = repeat('0',257) ORDER BY a ASC;
+right(a,1)
+1
+2
+3
+SELECT right(a,1) FROM t1 WHERE b = repeat('0',257) ORDER BY a DESC;
+right(a,1)
+3
+2
+1
+ALTER TABLE t1 DROP INDEX b;
+SELECT right(a,1) FROM t1 WHERE b = repeat('0',257) ORDER BY a ASC;
+right(a,1)
+1
+2
+3
+SELECT right(a,1) FROM t1 WHERE b = repeat('0',257) ORDER BY a DESC;
+right(a,1)
+3
+2
+1
+DROP TABLE t1;
 # Bug#32948
 CREATE TABLE t1 (c1 INT, PRIMARY KEY (c1)) ENGINE=INNODB;
 CREATE TABLE t2 (c1 INT, PRIMARY KEY (c1),
diff --git a/mysql-test/t/partition_innodb.test b/mysql-test/t/partition_innodb.test
index 932855cc877..c29b3458d19 100644
--- a/mysql-test/t/partition_innodb.test
+++ b/mysql-test/t/partition_innodb.test
@@ -1,6 +1,46 @@
 --source include/have_partition.inc
 --source include/have_innodb.inc
 
+#
+# Bug37721: ORDER BY when WHERE contains non-partitioned index column
+# wrong order since it did not use pk as second compare
+--echo # Bug#37721, test of ORDER BY on PK and WHERE on INDEX
+CREATE TABLE t1 (
+  a INT,
+  b INT,
+  PRIMARY KEY (a),
+  INDEX (b))
+ENGINE InnoDB
+PARTITION BY HASH(a)
+PARTITIONS 3;
+# This will give the middle partition the highest value
+INSERT INTO t1 VALUES (0,0),(4,0),(2,0);
+SELECT a FROM t1 WHERE b = 0 ORDER BY a ASC;
+SELECT a FROM t1 WHERE b = 0 ORDER BY a DESC;
+ALTER TABLE t1 DROP INDEX b;
+SELECT a FROM t1 WHERE b = 0 ORDER BY a ASC;
+SELECT a FROM t1 WHERE b = 0 ORDER BY a DESC;
+DROP TABLE t1;
+CREATE TABLE t1 (
+  a VARCHAR(600),
+  b VARCHAR(600),
+  PRIMARY KEY (a),
+  INDEX (b))
+ENGINE InnoDB
+PARTITION BY KEY(a)
+PARTITIONS 3;
+# This will give the middle partition the highest value
+INSERT INTO t1 VALUES (concat(repeat('MySQL',100),'1'),repeat('0',257));
+INSERT INTO t1 VALUES (concat(repeat('MySQL',100),'3'),repeat('0',257));
+INSERT INTO t1 VALUES (concat(repeat('MySQL',100),'2'),repeat('0',257));
+SELECT right(a,1) FROM t1 WHERE b = repeat('0',257) ORDER BY a ASC;
+SELECT right(a,1) FROM t1 WHERE b = repeat('0',257) ORDER BY a DESC;
+ALTER TABLE t1 DROP INDEX b;
+SELECT right(a,1) FROM t1 WHERE b = repeat('0',257) ORDER BY a ASC;
+SELECT right(a,1) FROM t1 WHERE b = repeat('0',257) ORDER BY a DESC;
+DROP TABLE t1;
+
+#
 # Bug#32948 - FKs allowed to reference partitioned table
 #
 -- echo # Bug#32948
diff --git a/sql/ha_partition.cc b/sql/ha_partition.cc
index 14e321218ca..bfc672409d0 100644
--- a/sql/ha_partition.cc
+++ b/sql/ha_partition.cc
@@ -239,7 +239,8 @@ void ha_partition::init_handler_variables()
   m_rec_length= 0;
   m_last_part= 0;
   m_rec0= 0;
-  m_curr_key_info= 0;
+  m_curr_key_info[0]= NULL;
+  m_curr_key_info[1]= NULL;
   /*
     this allows blackhole to work properly
   */
@@ -3604,11 +3605,24 @@ int ha_partition::index_init(uint inx, bool sorted)
   handler **file;
   DBUG_ENTER("ha_partition::index_init");
 
+  DBUG_PRINT("info", ("inx %u sorted %u", inx, sorted));
   active_index= inx;
   m_part_spec.start_part= NO_CURRENT_PART_ID;
   m_start_key.length= 0;
   m_ordered= sorted;
-  m_curr_key_info= table->key_info+inx;
+  m_curr_key_info[0]= table->key_info+inx;
+  if (m_pkey_is_clustered && table->s->primary_key != MAX_KEY)
+  {
+    /*
+      if PK is clustered, then the key cmp must use the pk to
+      differentiate between equal key in given index.
+    */
+    DBUG_PRINT("info", ("Clustered pk, using pk as secondary cmp"));
+    m_curr_key_info[1]= table->key_info+table->s->primary_key;
+    m_curr_key_info[2]= NULL;
+  }
+  else
+    m_curr_key_info[1]= NULL;
   /*
     Some handlers only read fields as specified by the bitmap for the
     read set. For partitioned handlers we always require that the
@@ -3633,9 +3647,13 @@ int ha_partition::index_init(uint inx, bool sorted)
       TODO: handle COUNT(*) queries via unordered scan.
     */
     uint i;
-    for (i= 0; i < m_curr_key_info->key_parts; i++)
-      bitmap_set_bit(table->read_set,
-                     m_curr_key_info->key_part[i].field->field_index);
+    KEY **key_info= m_curr_key_info;
+    do
+    {
+      for (i= 0; i < (*key_info)->key_parts; i++)
+        bitmap_set_bit(table->read_set,
+                       (*key_info)->key_part[i].field->field_index);
+    } while (*(++key_info));
   }
   file= m_file;
   do
@@ -3692,10 +3710,10 @@ int ha_partition::index_end()
   Read one record in an index scan and start an index scan
 
   SYNOPSIS
-    index_read()
+    index_read_map()
     buf                    Read row in MySQL Row Format
     key                    Key parts in consecutive order
-    key_len                Total length of key parts
+    keypart_map            Which part of key is used
     find_flag              What type of key condition is used
 
   RETURN VALUE
@@ -3703,12 +3721,12 @@ int ha_partition::index_end()
     0                  Success
 
   DESCRIPTION
-    index_read starts a new index scan using a start key. The MySQL Server
+    index_read_map starts a new index scan using a start key. The MySQL Server
     will check the end key on its own. Thus to function properly the
     partitioned handler need to ensure that it delivers records in the sort
     order of the MySQL Server.
-    index_read can be restarted without calling index_end on the previous
-    index scan and without calling index_init. In this case the index_read
+    index_read_map can be restarted without calling index_end on the previous
+    index scan and without calling index_init. In this case the index_read_map
     is on the same index as the previous index_scan. This is particularly
     used in conjuntion with multi read ranges.
 */
@@ -3765,11 +3783,15 @@ int ha_partition::common_index_read(uchar *buf, bool have_start_key)
   DBUG_ENTER("ha_partition::common_index_read");
   LINT_INIT(key_len); /* used if have_start_key==TRUE */
 
+  DBUG_PRINT("info", ("m_ordered %u m_ordered_scan_ong %u have_start_key %u",
+                      m_ordered, m_ordered_scan_ongoing, have_start_key));
+
   if (have_start_key)
   {
     m_start_key.length= key_len= calculate_key_len(table, active_index, 
                                                    m_start_key.key,
                                                    m_start_key.keypart_map);
+    DBUG_ASSERT(key_len);
   }
   if ((error= partition_scan_set_up(buf, have_start_key)))
   {
@@ -3784,9 +3806,12 @@ int ha_partition::common_index_read(uchar *buf, bool have_start_key)
     reverse_order= TRUE;
     m_ordered_scan_ongoing= TRUE;
   }
+  DBUG_PRINT("info", ("m_ordered %u m_o_scan_ong %u have_start_key %u",
+                      m_ordered, m_ordered_scan_ongoing, have_start_key));
   if (!m_ordered_scan_ongoing ||
       (have_start_key && m_start_key.flag == HA_READ_KEY_EXACT &&
-       (key_len >= m_curr_key_info->key_length || key_len == 0)))
+       !m_pkey_is_clustered &&
+       key_len >= m_curr_key_info[0]->key_length))
    {
     /*
       We use unordered index scan either when read_range is used and flag
@@ -3799,6 +3824,8 @@ int ha_partition::common_index_read(uchar *buf, bool have_start_key)
       Need to set unordered scan ongoing since we can come here even when
       it isn't set.
     */
+    DBUG_PRINT("info", ("key_len %lu (%lu), doing unordered scan",
+                        key_len, m_curr_key_info[0]->key_length));
     m_ordered_scan_ongoing= FALSE;
     error= handle_unordered_scan_next_partition(buf);
   }
@@ -3900,7 +3927,7 @@ int ha_partition::common_first_last(uchar *buf)
   Read last using key
 
   SYNOPSIS
-    index_read_last()
+    index_read_last_map()
     buf                   Read row in MySQL Row Format
     key                   Key
     keypart_map           Which part of key is used
@@ -4057,7 +4084,7 @@ int ha_partition::read_range_first(const key_range *start_key,
        (end_key->flag == HA_READ_AFTER_KEY) ? -1 : 0);
   }
 
-  range_key_part= m_curr_key_info->key_part;
+  range_key_part= m_curr_key_info[0]->key_part;
   if (start_key)
     m_start_key= *start_key;
   else
diff --git a/sql/ha_partition.h b/sql/ha_partition.h
index 685f057dfce..6e52106dfa5 100644
--- a/sql/ha_partition.h
+++ b/sql/ha_partition.h
@@ -74,9 +74,16 @@ private:
   handler **m_added_file;               // Added parts kept for errors
   partition_info *m_part_info;          // local reference to partition
   Field **m_part_field_array;           // Part field array locally to save acc
-  uchar *m_ordered_rec_buffer;           // Row and key buffer for ord. idx scan
-  KEY *m_curr_key_info;                 // Current index
-  uchar *m_rec0;                         // table->record[0]
+  uchar *m_ordered_rec_buffer;          // Row and key buffer for ord. idx scan
+  /*
+    Current index.
+    When used in key_rec_cmp: If clustered pk, index compare
+    must compare pk if given index is same for two rows.
+    So normally m_curr_key_info[0]= current index and m_curr_key[1]= NULL,
+    and if clustered pk, [0]= current index, [1]= pk, [2]= NULL
+  */
+  KEY *m_curr_key_info[3];              // Current index
+  uchar *m_rec0;                        // table->record[0]
   QUEUE m_queue;                        // Prio queue used by sorted read
   /*
     Since the partition handler is a handler on top of other handlers, it
diff --git a/sql/key.cc b/sql/key.cc
index 47e5c5ebdd7..5b2ae8029dd 100644
--- a/sql/key.cc
+++ b/sql/key.cc
@@ -448,84 +448,104 @@ int key_cmp(KEY_PART_INFO *key_part, const uchar *key, uint key_length)
 }
 
 
-/*
-  Compare two records in index order
-  SYNOPSIS
-    key_rec_cmp()
-    key                         Index information
-    rec0                        Pointer to table->record[0]
-    first_rec                   Pointer to record compare with
-    second_rec                  Pointer to record compare against first_rec
-
-  DESCRIPTION
-    This method is set-up such that it can be called directly from the
-    priority queue and it is attempted to be optimised as much as possible
-    since this will be called O(N * log N) times while performing a merge
-    sort in various places in the code.
-
-    We retrieve the pointer to table->record[0] using the fact that key_parts
-    have an offset making it possible to calculate the start of the record.
-    We need to get the diff to the compared record since none of the records
-    being compared are stored in table->record[0].
-
-    We first check for NULL values, if there are no NULL values we use
-    a compare method that gets two field pointers and a max length
-    and return the result of the comparison.
+/**
+  Compare two records in index order.
+
+  This method is set-up such that it can be called directly from the
+  priority queue and it is attempted to be optimised as much as possible
+  since this will be called O(N * log N) times while performing a merge
+  sort in various places in the code.
+
+  We retrieve the pointer to table->record[0] using the fact that key_parts
+  have an offset making it possible to calculate the start of the record.
+  We need to get the diff to the compared record since none of the records
+  being compared are stored in table->record[0].
+
+  We first check for NULL values, if there are no NULL values we use
+  a compare method that gets two field pointers and a max length
+  and return the result of the comparison.
+
+  key is a null terminated array, since in some cases (clustered
+  primary key) it must compare more than one index.
+
+  @param key                    Null terminated array of index information
+  @param first_rec              Pointer to record compare with
+  @param second_rec             Pointer to record compare against first_rec
+
+  @return Return value is SIGN(first_rec - second_rec)
+    @retval  0                  Keys are equal
+    @retval -1                  second_rec is greater than first_rec
+    @retval +1                  first_rec is greater than second_rec
 */
 
-int key_rec_cmp(void *key, uchar *first_rec, uchar *second_rec)
+int key_rec_cmp(void *key_p, uchar *first_rec, uchar *second_rec)
 {
-  KEY *key_info= (KEY*)key;
-  uint key_parts= key_info->key_parts, i= 0;
+  KEY **key= (KEY**) key_p;
+  KEY *key_info= *(key++);                     // Start with first key
+  uint key_parts, key_part_num;
   KEY_PART_INFO *key_part= key_info->key_part;
   uchar *rec0= key_part->field->ptr - key_part->offset;
   my_ptrdiff_t first_diff= first_rec - rec0, sec_diff= second_rec - rec0;
   int result= 0;
+  Field *field;
   DBUG_ENTER("key_rec_cmp");
 
+  /* loop over all given keys */
   do
   {
-    Field *field= key_part->field;
+    key_parts= key_info->key_parts;
+    key_part= key_info->key_part;
+    key_part_num= 0;
 
-    if (key_part->null_bit)
+    /* loop over every key part */
+    do
     {
-      /* The key_part can contain NULL values */
-      bool first_is_null= field->is_null_in_record_with_offset(first_diff);
-      bool sec_is_null= field->is_null_in_record_with_offset(sec_diff);
-      /*
-        NULL is smaller then everything so if first is NULL and the other
-        not then we know that we should return -1 and for the opposite
-        we should return +1. If both are NULL then we call it equality
-        although it is a strange form of equality, we have equally little
-        information of the real value.
-      */
-      if (!first_is_null)
+      field= key_part->field;
+
+      if (key_part->null_bit)
       {
-        if (!sec_is_null)
-          ; /* Fall through, no NULL fields */
-        else
+        /* The key_part can contain NULL values */
+        bool first_is_null= field->is_null_in_record_with_offset(first_diff);
+        bool sec_is_null= field->is_null_in_record_with_offset(sec_diff);
+        /*
+          NULL is smaller then everything so if first is NULL and the other
+          not then we know that we should return -1 and for the opposite
+          we should return +1. If both are NULL then we call it equality
+          although it is a strange form of equality, we have equally little
+          information of the real value.
+        */
+        if (!first_is_null)
         {
-          DBUG_RETURN(+1);
+          if (!sec_is_null)
+            ; /* Fall through, no NULL fields */
+          else
+          {
+            DBUG_RETURN(+1);
+          }
         }
+        else if (!sec_is_null)
+        {
+          DBUG_RETURN(-1);
+        }
+        else
+          goto next_loop; /* Both were NULL */
       }
-      else if (!sec_is_null)
-      {
-        DBUG_RETURN(-1);
-      }
-      else
-        goto next_loop; /* Both were NULL */
-    }
-    /*
-      No null values in the fields
-      We use the virtual method cmp_max with a max length parameter.
-      For most field types this translates into a cmp without
-      max length. The exceptions are the BLOB and VARCHAR field types
-      that take the max length into account.
-    */
-    result= field->cmp_max(field->ptr+first_diff, field->ptr+sec_diff,
-                           key_part->length);
+      /*
+        No null values in the fields
+        We use the virtual method cmp_max with a max length parameter.
+        For most field types this translates into a cmp without
+        max length. The exceptions are the BLOB and VARCHAR field types
+        that take the max length into account.
+      */
+      if ((result= field->cmp_max(field->ptr+first_diff, field->ptr+sec_diff,
+                             key_part->length)))
+        DBUG_RETURN(result);
 next_loop:
-    key_part++;
-  } while (!result && ++i < key_parts);
-  DBUG_RETURN(result);
+      key_part++;
+      key_part_num++;
+    } while (key_part_num < key_parts); /* this key is done */
+
+    key_info= *(key++);
+  } while (key_info); /* no more keys to test */
+  DBUG_RETURN(0);
 }
author	Mattias Jonsson <mattias.jonsson@sun.com>	2008-10-10 12:01:01 +0200
committer	Mattias Jonsson <mattias.jonsson@sun.com>	2008-10-10 12:01:01 +0200
commit	44630e09ee7c86e8e2a78a434b59121049c3bebc (patch)
tree	26c18991c89e887e3a6c706f648f8e6be6a7803f
parent	e300184c0179dc0b668afe085d175e8ce39f6609 (diff)
download	mariadb-git-44630e09ee7c86e8e2a78a434b59121049c3bebc.tar.gz