summaryrefslogtreecommitdiff
path: root/sql/handler.h
diff options
context:
space:
mode:
authorSergei Golubchik <sergii@pisem.net>2011-10-19 21:45:18 +0200
committerSergei Golubchik <sergii@pisem.net>2011-10-19 21:45:18 +0200
commit76f0b94bb0b2994d639353530c5b251d0f1a204b (patch)
tree9ed50628aac34f89a37637bab2fc4915b86b5eb4 /sql/handler.h
parent4e46d8e5bff140f2549841167dc4b65a3c0a645d (diff)
parent5dc1a2231f55bacc9aaf0e24816f3d9c2ee1f21d (diff)
downloadmariadb-git-76f0b94bb0b2994d639353530c5b251d0f1a204b.tar.gz
merge with 5.3
sql/sql_insert.cc: CREATE ... IF NOT EXISTS may do nothing, but it is still not a failure. don't forget to my_ok it. ****** CREATE ... IF NOT EXISTS may do nothing, but it is still not a failure. don't forget to my_ok it. sql/sql_table.cc: small cleanup ****** small cleanup
Diffstat (limited to 'sql/handler.h')
-rw-r--r--sql/handler.h328
1 files changed, 285 insertions, 43 deletions
diff --git a/sql/handler.h b/sql/handler.h
index c64859bafc3..b27c897364b 100644
--- a/sql/handler.h
+++ b/sql/handler.h
@@ -1,7 +1,7 @@
#ifndef HANDLER_INCLUDED
#define HANDLER_INCLUDED
-/* Copyright (c) 2000, 2011, Oracle and/or its affiliates. All rights reserved.
- Copyright (c) 2010-2011 Monty Program Ab
+/* Copyright (c) 2000, 2011, Oracle and/or its affiliates.
+ Copyright (c) 2009-2011 Monty Program Ab
This program is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License
@@ -170,8 +170,9 @@
/* Has automatic checksums and uses the new checksum format */
#define HA_HAS_NEW_CHECKSUM (LL(1) << 38)
-
-#define HA_MRR_CANT_SORT (LL(1) << 39)
+#define HA_CAN_VIRTUAL_COLUMNS (LL(1) << 39)
+#define HA_MRR_CANT_SORT (LL(1) << 40)
+#define HA_RECORD_MUST_BE_CLEAN_ON_WRITE (LL(1) << 41)
/*
Set of all binlog flags. Currently only contain the capabilities
@@ -193,8 +194,11 @@
*/
#define HA_KEY_SCAN_NOT_ROR 128
#define HA_DO_INDEX_COND_PUSHDOWN 256 /* Supports Index Condition Pushdown */
-
-
+/*
+ Data is clustered on this key. This means that when you read the key
+ you also get the row data without any additional disk reads.
+*/
+#define HA_CLUSTERED_INDEX 512
/*
bits in alter_table_flags:
@@ -687,6 +691,11 @@ struct handler_log_file_data {
See ha_example.cc for an example.
*/
+
+struct ha_table_option_struct;
+struct ha_field_option_struct;
+struct ha_index_option_struct;
+
enum ha_option_type { HA_OPTION_TYPE_ULL, /* unsigned long long */
HA_OPTION_TYPE_STRING, /* char * */
HA_OPTION_TYPE_ENUM, /* uint */
@@ -859,12 +868,113 @@ struct handlerton
NOTE 'all' is also false in auto-commit mode where 'end of statement'
and 'real commit' mean the same event.
*/
- int (*commit)(handlerton *hton, THD *thd, bool all);
+ int (*commit)(handlerton *hton, THD *thd, bool all);
+ /*
+ The commit_ordered() method is called prior to the commit() method, after
+ the transaction manager has decided to commit (not rollback) the
+ transaction. Unlike commit(), commit_ordered() is called only when the
+ full transaction is committed, not for each commit of statement
+ transaction in a multi-statement transaction.
+
+ Not that like prepare(), commit_ordered() is only called when 2-phase
+ commit takes place. Ie. when no binary log and only a single engine
+ participates in a transaction, one commit() is called, no
+ commit_ordered(). So engines must be prepared for this.
+
+ The calls to commit_ordered() in multiple parallel transactions is
+ guaranteed to happen in the same order in every participating
+ handler. This can be used to ensure the same commit order among multiple
+ handlers (eg. in table handler and binlog). So if transaction T1 calls
+ into commit_ordered() of handler A before T2, then T1 will also call
+ commit_ordered() of handler B before T2.
+
+ Engines that implement this method should during this call make the
+ transaction visible to other transactions, thereby making the order of
+ transaction commits be defined by the order of commit_ordered() calls.
+
+ The intention is that commit_ordered() should do the minimal amount of
+ work that needs to happen in consistent commit order among handlers. To
+ preserve ordering, calls need to be serialised on a global mutex, so
+ doing any time-consuming or blocking operations in commit_ordered() will
+ limit scalability.
+
+ Handlers can rely on commit_ordered() calls to be serialised (no two
+ calls can run in parallel, so no extra locking on the handler part is
+ required to ensure this).
+
+ Note that commit_ordered() can be called from a different thread than the
+ one handling the transaction! So it can not do anything that depends on
+ thread local storage, in particular it can not call my_error() and
+ friends (instead it can store the error code and delay the call of
+ my_error() to the commit() method).
+
+ Similarly, since commit_ordered() returns void, any return error code
+ must be saved and returned from the commit() method instead.
+
+ The commit_ordered method is optional, and can be left unset if not
+ needed in a particular handler (then there will be no ordering guarantees
+ wrt. other engines and binary log).
+ */
+ void (*commit_ordered)(handlerton *hton, THD *thd, bool all);
int (*rollback)(handlerton *hton, THD *thd, bool all);
int (*prepare)(handlerton *hton, THD *thd, bool all);
+ /*
+ The prepare_ordered method is optional. If set, it will be called after
+ successful prepare() in all handlers participating in 2-phase
+ commit. Like commit_ordered(), it is called only when the full
+ transaction is committed, not for each commit of statement transaction.
+
+ The calls to prepare_ordered() among multiple parallel transactions are
+ ordered consistently with calls to commit_ordered(). This means that
+ calls to prepare_ordered() effectively define the commit order, and that
+ each handler will see the same sequence of transactions calling into
+ prepare_ordered() and commit_ordered().
+
+ Thus, prepare_ordered() can be used to define commit order for handlers
+ that need to do this in the prepare step (like binlog). It can also be
+ used to release transaction's locks early in an order consistent with the
+ order transactions will be eventually committed.
+
+ Like commit_ordered(), prepare_ordered() calls are serialised to maintain
+ ordering, so the intention is that they should execute fast, with only
+ the minimal amount of work needed to define commit order. Handlers can
+ rely on this serialisation, and do not need to do any extra locking to
+ avoid two prepare_ordered() calls running in parallel.
+
+ Like commit_ordered(), prepare_ordered() is not guaranteed to be called
+ in the context of the thread handling the rest of the transaction. So it
+ cannot invoke code that relies on thread local storage, in particular it
+ cannot call my_error().
+
+ prepare_ordered() cannot cause a rollback by returning an error, all
+ possible errors must be handled in prepare() (the prepare_ordered()
+ method returns void). In case of some fatal error, a record of the error
+ must be made internally by the engine and returned from commit() later.
+
+ Note that for user-level XA SQL commands, no consistent ordering among
+ prepare_ordered() and commit_ordered() is guaranteed (as that would
+ require blocking all other commits for an indefinite time).
+
+ When 2-phase commit is not used (eg. only one engine (and no binlog) in
+ transaction), neither prepare() nor prepare_ordered() is called.
+ */
+ void (*prepare_ordered)(handlerton *hton, THD *thd, bool all);
int (*recover)(handlerton *hton, XID *xid_list, uint len);
int (*commit_by_xid)(handlerton *hton, XID *xid);
int (*rollback_by_xid)(handlerton *hton, XID *xid);
+ /*
+ "Disable or enable checkpointing internal to the storage engine. This is
+ used for FLUSH TABLES WITH READ LOCK AND DISABLE CHECKPOINT to ensure that
+ the engine will never start any recovery from a time between
+ FLUSH TABLES ... ; UNLOCK TABLES.
+
+ While checkpointing is disabled, the engine should pause any background
+ write activity (such as tablespace checkpointing) that require consistency
+ between different files (such as transaction log and tablespace files) for
+ crash recovery to succeed. The idea is to use this to make safe
+ multi-volume LVM snapshot backups.
+ */
+ int (*checkpoint_state)(handlerton *hton, bool disabled);
void *(*create_cursor_read_view)(handlerton *hton, THD *thd);
void (*set_cursor_read_view)(handlerton *hton, THD *thd, void *read_view);
void (*close_cursor_read_view)(handlerton *hton, THD *thd, void *read_view);
@@ -1151,9 +1261,9 @@ typedef struct st_ha_create_information
enum ha_choice page_checksum; ///< If we have page_checksums
engine_option_value *option_list; ///< list of table create options
/* the following three are only for ALTER TABLE, check_if_incompatible_data() */
- void *option_struct; ///< structure with parsed table options
- void **fileds_option_struct; ///< array of field option structures
- void **indexes_option_struct; ///< array of index option structures
+ ha_table_option_struct *option_struct; ///< structure with parsed table options
+ ha_field_option_struct **fields_option_struct; ///< array of field option structures
+ ha_index_option_struct **indexes_option_struct; ///< array of index option structures
} HA_CREATE_INFO;
@@ -1228,6 +1338,7 @@ typedef struct st_ha_check_opt
st_ha_check_opt() {} /* Remove gcc warning */
uint flags; /* isam layer flags (e.g. for myisamchk) */
uint sql_flags; /* sql layer flags - for something myisamchk cannot do */
+ time_t start_time; /* When check/repair starts */
KEY_CACHE *key_cache; /* new key cache when changing key cache */
void init();
} HA_CHECK_OPT;
@@ -1242,6 +1353,23 @@ typedef void *range_seq_t;
typedef struct st_range_seq_if
{
/*
+ Get key information
+
+ SYNOPSIS
+ get_key_info()
+ init_params The seq_init_param parameter
+ length OUT length of the keys in this range sequence
+ map OUT key_part_map of the keys in this range sequence
+
+ DESCRIPTION
+ This function is set only when using HA_MRR_FIXED_KEY mode. In that mode,
+ all ranges are single-point equality ranges that use the same set of key
+ parts. This function allows the MRR implementation to get the length of
+ a key, and which keyparts it uses.
+ */
+ void (*get_key_info)(void *init_params, uint *length, key_part_map *map);
+
+ /*
Initialize the traversal of range sequence
SYNOPSIS
@@ -1265,10 +1393,10 @@ typedef struct st_range_seq_if
range OUT Information about the next range
RETURN
- 0 - Ok, the range structure filled with info about the next range
- 1 - No more ranges
+ FALSE - Ok, the range structure filled with info about the next range
+ TRUE - No more ranges
*/
- uint (*next) (range_seq_t seq, KEY_MULTI_RANGE *range);
+ bool (*next) (range_seq_t seq, KEY_MULTI_RANGE *range);
/*
Check whether range_info orders to skip the next record
@@ -1285,7 +1413,7 @@ typedef struct st_range_seq_if
out from the stream of records returned by multi_range_read_next()
0 - The record shall be left in the stream
*/
- bool (*skip_record) (range_seq_t seq, char *range_info, uchar *rowid);
+ bool (*skip_record) (range_seq_t seq, range_id_t range_info, uchar *rowid);
/*
Check if the record combination matches the index condition
@@ -1298,9 +1426,11 @@ typedef struct st_range_seq_if
0 - The record combination satisfies the index condition
1 - Otherwise
*/
- bool (*skip_index_tuple) (range_seq_t seq, char *range_info);
+ bool (*skip_index_tuple) (range_seq_t seq, range_id_t range_info);
} RANGE_SEQ_IF;
+typedef bool (*SKIP_INDEX_TUPLE_FUNC) (range_seq_t seq, range_id_t range_info);
+
class COST_VECT
{
public:
@@ -1346,10 +1476,14 @@ public:
}
void add_io(double add_io_cnt, double add_avg_cost)
{
- double io_count_sum= io_count + add_io_cnt;
- avg_io_cost= (io_count * avg_io_cost +
- add_io_cnt * add_avg_cost) / io_count_sum;
- io_count= io_count_sum;
+ /* In edge cases add_io_cnt may be zero */
+ if (add_io_cnt > 0)
+ {
+ double io_count_sum= io_count + add_io_cnt;
+ avg_io_cost= (io_count * avg_io_cost +
+ add_io_cnt * add_avg_cost) / io_count_sum;
+ io_count= io_count_sum;
+ }
}
/*
@@ -1368,9 +1502,9 @@ void get_sweep_read_cost(TABLE *table, ha_rows nrows, bool interrupted,
COST_VECT *cost);
/*
- The below two are not used (and not handled) in this milestone of this WL
- entry because there seems to be no use for them at this stage of
- implementation.
+ Indicates that all scanned ranges will be singlepoint (aka equality) ranges.
+ The ranges may not use the full key but all of them will use the same number
+ of key parts.
*/
#define HA_MRR_SINGLE_POINT 1
#define HA_MRR_FIXED_KEY 2
@@ -1412,7 +1546,42 @@ void get_sweep_read_cost(TABLE *table, ha_rows nrows, bool interrupted,
*/
#define HA_MRR_NO_NULL_ENDPOINTS 128
+/*
+ The MRR user has materialized range keys somewhere in the user's buffer.
+ This can be used for optimization of the procedure that sorts these keys
+ since in this case key values don't have to be copied into the MRR buffer.
+
+ In other words, it is guaranteed that after RANGE_SEQ_IF::next() call the
+ pointer in range->start_key.key will point to a key value that will remain
+ there until the end of the MRR scan.
+*/
+#define HA_MRR_MATERIALIZED_KEYS 256
+
+/*
+ The following bits are reserved for use by MRR implementation. The intended
+ use scenario:
+
+ * sql layer calls handler->multi_range_read_info[_const]()
+ - MRR implementation figures out what kind of scan it will perform, saves
+ the result in *mrr_mode parameter.
+ * sql layer remembers what was returned in *mrr_mode
+
+ * the optimizer picks the query plan (which may or may not include the MRR
+ scan that was estimated by the multi_range_read_info[_const] call)
+
+ * if the query is an EXPLAIN statement, sql layer will call
+ handler->multi_range_read_explain_info(mrr_mode) to get a text description
+ of the picked MRR scan; the description will be a part of EXPLAIN output.
+*/
+#define HA_MRR_IMPLEMENTATION_FLAG1 512
+#define HA_MRR_IMPLEMENTATION_FLAG2 1024
+#define HA_MRR_IMPLEMENTATION_FLAG3 2048
+#define HA_MRR_IMPLEMENTATION_FLAG4 4096
+#define HA_MRR_IMPLEMENTATION_FLAG5 8192
+#define HA_MRR_IMPLEMENTATION_FLAG6 16384
+#define HA_MRR_IMPLEMENTATION_FLAGS \
+ (512 | 1024 | 2048 | 4096 | 8192 | 16384)
/*
This is a buffer area that the handler can use to store rows.
@@ -1542,6 +1711,7 @@ public:
KEY_PART_INFO *range_key_part;
int key_compare_result_on_equal;
bool eq_range;
+ bool internal_tmp_table; /* If internal tmp table */
uint errkey; /* Last dup key */
uint key_used_on_scan;
@@ -1583,6 +1753,7 @@ public:
*/
/* Statistics variables */
ulonglong rows_read;
+ ulonglong rows_tmp_read;
ulonglong rows_changed;
/* One bigger than needed to avoid to test if key == MAX_KEY */
ulonglong index_rows_read[MAX_KEY+1];
@@ -1642,23 +1813,27 @@ public:
}
/* ha_ methods: pubilc wrappers for private virtual API */
- int ha_open(TABLE *table, const char *name, int mode, int test_if_locked);
+ int ha_open(TABLE *table, const char *name, int mode, uint test_if_locked);
int ha_index_init(uint idx, bool sorted)
{
int result;
DBUG_ENTER("ha_index_init");
DBUG_ASSERT(inited==NONE);
if (!(result= index_init(idx, sorted)))
- inited=INDEX;
- end_range= NULL;
+ {
+ inited= INDEX;
+ active_index= idx;
+ end_range= NULL;
+ }
DBUG_RETURN(result);
}
int ha_index_end()
{
DBUG_ENTER("ha_index_end");
DBUG_ASSERT(inited==INDEX);
- inited=NONE;
- end_range= NULL;
+ inited= NONE;
+ active_index= MAX_KEY;
+ end_range= NULL;
DBUG_RETURN(index_end());
}
/* This is called after index_init() if we need to do a index scan */
@@ -1765,7 +1940,7 @@ public:
uint get_dup_key(int error);
void reset_statistics()
{
- rows_read= rows_changed= 0;
+ rows_read= rows_changed= rows_tmp_read= 0;
bzero(index_rows_read, sizeof(index_rows_read));
}
virtual void change_table_ptr(TABLE *table_arg, TABLE_SHARE *share)
@@ -1844,8 +2019,13 @@ public:
as there may be several calls to this routine.
*/
virtual void column_bitmaps_signal();
- uint get_index(void) const { return active_index; }
- virtual int close(void)=0;
+ /*
+ We have to check for inited as some engines, like innodb, sets
+ active_index during table scan.
+ */
+ uint get_index(void) const
+ { return inited == INDEX ? active_index : MAX_KEY; }
+ int ha_close(void);
/**
@retval 0 Bulk update used by handler
@@ -1921,10 +2101,18 @@ protected:
virtual int index_last(uchar * buf)
{ return HA_ERR_WRONG_COMMAND; }
virtual int index_next_same(uchar *buf, const uchar *key, uint keylen);
+ virtual int close(void)=0;
+ inline void update_rows_read()
+ {
+ if (likely(!internal_tmp_table))
+ rows_read++;
+ else
+ rows_tmp_read++;
+ }
inline void update_index_statistics()
{
index_rows_read[active_index]++;
- rows_read++;
+ update_rows_read();
}
public:
@@ -1940,16 +2128,47 @@ public:
inline int ha_index_first(uchar * buf);
inline int ha_index_last(uchar * buf);
inline int ha_index_next_same(uchar *buf, const uchar *key, uint keylen);
+ /*
+ TODO: should we make for those functions non-virtual ha_func_name wrappers,
+ too?
+ */
virtual ha_rows multi_range_read_info_const(uint keyno, RANGE_SEQ_IF *seq,
void *seq_init_param,
uint n_ranges, uint *bufsz,
- uint *flags, COST_VECT *cost);
+ uint *mrr_mode, COST_VECT *cost);
virtual ha_rows multi_range_read_info(uint keyno, uint n_ranges, uint keys,
- uint *bufsz, uint *flags, COST_VECT *cost);
+ uint key_parts, uint *bufsz,
+ uint *mrr_mode, COST_VECT *cost);
virtual int multi_range_read_init(RANGE_SEQ_IF *seq, void *seq_init_param,
- uint n_ranges, uint mode,
+ uint n_ranges, uint mrr_mode,
HANDLER_BUFFER *buf);
- virtual int multi_range_read_next(char **range_info);
+ virtual int multi_range_read_next(range_id_t *range_info);
+ /*
+ Return string representation of the MRR plan.
+
+ This is intended to be used for EXPLAIN, via the following scenario:
+ 1. SQL layer calls handler->multi_range_read_info().
+ 1.1. Storage engine figures out whether it will use some non-default
+ MRR strategy, sets appropritate bits in *mrr_mode, and returns
+ control to SQL layer
+ 2. SQL layer remembers the returned mrr_mode
+ 3. SQL layer compares various options and choses the final query plan. As
+ a part of that, it makes a choice of whether to use the MRR strategy
+ picked in 1.1
+ 4. EXPLAIN code converts the query plan to its text representation. If MRR
+ strategy is part of the plan, it calls
+ multi_range_read_explain_info(mrr_mode) to get a text representation of
+ the picked MRR strategy.
+
+ @param mrr_mode Mode which was returned by multi_range_read_info[_const]
+ @param str INOUT string to be printed for EXPLAIN
+ @param str_end End of the string buffer. The function is free to put the
+ string into [str..str_end] memory range.
+ */
+ virtual int multi_range_read_explain_info(uint mrr_mode, char *str,
+ size_t size)
+ { return 0; }
+
virtual int read_range_first(const key_range *start_key,
const key_range *end_key,
bool eq_range, bool sorted);
@@ -2090,6 +2309,7 @@ public:
{ return(NULL);} /* gets tablespace name from handler */
/** used in ALTER TABLE; 1 if changing storage engine is allowed */
virtual bool can_switch_engines() { return 1; }
+ virtual int can_continue_handler_scan() { return 0; }
/**
Get the list of foreign keys in this table.
@@ -2204,7 +2424,6 @@ public:
virtual uint max_supported_key_part_length() const { return 255; }
virtual uint min_record_length(uint options) const { return 1; }
- virtual bool low_byte_first() const { return 1; }
virtual uint checksum() const { return 0; }
virtual bool is_crashed() const { return 0; }
virtual bool auto_repair() const { return 0; }
@@ -2284,9 +2503,28 @@ public:
/*
- @retval TRUE Primary key (if there is one) is clustered
- key covering all fields
- @retval FALSE otherwise
+ Check if the primary key (if there is one) is a clustered and a
+ reference key. This means:
+
+ - Data is stored together with the primary key (no secondary lookup
+ needed to find the row data). The optimizer uses this to find out
+ the cost of fetching data.
+ - The primary key is part of each secondary key and is used
+ to find the row data in the primary index when reading trough
+ secondary indexes.
+ - When doing a HA_KEYREAD_ONLY we get also all the primary key parts
+ into the row. This is critical property used by index_merge.
+
+ All the above is usually true for engines that store the row
+ data in the primary key index (e.g. in a b-tree), and use the primary
+ key value as a position(). InnoDB is an example of such an engine.
+
+ For such a clustered primary key, the following should also hold:
+ index_flags() should contain HA_CLUSTERED_INDEX
+ table_flags() should contain HA_TABLE_SCAN_ON_INDEX
+
+ @retval TRUE yes
+ @retval FALSE No.
*/
virtual bool primary_key_is_clustered() { return FALSE; }
virtual int cmp_ref(const uchar *ref1, const uchar *ref2)
@@ -2358,7 +2596,8 @@ public:
*/
virtual bool check_if_supported_virtual_columns(void) { return FALSE;}
-
+
+ TABLE* get_table() { return table; }
protected:
/* deprecated, don't use in new engines */
inline void ha_statistic_increment(ulong SSV::*offset) const { }
@@ -2432,8 +2671,9 @@ private:
*/
virtual int open(const char *name, int mode, uint test_if_locked)=0;
- virtual int index_init(uint idx, bool sorted) { active_index= idx; return 0; }
- virtual int index_end() { active_index= MAX_KEY; return 0; }
+ /* Note: ha_index_read_idx_map() may buypass index_init() */
+ virtual int index_init(uint idx, bool sorted) { return 0; }
+ virtual int index_end() { return 0; }
/**
rnd_init() can be called two times without rnd_end() in between
(it only makes sense if scan=1).
@@ -2599,11 +2839,12 @@ private:
virtual int rename_partitions(const char *path)
{ return HA_ERR_WRONG_COMMAND; }
friend class ha_partition;
- friend class DsMrr_impl;
public:
/* XXX to be removed, see ha_partition::partition_ht() */
virtual handlerton *partition_ht() const
{ return ht; }
+ inline int ha_write_tmp_row(uchar *buf);
+ inline int ha_update_tmp_row(const uchar * old_data, uchar * new_data);
};
#include "multi_range_read.h"
@@ -2663,6 +2904,7 @@ int ha_panic(enum ha_panic_function flag);
void ha_close_connection(THD* thd);
bool ha_flush_logs(handlerton *db_type);
void ha_drop_database(char* path);
+void ha_checkpoint_state(bool disable);
int ha_create_table(THD *thd, const char *path,
const char *db, const char *table_name,
HA_CREATE_INFO *create_info,