diff options
author | unknown <heikki@hundin.mysql.fi> | 2003-10-07 17:28:59 +0300 |
---|---|---|
committer | unknown <heikki@hundin.mysql.fi> | 2003-10-07 17:28:59 +0300 |
commit | d1485aad0eb79559902b1af26502b7cc53f8e95a (patch) | |
tree | d53a5cc4e9736e149276ba08d53a7dd09a14b4fd /innobase/fil | |
parent | d1ab51eb947623f45a8314cb9c0b6ddc15f0d148 (diff) | |
download | mariadb-git-d1485aad0eb79559902b1af26502b7cc53f8e95a.tar.gz |
Many files:
Multiple tablespaces for InnoDB
sql_table.cc:
Tell explicitly that InnoDB should retrieve all columns in CHECKSUM TABLE
sql_update.cc, sql_select.cc, my_base.h:
More descriptive flag name HA_EXTRA_RETRIEVE_ALL_COLS
include/my_base.h:
More descriptive flag name HA_EXTRA_RETRIEVE_ALL_COLS
sql/sql_select.cc:
More descriptive flag name HA_EXTRA_RETRIEVE_ALL_COLS
sql/sql_update.cc:
More descriptive flag name HA_EXTRA_RETRIEVE_ALL_COLS
sql/sql_table.cc:
Tell explicitly that InnoDB should retrieve all columns in CHECKSUM TABLE
sql/sql_db.cc:
Multiple tablespaces for InnoDB
sql/ha_innodb.cc:
Multiple tablespaces for InnoDB
sql/mysqld.cc:
Multiple tablespaces for InnoDB
sql/set_var.cc:
Multiple tablespaces for InnoDB
sql/sql_cache.cc:
Multiple tablespaces for InnoDB
sql/ha_innodb.h:
Multiple tablespaces for InnoDB
innobase/include/btr0btr.ic:
Multiple tablespaces for InnoDB
innobase/include/btr0pcur.ic:
Multiple tablespaces for InnoDB
innobase/include/data0type.ic:
Multiple tablespaces for InnoDB
innobase/include/dyn0dyn.ic:
Multiple tablespaces for InnoDB
innobase/include/fut0lst.ic:
Multiple tablespaces for InnoDB
innobase/include/log0log.ic:
Multiple tablespaces for InnoDB
innobase/include/mach0data.ic:
Multiple tablespaces for InnoDB
innobase/include/mtr0log.ic:
Multiple tablespaces for InnoDB
innobase/include/rem0rec.ic:
Multiple tablespaces for InnoDB
innobase/include/ut0byte.ic:
Multiple tablespaces for InnoDB
innobase/include/ut0ut.ic:
Multiple tablespaces for InnoDB
innobase/include/buf0buf.h:
Multiple tablespaces for InnoDB
innobase/include/buf0lru.h:
Multiple tablespaces for InnoDB
innobase/include/buf0rea.h:
Multiple tablespaces for InnoDB
innobase/include/data0type.h:
Multiple tablespaces for InnoDB
innobase/include/db0err.h:
Multiple tablespaces for InnoDB
innobase/include/dict0boot.h:
Multiple tablespaces for InnoDB
innobase/include/dict0dict.h:
Multiple tablespaces for InnoDB
innobase/include/dict0load.h:
Multiple tablespaces for InnoDB
innobase/include/dict0mem.h:
Multiple tablespaces for InnoDB
innobase/include/fil0fil.h:
Multiple tablespaces for InnoDB
innobase/include/fsp0fsp.h:
Multiple tablespaces for InnoDB
innobase/include/ibuf0ibuf.h:
Multiple tablespaces for InnoDB
innobase/include/lock0lock.h:
Multiple tablespaces for InnoDB
innobase/include/log0log.h:
Multiple tablespaces for InnoDB
innobase/include/log0recv.h:
Multiple tablespaces for InnoDB
innobase/include/os0file.h:
Multiple tablespaces for InnoDB
innobase/include/page0page.h:
Multiple tablespaces for InnoDB
innobase/include/que0types.h:
Multiple tablespaces for InnoDB
innobase/include/rem0rec.h:
Multiple tablespaces for InnoDB
innobase/include/srv0srv.h:
Multiple tablespaces for InnoDB
innobase/include/srv0start.h:
Multiple tablespaces for InnoDB
innobase/include/sync0sync.h:
Multiple tablespaces for InnoDB
innobase/include/trx0sys.h:
Multiple tablespaces for InnoDB
innobase/include/ut0byte.h:
Multiple tablespaces for InnoDB
innobase/include/univ.i:
Multiple tablespaces for InnoDB
innobase/btr/btr0cur.c:
Multiple tablespaces for InnoDB
innobase/btr/btr0sea.c:
Multiple tablespaces for InnoDB
innobase/buf/buf0buf.c:
Multiple tablespaces for InnoDB
innobase/buf/buf0flu.c:
Multiple tablespaces for InnoDB
innobase/buf/buf0lru.c:
Multiple tablespaces for InnoDB
innobase/buf/buf0rea.c:
Multiple tablespaces for InnoDB
innobase/data/data0type.c:
Multiple tablespaces for InnoDB
innobase/dict/dict0boot.c:
Multiple tablespaces for InnoDB
innobase/dict/dict0crea.c:
Multiple tablespaces for InnoDB
innobase/dict/dict0dict.c:
Multiple tablespaces for InnoDB
innobase/dict/dict0load.c:
Multiple tablespaces for InnoDB
innobase/dict/dict0mem.c:
Multiple tablespaces for InnoDB
innobase/fil/fil0fil.c:
Multiple tablespaces for InnoDB
innobase/fsp/fsp0fsp.c:
Multiple tablespaces for InnoDB
innobase/ha/ha0ha.c:
Multiple tablespaces for InnoDB
innobase/ibuf/ibuf0ibuf.c:
Multiple tablespaces for InnoDB
innobase/log/log0log.c:
Multiple tablespaces for InnoDB
innobase/log/log0recv.c:
Multiple tablespaces for InnoDB
innobase/mach/mach0data.c:
Multiple tablespaces for InnoDB
innobase/mem/mem0dbg.c:
Multiple tablespaces for InnoDB
innobase/mem/mem0pool.c:
Multiple tablespaces for InnoDB
innobase/mtr/mtr0log.c:
Multiple tablespaces for InnoDB
innobase/os/os0file.c:
Multiple tablespaces for InnoDB
innobase/os/os0proc.c:
Multiple tablespaces for InnoDB
innobase/page/page0cur.c:
Multiple tablespaces for InnoDB
innobase/que/que0que.c:
Multiple tablespaces for InnoDB
innobase/row/row0ins.c:
Multiple tablespaces for InnoDB
innobase/row/row0mysql.c:
Multiple tablespaces for InnoDB
innobase/row/row0sel.c:
Multiple tablespaces for InnoDB
innobase/row/row0upd.c:
Multiple tablespaces for InnoDB
innobase/srv/srv0srv.c:
Multiple tablespaces for InnoDB
innobase/srv/srv0start.c:
Multiple tablespaces for InnoDB
innobase/sync/sync0rw.c:
Multiple tablespaces for InnoDB
innobase/sync/sync0sync.c:
Multiple tablespaces for InnoDB
innobase/trx/trx0sys.c:
Multiple tablespaces for InnoDB
innobase/trx/trx0trx.c:
Multiple tablespaces for InnoDB
innobase/trx/trx0undo.c:
Multiple tablespaces for InnoDB
innobase/ut/ut0byte.c:
Multiple tablespaces for InnoDB
innobase/ut/ut0ut.c:
Multiple tablespaces for InnoDB
Diffstat (limited to 'innobase/fil')
-rw-r--r-- | innobase/fil/fil0fil.c | 2702 |
1 files changed, 2247 insertions, 455 deletions
diff --git a/innobase/fil/fil0fil.c b/innobase/fil/fil0fil.c index f55df90846c..2b0138ccb5a 100644 --- a/innobase/fil/fil0fil.c +++ b/innobase/fil/fil0fil.c @@ -1,5 +1,5 @@ /****************************************************** -The low-level file system +The tablespace memory cache (c) 1995 Innobase Oy @@ -16,16 +16,19 @@ Created 10/25/1995 Heikki Tuuri #include "mach0data.h" #include "ibuf0ibuf.h" #include "buf0buf.h" +#include "buf0flu.h" +#include "buf0lru.h" #include "log0log.h" #include "log0recv.h" #include "fsp0fsp.h" #include "srv0srv.h" +#include "srv0start.h" /* - IMPLEMENTATION OF THE LOW-LEVEL FILE SYSTEM - =========================================== + IMPLEMENTATION OF THE TABLESPACE MEMORY CACHE + ============================================= -The file system is responsible for providing fast read/write access to +The tablespace cache is responsible for providing fast read/write access to tablespaces and logs of the database. File creation and deletion is done in other modules which know more of the logic of the operation, however. @@ -83,20 +86,31 @@ ulint fil_n_pending_tablespace_flushes = 0; /* Null file address */ fil_addr_t fil_addr_null = {FIL_NULL, 0}; -/* File system file node data structure */ +/* File node of a tablespace or the log data space */ typedef struct fil_node_struct fil_node_t; struct fil_node_struct { - char* name; /* the file name or path */ + fil_space_t* space; /* backpointer to the space where this node + belongs */ + char* name; /* path to the file */ ibool open; /* TRUE if file open */ os_file_t handle; /* OS handle to the file, if file open */ - ulint size; /* size of the file in database pages - (where the possible last incomplete megabyte - is ignored) */ + ibool is_raw_disk;/* TRUE if the 'file' is actually a raw + device or a raw disk partition */ + ulint size; /* size of the file in database pages, 0 if + not known yet; the possible last incomplete + megabyte is ignored if space == 0 */ ulint n_pending; - /* count of pending i/o-ops on this file */ - ibool is_modified; /* this is set to TRUE when we write - to the file and FALSE when we call fil_flush - for this file space */ + /* count of pending i/o's on this file; + closing of the file is not allowed if + this is > 0 */ + ulint n_pending_flushes; + /* count of pending flushes on this file; + closing of the file is not allowed if + this is > 0 */ + ib_longlong modification_counter;/* when we write to the file we + increment this by one */ + ib_longlong flush_counter;/* up to what modification_counter value + we have flushed the modifications to disk */ UT_LIST_NODE_T(fil_node_t) chain; /* link field for the file chain */ UT_LIST_NODE_T(fil_node_t) LRU; @@ -106,19 +120,52 @@ struct fil_node_struct { #define FIL_NODE_MAGIC_N 89389 -/* File system tablespace or log data structure: let us call them by a common -name space */ +/* Tablespace or log data space: let us call them by a common name space */ struct fil_space_struct { - char* name; /* space name */ + char* name; /* space name = the path to the first file in + it */ ulint id; /* space id */ + ib_longlong tablespace_version; + /* in DISCARD/IMPORT this timestamp is used to + check if we should ignore an insert buffer + merge request for a page because it actually + was for the previous incarnation of the + space */ + ibool mark; /* this is set to TRUE at database startup if + the space corresponds to a table in the InnoDB + data dictionary; so we can print a warning of + orphaned tablespaces */ + ibool stop_ios;/* TRUE if we want to rename the .ibd file of + tablespace and want to stop temporarily + posting of new i/o requests on the file */ + ibool stop_ibuf_merges; + /* we set this TRUE when we start deleting a + single-table tablespace */ + ibool is_being_deleted; + /* this is set to TRUE when we start + deleting a single-table tablespace and its + file; when this flag is set no further i/o + or flush requests can be placed on this space, + though there may be such requests still being + processed on this space */ ulint purpose;/* FIL_TABLESPACE, FIL_LOG, or FIL_ARCH_LOG */ UT_LIST_BASE_NODE_T(fil_node_t) chain; /* base node for the file chain */ - ulint size; /* space size in pages */ + ulint size; /* space size in pages; 0 if a single-table + tablespace whose size we do not know yet */ ulint n_reserved_extents; /* number of reserved free extents for ongoing operations like B-tree page split */ + ulint n_pending_flushes; /* this is > 0 when flushing + the tablespace to disk; dropping of the + tablespace is forbidden if this is > 0 */ + ulint n_pending_ibuf_merges;/* this is > 0 when merging + insert buffer entries to a page so that we + may need to access the ibuf bitmap page in the + tablespade: dropping of the tablespace is + forbidden if this is > 0 */ hash_node_t hash; /* hash chain node */ + hash_node_t name_hash;/* hash chain the name_hash table */ rw_lock_t latch; /* latch protecting the file space storage allocation */ UT_LIST_NODE_T(fil_space_t) space_list; @@ -130,80 +177,115 @@ struct fil_space_struct { #define FIL_SPACE_MAGIC_N 89472 -/* The file system data structure */ +/* The tablespace memory cache; also the totality of logs = the log data space, +is stored here; below we talk about tablespaces, but also the ib_logfiles +form a 'space' and it is handled here */ typedef struct fil_system_struct fil_system_t; struct fil_system_struct { - mutex_t mutex; /* The mutex protecting the system */ + mutex_t mutex; /* The mutex protecting the cache */ hash_table_t* spaces; /* The hash table of spaces in the - system */ + system; they are hashed on the space + id */ + hash_table_t* name_hash; /* hash table based on the space + name */ UT_LIST_BASE_NODE_T(fil_node_t) LRU; /* base node for the LRU list of the - most recently used open files */ - ulint n_open_pending; /* current number of open files with - pending i/o-ops on them */ - ulint max_n_open; /* maximum allowed open files */ - os_event_t can_open; /* this event is set to the signaled - state when the system is capable of - opening a new file, i.e., - n_open_pending < max_n_open */ + most recently used open files with no + pending i/o's; if we start an i/o on + the file, we first remove it from this + list, and return it to the start of + the list when the i/o ends; + log files and the system tablespace are + not put to this list: they are opened + after the startup, and kept open until + shutdown */ + ulint n_open; /* number of files currently open */ + ulint max_n_open; /* n_open is not allowed to exceed + this */ + ib_longlong modification_counter;/* when we write to a file we + increment this by one */ + ulint max_assigned_id;/* maximum space id in the existing + tables, or assigned during the time + mysqld has been up; at an InnoDB + startup we scan the data dictionary + and set here the maximum of the + space id's of the tables there */ + ib_longlong tablespace_version; + /* a counter which is incremented for + every space object memory creation; + every space mem object gets a + 'timestamp' from this; in DISCARD/ + IMPORT this is used to check if we + should ignore an insert buffer merge + request */ UT_LIST_BASE_NODE_T(fil_space_t) space_list; /* list of all file spaces */ }; -/* The file system. This variable is NULL before the module is initialized. */ +/* The tablespace memory cache. This variable is NULL before the module is +initialized. */ fil_system_t* fil_system = NULL; -/* The file system hash table size */ -#define FIL_SYSTEM_HASH_SIZE 500 +/* The tablespace memory cache hash table size */ +#define FIL_SYSTEM_HASH_SIZE 50 /* TODO: make bigger! */ -/*********************************************************************** -Reserves a right to open a single file. The right must be released with -fil_release_right_to_open. */ +/************************************************************************ +NOTE: you must call fil_mutex_enter_and_prepare_for_io() first! +Prepares a file node for i/o. Opens the file if it is closed. Updates the +pending i/o's field in the node and the system appropriately. Takes the node +off the LRU list if it is in the LRU list. The caller must hold the fil_sys +mutex. */ +static void -fil_reserve_right_to_open(void) -/*===========================*/ -{ -loop: - mutex_enter(&(fil_system->mutex)); - - if (fil_system->n_open_pending == fil_system->max_n_open) { - - /* It is not sure we can open the file if it is closed: wait */ - - os_event_reset(fil_system->can_open); - - mutex_exit(&(fil_system->mutex)); +fil_node_prepare_for_io( +/*====================*/ + fil_node_t* node, /* in: file node */ + fil_system_t* system, /* in: tablespace memory cache */ + fil_space_t* space); /* in: space */ +/************************************************************************ +Updates the data structures when an i/o operation finishes. Updates the +pending i/o's field in the node appropriately. */ +static +void +fil_node_complete_io( +/*=================*/ + fil_node_t* node, /* in: file node */ + fil_system_t* system, /* in: tablespace memory cache */ + ulint type); /* in: OS_FILE_WRITE or OS_FILE_READ; marks + the node as modified if + type == OS_FILE_WRITE */ - os_event_wait(fil_system->can_open); - goto loop; - } +/*********************************************************************** +Returns the version number of a tablespace, -1 if not found. */ - fil_system->max_n_open--; +ib_longlong +fil_space_get_version( +/*==================*/ + /* out: version number, -1 if the tablespace does not + exist in the memory cache */ + ulint id) /* in: space id */ +{ + fil_system_t* system = fil_system; + fil_space_t* space; + ib_longlong version = -1; - mutex_exit(&(fil_system->mutex)); -} + ut_ad(system); -/*********************************************************************** -Releases a right to open a single file. */ + mutex_enter(&(system->mutex)); -void -fil_release_right_to_open(void) -/*===========================*/ -{ - mutex_enter(&(fil_system->mutex)); - - if (fil_system->n_open_pending == fil_system->max_n_open) { + HASH_SEARCH(hash, system->spaces, id, space, space->id == id); - os_event_set(fil_system->can_open); + if (space) { + version = space->tablespace_version; } - fil_system->max_n_open++; + mutex_exit(&(system->mutex)); - mutex_exit(&(fil_system->mutex)); + return(version); } /*********************************************************************** @@ -215,8 +297,8 @@ fil_space_get_latch( /* out: latch protecting storage allocation */ ulint id) /* in: space id */ { - fil_space_t* space; fil_system_t* system = fil_system; + fil_space_t* space; ut_ad(system); @@ -224,6 +306,8 @@ fil_space_get_latch( HASH_SEARCH(hash, system->spaces, id, space, space->id == id); + ut_a(space); + mutex_exit(&(system->mutex)); return(&(space->latch)); @@ -238,8 +322,8 @@ fil_space_get_type( /* out: FIL_TABLESPACE or FIL_LOG */ ulint id) /* in: space id */ { - fil_space_t* space; fil_system_t* system = fil_system; + fil_space_t* space; ut_ad(system); @@ -247,6 +331,8 @@ fil_space_get_type( HASH_SEARCH(hash, system->spaces, id, space, space->id == id); + ut_a(space); + mutex_exit(&(system->mutex)); return(space->purpose); @@ -261,17 +347,21 @@ fil_space_get_ibuf_data( /* out: ibuf data for this space */ ulint id) /* in: space id */ { + fil_system_t* system = fil_system; fil_space_t* space; - fil_system_t* system = fil_system; ut_ad(system); + ut_a(id == 0); + mutex_enter(&(system->mutex)); HASH_SEARCH(hash, system->spaces, id, space, space->id == id); mutex_exit(&(system->mutex)); + ut_a(space); + return(space->ibuf_data); } @@ -284,16 +374,16 @@ fil_node_create( char* name, /* in: file name (file must be closed) */ ulint size, /* in: file size in database blocks, rounded downwards to an integer */ - ulint id) /* in: space id where to append */ + ulint id, /* in: space id where to append */ + ibool is_raw) /* in: TRUE if a raw device or a raw disk partition */ { + fil_system_t* system = fil_system; fil_node_t* node; fil_space_t* space; char* name2; - fil_system_t* system = fil_system; ut_a(system); ut_a(name); - ut_a(size > 0); mutex_enter(&(system->mutex)); @@ -305,29 +395,119 @@ fil_node_create( node->name = name2; node->open = FALSE; + + ut_a(!is_raw || srv_start_raw_disk_in_use); + + node->is_raw_disk = is_raw; node->size = size; node->magic_n = FIL_NODE_MAGIC_N; node->n_pending = 0; + node->n_pending_flushes = 0; - node->is_modified = FALSE; + node->modification_counter = 0; + node->flush_counter = 0; HASH_SEARCH(hash, system->spaces, id, space, space->id == id); + if (!space) { + ut_print_timestamp(stderr); + fprintf(stderr, +" InnoDB: Error: Could not find tablespace %lu for\n" +"InnoDB: file %s from the tablespace memory cache.\n", id, name); + mem_free(name2); + + mem_free(node); + + mutex_exit(&(system->mutex)); + + return; + } + space->size += size; + node->space = space; + UT_LIST_ADD_LAST(chain, space->chain, node); mutex_exit(&(system->mutex)); } +/************************************************************************ +Opens a the file of a node of a tablespace. The caller must own the fil_system +mutex. */ +static +void +fil_node_open_file( +/*===============*/ + fil_node_t* node, /* in: file node */ + fil_system_t* system, /* in: tablespace memory cache */ + fil_space_t* space) /* in: space */ +{ + ib_longlong size_bytes; + ulint size_low; + ulint size_high; + ibool ret; + + ut_ad(mutex_own(&(system->mutex))); + + ut_a(node->n_pending == 0); + ut_a(node->open == FALSE); + + /* printf("Opening file %s\n", node->name); */ + + if (space->purpose == FIL_LOG) { + node->handle = os_file_create(node->name, OS_FILE_OPEN, + OS_FILE_AIO, OS_LOG_FILE, &ret); + } else if (node->is_raw_disk) { + node->handle = os_file_create(node->name, + OS_FILE_OPEN_RAW, + OS_FILE_AIO, OS_DATA_FILE, &ret); + } else { + node->handle = os_file_create(node->name, OS_FILE_OPEN, + OS_FILE_AIO, OS_DATA_FILE, &ret); + } + + ut_a(ret); + + node->open = TRUE; + + system->n_open++; + + if (node->size == 0) { + /* It must be a single-table tablespace and we do not know the + size of the file yet */ + + ut_a(space->id != 0); + + os_file_get_size(node->handle, &size_low, &size_high); + + size_bytes = (((ib_longlong)size_high) << 32) + + (ib_longlong)size_low; + + if (size_bytes >= FSP_EXTENT_SIZE * UNIV_PAGE_SIZE) { + node->size = (ulint) ((size_bytes / (1024 * 1024)) + * ((1024 * 1024) / UNIV_PAGE_SIZE)); + } else { + node->size = (ulint) (size_bytes / UNIV_PAGE_SIZE); + } + + space->size = node->size; + } + + if (space->purpose == FIL_TABLESPACE && space->id != 0) { + /* Put the node to the LRU list */ + UT_LIST_ADD_FIRST(LRU, system->LRU, node); + } +} + /************************************************************************** Closes a file. */ static void -fil_node_close( -/*===========*/ +fil_node_close_file( +/*================*/ fil_node_t* node, /* in: file node */ - fil_system_t* system) /* in: file system */ + fil_system_t* system) /* in: tablespace memory cache */ { ibool ret; @@ -335,32 +515,214 @@ fil_node_close( ut_ad(mutex_own(&(system->mutex))); ut_a(node->open); ut_a(node->n_pending == 0); + ut_a(node->n_pending_flushes == 0); ret = os_file_close(node->handle); ut_a(ret); + /* printf("Closing file %s\n", node->name); */ + node->open = FALSE; + ut_a(system->n_open > 0); + system->n_open--; - /* The node is in the LRU list, remove it */ - UT_LIST_REMOVE(LRU, system->LRU, node); + if (node->space->purpose == FIL_TABLESPACE && node->space->id != 0) { + ut_a(UT_LIST_GET_LEN(system->LRU) > 0); + + /* The node is in the LRU list, remove it */ + UT_LIST_REMOVE(LRU, system->LRU, node); + } +} + +/************************************************************************ +Tries to close a file in the LRU list. The caller must hold the fil_sys +mutex. */ +static +ibool +fil_try_to_close_file_in_LRU( +/*=========================*/ + /* out: TRUE if success, FALSE if should retry + later; since i/o's generally complete in < + 100 ms, and as InnoDB writes at most 128 pages + from the buffer pool in a batch, and then + immediately flushes the files, there is a good + chance that the next time we find a suitable + node from the LRU list */ + ibool print_info) /* in: if TRUE, prints information why it + cannot close a file */ +{ + fil_system_t* system = fil_system; + fil_node_t* node; + + ut_ad(mutex_own(&(system->mutex))); + + node = UT_LIST_GET_LAST(system->LRU); + + if (print_info) { + fprintf(stderr, +"InnoDB: fil_sys open file LRU len %lu\n", UT_LIST_GET_LEN(system->LRU)); + } + + while (node != NULL) { + if (node->modification_counter == node->flush_counter + && node->n_pending_flushes == 0) { + + fil_node_close_file(node, system); + + return(TRUE); + } + + if (print_info && node->n_pending_flushes > 0) { + fprintf(stderr, +"InnoDB: cannot close file %s, because n_pending_flushes %lu\n", node->name, + node->n_pending_flushes); + } + + if (print_info + && node->modification_counter != node->flush_counter) { + fprintf(stderr, +"InnoDB: cannot close file %s, because mod_count %lld != fl_count %lld\n", + node->name, node->modification_counter, + node->flush_counter); + } + + node = UT_LIST_GET_PREV(LRU, node); + } + + return(FALSE); } /*********************************************************************** -Frees a file node object from a file system. */ +Reserves the fil_system mutex and tries to make sure we can open at least one +file while holding it. This should be called before calling +fil_node_prepare_for_io(), because that function may need to open a file. */ +static +void +fil_mutex_enter_and_prepare_for_io( +/*===============================*/ + ulint space_id) /* in: space id */ +{ + fil_system_t* system = fil_system; + fil_space_t* space; + ibool success; + ibool print_info = FALSE; + ulint count = 0; + ulint count2 = 0; + + ut_ad(!mutex_own(&(system->mutex))); +retry: + mutex_enter(&(system->mutex)); + + if (space_id == 0 || space_id >= SRV_LOG_SPACE_FIRST_ID) { + /* We keep log files and system tablespace files always open; + this is important in preventing deadlocks in this module, as + a page read completion often performs another read from the + insert buffer. The insert buffer is in tablespace 0, and we + cannot end up waiting in this function. */ + + return; + } + + if (system->n_open < system->max_n_open) { + + return; + } + + HASH_SEARCH(hash, system->spaces, space_id, space, + space->id == space_id); + if (space != NULL && space->stop_ios) { + /* We are going to do a rename file and want to stop new i/o's + for a while */ + + if (count2 > 20000) { + fprintf(stderr, +"InnoDB: Warning: tablespace %s has i/o ops stopped for a long time %lu\n", + space->name, count2); + } + + mutex_exit(&(system->mutex)); + + os_thread_sleep(20000); + + count2++; + + goto retry; + } + + /* If the file is already open, no need to do anything; if the space + does not exist, we handle the situation in the function which called + this function */ + + if (!space || UT_LIST_GET_FIRST(space->chain)->open) { + + return; + } + + if (count > 1) { + print_info = TRUE; + } + + /* Too many files are open, try to close some */ +close_more: + success = fil_try_to_close_file_in_LRU(print_info); + + if (success && system->n_open >= system->max_n_open) { + + goto close_more; + } + + if (system->n_open < system->max_n_open) { + /* Ok */ + + return; + } + + if (count >= 2) { + ut_print_timestamp(stderr); + fprintf(stderr, +" InnoDB: Warning: too many (%lu) files stay open while the maximum\n" +"InnoDB: allowed value would be %lu.\n" +"InnoDB: You may need to raise the value of innodb_max_files_open in\n" +"InnoDB: my.cnf.\n", system->n_open, system->max_n_open); + + return; + } + + mutex_exit(&(system->mutex)); + + /* Wake the i/o-handler threads to make sure pending i/o's are + performed */ + os_aio_simulated_wake_handler_threads(); + + os_thread_sleep(20000); + + /* Flush tablespaces so that we can close modified files in the LRU + list */ + + fil_flush_file_spaces(FIL_TABLESPACE); + + count++; + + goto retry; +} + +/*********************************************************************** +Frees a file node object from a tablespace memory cache. */ static void fil_node_free( /*==========*/ fil_node_t* node, /* in, own: file node */ - fil_system_t* system, /* in: file system */ + fil_system_t* system, /* in: tablespace memory cache */ fil_space_t* space) /* in: space where the file node is chained */ { ut_ad(node && system && space); ut_ad(mutex_own(&(system->mutex))); ut_a(node->magic_n == FIL_NODE_MAGIC_N); + ut_a(node->n_pending == 0); if (node->open) { - fil_node_close(node, system); + fil_node_close_file(node, system); } space->size -= node->size; @@ -383,9 +745,9 @@ fil_space_truncate_start( if this does not equal to the combined size of some initial files in the space */ { + fil_system_t* system = fil_system; fil_node_t* node; fil_space_t* space; - fil_system_t* system = fil_system; mutex_enter(&(system->mutex)); @@ -394,7 +756,6 @@ fil_space_truncate_start( ut_a(space); while (trunc_len > 0) { - node = UT_LIST_GET_FIRST(space->chain); ut_a(node->size * UNIV_PAGE_SIZE >= trunc_len); @@ -405,17 +766,323 @@ fil_space_truncate_start( } mutex_exit(&(system->mutex)); -} +} + +/*********************************************************************** +Creates a space memory object and puts it to the tablespace memory cache. If +there is an error, prints an error message to the .err log. */ + +ibool +fil_space_create( +/*=============*/ + /* out: TRUE if success */ + char* name, /* in: space name */ + ulint id, /* in: space id */ + ulint purpose)/* in: FIL_TABLESPACE, or FIL_LOG if log */ +{ + fil_system_t* system = fil_system; + fil_space_t* space; + char* name2; + ulint namesake_id; +try_again: + /*printf( + "InnoDB: Adding tablespace %lu of name %s, purpose %lu\n", id, name, + purpose);*/ + + ut_a(system); + ut_a(name); + + mutex_enter(&(system->mutex)); + + HASH_SEARCH(name_hash, system->name_hash, ut_fold_string(name), space, + 0 == strcmp(name, space->name)); + if (space != NULL) { + ut_print_timestamp(stderr); + fprintf(stderr, +" InnoDB: Warning: trying to init to the tablespace memory cache\n" +"InnoDB: a tablespace %lu of name %s,\n" +"InnoDB: but a tablespace %lu of the same name %s\n" +"InnoDB: already exists in the tablespace memory cache!\n", + id, name, space->id, space->name); + + if (id == 0 || purpose != FIL_TABLESPACE) { + + mutex_exit(&(system->mutex)); + + return(FALSE); + } + + fprintf(stderr, +"InnoDB: We assume that InnoDB did a crash recovery, and you had\n" +"InnoDB: an .ibd file for which the table did not exist in the\n" +"InnoDB: InnoDB internal data dictionary in the ibdata files.\n" +"InnoDB: We assume that you later removed the .ibd and .frm files,\n" +"InnoDB: and are now trying to recreate the table. We now remove the\n" +"InnoDB: conflicting tablespace object from the memory cache and try\n" +"InnoDB: the init again.\n"); + + namesake_id = space->id; + + mutex_exit(&(system->mutex)); + + fil_space_free(namesake_id); + + goto try_again; + } + + HASH_SEARCH(hash, system->spaces, id, space, space->id == id); + + if (space != NULL) { + fprintf(stderr, +"InnoDB: Error: trying to add tablespace %lu of name %s\n" +"InnoDB: to the tablespace memory cache, but tablespace\n" +"InnoDB: %lu of name %s already exists in the tablespace\n" +"InnoDB: memory cache!\n", id, name, space->id, space->name); + + mutex_exit(&(system->mutex)); + + return(FALSE); + } + + space = mem_alloc(sizeof(fil_space_t)); + + name2 = mem_alloc(ut_strlen(name) + 1); + + ut_strcpy(name2, name); + + space->name = name2; + space->id = id; + + system->tablespace_version++; + space->tablespace_version = + system->tablespace_version; + space->mark = FALSE; + + if (purpose == FIL_TABLESPACE && id > system->max_assigned_id) { + system->max_assigned_id = id; + } + + space->stop_ios = FALSE; + space->stop_ibuf_merges = FALSE; + space->is_being_deleted = FALSE; + space->purpose = purpose; + space->size = 0; + + space->n_reserved_extents = 0; + + space->n_pending_flushes = 0; + space->n_pending_ibuf_merges = 0; + + UT_LIST_INIT(space->chain); + space->magic_n = FIL_SPACE_MAGIC_N; + + space->ibuf_data = NULL; + + rw_lock_create(&(space->latch)); + rw_lock_set_level(&(space->latch), SYNC_FSP); + + HASH_INSERT(fil_space_t, hash, system->spaces, id, space); + + HASH_INSERT(fil_space_t, name_hash, system->name_hash, + ut_fold_string(name), space); + UT_LIST_ADD_LAST(space_list, system->space_list, space); + + mutex_exit(&(system->mutex)); + + return(TRUE); +} + +/*********************************************************************** +Assigns a new space id for a new single-table tablespace. This works simply by +incrementing the global counter. If 4 billion id's is not enough, we may need +to recycle id's. */ +static +ulint +fil_assign_new_space_id(void) +/*=========================*/ + /* out: new tablespace id; ULINT_UNDEFINED if could + not assign an id */ +{ + fil_system_t* system = fil_system; + ulint id; + + mutex_enter(&(system->mutex)); + + system->max_assigned_id++; + + id = system->max_assigned_id; + + if (id > (SRV_LOG_SPACE_FIRST_ID / 2) && (id % 1000000UL == 0)) { + ut_print_timestamp(stderr); + fprintf(stderr, +"InnoDB: Warning: you are running out of new single-table tablespace id's.\n" +"InnoDB: Current counter is %lu and it must not exceed %lu!\n" +"InnoDB: To reset the counter to zero you have to dump all your tables and\n" +"InnoDB: recreate the whole InnoDB installation.\n", id, + SRV_LOG_SPACE_FIRST_ID); + } + + if (id >= SRV_LOG_SPACE_FIRST_ID) { + ut_print_timestamp(stderr); + fprintf(stderr, +"InnoDB: You have run out of single-table tablespace id's!\n" +"InnoDB: Current counter is %lu.\n" +"InnoDB: To reset the counter to zero you have to dump all your tables and\n" +"InnoDB: recreate the whole InnoDB installation.\n", id); + system->max_assigned_id--; + + id = ULINT_UNDEFINED; + } + + mutex_exit(&(system->mutex)); + + return(id); +} + +/*********************************************************************** +Frees a space object from the tablespace memory cache. Closes the files in +the chain but does not delete them. There must not be any pending i/o's or +flushes on the files. */ + +ibool +fil_space_free( +/*===========*/ + /* out: TRUE if success */ + ulint id) /* in: space id */ +{ + fil_system_t* system = fil_system; + fil_space_t* space; + fil_space_t* namespace; + fil_node_t* fil_node; + + mutex_enter(&(system->mutex)); + + HASH_SEARCH(hash, system->spaces, id, space, space->id == id); + + if (!space) { + ut_print_timestamp(stderr); + fprintf(stderr, +" InnoDB: Error: trying to remove tablespace %lu from the cache but\n" +"InnoDB: it is not there.\n", id); + + mutex_exit(&(system->mutex)); + + return(FALSE); + } + + HASH_DELETE(fil_space_t, hash, system->spaces, id, space); + + HASH_SEARCH(name_hash, system->name_hash, ut_fold_string(space->name), + namespace, 0 == strcmp(space->name, namespace->name)); + ut_a(namespace); + ut_a(space == namespace); + + HASH_DELETE(fil_space_t, name_hash, system->name_hash, + ut_fold_string(space->name), space); + + UT_LIST_REMOVE(space_list, system->space_list, space); + + ut_a(space->magic_n == FIL_SPACE_MAGIC_N); + ut_a(0 == space->n_pending_flushes); + + fil_node = UT_LIST_GET_FIRST(space->chain); + + while (fil_node != NULL) { + fil_node_free(fil_node, system, space); + + fil_node = UT_LIST_GET_FIRST(space->chain); + } + + ut_a(0 == UT_LIST_GET_LEN(space->chain)); + + mutex_exit(&(system->mutex)); + + rw_lock_free(&(space->latch)); + + mem_free(space->name); + mem_free(space); + + return(TRUE); +} + +/*********************************************************************** +Returns the size of the space in pages. The tablespace must be cached in the +memory cache. */ + +ulint +fil_space_get_size( +/*===============*/ + /* out: space size, 0 if space not found */ + ulint id) /* in: space id */ +{ + fil_system_t* system = fil_system; + fil_node_t* node; + fil_space_t* space; + ulint size; + + ut_ad(system); + + fil_mutex_enter_and_prepare_for_io(id); + + HASH_SEARCH(hash, system->spaces, id, space, space->id == id); + + if (space == NULL) { + mutex_exit(&(system->mutex)); + + return(0); + } + + if (space->size == 0 && space->purpose == FIL_TABLESPACE) { + ut_a(id != 0); + + ut_a(1 == UT_LIST_GET_LEN(space->chain)); + + node = UT_LIST_GET_FIRST(space->chain); + + /* It must be a single-table tablespace and we have not opened + the file yet; the following calls will open it and update the + size fields */ + + fil_node_prepare_for_io(node, system, space); + fil_node_complete_io(node, system, OS_FILE_READ); + } + + size = space->size; + + mutex_exit(&(system->mutex)); + + return(size); +} + +/*********************************************************************** +Checks if the pair space, page_no refers to an existing page in a tablespace +file space. The tablespace must be cached in the memory cache. */ + +ibool +fil_check_adress_in_tablespace( +/*===========================*/ + /* out: TRUE if the address is meaningful */ + ulint id, /* in: space id */ + ulint page_no)/* in: page number */ +{ + if (fil_space_get_size(id) > page_no) { + + return(TRUE); + } + + return(FALSE); +} /******************************************************************** -Creates a file system object. */ +Creates a the tablespace memory cache. */ static fil_system_t* fil_system_create( /*==============*/ - /* out, own: file system object */ + /* out, own: tablespace memory cache */ ulint hash_size, /* in: hash table size */ - ulint max_n_open) /* in: maximum number of open files */ + ulint max_n_open) /* in: maximum number of open files; must be + > 10 */ { fil_system_t* system; @@ -429,12 +1096,17 @@ fil_system_create( mutex_set_level(&(system->mutex), SYNC_ANY_LATCH); system->spaces = hash_create(hash_size); + system->name_hash = hash_create(hash_size); UT_LIST_INIT(system->LRU); - system->n_open_pending = 0; + system->n_open = 0; system->max_n_open = max_n_open; - system->can_open = os_event_create(NULL); + + system->modification_counter = 0; + system->max_assigned_id = 0; + + system->tablespace_version = 0; UT_LIST_INIT(system->space_list); @@ -442,7 +1114,7 @@ fil_system_create( } /******************************************************************** -Initializes the file system of this module. */ +Initializes the tablespace memory cache. */ void fil_init( @@ -451,11 +1123,119 @@ fil_init( { ut_a(fil_system == NULL); + /*printf("Initializing the tablespace cache with max %lu open files\n", + max_n_open); */ fil_system = fil_system_create(FIL_SYSTEM_HASH_SIZE, max_n_open); } +/*********************************************************************** +Opens all log files and system tablespace data files. They stay open until the +database server shutdown. This should be called at a server startup after the +space objects for the log and the system tablespace have been created. The +purpose of this operation is to make sure we never run out of file descriptors +if we need to read from the insert buffer or to write to the log. */ + +void +fil_open_log_and_system_tablespace_files(void) +/*==========================================*/ +{ + fil_system_t* system = fil_system; + fil_space_t* space; + fil_node_t* node; + + mutex_enter(&(system->mutex)); + + space = UT_LIST_GET_FIRST(system->space_list); + + while (space != NULL) { + if (space->purpose != FIL_TABLESPACE || space->id == 0) { + node = UT_LIST_GET_FIRST(space->chain); + + while (node != NULL) { + if (!node->open) { + fil_node_open_file(node, system, + space); + } + if (system->max_n_open < 10 + system->n_open) { + fprintf(stderr, +"InnoDB: Warning: you must raise the value of innodb_max_open_files in\n" +"InnoDB: my.cnf! Remember that InnoDB keeps all log files and all system\n" +"InnoDB: tablespace files open for the whole time mysqld is running, and\n" +"InnoDB: needs to open also some .ibd files if the file-per-table storage\n" +"InnoDB: model is used. Current open files %lu, max allowed open files %lu.\n", + system->n_open, system->max_n_open); + } + node = UT_LIST_GET_NEXT(chain, node); + } + } + space = UT_LIST_GET_NEXT(space_list, space); + } + + mutex_exit(&(system->mutex)); +} + +/*********************************************************************** +Closes all open files. There must not be any pending i/o's or not flushed +modifications in the files. */ + +void +fil_close_all_files(void) +/*=====================*/ +{ + fil_system_t* system = fil_system; + fil_space_t* space; + fil_node_t* node; + + mutex_enter(&(system->mutex)); + + space = UT_LIST_GET_FIRST(system->space_list); + + while (space != NULL) { + node = UT_LIST_GET_FIRST(space->chain); + + while (node != NULL) { + if (node->open) { + fil_node_close_file(node, system); + } + node = UT_LIST_GET_NEXT(chain, node); + } + space = UT_LIST_GET_NEXT(space_list, space); + } + + mutex_exit(&(system->mutex)); +} + +/*********************************************************************** +Sets the max tablespace id counter if the given number is bigger than the +previous value. */ + +void +fil_set_max_space_id_if_bigger( +/*===========================*/ + ulint max_id) /* in: maximum known id */ +{ + fil_system_t* system = fil_system; + + if (max_id >= SRV_LOG_SPACE_FIRST_ID) { + fprintf(stderr, +"InnoDB: Fatal error: max tablespace id is too high, %lu\n", max_id); + ut_a(0); + } + + mutex_enter(&(system->mutex)); + + if (system->max_assigned_id < max_id) { + + system->max_assigned_id = max_id; + } + + mutex_exit(&(system->mutex)); +} + /******************************************************************** -Writes the flushed lsn to the header of each file space. */ +Initializes the ibuf data structure for space 0 == the system tablespace. +This can be called after the file space headers have been created and the +dictionary system has been initialized. */ void fil_ibuf_init_at_db_start(void) @@ -464,39 +1244,37 @@ fil_ibuf_init_at_db_start(void) fil_space_t* space; space = UT_LIST_GET_FIRST(fil_system->space_list); - - while (space) { - if (space->purpose == FIL_TABLESPACE) { - space->ibuf_data = ibuf_data_init_for_space(space->id); - } - space = UT_LIST_GET_NEXT(space_list, space); - } + ut_a(space); + ut_a(space->purpose == FIL_TABLESPACE); + + space->ibuf_data = ibuf_data_init_for_space(space->id); } /******************************************************************** -Writes the flushed lsn and the latest archived log number to the page -header of the first page of a data file. */ +Writes the flushed lsn and the latest archived log number to the page header +of the first page of a data file. */ static ulint fil_write_lsn_and_arch_no_to_file( /*==============================*/ ulint space_id, /* in: space number */ - ulint sum_of_sizes, /* in: combined size of previous files in space, - in database pages */ + ulint sum_of_sizes, /* in: combined size of previous files in + space, in database pages */ dulint lsn, /* in: lsn to write */ ulint arch_log_no) /* in: archived log number to write */ { byte* buf1; byte* buf; + UT_NOT_USED(arch_log_no); + buf1 = mem_alloc(2 * UNIV_PAGE_SIZE); buf = ut_align(buf1, UNIV_PAGE_SIZE); fil_read(TRUE, space_id, sum_of_sizes, 0, UNIV_PAGE_SIZE, buf, NULL); mach_write_to_8(buf + FIL_PAGE_FILE_FLUSH_LSN, lsn); - mach_write_to_4(buf + FIL_PAGE_ARCH_LOG_NO, arch_log_no); fil_write(TRUE, space_id, sum_of_sizes, 0, UNIV_PAGE_SIZE, buf, NULL); @@ -505,7 +1283,7 @@ fil_write_lsn_and_arch_no_to_file( /******************************************************************** Writes the flushed lsn and the latest archived log number to the page -header of the first page of each data file. */ +header of the first page of each data file in the system tablespace. */ ulint fil_write_flushed_lsn_to_data_files( @@ -524,18 +1302,20 @@ fil_write_flushed_lsn_to_data_files( space = UT_LIST_GET_FIRST(fil_system->space_list); while (space) { - if (space->purpose == FIL_TABLESPACE) { + /* We only write the lsn to the system tablespace + (space id == 0) files */ + + if (space->id == 0) { + ut_a(space->purpose == FIL_TABLESPACE); sum_of_sizes = 0; node = UT_LIST_GET_FIRST(space->chain); - while (node) { mutex_exit(&(fil_system->mutex)); err = fil_write_lsn_and_arch_no_to_file( - space->id, - sum_of_sizes, - lsn, arch_log_no); + space->id, sum_of_sizes, + lsn, arch_log_no); if (err != DB_SUCCESS) { return(err); @@ -544,11 +1324,11 @@ fil_write_flushed_lsn_to_data_files( mutex_enter(&(fil_system->mutex)); sum_of_sizes += node->size; - node = UT_LIST_GET_NEXT(chain, node); } - } + break; /* there is only one space with id == 0 */ + } space = UT_LIST_GET_NEXT(space_list, space); } @@ -575,8 +1355,9 @@ fil_read_flushed_lsn_and_arch_log_no( byte* buf; byte* buf2; dulint flushed_lsn; - ulint arch_log_no; - + ulint arch_log_no = 0; /* since InnoDB does not archive + its own logs under MySQL, this + parameter is not relevant */ buf2 = ut_malloc(2 * UNIV_PAGE_SIZE); /* Align the memory for a possible read from a raw device */ buf = ut_align(buf2, UNIV_PAGE_SIZE); @@ -584,7 +1365,6 @@ fil_read_flushed_lsn_and_arch_log_no( os_file_read(data_file, buf, 0, 0, UNIV_PAGE_SIZE); flushed_lsn = mach_read_from_8(buf + FIL_PAGE_FILE_FLUSH_LSN); - arch_log_no = mach_read_from_4(buf + FIL_PAGE_ARCH_LOG_NO); ut_free(buf2); @@ -611,143 +1391,959 @@ fil_read_flushed_lsn_and_arch_log_no( } } +/*================ SINGLE-TABLE TABLESPACES ==========================*/ + /*********************************************************************** -Creates a space object and puts it to the file system. */ +Increments the count of pending insert buffer page merges, if space is not +being deleted. */ -void -fil_space_create( -/*=============*/ - char* name, /* in: space name */ - ulint id, /* in: space id */ - ulint purpose)/* in: FIL_TABLESPACE, or FIL_LOG if log */ +ibool +fil_inc_pending_ibuf_merges( +/*========================*/ + /* out: TRUE if being deleted, and ibuf merges should + be skipped */ + ulint id) /* in: space id */ { - fil_space_t* space; - char* name2; - fil_system_t* system = fil_system; - - ut_a(system); - ut_a(name); - -#ifndef UNIV_BASIC_LOG_DEBUG - /* Spaces with an odd id number are reserved to replicate spaces - used in log debugging */ + fil_system_t* system = fil_system; + fil_space_t* space; - ut_anp((purpose == FIL_LOG) || (id % 2 == 0)); -#endif mutex_enter(&(system->mutex)); - space = mem_alloc(sizeof(fil_space_t)); - - name2 = mem_alloc(ut_strlen(name) + 1); + HASH_SEARCH(hash, system->spaces, id, space, space->id == id); - ut_strcpy(name2, name); + if (space == NULL) { + fprintf(stderr, +"InnoDB: Error: trying to do ibuf merge to a dropped tablespace %lu\n", id); + } - space->name = name2; - space->id = id; - space->purpose = purpose; - space->size = 0; + if (space == NULL || space->stop_ibuf_merges) { + mutex_exit(&(system->mutex)); - space->n_reserved_extents = 0; - - UT_LIST_INIT(space->chain); - space->magic_n = FIL_SPACE_MAGIC_N; + return(TRUE); + } - space->ibuf_data = NULL; - - rw_lock_create(&(space->latch)); - rw_lock_set_level(&(space->latch), SYNC_FSP); - - HASH_INSERT(fil_space_t, hash, system->spaces, id, space); + space->n_pending_ibuf_merges++; - UT_LIST_ADD_LAST(space_list, system->space_list, space); - mutex_exit(&(system->mutex)); + + return(FALSE); } /*********************************************************************** -Frees a space object from a file system. Closes the files in the chain -but does not delete them. */ +Decrements the count of pending insert buffer page merges. */ void -fil_space_free( -/*===========*/ +fil_decr_pending_ibuf_merges( +/*========================*/ ulint id) /* in: space id */ { + fil_system_t* system = fil_system; fil_space_t* space; - fil_node_t* fil_node; - fil_system_t* system = fil_system; mutex_enter(&(system->mutex)); HASH_SEARCH(hash, system->spaces, id, space, space->id == id); - HASH_DELETE(fil_space_t, hash, system->spaces, id, space); + if (space == NULL) { + fprintf(stderr, +"InnoDB: Error: decrementing ibuf merge of a dropped tablespace %lu\n", id); + } - UT_LIST_REMOVE(space_list, system->space_list, space); + if (space != NULL) { + space->n_pending_ibuf_merges--; + } - ut_a(space->magic_n == FIL_SPACE_MAGIC_N); + mutex_exit(&(system->mutex)); +} - fil_node = UT_LIST_GET_FIRST(space->chain); +/*********************************************************************** +Deletes a single-table tablespace. The tablespace must be cached in the +memory cache. */ - ut_d(UT_LIST_VALIDATE(chain, fil_node_t, space->chain)); +ibool +fil_delete_tablespace( +/*==================*/ + /* out: TRUE if success */ + ulint id) /* in: space id */ +{ + fil_system_t* system = fil_system; + ibool success; + fil_space_t* space; + fil_node_t* node; + ulint count = 0; + char path[OS_FILE_MAX_PATH]; - while (fil_node != NULL) { - fil_node_free(fil_node, system, space); + ut_a(id != 0); +stop_ibuf_merges: + mutex_enter(&(system->mutex)); - fil_node = UT_LIST_GET_FIRST(space->chain); - } + HASH_SEARCH(hash, system->spaces, id, space, space->id == id); + + if (space != NULL) { + space->stop_ibuf_merges = TRUE; + + if (space->n_pending_ibuf_merges == 0) { + mutex_exit(&(system->mutex)); + + count = 0; + + goto try_again; + } else { + if (count > 5000) { + ut_print_timestamp(stderr); + fprintf(stderr, +" InnoDB: Warning: trying to delete tablespace %s,\n" +"InnoDB: but there are %lu pending ibuf merges on it.\n" +"InnoDB: Loop %lu.\n", space->name, space->n_pending_ibuf_merges, count); + } + + mutex_exit(&(system->mutex)); + + os_thread_sleep(20000); + count++; + + goto stop_ibuf_merges; + } + } + + mutex_exit(&(system->mutex)); + count = 0; + +try_again: + mutex_enter(&(system->mutex)); + + HASH_SEARCH(hash, system->spaces, id, space, space->id == id); + + if (space == NULL) { + ut_print_timestamp(stderr); + fprintf(stderr, +" InnoDB: Error: cannot delete tablespace %lu because it is not found\n" +"InnoDB: in the tablespace memory cache.\n", id); + + mutex_exit(&(system->mutex)); - ut_d(UT_LIST_VALIDATE(chain, fil_node_t, space->chain)); - ut_ad(0 == UT_LIST_GET_LEN(space->chain)); + return(FALSE); + } + + ut_a(space); + ut_a(strlen(space->name) < OS_FILE_MAX_PATH); + ut_a(space->n_pending_ibuf_merges == 0); + + strcpy(path, space->name); + + space->is_being_deleted = TRUE; + + ut_a(UT_LIST_GET_LEN(space->chain) == 1); + node = UT_LIST_GET_FIRST(space->chain); + + if (space->n_pending_flushes > 0 || node->n_pending > 0) { + if (count > 1000) { + ut_print_timestamp(stderr); + fprintf(stderr, +" InnoDB: Warning: trying to delete tablespace %s,\n" +"InnoDB: but there are %lu flushes and %lu pending i/o's on it\n" +"InnoDB: Loop %lu.\n", space->name, space->n_pending_flushes, node->n_pending, + count); + } + mutex_exit(&(system->mutex)); + os_thread_sleep(20000); + + count++; + + goto try_again; + } mutex_exit(&(system->mutex)); - mem_free(space->name); - mem_free(space); + /* Invalidate in the buffer pool all pages belonging to the + tablespace. Since we have set space->is_being_deleted = TRUE, readahead + or ibuf merge can no longer read more pages of this tablespace to the + buffer pool. Thus we can clean the tablespace out of the buffer pool + completely and permanently. The flag is_being_deleted also prevents + fil_flush() from being applied to this tablespace. */ + + buf_LRU_invalidate_tablespace(id); + + success = fil_space_free(id); + + if (success) { + success = os_file_delete(path); + + if (success) { + + return(TRUE); + } + } + + return(FALSE); } /*********************************************************************** -Returns the size of the space in pages. */ +Discards a single-table tablespace. The tablespace must be cached in the +memory cache. Discarding is like deleting a tablespace, but +1) we do not drop the table from the data dictionary; +2) we remove all insert buffer entries for the tablespace immediately; in DROP +TABLE they are only removed gradually in the background; +3) when the user does IMPORT TABLESPACE, the tablespace will have the same id +as it originally had. */ -ulint -fil_space_get_size( -/*===============*/ - /* out: space size */ +ibool +fil_discard_tablespace( +/*===================*/ + /* out: TRUE if success */ ulint id) /* in: space id */ { - fil_space_t* space; + ibool success; + + success = fil_delete_tablespace(id); + + if (!success) { + fprintf(stderr, +"InnoDB: Warning: cannot delete tablespace %lu in DISCARD TABLESPACE.\n" +"InnoDB: But let us remove the insert buffer entries for this tablespace.\n", + id); + } + + /* Remove all insert buffer entries for the tablespace */ + + ibuf_delete_for_discarded_space(id); + + return(TRUE); +} + +/*********************************************************************** +Renames the memory cache structures of a single-table tablespace. */ +static +ibool +fil_rename_tablespace_in_mem( +/*=========================*/ + /* out: TRUE if success */ + fil_space_t* space, /* in: tablespace memory object */ + fil_node_t* node, /* in: file node of that tablespace */ + char* path) /* in: new name */ +{ fil_system_t* system = fil_system; - ulint size; + fil_space_t* space2; + char* old_name = space->name; + + HASH_SEARCH(name_hash, system->name_hash, ut_fold_string(old_name), + space2, 0 == strcmp(old_name, space2->name)); + if (space != space2) { + fprintf(stderr, +"InnoDB: Error: cannot find %s in tablespace memory cache\n", old_name); - ut_ad(system); + return(FALSE); + } + + HASH_SEARCH(name_hash, system->name_hash, ut_fold_string(path), + space2, 0 == strcmp(path, space2->name)); + if (space2 != NULL) { + fprintf(stderr, +"InnoDB: Error: %s is already in tablespace memory cache\n", path); + + return(FALSE); + } + + HASH_DELETE(fil_space_t, name_hash, system->name_hash, + ut_fold_string(space->name), space); + mem_free(space->name); + mem_free(node->name); + + space->name = mem_alloc(strlen(path) + 1); + node->name = mem_alloc(strlen(path) + 1); + + strcpy(space->name, path); + strcpy(node->name, path); + + HASH_INSERT(fil_space_t, name_hash, system->name_hash, + ut_fold_string(path), space); + return(TRUE); +} + +/*********************************************************************** +Renames a single-table tablespace. The tablespace must be cached in the +tablespace memory cache. */ + +ibool +fil_rename_tablespace( +/*==================*/ + /* out: TRUE if success */ + char* old_name, /* in: old table name in the standard + databasename/tablename format of InnoDB */ + ulint id, /* in: space id */ + char* new_name) /* in: new table name in the standard + databasename/tablename format of InnoDB */ +{ + fil_system_t* system = fil_system; + ibool success; + fil_space_t* space; + fil_node_t* node; + ulint count = 0; + char old_path[OS_FILE_MAX_PATH]; + char path[OS_FILE_MAX_PATH]; + + ut_a(id != 0); +retry: + count++; + + if (count > 1000) { + ut_print_timestamp(stderr); + fprintf(stderr, +" InnoDB: Warning: problems renaming %s to %s, %lu iterations\n", + old_name, new_name, count); + } mutex_enter(&(system->mutex)); HASH_SEARCH(hash, system->spaces, id, space, space->id == id); - size = space->size; + if (space == NULL) { + fprintf(stderr, +"InnoDB: Error: cannot find space id %lu from the tablespace memory cache\n" +"InnoDB: though the table %s in a rename operation should have that id\n", + id, old_name); + mutex_exit(&(system->mutex)); + + return(FALSE); + } + + if (count > 25000) { + space->stop_ios = FALSE; + mutex_exit(&(system->mutex)); + + return(FALSE); + } + + /* We temporarily close the .ibd file because we do not trust that + operating systems can rename an open file. For the closing we have to + wait until there are no pending i/o's or flushes on the file. */ + + space->stop_ios = TRUE; + + ut_a(UT_LIST_GET_LEN(space->chain) == 1); + node = UT_LIST_GET_FIRST(space->chain); + + if (node->n_pending > 0 || node->n_pending_flushes > 0) { + /* There are pending i/o's or flushes, sleep for a while and + retry */ + + mutex_exit(&(system->mutex)); + + os_thread_sleep(20000); + + goto retry; + + } else if (node->modification_counter > node->flush_counter) { + /* Flush the space */ + + mutex_exit(&(system->mutex)); + + os_thread_sleep(20000); + + fil_flush(id); + + goto retry; + + } else if (node->open) { + /* Close the file */ + + fil_node_close_file(node, system); + } + + /* Check that the old name in the space is right */ + ut_a(strlen(old_name) < OS_FILE_MAX_PATH - 10); + + sprintf(old_path, "./%s.ibd", old_name); + + srv_normalize_path_for_win(old_path); + + ut_a(strcmp(space->name, old_path) == 0); + ut_a(strcmp(node->name, old_path) == 0); + + /* Rename the tablespace and the node in the memory cache */ + + ut_a(strlen(new_name) < OS_FILE_MAX_PATH - 10); + + sprintf(path, "./%s.ibd", new_name); + + srv_normalize_path_for_win(path); + + success = fil_rename_tablespace_in_mem(space, node, path); + + if (!success) { + + goto func_exit; + } + + success = os_file_rename(old_path, path); + + if (!success) { + /* We have to revert the changes we made to the tablespace + memory cache */ + + ut_a(fil_rename_tablespace_in_mem(space, node, old_path)); + } +func_exit: + space->stop_ios = FALSE; + mutex_exit(&(system->mutex)); - return(size); + return(success); } /*********************************************************************** -Checks if the pair space, page_no refers to an existing page in a -tablespace file space. */ +Creates a new single-table tablespace to a database directory of MySQL. +Database directories are under the 'datadir' of MySQL. The datadir is the +directory of a running mysqld program. We can refer to it by simply the +path '.'. */ + +ulint +fil_create_new_single_table_tablespace( +/*===================================*/ + /* out: DB_SUCCESS or error code */ + ulint* space_id, /* out: space id */ + char* tablename, /* in: the table name in the usual + databasename/tablename format of InnoDB */ + ulint size) /* in: the initial size of the tablespace file + in pages, must be > 0 */ +{ + os_file_t file; + ibool ret; + ulint err; + byte* page; + ibool success; + char path[OS_FILE_MAX_PATH]; + + ut_a(strlen(tablename) < OS_FILE_MAX_PATH - 10); + + sprintf(path, "./%s.ibd", tablename); + + srv_normalize_path_for_win(path); + + file = os_file_create(path, OS_FILE_CREATE, OS_FILE_NORMAL, + OS_DATA_FILE, &ret); + if (ret == FALSE) { + ut_print_timestamp(stderr); + fprintf(stderr, +" InnoDB: Error creating file %s.\n", path); + + /* The following call will print an error message */ + + err = os_file_get_last_error(TRUE); + + if (err == OS_FILE_ALREADY_EXISTS) { + fprintf(stderr, +"InnoDB: The file already exists though the corresponding table did not\n" +"InnoDB: exist in the InnoDB data dictionary. Have you moved InnoDB\n" +"InnoDB: .ibd files around without using the SQL commands\n" +"InnoDB: DISCARD TABLESPACE and IMPORT TABLESPACE, or did\n" +"InnoDB: mysqld crash in the middle of CREATE TABLE? You can\n" +"InnoDB: resolve the problem by removing the file %s\n" +"InnoDB: under the 'datadir' of MySQL.\n", path); + + return(DB_TABLESPACE_ALREADY_EXISTS); + } + + if (err == OS_FILE_DISK_FULL) { + + return(DB_OUT_OF_FILE_SPACE); + } + + return(DB_ERROR); + } + + page = ut_malloc(UNIV_PAGE_SIZE); + + ret = os_file_set_size(path, file, size * UNIV_PAGE_SIZE, 0); + + if (!ret) { + ut_free(page); + os_file_close(file); + os_file_delete(path); + + return(DB_OUT_OF_FILE_SPACE); + } + + *space_id = fil_assign_new_space_id(); + + if (*space_id == ULINT_UNDEFINED) { + ut_free(page); + os_file_close(file); + os_file_delete(path); + + return(DB_ERROR); + } + + /* We have to write the space id to the file immediately and flush the + file to disk. This is because in crash recovery we must be aware what + tablespaces exist and what are their space id's, so that we can apply + the log records to the right file. It may take quite a while until + buffer pool flush algorithms write anything to the file and flush it to + disk. If we would not write here anything, the file would be filled + with zeros from the call of os_file_set_size(), until a buffer pool + flush would write to it. */ + + memset(page, '\0', UNIV_PAGE_SIZE); + + fsp_header_write_space_id(page, *space_id); + + buf_flush_init_for_writing(page, ut_dulint_zero, *space_id, 0); + + ret = os_file_write(path, file, page, 0, 0, UNIV_PAGE_SIZE); + + ut_free(page); + + if (!ret) { + fprintf(stderr, +"InnoDB: Error: could not write the first page to tablespace %s\n", path); + + os_file_close(file); + os_file_delete(path); + + return(DB_ERROR); + } + + ret = os_file_flush(file); + + if (!ret) { + fprintf(stderr, +"InnoDB: Error: file flush of tablespace %s failed\n", path); + + os_file_close(file); + os_file_delete(path); + + return(DB_ERROR); + } + + os_file_close(file); + + if (*space_id == ULINT_UNDEFINED) { + os_file_delete(path); + + return(DB_ERROR); + } + + success = fil_space_create(path, *space_id, FIL_TABLESPACE); + + if (!success) { + os_file_delete(path); + + return(DB_ERROR); + } + + fil_node_create(path, size, *space_id, FALSE); + + return(DB_SUCCESS); +} + +/************************************************************************ +Tries to open a single-table tablespace and checks the space id is right in +it. If does not succeed, prints an error message to the .err log. This +function is used to open the tablespace when we load a table definition +to the dictionary cache. NOTE that we assume this operation is used under the +protection of the dictionary mutex, so that two users cannot race here. This +operation does not leave the file associated with the tablespace open, but +closes it after we have looked at the space id in it. */ ibool -fil_check_adress_in_tablespace( -/*===========================*/ - /* out: TRUE if the address is meaningful */ +fil_open_single_table_tablespace( +/*=============================*/ + /* out: TRUE if success */ ulint id, /* in: space id */ - ulint page_no)/* in: page number */ + char* name) /* in: table name in the databasename/tablename + format */ { + os_file_t file; + char* filepath; + ibool success; + byte* page; + ulint space_id; + ibool ret = TRUE; + + filepath = ut_malloc(OS_FILE_MAX_PATH); + + ut_a(strlen(name) < OS_FILE_MAX_PATH - 10); + + sprintf(filepath, "./%s.ibd", name); + + srv_normalize_path_for_win(filepath); + + file = os_file_create_simple_no_error_handling(filepath, OS_FILE_OPEN, + OS_FILE_READ_ONLY, &success); + if (!success) { + /* The following call prints an error message */ + os_file_get_last_error(TRUE); + + ut_print_timestamp(stderr); + + fprintf(stderr, +" InnoDB: Error: trying to open a table, but could not\n" +"InnoDB: open the tablespace file %s!\n", filepath); + fprintf(stderr, +"InnoDB: have you moved InnoDB .ibd files around without using the\n" +"InnoDB: commands DISCARD TABLESPACE and IMPORT TABLESPACE?\n"); + + ut_free(filepath); + + return(FALSE); + } + + /* Read the first page of the tablespace */ + + page = ut_malloc(UNIV_PAGE_SIZE); + + success = os_file_read(file, page, 0, 0, UNIV_PAGE_SIZE); + + /* We have to read the tablespace id from the file */ + + space_id = fsp_header_get_space_id(page); + + if (space_id != id) { + ut_print_timestamp(stderr); + + fprintf(stderr, +" InnoDB: Error: tablespace id in file %s is %lu, but in the InnoDB\n" +"InnoDB: data dictionary it is %lu.\n", filepath, space_id, id); + fprintf(stderr, +"InnoDB: Have you moved InnoDB .ibd files around without using the\n" +"InnoDB: commands DISCARD TABLESPACE and IMPORT TABLESPACE?\n"); + + ret = FALSE; + + goto func_exit; + } + + success = fil_space_create(filepath, space_id, FIL_TABLESPACE); + + if (!success) { + goto func_exit; + } + + /* We do not measure the size of the file, that is why we pass the 0 + below */ + + fil_node_create(filepath, 0, space_id, FALSE); +func_exit: + os_file_close(file); + ut_free(page); + ut_free(filepath); + + return(ret); +} + +/************************************************************************ +Opens an .ibd file and adds the associated single-table tablespace to the +InnoDB fil0fil.c data structures. */ +static +void +fil_load_single_table_tablespace( +/*=============================*/ + char* dbname, /* in: database name */ + char* filename) /* in: file name (not a path), including the + .ibd extension */ +{ + os_file_t file; + char* filepath; + ibool success; + byte* page; + ulint space_id; + ulint size_low; + ulint size_high; + ib_longlong size; + + filepath = ut_malloc(OS_FILE_MAX_PATH); + + ut_a(strlen(dbname) + strlen(filename) < OS_FILE_MAX_PATH - 10); + + sprintf(filepath, "./%s/%s", dbname, filename); + + srv_normalize_path_for_win(filepath); + + file = os_file_create_simple_no_error_handling(filepath, OS_FILE_OPEN, + OS_FILE_READ_ONLY, &success); + if (!success) { + /* The following call prints an error message */ + os_file_get_last_error(TRUE); + + fprintf(stderr, +"InnoDB: Error: could not open single-table tablespace file\n" +"InnoDB: %s!", filepath); + + ut_free(filepath); + + return; + } + + success = os_file_get_size(file, &size_low, &size_high); + + if (!success) { + /* The following call prints an error message */ + os_file_get_last_error(TRUE); + + fprintf(stderr, +"InnoDB: Error: could not measure the size of single-table tablespace file\n" +"InnoDB: %s!", filepath); + + os_file_close(file); + ut_free(filepath); + + return; + } + + size = (((ib_longlong)size_high) << 32) + (ib_longlong)size_low; + + if (size < 4 * UNIV_PAGE_SIZE) { + fprintf(stderr, +"InnoDB: Error: the size of single-table tablespace file %s\n" +"InnoDB: is only %lu %lu, should be at least %lu!", filepath, size_high, + size_low, (ulint)4 * UNIV_PAGE_SIZE); + os_file_close(file); + ut_free(filepath); + + return; + } + + /* Read the first page of the tablespace */ + + page = ut_malloc(UNIV_PAGE_SIZE); + + success = os_file_read(file, page, 0, 0, UNIV_PAGE_SIZE); + + /* We have to read the tablespace id from the file */ + + space_id = fsp_header_get_space_id(page); + + if (space_id == ULINT_UNDEFINED || space_id == 0) { + fprintf(stderr, +"InnoDB: Error: tablespace id %lu in file %s is not sensible\n", space_id, + filepath); + goto func_exit; + } + + success = fil_space_create(filepath, space_id, FIL_TABLESPACE); + + if (!success) { + + goto func_exit; + } + + /* We do not measure the size of the file, that is why we pass the 0 + below */ + + fil_node_create(filepath, 0, space_id, FALSE); +func_exit: + os_file_close(file); + ut_free(page); + ut_free(filepath); +} + +/************************************************************************ +At the server startup, if we need crash recovery, scans the database +directories under the MySQL datadir, looking for .ibd files. Those files are +single-table tablespaces. We need to know the space id in each of them so that +we know into which file we should look to check the contents of a page stored +in the doublewrite buffer, also to know where to apply log records where the +space id is != 0. */ + +ulint +fil_load_single_table_tablespaces(void) +/*===================================*/ + /* out: DB_SUCCESS or error number */ +{ + int ret; + char* dbpath; + os_file_dir_t dir; + os_file_dir_t dbdir; + os_file_stat_t dbinfo; + os_file_stat_t fileinfo; + + /* The datadir of MySQL is always the default directory of mysqld */ + + dir = os_file_opendir((char*)".", TRUE); + + if (dir == NULL) { + + return(DB_ERROR); + } + + dbpath = ut_malloc(OS_FILE_MAX_PATH); + + /* Scan all directories under the datadir. They are the database + directories of MySQL. */ + + ret = os_file_readdir_next_file((char*)".", dir, &dbinfo); + + while (ret == 0) { + /* printf("Looking at %s in datadir\n", dbinfo.name); */ + + if (dbinfo.type == OS_FILE_TYPE_FILE + || dbinfo.type == OS_FILE_TYPE_UNKNOWN) { + + goto next_datadir_item; + } + + /* We found a symlink or a directory; try opening it to see + if a symlink is a directory */ + + ut_a(strlen(dbinfo.name) < OS_FILE_MAX_PATH - 10); + + sprintf(dbpath, "./%s", dbinfo.name); + + srv_normalize_path_for_win(dbpath); + + dbdir = os_file_opendir(dbpath, FALSE); + + if (dbdir != NULL) { + /* printf("Opened dir %s\n", dbinfo.name); */ + + /* We found a database directory; loop through it, + looking for possible .ibd files in it */ + + ret = os_file_readdir_next_file(dbpath, dbdir, + &fileinfo); + while (ret == 0) { + /* printf( +" Looking at file %s\n", fileinfo.name); */ + + if (fileinfo.type == OS_FILE_TYPE_DIR + || dbinfo.type == OS_FILE_TYPE_UNKNOWN) { + goto next_file_item; + } + + /* We found a symlink or a file */ + if (strlen(fileinfo.name) > 4 + && 0 == strcmp(fileinfo.name + + strlen(fileinfo.name) - 4, + ".ibd")) { + /* The name ends in .ibd; try opening + the file */ + fil_load_single_table_tablespace( + dbinfo.name, fileinfo.name); + } +next_file_item: + ret = os_file_readdir_next_file(dbpath, dbdir, + &fileinfo); + } + + if (0 != os_file_closedir(dbdir)) { + fprintf(stderr, +"InnoDB: Warning: could not close database directory %s\n", dbpath); + } + } + +next_datadir_item: + ret = os_file_readdir_next_file((char*)".", dir, &dbinfo); + } + + ut_free(dbpath); + + /* At the end of directory we should get 1 as the return value, -1 + if there was an error */ + if (ret != 1) { + fprintf(stderr, +"InnoDB: Error: os_file_readdir_next_file returned %d in MySQL datadir\n", + ret); + os_file_closedir(dir); + + return(DB_ERROR); + } + + if (0 != os_file_closedir(dir)) { + fprintf(stderr, +"InnoDB: Error: could not close MySQL datadir\n"); + + return(DB_ERROR); + } + + return(DB_SUCCESS); +} + +/************************************************************************ +If we need crash recovery, and we have called +fil_load_single_table_tablespaces() and dict_load_single_table_tablespaces(), +we can call this function to print an error message of orphaned .ibd files +for which there is not a data dictionary entry with a matching table name +and space id. */ + +void +fil_print_orphaned_tablespaces(void) +/*================================*/ +{ + fil_system_t* system = fil_system; + fil_space_t* space; + + mutex_enter(&(system->mutex)); + + space = UT_LIST_GET_FIRST(system->space_list); + + while (space) { + if (space->purpose == FIL_TABLESPACE && space->id != 0 + && !space->mark) { + fprintf(stderr, +"InnoDB: Warning: tablespace %s of id %lu has no matching table in\n" +"InnoDB: the InnoDB data dixtionary.\n", space->name, space->id); + } + + space = UT_LIST_GET_NEXT(space_list, space); + + } + + mutex_exit(&(system->mutex)); +} + +/*********************************************************************** +Returns TRUE if a single-table tablespace does not exist in the memory cache, +or is being deleted there. */ + +ibool +fil_tablespace_deleted_or_being_deleted_in_mem( +/*===========================================*/ + /* out: TRUE if does not exist or is being\ + deleted */ + ulint id, /* in: space id */ + ib_longlong version)/* in: tablespace_version should be this; if + you pass -1 as the value of this, then this + parameter is ignored */ +{ + fil_system_t* system = fil_system; fil_space_t* space; + + ut_ad(system); + + mutex_enter(&(system->mutex)); + + HASH_SEARCH(hash, system->spaces, id, space, space->id == id); + + if (space == NULL || space->is_being_deleted) { + mutex_exit(&(system->mutex)); + + return(TRUE); + } + + if (version != -1LL && space->tablespace_version != version) { + mutex_exit(&(system->mutex)); + + return(TRUE); + } + + mutex_exit(&(system->mutex)); + + return(FALSE); +} + +/*********************************************************************** +Returns TRUE if a single-table tablespace exists in the memory cache. */ + +ibool +fil_tablespace_exists_in_mem( +/*=========================*/ + /* out: TRUE if exists */ + ulint id) /* in: space id */ +{ fil_system_t* system = fil_system; - ulint size; - ibool ret; - + fil_space_t* space; + ut_ad(system); mutex_enter(&(system->mutex)); @@ -755,24 +2351,297 @@ fil_check_adress_in_tablespace( HASH_SEARCH(hash, system->spaces, id, space, space->id == id); if (space == NULL) { - ret = FALSE; - } else { - size = space->size; + mutex_exit(&(system->mutex)); + + return(FALSE); + } + + mutex_exit(&(system->mutex)); + + return(TRUE); +} + +/*********************************************************************** +Returns TRUE if a matching tablespace exists in the InnoDB tablespace memory +cache. Note that if we have not done a crash recovery at the database startup, +there may be many tablespaces which are not yet in the memory cache. */ - if (page_no > size) { - ret = FALSE; - } else if (space->purpose != FIL_TABLESPACE) { - ret = FALSE; +ibool +fil_space_for_table_exists_in_mem( +/*==============================*/ + /* out: TRUE if a matching tablespace + exists in the memory cache */ + ulint id, /* in: space id */ + char* name, /* in: table name in the standard + 'databasename/tablename' format */ + ibool mark_space, /* in: in crash recovery, at database startup + we mark all spaces which have an associated + table in the InnoDB data dictionary, so that + we can print a warning about orphaned + tablespaces */ + ibool print_error_if_does_not_exist) + /* in: print detailed error information to + the .err log if a matching tablespace is + not found from memory */ +{ + fil_system_t* system = fil_system; + fil_space_t* namespace; + fil_space_t* space; + char path[OS_FILE_MAX_PATH]; + + ut_ad(system); + + mutex_enter(&(system->mutex)); + + sprintf(path, "./%s.ibd", name); + srv_normalize_path_for_win(path); + + /* Look if there is a space with the same id */ + + HASH_SEARCH(hash, system->spaces, id, space, space->id == id); + + /* Look if there is a space with the same name; the name is the + directory path from the datadir to the file */ + + HASH_SEARCH(name_hash, system->name_hash, + ut_fold_string(path), namespace, + 0 == strcmp(namespace->name, path)); + if (!print_error_if_does_not_exist) { + if (space && space == namespace) { + if (mark_space) { + space->mark = TRUE; + } + + mutex_exit(&(system->mutex)); + + return(TRUE); + } + + mutex_exit(&(system->mutex)); + + return(FALSE); + } + + if (space == NULL) { + if (namespace == NULL) { + ut_print_timestamp(stderr); + fprintf(stderr, +" InnoDB: Error: table %s in InnoDB data dictionary has tablespace\n" +"InnoDB: id %lu, but tablespace with that id or name does not exist. Have\n" +"InnoDB: you deleted or moved .ibd files? We cannot open table %s now.\n", + name, id, name); } else { - ret = TRUE; + ut_print_timestamp(stderr); + fprintf(stderr, +" InnoDB: Error: table %s in InnoDB data dictionary has tablespace\n" +"InnoDB: id %lu, but tablespace with that id does not exist. There is\n" +"InnoDB: a tablespace of name %s and id %lu, though. Have\n" +"InnoDB: you deleted or moved .ibd files? We cannot open table %s now.\n", + name, id, namespace->name, namespace->id, name); } + + mutex_exit(&(system->mutex)); + + return(FALSE); } - + + if (0 != strcmp(space->name, path)) { + ut_print_timestamp(stderr); + fprintf(stderr, +" InnoDB: Error: table %s in InnoDB data dictionary has tablespace\n" +"InnoDB: id %lu, but tablespace with that id has name %s. Have you\n" +"InnoDB: deleted or moved .ibd files? We cannot open table %s now.\n", + name, id, space->name, name); + if (namespace != NULL) { + fprintf(stderr, +"InnoDB: There is a tablespace with the right name %s, but its id is %lu.\n", + namespace->name, namespace->id); + } + + mutex_exit(&(system->mutex)); + + return(FALSE); + } + + ut_a(space == namespace); + + if (mark_space) { + space->mark = TRUE; + } + mutex_exit(&(system->mutex)); - return(ret); + return(TRUE); +} + +/************************************************************************** +Tries to extend a data file by the number of pages given. Fractions of 1 MB +are ignored. The tablespace must be cached in the memory cache. */ + +ibool +fil_extend_last_data_file( +/*======================*/ + /* out: TRUE if success, also if we run + out of disk space we may return TRUE */ + ulint* actual_increase,/* out: number of pages we were able to + extend, here the original size of the file and + the resulting size of the file are rounded + downwards to a full megabyte, and the + difference expressed in pages is returned */ + ulint space_id, /* in: space id */ + ulint size, /* in: current size of the space in pages, as + stored in the fsp header */ + ulint size_increase) /* in: try to extend this many pages */ +{ + fil_system_t* system = fil_system; + fil_node_t* node; + fil_space_t* space; + byte* buf2; + byte* buf; + ibool success; + ulint i; + + fil_mutex_enter_and_prepare_for_io(space_id); + + HASH_SEARCH(hash, system->spaces, space_id, space, + space->id == space_id); + ut_a(space); + + node = UT_LIST_GET_LAST(space->chain); + + fil_node_prepare_for_io(node, system, space); + + if (UT_LIST_GET_LEN(space->chain) == 1 && node->size < size) { + ut_print_timestamp(stderr); + fprintf(stderr, +"InnoDB: Fatal error: space %s id %lu size stored in header is %lu pages\n" +"InnoDB: but actual size is only %lu pages (possibly rounded downwards)!\n" +"InnoDB: Cannot continue operation!\n", space->name, space->id, size, + node->size); + exit(1); + } + + buf2 = mem_alloc(1024 * 1024 + UNIV_PAGE_SIZE); + buf = ut_align(buf2, UNIV_PAGE_SIZE); + + memset(buf, '\0', 1024 * 1024); + + for (i = 0; i < size_increase / ((1024 * 1024) / UNIV_PAGE_SIZE); + i++) { + /* If we use native Windows aio, then we use it also in this + write */ + + success = os_aio(OS_FILE_WRITE, OS_AIO_SYNC, + node->name, node->handle, buf, + (node->size << UNIV_PAGE_SIZE_SHIFT) & 0xFFFFFFFFUL, + node->size >> (32 - UNIV_PAGE_SIZE_SHIFT), + 1024 * 1024, NULL, NULL); + + if (!success) { + break; + } + + node->size += ((1024 * 1024) / UNIV_PAGE_SIZE); + space->size += ((1024 * 1024) / UNIV_PAGE_SIZE); + + os_has_said_disk_full = FALSE; + } + + mem_free(buf2); + + fil_node_complete_io(node, system, OS_FILE_WRITE); + + mutex_exit(&(system->mutex)); + + *actual_increase = i * ((1024 * 1024) / UNIV_PAGE_SIZE); + + fil_flush(space_id); + + if (space_id == 0) { + srv_data_file_sizes[srv_n_data_files - 1] += *actual_increase; + } + + return(TRUE); } +/************************************************************************** +Tries to extend a data file so that it would accommodate the number of pages +given. The tablespace must be cached in the memory cache. */ + +ibool +fil_extend_data_file_with_pages( +/*============================*/ + /* out: TRUE if success */ + ulint space_id, /* in: space id, must be != 0 */ + ulint size, /* in: current size of the space in pages, as + stored in the fsp header */ + ulint size_after_extend)/* in: desired size in pages after the + extension, should be less than 4 GB (this + function is primarily intended for increasing + the data file size from < 64 pages to up to + 64 pages) */ +{ + fil_system_t* system = fil_system; + fil_node_t* node; + fil_space_t* space; + byte* buf2; + byte* buf; + ibool success; + + ut_a(space_id != 0); + ut_a(size_after_extend < 64 * 4096); + ut_a(size_after_extend >= size); + + fil_mutex_enter_and_prepare_for_io(space_id); + + HASH_SEARCH(hash, system->spaces, space_id, space, + space->id == space_id); + ut_a(space); + + node = UT_LIST_GET_LAST(space->chain); + + fil_node_prepare_for_io(node, system, space); + + if (UT_LIST_GET_LEN(space->chain) == 1 && node->size < size) { + ut_print_timestamp(stderr); + fprintf(stderr, +"InnoDB: Fatal error: space %s id %lu size stored in header is %lu pages\n" +"InnoDB: but actual size is only %lu pages (possibly rounded downwards)!\n" +"InnoDB: Cannot continue operation!\n", space->name, space_id, size, + node->size); + exit(1); + } + + buf2 = mem_alloc((1 + size_after_extend - size) * UNIV_PAGE_SIZE); + buf = ut_align(buf2, UNIV_PAGE_SIZE); + + memset(buf, '\0', (size_after_extend - size) * UNIV_PAGE_SIZE); + + success = os_aio(OS_FILE_WRITE, OS_AIO_SYNC, + node->name, node->handle, buf, + UNIV_PAGE_SIZE * size, 0, + UNIV_PAGE_SIZE * (size_after_extend - size), + NULL, NULL); + if (success) { + node->size = size_after_extend; + space->size = size_after_extend; + + os_has_said_disk_full = FALSE; + } + + mem_free(buf2); + + fil_node_complete_io(node, system, OS_FILE_WRITE); + + mutex_exit(&(system->mutex)); + + fil_flush(space_id); + + return(success); +} + +/*========== RESERVE FREE EXTENTS (for a B-tree split, for example) ===*/ + /*********************************************************************** Tries to reserve free extents in a file space. */ @@ -784,8 +2653,8 @@ fil_space_reserve_free_extents( ulint n_free_now, /* in: number of free extents now */ ulint n_to_reserve) /* in: how many one wants to reserve */ { - fil_space_t* space; fil_system_t* system = fil_system; + fil_space_t* space; ibool success; ut_ad(system); @@ -794,6 +2663,8 @@ fil_space_reserve_free_extents( HASH_SEARCH(hash, system->spaces, id, space, space->id == id); + ut_a(space); + if (space->n_reserved_extents + n_to_reserve > n_free_now) { success = FALSE; } else { @@ -815,8 +2686,8 @@ fil_space_release_free_extents( ulint id, /* in: space id */ ulint n_reserved) /* in: how many one reserved */ { - fil_space_t* space; fil_system_t* system = fil_system; + fil_space_t* space; ut_ad(system); @@ -824,6 +2695,7 @@ fil_space_release_free_extents( HASH_SEARCH(hash, system->spaces, id, space, space->id == id); + ut_a(space); ut_a(space->n_reserved_extents >= n_reserved); space->n_reserved_extents -= n_reserved; @@ -840,8 +2712,8 @@ fil_space_get_n_reserved_extents( /*=============================*/ ulint id) /* in: space id */ { - fil_space_t* space; fil_system_t* system = fil_system; + fil_space_t* space; ulint n; ut_ad(system); @@ -859,204 +2731,94 @@ fil_space_get_n_reserved_extents( return(n); } +/*============================ FILE I/O ================================*/ + /************************************************************************ +NOTE: you must call fil_mutex_enter_and_prepare_for_io() first! + Prepares a file node for i/o. Opens the file if it is closed. Updates the pending i/o's field in the node and the system appropriately. Takes the node -off the LRU list if it is in the LRU list. */ +off the LRU list if it is in the LRU list. The caller must hold the fil_sys +mutex. */ static void fil_node_prepare_for_io( /*====================*/ fil_node_t* node, /* in: file node */ - fil_system_t* system, /* in: file system */ + fil_system_t* system, /* in: tablespace memory cache */ fil_space_t* space) /* in: space */ { - ibool ret; - fil_node_t* last_node; - ut_ad(node && system && space); ut_ad(mutex_own(&(system->mutex))); + if (system->n_open > system->max_n_open + 5) { + ut_print_timestamp(stderr); + fprintf(stderr, +" InnoDB: Warning: open files %lu exceeds the limit %lu\n", + system->n_open, system->max_n_open); + } + if (node->open == FALSE) { - /* File is closed */ + /* File is closed: open it */ ut_a(node->n_pending == 0); - /* If too many files are open, close one */ - - if (system->n_open_pending + UT_LIST_GET_LEN(system->LRU) - == system->max_n_open) { - - ut_a(UT_LIST_GET_LEN(system->LRU) > 0); - - last_node = UT_LIST_GET_LAST(system->LRU); - - if (last_node == NULL) { - fprintf(stderr, - "InnoDB: Error: cannot close any file to open another for i/o\n" - "InnoDB: Pending i/o's on %lu files exist\n", - system->n_open_pending); - - ut_a(0); - } - - fil_node_close(last_node, system); - } - - if (space->purpose == FIL_LOG) { - node->handle = os_file_create(node->name, OS_FILE_OPEN, - OS_FILE_AIO, OS_LOG_FILE, &ret); - } else { - node->handle = os_file_create(node->name, OS_FILE_OPEN, - OS_FILE_AIO, OS_DATA_FILE, &ret); - } - - ut_a(ret); - - node->open = TRUE; - - system->n_open_pending++; - node->n_pending = 1; - - /* File was closed: the node was not in the LRU list */ - - return; + fil_node_open_file(node, system, space); } - /* File is open */ - if (node->n_pending == 0) { + if (node->n_pending == 0 && space->purpose == FIL_TABLESPACE + && space->id != 0) { /* The node is in the LRU list, remove it */ - UT_LIST_REMOVE(LRU, system->LRU, node); - - system->n_open_pending++; - node->n_pending = 1; - } else { - /* There is already a pending i/o-op on the file: the node is - not in the LRU list */ + ut_a(UT_LIST_GET_LEN(system->LRU) > 0); - node->n_pending++; + UT_LIST_REMOVE(LRU, system->LRU, node); } + + node->n_pending++; } /************************************************************************ Updates the data structures when an i/o operation finishes. Updates the -pending i/os field in the node and the system appropriately. Puts the node -in the LRU list if there are no other pending i/os. */ +pending i/o's field in the node appropriately. */ static void fil_node_complete_io( /*=================*/ fil_node_t* node, /* in: file node */ - fil_system_t* system, /* in: file system */ - ulint type) /* in: OS_FILE_WRITE or ..._READ */ + fil_system_t* system, /* in: tablespace memory cache */ + ulint type) /* in: OS_FILE_WRITE or OS_FILE_READ; marks + the node as modified if + type == OS_FILE_WRITE */ { ut_ad(node); ut_ad(system); ut_ad(mutex_own(&(system->mutex))); + ut_a(node->n_pending > 0); node->n_pending--; - if (type != OS_FILE_READ) { - node->is_modified = TRUE; + if (type == OS_FILE_WRITE) { + system->modification_counter++; + node->modification_counter = system->modification_counter; } - if (node->n_pending == 0) { + if (node->n_pending == 0 && node->space->purpose == FIL_TABLESPACE + && node->space->id != 0) { /* The node must be put back to the LRU list */ UT_LIST_ADD_FIRST(LRU, system->LRU, node); - - ut_a(system->n_open_pending > 0); - - system->n_open_pending--; - - if (system->n_open_pending == system->max_n_open - 1) { - - os_event_set(system->can_open); - } } } - -/************************************************************************** -Tries to extend a data file by the number of pages given. Any fractions of a -megabyte are ignored. */ - -ibool -fil_extend_last_data_file( -/*======================*/ - /* out: TRUE if success, also if we run - out of disk space we may return TRUE */ - ulint* actual_increase,/* out: number of pages we were able to - extend, here the orginal size of the file and - the resulting size of the file are rounded - downwards to a full megabyte, and the - difference expressed in pages is returned */ - ulint size_increase) /* in: try to extend this many pages */ -{ - fil_node_t* node; - fil_space_t* space; - fil_system_t* system = fil_system; - byte* buf2; - byte* buf; - ibool success; - ulint i; - - mutex_enter(&(system->mutex)); - - HASH_SEARCH(hash, system->spaces, 0, space, space->id == 0); - - ut_a(space); - - node = UT_LIST_GET_LAST(space->chain); - - fil_node_prepare_for_io(node, system, space); - - buf2 = mem_alloc(1024 * 1024 + UNIV_PAGE_SIZE); - buf = ut_align(buf2, UNIV_PAGE_SIZE); - - memset(buf, '\0', 1024 * 1024); - - for (i = 0; i < size_increase / ((1024 * 1024) / UNIV_PAGE_SIZE); i++) { - - /* If we use native Windows aio, then also this write is - done using it */ - - success = os_aio(OS_FILE_WRITE, OS_AIO_SYNC, - node->name, node->handle, buf, - (node->size << UNIV_PAGE_SIZE_SHIFT) & 0xFFFFFFFF, - node->size >> (32 - UNIV_PAGE_SIZE_SHIFT), - 1024 * 1024, NULL, NULL); - - if (!success) { - break; - } - - node->size += ((1024 * 1024) / UNIV_PAGE_SIZE); - space->size += ((1024 * 1024) / UNIV_PAGE_SIZE); - - os_has_said_disk_full = FALSE; - } - - mem_free(buf2); - - fil_node_complete_io(node, system, OS_FILE_WRITE); - - mutex_exit(&(system->mutex)); - - *actual_increase = i * ((1024 * 1024) / UNIV_PAGE_SIZE); - - fil_flush(0); - - srv_data_file_sizes[srv_n_data_files - 1] += *actual_increase; - - return(TRUE); -} /************************************************************************ Reads or writes data. This operation is asynchronous (aio). */ -void +ulint fil_io( /*===*/ + /* out: DB_SUCCESS, or DB_TABLESPACE_DELETED + if we are trying to do i/o on a tablespace + which does not exist */ ulint type, /* in: OS_FILE_READ or OS_FILE_WRITE, ORed to OS_FILE_LOG, if a log i/o and ORed to OS_AIO_SIMULATED_WAKE_LATER @@ -1081,17 +2843,15 @@ fil_io( void* message) /* in: message for aio handler if non-sync aio used, else ignored */ { + fil_system_t* system = fil_system; ulint mode; fil_space_t* space; fil_node_t* node; ulint offset_high; ulint offset_low; - fil_system_t* system; - os_event_t event; ibool ret; ulint is_log; ulint wake_later; - ulint count; is_log = type & OS_FILE_LOG; type = type & ~OS_FILE_LOG; @@ -1102,7 +2862,7 @@ fil_io( ut_ad(byte_offset < UNIV_PAGE_SIZE); ut_ad(buf); ut_ad(len > 0); - ut_ad((1 << UNIV_PAGE_SIZE_SHIFT) == UNIV_PAGE_SIZE); + ut_a((1 << UNIV_PAGE_SIZE_SHIFT) == UNIV_PAGE_SIZE); ut_ad(fil_validate()); #ifndef UNIV_LOG_DEBUG /* ibuf bitmap pages must be read in the sync aio mode: */ @@ -1124,82 +2884,45 @@ fil_io( mode = OS_AIO_NORMAL; } - system = fil_system; + /* Reserve the fil_system mutex and make sure that we can open at + least one file while holding it, if the file is not already open */ - count = 0; -loop: - count++; - - /* NOTE that there is a possibility of a hang here: - if the read i/o-handler thread needs to complete - a read by reading from the insert buffer, it may need to - post another read. But if the maximum number of files - are already open, it cannot proceed from here! */ - - mutex_enter(&(system->mutex)); + fil_mutex_enter_and_prepare_for_io(space_id); - if (count < 500 && !is_log && !ibuf_inside() - && system->n_open_pending >= (3 * system->max_n_open) / 4) { - - /* We are not doing an ibuf operation: leave a - safety margin of openable files for possible ibuf - merges needed in page read completion */ - - mutex_exit(&(system->mutex)); - - /* Wake the i/o-handler threads to make sure pending - i/o's are handled and eventually we can open the file */ - - os_aio_simulated_wake_handler_threads(); - - os_thread_sleep(100000); - - if (count > 50) { - fprintf(stderr, - "InnoDB: Warning: waiting for file closes to proceed\n" - "InnoDB: round %lu\n", count); - } - - goto loop; - } - - if (system->n_open_pending == system->max_n_open) { - - /* It is not sure we can open the file if it is closed: wait */ - - event = system->can_open; - os_event_reset(event); - + HASH_SEARCH(hash, system->spaces, space_id, space, + space->id == space_id); + if (!space) { mutex_exit(&(system->mutex)); - /* Wake the i/o-handler threads to make sure pending - i/o's are handled and eventually we can open the file */ - - os_aio_simulated_wake_handler_threads(); - + ut_print_timestamp(stderr); fprintf(stderr, - "InnoDB: Warning: max allowed number of files is open\n"); - - os_event_wait(event); +" InnoDB: Error: trying to do i/o to a tablespace which does not exist.\n" +"InnoDB: i/o type %lu, space id %lu, page no. %lu, i/o length %lu bytes\n", + type, space_id, block_offset, len); - goto loop; - } - - HASH_SEARCH(hash, system->spaces, space_id, space, - space->id == space_id); - ut_a(space); + return(DB_TABLESPACE_DELETED); + } ut_ad((mode != OS_AIO_IBUF) || (space->purpose == FIL_TABLESPACE)); node = UT_LIST_GET_FIRST(space->chain); for (;;) { + if (space->id != 0 && node->size == 0) { + /* We do not know the size of a single-table tablespace + before we open the file */ + + break; + } + if (node == NULL) { fprintf(stderr, - "InnoDB: Error: trying to access page number %lu in space %lu\n" + "InnoDB: Error: trying to access page number %lu in space %lu,\n" + "InnoDB: space name %s,\n" "InnoDB: which is outside the tablespace bounds.\n" "InnoDB: Byte offset %lu, len %lu, i/o type %lu\n", - block_offset, space_id, byte_offset, len, type); + block_offset, space_id, space->name, byte_offset, len, + type); ut_a(0); } @@ -1216,13 +2939,28 @@ loop: /* Open file if closed */ fil_node_prepare_for_io(node, system, space); + /* Check that at least the start offset is within the bounds of a + single-table tablespace */ + if (space->purpose == FIL_TABLESPACE && space->id != 0 + && node->size <= block_offset) { + + fprintf(stderr, + "InnoDB: Error: trying to access page number %lu in space %lu,\n" + "InnoDB: space name %s,\n" + "InnoDB: which is outside the tablespace bounds.\n" + "InnoDB: Byte offset %lu, len %lu, i/o type %lu\n", + block_offset, space_id, space->name, byte_offset, len, + type); + ut_a(0); + } + /* Now we have made the changes in the data structures of system */ mutex_exit(&(system->mutex)); /* Calculate the low 32 bits and the high 32 bits of the file offset */ offset_high = (block_offset >> (32 - UNIV_PAGE_SIZE_SHIFT)); - offset_low = ((block_offset << UNIV_PAGE_SIZE_SHIFT) & 0xFFFFFFFF) + offset_low = ((block_offset << UNIV_PAGE_SIZE_SHIFT) & 0xFFFFFFFFUL) + byte_offset; ut_a(node->size - block_offset >= @@ -1250,6 +2988,8 @@ loop: ut_ad(fil_validate()); } + + return(DB_SUCCESS); } /************************************************************************ @@ -1257,9 +2997,12 @@ Reads data from a space to a buffer. Remember that the possible incomplete blocks at the end of file are ignored: they are not taken into account when calculating the byte offset within a space. */ -void +ulint fil_read( /*=====*/ + /* out: DB_SUCCESS, or DB_TABLESPACE_DELETED + if we are trying to do i/o on a tablespace + which does not exist */ ibool sync, /* in: TRUE if synchronous aio is desired */ ulint space_id, /* in: space id */ ulint block_offset, /* in: offset in number of blocks */ @@ -1273,8 +3016,8 @@ fil_read( void* message) /* in: message for aio handler if non-sync aio used, else ignored */ { - fil_io(OS_FILE_READ, sync, space_id, block_offset, byte_offset, len, - buf, message); + return(fil_io(OS_FILE_READ, sync, space_id, block_offset, + byte_offset, len, buf, message)); } /************************************************************************ @@ -1282,9 +3025,12 @@ Writes data to a space from a buffer. Remember that the possible incomplete blocks at the end of file are ignored: they are not taken into account when calculating the byte offset within a space. */ -void +ulint fil_write( /*======*/ + /* out: DB_SUCCESS, or DB_TABLESPACE_DELETED + if we are trying to do i/o on a tablespace + which does not exist */ ibool sync, /* in: TRUE if synchronous aio is desired */ ulint space_id, /* in: space id */ ulint block_offset, /* in: offset in number of blocks */ @@ -1298,8 +3044,8 @@ fil_write( void* message) /* in: message for aio handler if non-sync aio used, else ignored */ { - fil_io(OS_FILE_WRITE, sync, space_id, block_offset, byte_offset, len, - buf, message); + return(fil_io(OS_FILE_WRITE, sync, space_id, block_offset, + byte_offset, len, buf, message)); } /************************************************************************** @@ -1314,16 +3060,16 @@ fil_aio_wait( ulint segment) /* in: the number of the segment in the aio array to wait for */ { + fil_system_t* system = fil_system; ibool ret; fil_node_t* fil_node; - fil_system_t* system = fil_system; void* message; ulint type; ut_ad(fil_validate()); if (os_aio_use_native_aio) { - srv_io_thread_op_info[segment] = (char *) "native aio handle"; + srv_io_thread_op_info[segment] = (char *) "handle native aio"; #ifdef WIN_ASYNC_IO ret = os_aio_windows_handle(segment, 0, &fil_node, &message, &type); @@ -1334,7 +3080,7 @@ fil_aio_wait( ut_a(0); #endif } else { - srv_io_thread_op_info[segment] =(char *)"simulated aio handle"; + srv_io_thread_op_info[segment] =(char *)"handle simulated aio"; ret = os_aio_simulated_handle(segment, (void**) &fil_node, &message, &type); @@ -1353,6 +3099,10 @@ fil_aio_wait( ut_ad(fil_validate()); /* Do the i/o handling */ + /* IMPORTANT: since i/o handling for reads will read also the insert + buffer in tablespace 0, you have to be very careful not to introduce + deadlocks in the i/o system. We keep tablespace 0 data files always + open, and use a special i/o thread to serve insert buffer requests. */ if (buf_pool_is_block(message)) { srv_io_thread_op_info[segment] = @@ -1365,7 +3115,8 @@ fil_aio_wait( } /************************************************************************** -Flushes to disk possible writes cached by the OS. */ +Flushes to disk possible writes cached by the OS. If the space does not exist +or is being dropped, does not do anything. */ void fil_flush( @@ -1377,41 +3128,79 @@ fil_flush( fil_space_t* space; fil_node_t* node; os_file_t file; + ib_longlong old_mod_counter; mutex_enter(&(system->mutex)); HASH_SEARCH(hash, system->spaces, space_id, space, - space->id == space_id); - ut_a(space); + space->id == space_id); + if (!space || space->is_being_deleted) { + mutex_exit(&(system->mutex)); + + return; + } + space->n_pending_flushes++; /* prevent dropping of the space while + we are flushing */ node = UT_LIST_GET_FIRST(space->chain); while (node) { - if (node->open && node->is_modified) { - file = node->handle; + if (node->modification_counter > node->flush_counter) { + ut_a(node->open); + + /* We want to flush the changes at least up to + old_mod_counter */ + old_mod_counter = node->modification_counter; - node->is_modified = FALSE; - if (space->purpose == FIL_TABLESPACE) { fil_n_pending_tablespace_flushes++; } else { fil_n_pending_log_flushes++; } +#ifdef __WIN__ + if (node->is_raw_disk) { - mutex_exit(&(system->mutex)); + goto skip_flush; + } +#endif +retry: + if (node->n_pending_flushes > 0) { + /* We want to avoid calling os_file_flush() on + the file twice at the same time, because we do + not know what bugs OS's may contain in file + i/o; sleep for a while */ - /* Note that it is not certain, when we have - released the mutex above, that the file of the - handle is still open: we assume that the OS - will not crash or trap even if we pass a handle - to a closed file below in os_file_flush! */ + mutex_exit(&(system->mutex)); + + os_thread_sleep(20000); + + mutex_enter(&(system->mutex)); + + if (node->flush_counter >= old_mod_counter) { + + goto skip_flush; + } + + goto retry; + } + + ut_a(node->open); + file = node->handle; + node->n_pending_flushes++; + + mutex_exit(&(system->mutex)); /* printf("Flushing to file %s\n", node->name); */ - - os_file_flush(file); - + os_file_flush(file); + mutex_enter(&(system->mutex)); + node->n_pending_flushes--; +skip_flush: + if (node->flush_counter < old_mod_counter) { + node->flush_counter = old_mod_counter; + } + if (space->purpose == FIL_TABLESPACE) { fil_n_pending_tablespace_flushes--; } else { @@ -1422,11 +3211,13 @@ fil_flush( node = UT_LIST_GET_NEXT(chain, node); } + space->n_pending_flushes--; + mutex_exit(&(system->mutex)); } /************************************************************************** -Flushes to disk writes in file spaces of the given type possibly cached by +Flushes to disk the writes in file spaces of the given type possibly cached by the OS. */ void @@ -1443,13 +3234,17 @@ fil_flush_file_spaces( while (space) { if (space->purpose == purpose) { + space->n_pending_flushes++; /* prevent dropping of the + space while we are + flushing */ mutex_exit(&(system->mutex)); fil_flush(space->id); mutex_enter(&(system->mutex)); - } + space->n_pending_flushes--; + } space = UT_LIST_GET_NEXT(space_list, space); } @@ -1457,20 +3252,18 @@ fil_flush_file_spaces( } /********************************************************************** -Checks the consistency of the file system. */ +Checks the consistency of the tablespace cache. */ ibool fil_validate(void) /*==============*/ /* out: TRUE if ok */ { + fil_system_t* system = fil_system; fil_space_t* space; fil_node_t* fil_node; - ulint pending_count = 0; - fil_system_t* system; + ulint n_open = 0; ulint i; - - system = fil_system; mutex_enter(&(system->mutex)); @@ -1481,36 +3274,35 @@ fil_validate(void) space = HASH_GET_FIRST(system->spaces, i); while (space != NULL) { - UT_LIST_VALIDATE(chain, fil_node_t, space->chain); fil_node = UT_LIST_GET_FIRST(space->chain); while (fil_node != NULL) { - if (fil_node->n_pending > 0) { - - pending_count++; ut_a(fil_node->open); } + if (fil_node->open) { + n_open++; + } fil_node = UT_LIST_GET_NEXT(chain, fil_node); } - space = HASH_GET_NEXT(hash, space); } } - ut_a(pending_count == system->n_open_pending); + ut_a(system->n_open == n_open); UT_LIST_VALIDATE(LRU, fil_node_t, system->LRU); fil_node = UT_LIST_GET_FIRST(system->LRU); while (fil_node != NULL) { - ut_a(fil_node->n_pending == 0); ut_a(fil_node->open); + ut_a(fil_node->space->purpose == FIL_TABLESPACE); + ut_a(fil_node->space->id != 0); fil_node = UT_LIST_GET_NEXT(LRU, fil_node); } @@ -1578,4 +3370,4 @@ fil_page_get_type( ut_ad(page); return(mach_read_from_2(page + FIL_PAGE_TYPE)); -} +} |