diff options
Diffstat (limited to 'storage/innobase/os')
-rw-r--r-- | storage/innobase/os/os0file.cc | 1238 | ||||
-rw-r--r-- | storage/innobase/os/os0sync.cc | 935 | ||||
-rw-r--r-- | storage/innobase/os/os0thread.cc | 15 |
3 files changed, 1088 insertions, 1100 deletions
diff --git a/storage/innobase/os/os0file.cc b/storage/innobase/os/os0file.cc index 0d63b6c091c..047eec7949c 100644 --- a/storage/innobase/os/os0file.cc +++ b/storage/innobase/os/os0file.cc @@ -125,6 +125,30 @@ static const ulint IO_LOG_SEGMENT = 1; /** Number of retries for partial I/O's */ static const ulint NUM_RETRIES_ON_PARTIAL_IO = 10; +/** Blocks for doing IO, used in the transparent compression +and encryption code. */ +struct Block { + /** Default constructor */ + Block() : m_ptr(), m_in_use() { } + + byte* m_ptr; + + byte pad[CACHE_LINE_SIZE - sizeof(ulint)]; + lock_word_t m_in_use; +}; + +/** For storing the allocated blocks */ +typedef std::vector<Block> Blocks; + +/** Block collection */ +static Blocks* block_cache; + +/** Number of blocks to allocate for sync read/writes */ +static const size_t MAX_BLOCKS = 128; + +/** Block buffer size */ +#define BUFFER_BLOCK_SIZE ((ulint)(UNIV_PAGE_SIZE * 1.3)) + /* This specifies the file permissions InnoDB uses when it creates files in Unix; the value of os_innodb_umask is initialized in ha_innodb.cc to my_umask */ @@ -259,7 +283,8 @@ struct Slot { to the caller of os_aio_simulated_handle */ bool io_already_done; - ulint file_block_size;/*!< file block size */ + /*!< file block size */ + ulint file_block_size; /** The file node for which the IO is requested. */ fil_node_t* m1; @@ -307,6 +332,9 @@ struct Slot { /** Length of the block before it was compressed */ uint32 original_len; + /** Buffer block for compressed pages or encrypted pages */ + Block* buf_block; + /** Unaligned buffer for compressed pages */ byte* compressed_ptr; @@ -360,7 +388,7 @@ public: os_offset_t offset, ulint len, ulint* write_size) - __attribute__((warn_unused_result)); + MY_ATTRIBUTE((warn_unused_result)); /** @return number of reserved slots */ ulint pending_io_count() const; @@ -369,7 +397,7 @@ public: @param[in] index Index of the slot in the array @return pointer to slot */ const Slot* at(ulint i) const - __attribute__((warn_unused_result)) + MY_ATTRIBUTE((warn_unused_result)) { ut_a(i < m_slots.size()); @@ -378,7 +406,7 @@ public: /** Non const version */ Slot* at(ulint i) - __attribute__((warn_unused_result)) + MY_ATTRIBUTE((warn_unused_result)) { ut_a(i < m_slots.size()); @@ -399,14 +427,14 @@ public: /** @return the number of slots per segment */ ulint slots_per_segment() const - __attribute__((warn_unused_result)) + MY_ATTRIBUTE((warn_unused_result)) { return(m_slots.size() / m_n_segments); } /** @return accessor for n_segments */ ulint get_n_segments() const - __attribute__((warn_unused_result)) + MY_ATTRIBUTE((warn_unused_result)) { return(m_n_segments); } @@ -414,7 +442,7 @@ public: #ifdef UNIV_DEBUG /** @return true if the thread owns the mutex */ bool is_mutex_owned() const - __attribute__((warn_unused_result)) + MY_ATTRIBUTE((warn_unused_result)) { return(mutex_own(&m_mutex)); } @@ -441,13 +469,13 @@ public: @param[in,out] slot an already reserved slot @return true on success. */ bool linux_dispatch(Slot* slot) - __attribute__((warn_unused_result)); + MY_ATTRIBUTE((warn_unused_result)); /** Accessor for an AIO event @param[in] index Index into the array @return the event at the index */ io_event* io_events(ulint index) - __attribute__((warn_unused_result)) + MY_ATTRIBUTE((warn_unused_result)) { ut_a(index < m_events.size()); @@ -458,7 +486,7 @@ public: @param[in] segment Segment for which to get the context @return the AIO context for the segment */ io_context* io_ctx(ulint segment) - __attribute__((warn_unused_result)) + MY_ATTRIBUTE((warn_unused_result)) { ut_ad(segment < get_n_segments()); @@ -470,7 +498,7 @@ public: @param[out] io_ctx io_ctx to initialize. @return true on success. */ static bool linux_create_io_ctx(ulint max_events, io_context_t* io_ctx) - __attribute__((warn_unused_result)); + MY_ATTRIBUTE((warn_unused_result)); /** Checks if the system supports native linux aio. On some kernel versions where native aio is supported it won't work on tmpfs. In such @@ -478,7 +506,7 @@ public: and native aio. @return true if supported, false otherwise. */ static bool is_linux_native_aio_supported() - __attribute__((warn_unused_result)); + MY_ATTRIBUTE((warn_unused_result)); #endif /* LINUX_NATIVE_AIO */ #ifdef WIN_ASYNC_IO @@ -520,7 +548,7 @@ public: /** The non asynchronous IO array. @return the synchronous AIO array instance. */ static AIO* sync_array() - __attribute__((warn_unused_result)) + MY_ATTRIBUTE((warn_unused_result)) { return(s_sync); } @@ -530,7 +558,7 @@ public: @param[in] segment The local segment. @return the handles for the segment. */ HANDLE* handles(ulint segment) - __attribute__((warn_unused_result)) + MY_ATTRIBUTE((warn_unused_result)) { ut_ad(segment < m_handles->size() / slots_per_segment()); @@ -539,7 +567,7 @@ public: /** @return true if no slots are reserved */ bool is_empty() const - __attribute__((warn_unused_result)) + MY_ATTRIBUTE((warn_unused_result)) { ut_ad(is_mutex_owned()); return(m_n_reserved == 0); @@ -555,7 +583,7 @@ public: latch_id_t id, ulint n_slots, ulint segments) - __attribute__((warn_unused_result)); + MY_ATTRIBUTE((warn_unused_result)); /** Initializes the asynchronous io system. Creates one array each for ibuf and log I/O. Also creates one array each for read and write @@ -574,7 +602,7 @@ public: ulint n_readers, ulint n_writers, ulint n_slots_sync) - __attribute__((warn_unused_result)); + MY_ATTRIBUTE((warn_unused_result)); /** Free the AIO arrays */ static void shutdown(); @@ -591,7 +619,7 @@ public: static ulint get_array_and_local_segment( AIO** array, ulint segment) - __attribute__((warn_unused_result)); + MY_ATTRIBUTE((warn_unused_result)); /** Select the IO slot array @param[in] type Type of IO, READ or WRITE @@ -602,7 +630,7 @@ public: IORequest& type, bool read_only, ulint mode) - __attribute__((warn_unused_result)); + MY_ATTRIBUTE((warn_unused_result)); /** Calculates segment number for a slot. @param[in] array AIO wait array @@ -612,7 +640,7 @@ public: static ulint get_segment_no_from_slot( const AIO* array, const Slot* slot) - __attribute__((warn_unused_result)); + MY_ATTRIBUTE((warn_unused_result)); /** Wakes up a simulated AIO I/O-handler thread if it has something to do. @@ -624,7 +652,7 @@ public: @param[in] aio The AIO instance to check @return true if the AIO instance is for reading. */ static bool is_read(const AIO* aio) - __attribute__((warn_unused_result)) + MY_ATTRIBUTE((warn_unused_result)) { return(s_reads == aio); } @@ -648,7 +676,7 @@ private: /** Initialise the slots @return DB_SUCCESS or error code */ dberr_t init_slots() - __attribute__((warn_unused_result)); + MY_ATTRIBUTE((warn_unused_result)); /** Wakes up a simulated AIO I/O-handler thread if it has something to do for a local segment in the AIO array. @@ -671,7 +699,7 @@ private: /** Initialise the Linux native AIO data structures @return DB_SUCCESS or error code */ dberr_t init_linux_native_aio() - __attribute__((warn_unused_result)); + MY_ATTRIBUTE((warn_unused_result)); #endif /* LINUX_NATIVE_AIO */ private: @@ -898,6 +926,77 @@ os_aio_windows_handler( IORequest* type); #endif /* WIN_ASYNC_IO */ +#ifdef MYSQL_COMPRESSION_ENCRYPTION +/** Allocate a page for sync IO +@return pointer to page */ +static +Block* +os_alloc_block() +{ + size_t pos; + Blocks& blocks = *block_cache; + size_t i = static_cast<size_t>(my_timer_cycles()); + const size_t size = blocks.size(); + ulint retry = 0; + Block* block; + + DBUG_EXECUTE_IF("os_block_cache_busy", retry = MAX_BLOCKS * 3;); + + for (;;) { + + /* After go through the block cache for 3 times, + allocate a new temporary block. */ + if (retry == MAX_BLOCKS * 3) { + byte* ptr; + + ptr = static_cast<byte*>( + ut_malloc_nokey(sizeof(*block) + + BUFFER_BLOCK_SIZE)); + + block = new (ptr) Block(); + block->m_ptr = static_cast<byte*>( + ptr + sizeof(*block)); + block->m_in_use = 1; + + break; + } + + pos = i++ % size; + + if (TAS(&blocks[pos].m_in_use, 1) == 0) { + block = &blocks[pos]; + break; + } + + os_thread_yield(); + + ++retry; + } + + ut_a(block->m_in_use != 0); + + return(block); +} + +/** Free a page after sync IO +@param[in,own] block The block to free/release */ +static +void +os_free_block(Block* block) +{ + ut_ad(block->m_in_use == 1); + + TAS(&block->m_in_use, 0); + + /* When this block is not in the block cache, and it's + a temporary block, we need to free it directly. */ + if (std::less<Block*>()(block, &block_cache->front()) + || std::greater<Block*>()(block, &block_cache->back())) { + ut_free(block); + } +} +#endif /* MYSQL_COMPRESSION_ENCRYPTION */ + /** Generic AIO Handler methods. Currently handles IO post processing. */ class AIOHandler { public: @@ -920,6 +1019,18 @@ public: } private: + /** Check whether the page was encrypted. + @param[in] slot The slot that contains the IO request + @return true if it was an encyrpted page */ + static bool is_encrypted_page(const Slot* slot) + { +#ifdef MYSQL_ENCRYPTION + return(Encryption::is_encrypted_page(slot->buf)); +#else + return (false); +#endif + } + /** Check whether the page was compressed. @param[in] slot The slot that contains the IO request @return true if it was a compressed page */ @@ -994,14 +1105,6 @@ public: m_offset(offset) { ut_ad(m_n > 0); - - /* If off_t is > 4 bytes in size, then we assume we can pass a - 64-bit address */ - off_t offs = static_cast<off_t>(m_offset); - - if (sizeof(off_t) <= 4 && m_offset != (os_offset_t) offs) { - ib::error() << "file write at offset > 4 GB."; - } } /** Destructor */ @@ -1092,7 +1195,7 @@ os_file_original_page_size(const byte* buf) dberr_t AIOHandler::check_read(Slot* slot, ulint n_bytes) { - dberr_t err; + dberr_t err=DB_SUCCESS; ut_ad(slot->type.is_read()); ut_ad(slot->original_len > slot->len); @@ -1119,10 +1222,29 @@ AIOHandler::check_read(Slot* slot, ulint n_bytes) err = DB_FAIL; } + } else if (is_encrypted_page(slot)) { + ut_a(slot->offset > 0); + + slot->len = slot->original_len; +#ifdef _WIN32 + slot->n_bytes = static_cast<DWORD>(n_bytes); +#else + slot->n_bytes = static_cast<ulint>(n_bytes); +#endif /* _WIN32 */ + + err = io_complete(slot); + ut_a(err == DB_SUCCESS); + } else { err = DB_FAIL; } +#ifdef MYSQL_COMPRESSION_ENCRYPTION + if (slot->buf_block != NULL) { + os_free_block(slot->buf_block); + slot->buf_block = NULL; + } +#endif return(err); } @@ -1131,7 +1253,7 @@ AIOHandler::check_read(Slot* slot, ulint n_bytes) dberr_t AIOHandler::post_io_processing(Slot* slot) { - dberr_t err; + dberr_t err=DB_SUCCESS; ut_ad(slot->is_reserved); @@ -1145,7 +1267,10 @@ AIOHandler::post_io_processing(Slot* slot) && slot->type.is_compressed() && slot->len == static_cast<ulint>(slot->n_bytes))) { - if (!slot->type.is_log() && is_compressed_page(slot)) { +#ifdef MYSQL_COMPRESSION + if (!slot->type.is_log() + && (is_compressed_page(slot) + || is_encrypted_page(slot))) { ut_a(slot->offset > 0); @@ -1170,6 +1295,11 @@ AIOHandler::post_io_processing(Slot* slot) err = DB_SUCCESS; } + if (slot->buf_block != NULL) { + os_free_block(slot->buf_block); + slot->buf_block = NULL; + } +#endif /* MYSQL_COMPRESSION */ } else if ((ulint) slot->n_bytes == (ulint) slot->len) { /* It *must* be a partial read. */ @@ -1220,6 +1350,7 @@ AIO::pending_io_count() const return(reserved); } +#ifdef MYSQL_COMPRESSION /** Compress a data page #param[in] block_size File system block size @param[in] src Source contents to compress @@ -1241,7 +1372,7 @@ os_file_compress_page( ulint compression_level = page_zip_level; ulint page_type = mach_read_from_2(src + FIL_PAGE_TYPE); - /* Must be divisible by the file system block size. */ + /* The page size must be a multiple of the OS punch hole size. */ ut_ad(!(src_len % block_size)); /* Shouldn't compress an already compressed page. */ @@ -1255,7 +1386,6 @@ os_file_compress_page( if (page_type == FIL_PAGE_RTREE || block_size == ULINT_UNDEFINED || compression.m_type == Compression::NONE - || block_size >= src_len || src_len < block_size * 2) { *dst_len = src_len; @@ -1367,6 +1497,7 @@ os_file_compress_page( return(dst); } +#endif /* MYSQL_COMPRESSION */ #ifdef UNIV_DEBUG # ifndef UNIV_HOTBACKUP @@ -1547,12 +1678,17 @@ AIO::release_with_mutex(Slot* slot) } /** Creates a temporary file. This function is like tmpfile(3), but -@return temporary file handle, or NULL on error */ +the temporary file is created in the given parameter path. If the path +is NULL then it will create the file in the MySQL server configuration +parameter (--tmpdir). +@param[in] path location for creating temporary file +@@return temporary file handle, or NULL on error */ FILE* -os_file_create_tmpfile() +os_file_create_tmpfile( + const char* path) { FILE* file = NULL; - int fd = innobase_mysql_tmpfile(); + int fd = innobase_mysql_tmpfile(path); if (fd >= 0) { file = fdopen(fd, "w+b"); @@ -1613,6 +1749,7 @@ os_file_io_complete( ulint offset, ulint len) { +#ifdef MYSQL_ENCRYPTION_COMPRESSION /* We never compress/decompress the first page */ ut_a(offset > 0); ut_ad(type.validate()); @@ -1622,11 +1759,19 @@ os_file_io_complete( return(DB_SUCCESS); } else if (type.is_read()) { + dberr_t ret = DB_SUCCESS; + Encryption encryption(type.encryption_algorithm()); ut_ad(!type.is_log()); - return(os_file_decompress_page( - type.is_dblwr_recover(), buf, scratch, len)); + ret = encryption.decrypt(type, buf, src_len, scratch, len); + if (ret == DB_SUCCESS) { + return(os_file_decompress_page( + type.is_dblwr_recover(), + buf, scratch, len)); + } else { + return(ret); + } } else if (type.punch_hole()) { @@ -1660,6 +1805,7 @@ os_file_io_complete( } ut_ad(!type.is_log()); +#endif /* MYSQL_ENCRYPTION_COMPRESSION */ return(DB_SUCCESS); } @@ -1961,6 +2107,7 @@ os_file_create_subdirs_if_needed( return(success ? DB_SUCCESS : DB_ERROR); } +#ifdef MYSQL_COMPRESSION /** Allocate the buffer for IO on a transparently compressed table. @param[in] type IO flags @param[out] buf buffer to read or write @@ -1969,7 +2116,7 @@ os_file_create_subdirs_if_needed( @return pointer to allocated page, compressed data is written to the offset that is aligned on UNIV_SECTOR_SIZE of Block.m_ptr */ static -byte* +Block* os_file_compress_page( IORequest& type, void*& buf, @@ -1981,13 +2128,13 @@ os_file_compress_page( ulint n_alloc = *n * 2; - ut_a(n_alloc < UNIV_PAGE_SIZE_MAX * 2); + ut_a(n_alloc <= UNIV_PAGE_SIZE_MAX * 2); #ifdef HAVE_LZ4 ut_a(type.compression_algorithm().m_type != Compression::LZ4 || static_cast<ulint>(LZ4_COMPRESSBOUND(*n)) < n_alloc); #endif - byte* ptr = reinterpret_cast<byte*>(ut_malloc_nokey(n_alloc)); + Block* ptr = reinterpret_cast<Block*>(ut_malloc_nokey(n_alloc)); if (ptr == NULL) { return(NULL); @@ -2009,7 +2156,7 @@ os_file_compress_page( byte* compressed_page; compressed_page = static_cast<byte*>( - ut_align(ptr, UNIV_SECTOR_SIZE)); + ut_align(block->m_ptr, UNIV_SECTOR_SIZE)); byte* buf_ptr; @@ -2037,8 +2184,54 @@ os_file_compress_page( } } - return(ptr); + return(block); } +#endif /* MYSQL_COMPRESSION */ + +#ifdef MYSQL_ENCRYPTION +/** Encrypt a page content when write it to disk. +@param[in] type IO flags +@param[out] buf buffer to read or write +@param[in,out] n number of bytes to read/write, starting from + offset +@return pointer to the encrypted page */ +static +Block* +os_file_encrypt_page( + const IORequest& type, + void*& buf, + ulint* n) +{ + + byte* encrypted_page; + ulint encrypted_len = *n; + byte* buf_ptr; + Encryption encryption(type.encryption_algorithm()); + + ut_ad(!type.is_log()); + ut_ad(type.is_write()); + ut_ad(type.is_encrypted()); + + Block* block = os_alloc_block(); + + encrypted_page = static_cast<byte*>( + ut_align(block->m_ptr, UNIV_SECTOR_SIZE)); + + buf_ptr = encryption.encrypt(type, + reinterpret_cast<byte*>(buf), *n, + encrypted_page, &encrypted_len); + + bool encrypted = buf_ptr != buf; + + if (encrypted) { + + buf = buf_ptr; + *n = encrypted_len; + } + + return(block); +} +#endif /* MYSQL_ENCRYPTION */ #ifndef _WIN32 @@ -2272,7 +2465,7 @@ LinuxAIOHandler::check_state(Slot* slot) ut_ad(slot->io_already_done); - dberr_t err; + dberr_t err = DB_SUCCESS; if (slot->ret == 0) { @@ -2469,7 +2662,7 @@ LinuxAIOHandler::collect() dberr_t LinuxAIOHandler::poll(fil_node_t** m1, void** m2, IORequest* request) { - dberr_t err; + dberr_t err = DB_SUCCESS; Slot* slot; /* Loop until we have found a completed request. */ @@ -2722,7 +2915,7 @@ AIO::is_linux_native_aio_supported() } else if (!srv_read_only_mode) { /* Now check if tmpdir supports native aio ops. */ - fd = innobase_mysql_tmpfile(); + fd = innobase_mysql_tmpfile(NULL); if (fd < 0) { ib::warn() @@ -2733,7 +2926,7 @@ AIO::is_linux_native_aio_supported() } } else { - os_normalize_path_for_win(srv_log_group_home_dir); + os_normalize_path(srv_log_group_home_dir); ulint dirnamelen = strlen(srv_log_group_home_dir); @@ -3522,11 +3715,11 @@ os_file_create_func( } while (retry); /* We disable OS caching (O_DIRECT) only on data files */ - if (!srv_read_only_mode - && *success - && type != OS_LOG_FILE - && (srv_unix_file_flush_method == SRV_UNIX_O_DIRECT - || srv_unix_file_flush_method == SRV_UNIX_O_DIRECT_NO_FSYNC)) { + if (!read_only + && *success + && (type != OS_LOG_FILE && type != OS_DATA_TEMP_FILE) + && (srv_unix_file_flush_method == SRV_UNIX_O_DIRECT + || srv_unix_file_flush_method == SRV_UNIX_O_DIRECT_NO_FSYNC)) { os_file_set_nocache(file, name, mode_str); } @@ -5300,6 +5493,7 @@ AIO::simulated_put_read_threads_to_sleep() #endif /* !_WIN32*/ +#ifdef MYSQL_COMPRESSION /** Validate the type, offset and number of bytes to read * @param[in] type IO flags @param[in] offset Offset from start of the file @@ -5321,6 +5515,7 @@ os_file_check_args(const IORequest& type, os_offset_t offset, ulint n) ib::error() << "file write at offset > 4 GB."; } } +#endif /* MYSQL_COMPRESSION */ /** Does a syncronous read or write depending upon the type specified In case of partial reads/writes the function tries @@ -5332,7 +5527,7 @@ NUM_RETRIES_ON_PARTIAL_IO times to read/write the complete data. @param[in] n number of bytes to read, starting from offset @param[out] err DB_SUCCESS or error code @return number of bytes read/written, -1 if error */ -static __attribute__((warn_unused_result)) +static MY_ATTRIBUTE((warn_unused_result)) ssize_t os_file_io( const IORequest&in_type, @@ -5342,26 +5537,45 @@ os_file_io( os_offset_t offset, dberr_t* err) { - byte* ptr; ulint original_n = n; IORequest type = in_type; - byte* compressed_page; + byte* compressed_page=NULL; ssize_t bytes_returned = 0; +#ifdef MYSQL_COMPRESSION + Block* block=NULL; if (type.is_compressed()) { /* We don't compress the first page of any file. */ ut_ad(offset > 0); - ptr = os_file_compress_page(type, buf, &n); + block = os_file_compress_page(type, buf, &n); compressed_page = static_cast<byte*>( - ut_align(ptr, UNIV_SECTOR_SIZE)); + ut_align(block->m_ptr, UNIV_SECTOR_SIZE)); } else { - ptr = NULL; + block = NULL; compressed_page = NULL; } +#endif /* MYSQL_COMPRESSION */ + +#ifdef MYSQL_ENCRYPTION + /* We do encryption after compression, since if we do encryption + before compression, the encrypted data will cause compression fail + or low compression rate. */ + if (type.is_encrypted() && type.is_write()) { + /* We don't encrypt the first page of any file. */ + Block* compressed_block = block; + ut_ad(offset > 0); + + block = os_file_encrypt_page(type, buf, &n); + + if (compressed_block != NULL) { + os_free_block(compressed_block); + } + } +#endif /* MYSQL_ENCRYPTION */ SyncFileIO sync_file_io(file, buf, n, offset); @@ -5387,14 +5601,15 @@ os_file_io( compressed_page, original_n, static_cast<ulint>(offset), n); - if (ptr != NULL) { - ut_free(ptr); - } - } else { *err = DB_SUCCESS; } +#ifdef MYSQL_COMPRESSION + if (block != NULL) { + os_free_block(block); + } +#endif return(original_n); } @@ -5422,9 +5637,11 @@ os_file_io( sync_file_io.advance(n_bytes); } - if (ptr != NULL) { - ut_free(ptr); +#ifdef MYSQL_COMPRESSION + if (block != NULL) { + os_free_block(block); } +#endif *err = DB_IO_ERROR; @@ -5446,7 +5663,7 @@ os_file_io( @param[in] offset file offset from the start where to read @param[out] err DB_SUCCESS or error code @return number of bytes written, -1 if error */ -static __attribute__((warn_unused_result)) +static MY_ATTRIBUTE((warn_unused_result)) ssize_t os_file_pwrite( IORequest& type, @@ -5478,7 +5695,7 @@ os_file_pwrite( @param[in] offset file offset from the start where to read @param[in] n number of bytes to read, starting from offset @return DB_SUCCESS if request was successful, false if fail */ -static __attribute__((warn_unused_result)) +static MY_ATTRIBUTE((warn_unused_result)) dberr_t os_file_write_page( IORequest& type, @@ -5490,8 +5707,9 @@ os_file_write_page( { dberr_t err; - os_file_check_args(type, offset, n); - + ut_ad(type.validate()); + ut_ad(n > 0); + ssize_t n_bytes = os_file_pwrite(type, file, buf, n, offset, &err); if ((ulint) n_bytes != n && !os_has_said_disk_full) { @@ -5530,7 +5748,7 @@ os_file_write_page( @param[in] n number of bytes to read, starting from offset @param[out] err DB_SUCCESS or error code @return number of bytes read, -1 if error */ -static __attribute__((warn_unused_result)) +static MY_ATTRIBUTE((warn_unused_result)) ssize_t os_file_pread( IORequest& type, @@ -5563,7 +5781,7 @@ os_file_pread( @param[out] o number of bytes actually read @param[in] exit_on_err if true then exit on error @return DB_SUCCESS or error code */ -static __attribute__((warn_unused_result)) +static MY_ATTRIBUTE((warn_unused_result)) dberr_t os_file_read_page( IORequest& type, @@ -5578,7 +5796,8 @@ os_file_read_page( os_bytes_read_since_printout += n; - os_file_check_args(type, offset, n); + ut_ad(type.validate()); + ut_ad(n > 0); for (;;) { ssize_t n_bytes; @@ -5595,6 +5814,7 @@ os_file_read_page( } else if ((ulint) n_bytes == n) { +#ifdef MYSQL_COMPRESSION /** The read will succeed but decompress can fail for various reasons. */ @@ -5607,11 +5827,14 @@ os_file_read_page( } else { return(err); } +#else + return(DB_SUCCESS); +#endif /* MYSQL_COMPRESSION */ } - ib::error() - << "Tried to read " << n << " bytes at offset " - << offset << " was only able to read" << n_bytes; + ib::error() << "Tried to read " << n + << " bytes at offset " << offset + << ", but was only able to read " << n_bytes; if (exit_on_err) { @@ -5664,7 +5887,7 @@ and the error type, if should_exit is true then on_error_silent is ignored. @param[in] on_error_silent if true then don't print any message to the log iff it is an unknown non-fatal error @return true if we should retry the operation */ -static __attribute__((warn_unused_result)) +static MY_ATTRIBUTE((warn_unused_result)) bool os_file_handle_error_cond_exit( const char* name, @@ -5785,9 +6008,9 @@ os_file_handle_error_no_exit( message */ void os_file_set_nocache( - os_file_t fd __attribute__((unused)), - const char* file_name __attribute__((unused)), - const char* operation_name __attribute__((unused))) + os_file_t fd MY_ATTRIBUTE((unused)), + const char* file_name MY_ATTRIBUTE((unused)), + const char* operation_name MY_ATTRIBUTE((unused))) { /* some versions of Solaris may not have DIRECTIO_ON */ #if defined(UNIV_SOLARIS) && defined(DIRECTIO_ON) @@ -6064,6 +6287,13 @@ os_file_punch_hole( os_offset_t off, os_offset_t len) { + /* In this debugging mode, we act as if punch hole is supported, + and then skip any calls to actually punch a hole here. + In this way, Transparent Page Compression is still being tested. */ + DBUG_EXECUTE_IF("ignore_punch_hole", + return(DB_SUCCESS); + ); + #ifdef _WIN32 return(os_file_punch_hole_win32(fh, off, len)); #else @@ -6085,6 +6315,13 @@ Note: On Windows we use the name and on Unices we use the file handle. bool os_is_sparse_file_supported(const char* path, os_file_t fh) { + /* In this debugging mode, we act as if punch hole is supported, + then we skip any calls to actually punch a hole. In this way, + Transparent Page Compression is still being tested. */ + DBUG_EXECUTE_IF("ignore_punch_hole", + return(true); + ); + #ifdef _WIN32 return(os_is_sparse_file_supported_win32(path)); #else @@ -6575,6 +6812,26 @@ os_aio_init( } #endif /* _WIN32 */ + ut_a(block_cache == NULL); + + block_cache = UT_NEW_NOKEY(Blocks(MAX_BLOCKS)); + + for (Blocks::iterator it = block_cache->begin(); + it != block_cache->end(); + ++it) { + + ut_a(it->m_in_use == 0); + ut_a(it->m_ptr == NULL); + + /* Allocate double of max page size memory, since + compress could generate more bytes than orgininal + data. */ + it->m_ptr = static_cast<byte*>( + ut_malloc_nokey(BUFFER_BLOCK_SIZE)); + + ut_a(it->m_ptr != NULL); + } + return(AIO::start(limit, n_readers, n_writers, n_slots_sync)); } @@ -6591,6 +6848,18 @@ os_aio_free() ut_free(os_aio_segment_wait_events); os_aio_segment_wait_events = 0; os_aio_n_segments = 0; + + for (Blocks::iterator it = block_cache->begin(); + it != block_cache->end(); + ++it) { + + ut_a(it->m_in_use == 0); + ut_free(it->m_ptr); + } + + UT_DELETE(block_cache); + + block_cache = NULL; } /** Wakes up all async i/o threads so that they know to exit themselves in @@ -6796,70 +7065,78 @@ AIO::reserve_slot( slot->is_log = type.is_log(); slot->original_len = static_cast<uint32>(len); slot->io_already_done = false; + slot->buf_block = NULL; slot->buf = static_cast<byte*>(buf); +#ifdef MYSQL_COMPRESSION if (srv_use_native_aio && offset > 0 && type.is_write() && type.is_compressed()) { + ulint compressed_len = len; ut_ad(!type.is_log()); release(); - ulint compressed_len = len; - - ulint old_compressed_len; - - old_compressed_len = mach_read_from_2( - slot->buf + FIL_PAGE_COMPRESS_SIZE_V1); + void* src_buf = slot->buf; - if (old_compressed_len > 0) { - old_compressed_len = ut_calc_align( - old_compressed_len + FIL_PAGE_DATA, - slot->type.block_size()); - } - - byte* ptr; - - ptr = os_file_compress_page( - slot->type.compression_algorithm(), - slot->type.block_size(), - slot->buf, - slot->len, - slot->compressed_page, + slot->buf_block = os_file_compress_page( + type, + src_buf, &compressed_len); - if (ptr != buf) { - /* Set new compressed size to uncompressed page. */ - memcpy(slot->buf + FIL_PAGE_COMPRESS_SIZE_V1, - slot->compressed_page - + FIL_PAGE_COMPRESS_SIZE_V1, 2); + slot->buf = static_cast<byte*>(src_buf); + slot->ptr = slot->buf; #ifdef _WIN32 - slot->len = static_cast<DWORD>(compressed_len); + slot->len = static_cast<DWORD>(compressed_len); #else - slot->len = static_cast<ulint>(compressed_len); + slot->len = static_cast<ulint>(compressed_len); #endif /* _WIN32 */ - slot->buf = slot->compressed_page; - slot->ptr = slot->buf; + slot->skip_punch_hole = type.punch_hole(); - if (old_compressed_len > 0 - && compressed_len >= old_compressed_len) { + acquire(); + } +#endif /* MYSQL_COMPRESSION */ - ut_ad(old_compressed_len <= UNIV_PAGE_SIZE); +#ifdef MYSQL_ENCRYPTION + /* We do encryption after compression, since if we do encryption + before compression, the encrypted data will cause compression fail + or low compression rate. */ + if (srv_use_native_aio + && offset > 0 + && type.is_write() + && type.is_encrypted()) { + ulint encrypted_len = slot->len; + Block* encrypted_block; - slot->skip_punch_hole = true; + ut_ad(!type.is_log()); - } else { - slot->skip_punch_hole = false; - } + release(); - } else { - slot->skip_punch_hole = false; + void* src_buf = slot->buf; + encrypted_block = os_file_encrypt_page( + type, + src_buf, + &encrypted_len); + + if (slot->buf_block != NULL) { + os_free_block(slot->buf_block); } + slot->buf_block = encrypted_block; + slot->buf = static_cast<byte*>(src_buf); + slot->ptr = slot->buf; + +#ifdef _WIN32 + slot->len = static_cast<DWORD>(encrypted_len); +#else + slot->len = static_cast<ulint>(encrypted_len); +#endif /* _WIN32 */ + acquire(); - } + } +#endif /* MYSQL_ENCRYPTION */ #ifdef WIN_ASYNC_IO { @@ -7501,7 +7778,7 @@ public: all data, and perform the I/O @return the length of the buffer */ ulint allocate_buffer() - __attribute__((warn_unused_result)) + MY_ATTRIBUTE((warn_unused_result)) { ulint len; Slot* slot = first_slot(); @@ -7591,7 +7868,7 @@ public: /** @return the first slot in the consecutive array */ Slot* first_slot() - __attribute__((warn_unused_result)) + MY_ATTRIBUTE((warn_unused_result)) { ut_a(m_n_elems > 0); @@ -7605,7 +7882,7 @@ public: ulint check_pending( ulint global_segment, os_event_t event) - __attribute__((warn_unused_result)); + MY_ATTRIBUTE((warn_unused_result)); private: /** Do the file read @@ -8095,7 +8372,7 @@ os_aio_print(FILE* file) for (ulint i = 0; i < srv_n_file_io_threads; ++i) { fprintf(file, "I/O thread %lu state: %s (%s)", - (ulong) i, + (ulint) i, srv_io_thread_op_info[i], srv_io_thread_function[i]); @@ -8119,17 +8396,17 @@ os_aio_print(FILE* file) fprintf(file, "Pending flushes (fsync) log: %lu; buffer pool: %lu\n" "%lu OS file reads, %lu OS file writes, %lu OS fsyncs\n", - (ulong) fil_n_pending_log_flushes, - (ulong) fil_n_pending_tablespace_flushes, - (ulong) os_n_file_reads, - (ulong) os_n_file_writes, - (ulong) os_n_fsyncs); + (ulint) fil_n_pending_log_flushes, + (ulint) fil_n_pending_tablespace_flushes, + (ulint) os_n_file_reads, + (ulint) os_n_file_writes, + (ulint) os_n_fsyncs); if (os_n_pending_writes != 0 || os_n_pending_reads != 0) { fprintf(file, "%lu pending preads, %lu pending pwrites\n", (ulint) os_n_pending_reads, - (ulong) os_n_pending_writes); + (ulint) os_n_pending_writes); } if (os_n_file_reads == os_n_file_reads_old) { @@ -8144,7 +8421,7 @@ os_aio_print(FILE* file) " %.2f writes/s, %.2f fsyncs/s\n", (os_n_file_reads - os_n_file_reads_old) / time_elapsed, - (ulong) avg_bytes_read, + (ulint) avg_bytes_read, (os_n_file_writes - os_n_file_writes_old) / time_elapsed, (os_n_fsyncs - os_n_fsyncs_old) @@ -8276,9 +8553,38 @@ os_file_set_umask(ulint umask) #endif #include <zlib.h> +#ifndef UNIV_INNOCHECKSUM +#include <my_aes.h> +#include <my_rnd.h> +#include <mysqld.h> +#include <mysql/service_mysql_keyring.h> +#endif +typedef byte Block; + +#ifdef MYSQL_COMPRESSION +/** Allocate a page for sync IO +@return pointer to page */ +static +Block* +os_alloc_block() +{ + return(reinterpret_cast<byte*>(malloc(UNIV_PAGE_SIZE_MAX * 2))); +} + +/** Free a page after sync IO +@param[in,own] block The block to free/release */ +static +void +os_free_block(Block* block) +{ + ut_free(block); +} +#endif #endif /* !UNIV_INNOCHECKSUM */ +#ifdef MYSQL_COMPRESSION + /** @param[in] type The compression type @return the string representation */ @@ -8388,32 +8694,24 @@ Compression::deserialize( return(DB_CORRUPTION); } - // FIXME: We should use TLS for this and reduce the malloc/free - bool allocated; + Block* block; /* The caller doesn't know what to expect */ if (dst == NULL) { - /* Add a safety margin of an additional 50% */ - ulint n_bytes = header.m_original_size - + (header.m_original_size / 2); + block = os_alloc_block(); -#ifndef UNIV_INNOCHECKSUM - dst = reinterpret_cast<byte*>(ut_malloc_nokey(n_bytes)); +#ifdef UNIV_INNOCHECKSUM + dst = block; #else - dst = reinterpret_cast<byte*>(malloc(n_bytes)); -#endif /* !UNIV_INNOCHECKSUM */ - - if (dst == NULL) { - - return(DB_OUT_OF_MEMORY); - } + dst = block->m_ptr; +#endif /* UNIV_INNOCHECKSUM */ - allocated = true; } else { - allocated = false; + block = NULL; } + int ret; Compression compression; ulint len = header.m_original_size; @@ -8427,8 +8725,8 @@ Compression::deserialize( if (uncompress(dst, &zlen, ptr, header.m_compressed_size) != Z_OK) { - if (allocated) { - ut_free(dst); + if (block != NULL) { + os_free_block(block); } return(DB_IO_DECOMPRESS_FAIL); @@ -8467,8 +8765,8 @@ Compression::deserialize( if (ret < 0) { - if (allocated) { - ut_free(dst); + if (block != NULL) { + os_free_block(block); } return(DB_IO_DECOMPRESS_FAIL); @@ -8487,8 +8785,8 @@ Compression::deserialize( Compression::to_string(compression.m_type)); #endif /* !UNIV_INNOCHECKSUM */ - if (allocated) { - ut_free(dst); + if (block != NULL) { + os_free_block(block); } return(DB_UNSUPPORTED); @@ -8503,9 +8801,10 @@ Compression::deserialize( src + (header.m_original_size + FIL_PAGE_DATA) - FIL_PAGE_END_LSN_OLD_CHKSUM + 4, 4) == 0); - if (allocated) { - ut_free(dst); + if (block != NULL) { + os_free_block(block); } + return(DB_SUCCESS); } @@ -8526,6 +8825,635 @@ os_file_decompress_page( { return(Compression::deserialize(dblwr_recover, src, dst, dst_len)); } +#endif /* MYSQL_COMPRESSION */ + +#ifdef MYSQL_ENCRYPTION + +/** +@param[in] type The encryption type +@return the string representation */ +const char* +Encryption::to_string(Type type) +{ + switch(type) { + case NONE: + return("N"); + case AES: + return("Y"); + } + + ut_ad(0); + + return("<UNKNOWN>"); +} + +/** Generate random encryption value for key and iv. +@param[in,out] value Encryption value */ +void Encryption::random_value(byte* value) +{ + ut_ad(value != NULL); + + my_rand_buffer(value, ENCRYPTION_KEY_LEN); +} + +/** Create new master key for key rotation. +@param[in,out] master_key master key */ +void +Encryption::create_master_key(byte** master_key) +{ +#ifndef UNIV_INNOCHECKSUM + char* key_type = NULL; + size_t key_len; + char key_name[ENCRYPTION_MASTER_KEY_NAME_MAX_LEN]; + int ret; + + /* If uuid does not match with current server uuid, + set uuid as current server uuid. */ + if (strcmp(uuid, server_uuid) != 0) { + memcpy(uuid, server_uuid, ENCRYPTION_SERVER_UUID_LEN); + } + memset(key_name, 0, ENCRYPTION_MASTER_KEY_NAME_MAX_LEN); + + /* Generate new master key */ + ut_snprintf(key_name, ENCRYPTION_MASTER_KEY_NAME_MAX_LEN, + "%s-%s-%lu", ENCRYPTION_MASTER_KEY_PRIFIX, + uuid, master_key_id + 1); + + /* We call key ring API to generate master key here. */ + ret = my_key_generate(key_name, "AES", + NULL, ENCRYPTION_KEY_LEN); + + /* We call key ring API to get master key here. */ + ret = my_key_fetch(key_name, &key_type, NULL, + reinterpret_cast<void**>(master_key), + &key_len); + + if (ret || *master_key == NULL) { + ib::error() << "Encryption can't find master key, please check" + " the keyring plugin is loaded."; + *master_key = NULL; + } else { + master_key_id++; + } + + if (key_type) { + my_free(key_type); + } +#endif +} + +/** Get master key by key id. +@param[in] master_key_id master key id +@param[in] srv_uuid uuid of server instance +@param[in,out] master_key master key */ +void +Encryption::get_master_key(ulint master_key_id, + char* srv_uuid, + byte** master_key) +{ +#ifndef UNIV_INNOCHECKSUM + char* key_type = NULL; + size_t key_len; + char key_name[ENCRYPTION_MASTER_KEY_NAME_MAX_LEN]; + int ret; + + memset(key_name, 0, ENCRYPTION_MASTER_KEY_NAME_MAX_LEN); + + if (srv_uuid != NULL) { + ut_snprintf(key_name, ENCRYPTION_MASTER_KEY_NAME_MAX_LEN, + "%s-%s-%lu", ENCRYPTION_MASTER_KEY_PRIFIX, + srv_uuid, master_key_id); + } else { + /* For compitable with 5.7.11, we need to get master key with + server id. */ + memset(key_name, 0, ENCRYPTION_MASTER_KEY_NAME_MAX_LEN); + ut_snprintf(key_name, ENCRYPTION_MASTER_KEY_NAME_MAX_LEN, + "%s-%lu-%lu", ENCRYPTION_MASTER_KEY_PRIFIX, + server_id, master_key_id); + } + + /* We call key ring API to get master key here. */ + ret = my_key_fetch(key_name, &key_type, NULL, + reinterpret_cast<void**>(master_key), &key_len); + + if (key_type) { + my_free(key_type); + } + + if (ret) { + *master_key = NULL; + ib::error() << "Encryption can't find master key, please check" + " the keyring plugin is loaded."; + } + +#ifdef UNIV_ENCRYPT_DEBUG + if (!ret && *master_key) { + fprintf(stderr, "Fetched master key:%lu ", master_key_id); + ut_print_buf(stderr, *master_key, key_len); + fprintf(stderr, "\n"); + } +#endif /* DEBUG_TDE */ + +#endif +} + +/** Current master key id */ +ulint Encryption::master_key_id = 0; + +/** Current uuid of server instance */ +char Encryption::uuid[ENCRYPTION_SERVER_UUID_LEN + 1] = {0}; + +/** Get current master key and master key id +@param[in,out] master_key_id master key id +@param[in,out] master_key master key +@param[in,out] version encryption information version */ +void +Encryption::get_master_key(ulint* master_key_id, + byte** master_key, + Encryption::Version* version) +{ +#ifndef UNIV_INNOCHECKSUM + char* key_type = NULL; + size_t key_len; + char key_name[ENCRYPTION_MASTER_KEY_NAME_MAX_LEN]; + int ret; + + memset(key_name, 0, ENCRYPTION_KEY_LEN); + *version = Encryption::ENCRYPTION_VERSION_2; + + if (Encryption::master_key_id == 0) { + /* If m_master_key is 0, means there's no encrypted + tablespace, we need to generate the first master key, + and store it to key ring. */ + memset(uuid, 0, ENCRYPTION_SERVER_UUID_LEN + 1); + memcpy(uuid, server_uuid, ENCRYPTION_SERVER_UUID_LEN); + + /* Prepare the server uuid. */ + ut_snprintf(key_name, ENCRYPTION_MASTER_KEY_NAME_MAX_LEN, + "%s-%s-1", ENCRYPTION_MASTER_KEY_PRIFIX, + uuid); + + /* We call key ring API to generate master key here. */ + ret = my_key_generate(key_name, "AES", + NULL, ENCRYPTION_KEY_LEN); + + /* We call key ring API to get master key here. */ + ret = my_key_fetch(key_name, &key_type, NULL, + reinterpret_cast<void**>(master_key), + &key_len); + + if (!ret && *master_key != NULL) { + Encryption::master_key_id++; + *master_key_id = Encryption::master_key_id; + } +#ifdef UNIV_ENCRYPT_DEBUG + if (!ret && *master_key) { + fprintf(stderr, "Generated new master key:"); + ut_print_buf(stderr, *master_key, key_len); + fprintf(stderr, "\n"); + } +#endif + } else { + *master_key_id = Encryption::master_key_id; + + ut_snprintf(key_name, ENCRYPTION_MASTER_KEY_NAME_MAX_LEN, + "%s-%s-%lu", ENCRYPTION_MASTER_KEY_PRIFIX, + uuid, *master_key_id); + + /* We call key ring API to get master key here. */ + ret = my_key_fetch(key_name, &key_type, NULL, + reinterpret_cast<void**>(master_key), + &key_len); + + /* For compitable with 5.7.11, we need to try to get master key with + server id when get master key with server uuid failure. */ + if (ret || *master_key == NULL) { + if (key_type) { + my_free(key_type); + } + + memset(key_name, 0, + ENCRYPTION_MASTER_KEY_NAME_MAX_LEN); + ut_snprintf(key_name, ENCRYPTION_MASTER_KEY_NAME_MAX_LEN, + "%s-%lu-%lu", ENCRYPTION_MASTER_KEY_PRIFIX, + server_id, *master_key_id); + + ret = my_key_fetch(key_name, &key_type, NULL, + reinterpret_cast<void**>(master_key), + &key_len); + *version = Encryption::ENCRYPTION_VERSION_1; + } +#ifdef UNIV_ENCRYPT_DEBUG + if (!ret && *master_key) { + fprintf(stderr, "Fetched master key:%lu ", + *master_key_id); + ut_print_buf(stderr, *master_key, key_len); + fprintf(stderr, "\n"); + } +#endif + } + + if (ret) { + *master_key = NULL; + ib::error() << "Encryption can't find master key, please check" + " the keyring plugin is loaded."; + } + + if (key_type) { + my_free(key_type); + } +#endif +} + +/** Check if page is encrypted page or not +@param[in] page page which need to check +@return true if it is a encrypted page */ +bool +Encryption::is_encrypted_page(const byte* page) +{ + ulint page_type = mach_read_from_2(page + FIL_PAGE_TYPE); + + return(page_type == FIL_PAGE_ENCRYPTED + || page_type == FIL_PAGE_COMPRESSED_AND_ENCRYPTED + || page_type == FIL_PAGE_ENCRYPTED_RTREE); +} + +/** Encrypt the page data contents. Page type can't be +FIL_PAGE_ENCRYPTED, FIL_PAGE_COMPRESSED_AND_ENCRYPTED, +FIL_PAGE_ENCRYPTED_RTREE. +@param[in] type IORequest +@param[in,out] src page data which need to encrypt +@param[in] src_len Size of the source in bytes +@param[in,out] dst destination area +@param[in,out] dst_len Size of the destination in bytes +@return buffer data, dst_len will have the length of the data */ +byte* +Encryption::encrypt( + const IORequest& type, + byte* src, + ulint src_len, + byte* dst, + ulint* dst_len) +{ + ulint len = 0; + ulint page_type = mach_read_from_2(src + FIL_PAGE_TYPE); + ulint data_len; + ulint main_len; + ulint remain_len; + byte remain_buf[MY_AES_BLOCK_SIZE * 2]; + +#ifdef UNIV_ENCRYPT_DEBUG + ulint space_id = + mach_read_from_4(src + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID); + ulint page_no = mach_read_from_4(src + FIL_PAGE_OFFSET); + + fprintf(stderr, "Encrypting page:%lu.%lu len:%lu\n", + space_id, page_no, src_len); +#endif + + /* Shouldn't encrypte an already encrypted page. */ + ut_ad(page_type != FIL_PAGE_ENCRYPTED + && page_type != FIL_PAGE_COMPRESSED_AND_ENCRYPTED + && page_type != FIL_PAGE_ENCRYPTED_RTREE); + + ut_ad(m_type != Encryption::NONE); + + /* This is data size which need to encrypt. */ + data_len = src_len - FIL_PAGE_DATA; + main_len = (data_len / MY_AES_BLOCK_SIZE) * MY_AES_BLOCK_SIZE; + remain_len = data_len - main_len; + + /* Only encrypt the data + trailer, leave the header alone */ + + switch (m_type) { + case Encryption::NONE: + ut_error; + + case Encryption::AES: { + lint elen; + + ut_ad(m_klen == ENCRYPTION_KEY_LEN); + + elen = my_aes_encrypt( + src + FIL_PAGE_DATA, + static_cast<uint32>(main_len), + dst + FIL_PAGE_DATA, + reinterpret_cast<unsigned char*>(m_key), + static_cast<uint32>(m_klen), + my_aes_256_cbc, + reinterpret_cast<unsigned char*>(m_iv), + false); + + if (elen == MY_AES_BAD_DATA) { + ulint page_no =mach_read_from_4( + src + FIL_PAGE_OFFSET); + ulint space_id = mach_read_from_4( + src + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID); + *dst_len = src_len; +#ifndef UNIV_INNOCHECKSUM + ib::warn() + << " Can't encrypt data of page," + << " page no:" << page_no + << " space id:" << space_id; +#else + fprintf(stderr, " Can't encrypt data of page," + " page no:" ULINTPF + " space id:" ULINTPF, + page_no, space_id); +#endif /* !UNIV_INNOCHECKSUM */ + return(src); + } + + len = static_cast<ulint>(elen); + ut_ad(len == main_len); + + /* Copy remain bytes and page tailer. */ + memcpy(dst + FIL_PAGE_DATA + len, + src + FIL_PAGE_DATA + len, + src_len - FIL_PAGE_DATA - len); + + /* Encrypt the remain bytes. */ + if (remain_len != 0) { + remain_len = MY_AES_BLOCK_SIZE * 2; + + elen = my_aes_encrypt( + dst + FIL_PAGE_DATA + data_len - remain_len, + static_cast<uint32>(remain_len), + remain_buf, + reinterpret_cast<unsigned char*>(m_key), + static_cast<uint32>(m_klen), + my_aes_256_cbc, + reinterpret_cast<unsigned char*>(m_iv), + false); + + if (elen == MY_AES_BAD_DATA) { + ulint page_no =mach_read_from_4( + src + FIL_PAGE_OFFSET); + ulint space_id = mach_read_from_4( + src + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID); +#ifndef UNIV_INNOCHECKSUM + ib::warn() + << " Can't encrypt data of page," + << " page no:" << page_no + << " space id:" << space_id; +#else + fprintf(stderr, " Can't encrypt data of page," + " page no:" ULINTPF + " space id:" ULINTPF, + page_no, space_id); +#endif /* !UNIV_INNOCHECKSUM */ + *dst_len = src_len; + return(src); + } + + memcpy(dst + FIL_PAGE_DATA + data_len - remain_len, + remain_buf, remain_len); + } + + + break; + } + + default: + ut_error; + } + + /* Copy the header as is. */ + memmove(dst, src, FIL_PAGE_DATA); + ut_ad(memcmp(src, dst, FIL_PAGE_DATA) == 0); + + /* Add encryption control information. Required for decrypting. */ + if (page_type == FIL_PAGE_COMPRESSED) { + /* If the page is compressed, we don't need to save the + original type, since it is done in compression already. */ + mach_write_to_2(dst + FIL_PAGE_TYPE, + FIL_PAGE_COMPRESSED_AND_ENCRYPTED); + ut_ad(memcmp(src+FIL_PAGE_TYPE+2, + dst+FIL_PAGE_TYPE+2, + FIL_PAGE_DATA-FIL_PAGE_TYPE-2) == 0); + } else if (page_type == FIL_PAGE_RTREE) { + /* If the page is R-tree page, we need to save original + type. */ + mach_write_to_2(dst + FIL_PAGE_TYPE, FIL_PAGE_ENCRYPTED_RTREE); + } else{ + mach_write_to_2(dst + FIL_PAGE_TYPE, FIL_PAGE_ENCRYPTED); + mach_write_to_2(dst + FIL_PAGE_ORIGINAL_TYPE_V1, page_type); + } + +#ifdef UNIV_ENCRYPT_DEBUG +#ifndef UNIV_INNOCHECKSUM +#if 0 + byte* check_buf = static_cast<byte*>(ut_malloc_nokey(src_len)); + byte* buf2 = static_cast<byte*>(ut_malloc_nokey(src_len)); + + memcpy(check_buf, dst, src_len); + + dberr_t err = decrypt(type, check_buf, src_len, buf2, src_len); + if (err != DB_SUCCESS || memcmp(src + FIL_PAGE_DATA, + check_buf + FIL_PAGE_DATA, + src_len - FIL_PAGE_DATA) != 0) { + ut_print_buf(stderr, src, src_len); + ut_print_buf(stderr, check_buf, src_len); + ut_ad(0); + } + ut_free(buf2); + ut_free(check_buf); +#endif + fprintf(stderr, "Encrypted page:%lu.%lu\n", space_id, page_no); +#endif +#endif + *dst_len = src_len; + + + return(dst); +} + +/** Decrypt the page data contents. Page type must be FIL_PAGE_ENCRYPTED, +if not then the source contents are left unchanged and DB_SUCCESS is returned. +@param[in] type IORequest +@param[in,out] src Data read from disk, decrypted data will be + copied to this page +@param[in] src_len source data length +@param[in,out] dst Scratch area to use for decryption +@param[in] dst_len Size of the scratch area in bytes +@return DB_SUCCESS or error code */ +dberr_t +Encryption::decrypt( + const IORequest& type, + byte* src, + ulint src_len, + byte* dst, + ulint dst_len) +{ + ulint data_len; + ulint main_len; + ulint remain_len; + ulint original_type; + ulint page_type; + byte remain_buf[MY_AES_BLOCK_SIZE * 2]; + Block* block; + + /* Do nothing if it's not an encrypted table. */ + if (!is_encrypted_page(src)) { + return(DB_SUCCESS); + } + + /* For compressed page, we need to get the compressed size + for decryption */ + page_type = mach_read_from_2(src + FIL_PAGE_TYPE); + if (page_type == FIL_PAGE_COMPRESSED_AND_ENCRYPTED) { + src_len = static_cast<uint16_t>( + mach_read_from_2(src + FIL_PAGE_COMPRESS_SIZE_V1)) + + FIL_PAGE_DATA; +#ifndef UNIV_INNOCHECKSUM + src_len = ut_calc_align(src_len, type.block_size()); +#endif + } +#ifdef UNIV_ENCRYPT_DEBUG + ulint space_id = + mach_read_from_4(src + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID); + ulint page_no = mach_read_from_4(src + FIL_PAGE_OFFSET); + + fprintf(stderr, "Decrypting page:%lu.%lu len:%lu\n", + space_id, page_no, src_len); +#endif + + original_type = static_cast<uint16_t>( + mach_read_from_2(src + FIL_PAGE_ORIGINAL_TYPE_V1)); + + byte* ptr = src + FIL_PAGE_DATA; + + /* The caller doesn't know what to expect */ + if (dst == NULL) { + + block = os_alloc_block(); +#ifdef UNIV_INNOCHECKSUM + dst = block; +#else + dst = block->m_ptr; +#endif /* UNIV_INNOCHECKSUM */ + + } else { + block = NULL; + } + + data_len = src_len - FIL_PAGE_DATA; + main_len = (data_len / MY_AES_BLOCK_SIZE) * MY_AES_BLOCK_SIZE; + remain_len = data_len - main_len; + + switch(m_type) { + case Encryption::AES: { + lint elen; + + /* First decrypt the last 2 blocks data of data, since + data is no block aligned. */ + if (remain_len != 0) { + ut_ad(m_klen == ENCRYPTION_KEY_LEN); + + remain_len = MY_AES_BLOCK_SIZE * 2; + + /* Copy the last 2 blocks. */ + memcpy(remain_buf, + ptr + data_len - remain_len, + remain_len); + + elen = my_aes_decrypt( + remain_buf, + static_cast<uint32>(remain_len), + dst + data_len - remain_len, + reinterpret_cast<unsigned char*>(m_key), + static_cast<uint32>(m_klen), + my_aes_256_cbc, + reinterpret_cast<unsigned char*>(m_iv), + false); + if (elen == MY_AES_BAD_DATA) { + if (block != NULL) { + os_free_block(block); + } + + return(DB_IO_DECRYPT_FAIL); + } + + /* Copy the other data bytes to temp area. */ + memcpy(dst, ptr, data_len - remain_len); + } else { + ut_ad(data_len == main_len); + + /* Copy the data bytes to temp area. */ + memcpy(dst, ptr, data_len); + } + + /* Then decrypt the main data */ + elen = my_aes_decrypt( + dst, + static_cast<uint32>(main_len), + ptr, + reinterpret_cast<unsigned char*>(m_key), + static_cast<uint32>(m_klen), + my_aes_256_cbc, + reinterpret_cast<unsigned char*>(m_iv), + false); + if (elen == MY_AES_BAD_DATA) { + + if (block != NULL) { + os_free_block(block); + } + + return(DB_IO_DECRYPT_FAIL); + } + + ut_ad(static_cast<ulint>(elen) == main_len); + + /* Copy the remain bytes. */ + memcpy(ptr + main_len, dst + main_len, data_len - main_len); + + break; + } + + default: +#if !defined(UNIV_INNOCHECKSUM) + ib::error() + << "Encryption algorithm support missing: " + << Encryption::to_string(m_type); +#else + fprintf(stderr, "Encryption algorithm support missing: %s\n", + Encryption::to_string(m_type)); +#endif /* !UNIV_INNOCHECKSUM */ + + if (block != NULL) { + os_free_block(block); + } + + return(DB_UNSUPPORTED); + } + + /* Restore the original page type. If it's a compressed and + encrypted page, just reset it as compressed page type, since + we will do uncompress later. */ + + if (page_type == FIL_PAGE_ENCRYPTED) { + mach_write_to_2(src + FIL_PAGE_TYPE, original_type); + mach_write_to_2(src + FIL_PAGE_ORIGINAL_TYPE_V1, 0); + } else if (page_type == FIL_PAGE_ENCRYPTED_RTREE) { + mach_write_to_2(src + FIL_PAGE_TYPE, FIL_PAGE_RTREE); + } else { + ut_ad(page_type == FIL_PAGE_COMPRESSED_AND_ENCRYPTED); + mach_write_to_2(src + FIL_PAGE_TYPE, FIL_PAGE_COMPRESSED); + } + + if (block != NULL) { + os_free_block(block); + } + +#ifdef UNIV_ENCRYPT_DEBUG + fprintf(stderr, "Decrypted page:%lu.%lu\n", space_id, page_no); +#endif + + DBUG_EXECUTE_IF("ib_crash_during_decrypt_page", DBUG_SUICIDE();); + + return(DB_SUCCESS); +} +#endif /* MYSQL_ENCRYPTION */ /** Normalizes a directory path for the current OS: On Windows, we convert '/' to '\', else we convert '\' to '/'. diff --git a/storage/innobase/os/os0sync.cc b/storage/innobase/os/os0sync.cc deleted file mode 100644 index 03c53848832..00000000000 --- a/storage/innobase/os/os0sync.cc +++ /dev/null @@ -1,935 +0,0 @@ -/***************************************************************************** - -Copyright (c) 1995, 2015, Oracle and/or its affiliates. All Rights Reserved. - -This program is free software; you can redistribute it and/or modify it under -the terms of the GNU General Public License as published by the Free Software -Foundation; version 2 of the License. - -This program is distributed in the hope that it will be useful, but WITHOUT -ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS -FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. - -You should have received a copy of the GNU General Public License along with -this program; if not, write to the Free Software Foundation, Inc., -51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA - -*****************************************************************************/ - -/**************************************************//** -@file os/os0sync.cc -The interface to the operating system -synchronization primitives. - -Created 9/6/1995 Heikki Tuuri -*******************************************************/ - -#include "os0sync.h" -#ifdef UNIV_NONINL -#include "os0sync.ic" -#endif - -#ifdef __WIN__ -#include <windows.h> -#endif - -#include "ut0mem.h" -#include "srv0start.h" -#include "srv0srv.h" - -/* Type definition for an operating system mutex struct */ -struct os_mutex_t{ - os_event_t event; /*!< Used by sync0arr.cc for queing threads */ - void* handle; /*!< OS handle to mutex */ - ulint count; /*!< we use this counter to check - that the same thread does not - recursively lock the mutex: we - do not assume that the OS mutex - supports recursive locking, though - NT seems to do that */ - UT_LIST_NODE_T(os_mutex_t) os_mutex_list; - /* list of all 'slow' OS mutexes created */ -}; - -/** Mutex protecting counts and the lists of OS mutexes and events */ -UNIV_INTERN os_ib_mutex_t os_sync_mutex; -/** TRUE if os_sync_mutex has been initialized */ -static ibool os_sync_mutex_inited = FALSE; -/** TRUE when os_sync_free() is being executed */ -static ibool os_sync_free_called = FALSE; - -/** This is incremented by 1 in os_thread_create and decremented by 1 in -os_thread_exit */ -UNIV_INTERN ulint os_thread_count = 0; - -/** The list of all events created */ -static UT_LIST_BASE_NODE_T(os_event) os_event_list; - -/** The list of all OS 'slow' mutexes */ -static UT_LIST_BASE_NODE_T(os_mutex_t) os_mutex_list; - -UNIV_INTERN ulint os_event_count = 0; -UNIV_INTERN ulint os_mutex_count = 0; -UNIV_INTERN ulint os_fast_mutex_count = 0; - -/* The number of microsecnds in a second. */ -static const ulint MICROSECS_IN_A_SECOND = 1000000; - -#ifdef UNIV_PFS_MUTEX -UNIV_INTERN mysql_pfs_key_t event_os_mutex_key; -UNIV_INTERN mysql_pfs_key_t os_mutex_key; -#endif - -/* Because a mutex is embedded inside an event and there is an -event embedded inside a mutex, on free, this generates a recursive call. -This version of the free event function doesn't acquire the global lock */ -static void os_event_free_internal(os_event_t event); - -/* On Windows (Vista and later), load function pointers for condition -variable handling. Those functions are not available in prior versions, -so we have to use them via runtime loading, as long as we support XP. */ -static void os_cond_module_init(void); - -#ifdef __WIN__ -/* Prototypes and function pointers for condition variable functions */ -typedef VOID (WINAPI* InitializeConditionVariableProc) - (PCONDITION_VARIABLE ConditionVariable); -static InitializeConditionVariableProc initialize_condition_variable; - -typedef BOOL (WINAPI* SleepConditionVariableCSProc) - (PCONDITION_VARIABLE ConditionVariable, - PCRITICAL_SECTION CriticalSection, - DWORD dwMilliseconds); -static SleepConditionVariableCSProc sleep_condition_variable; - -typedef VOID (WINAPI* WakeAllConditionVariableProc) - (PCONDITION_VARIABLE ConditionVariable); -static WakeAllConditionVariableProc wake_all_condition_variable; - -typedef VOID (WINAPI* WakeConditionVariableProc) - (PCONDITION_VARIABLE ConditionVariable); -static WakeConditionVariableProc wake_condition_variable; -#endif - -/*********************************************************//** -Initialitze condition variable */ -UNIV_INLINE -void -os_cond_init( -/*=========*/ - os_cond_t* cond) /*!< in: condition variable. */ -{ - ut_a(cond); - -#ifdef __WIN__ - ut_a(initialize_condition_variable != NULL); - initialize_condition_variable(cond); -#else - ut_a(pthread_cond_init(cond, NULL) == 0); -#endif -} - -/*********************************************************//** -Do a timed wait on condition variable. -@return TRUE if timed out, FALSE otherwise */ -UNIV_INLINE -ibool -os_cond_wait_timed( -/*===============*/ - os_cond_t* cond, /*!< in: condition variable. */ - os_fast_mutex_t* fast_mutex, /*!< in: fast mutex */ -#ifndef __WIN__ - const struct timespec* abstime /*!< in: timeout */ -#else - DWORD time_in_ms /*!< in: timeout in - milliseconds*/ -#endif /* !__WIN__ */ -) -{ - fast_mutex_t* mutex = &fast_mutex->mutex; -#ifdef __WIN__ - BOOL ret; - DWORD err; - - ut_a(sleep_condition_variable != NULL); - - ret = sleep_condition_variable(cond, mutex, time_in_ms); - - if (!ret) { - err = GetLastError(); - /* From http://msdn.microsoft.com/en-us/library/ms686301%28VS.85%29.aspx, - "Condition variables are subject to spurious wakeups - (those not associated with an explicit wake) and stolen wakeups - (another thread manages to run before the woken thread)." - Check for both types of timeouts. - Conditions are checked by the caller.*/ - if ((err == WAIT_TIMEOUT) || (err == ERROR_TIMEOUT)) { - return(TRUE); - } - } - - ut_a(ret); - - return(FALSE); -#else - int ret; - - ret = pthread_cond_timedwait(cond, mutex, abstime); - - switch (ret) { - case 0: - case ETIMEDOUT: - /* We play it safe by checking for EINTR even though - according to the POSIX documentation it can't return EINTR. */ - case EINTR: - break; - - default: - fprintf(stderr, " InnoDB: pthread_cond_timedwait() returned: " - "%d: abstime={%lu,%lu}\n", - ret, (ulong) abstime->tv_sec, (ulong) abstime->tv_nsec); - ut_error; - } - - return(ret == ETIMEDOUT); -#endif -} -/*********************************************************//** -Wait on condition variable */ -UNIV_INLINE -void -os_cond_wait( -/*=========*/ - os_cond_t* cond, /*!< in: condition variable. */ - os_fast_mutex_t* fast_mutex)/*!< in: fast mutex */ -{ - fast_mutex_t* mutex = &fast_mutex->mutex; - ut_a(cond); - ut_a(mutex); - -#ifdef __WIN__ - ut_a(sleep_condition_variable != NULL); - ut_a(sleep_condition_variable(cond, mutex, INFINITE)); -#else - ut_a(pthread_cond_wait(cond, mutex) == 0); -#endif -} - -/*********************************************************//** -Wakes all threads waiting for condition variable */ -UNIV_INLINE -void -os_cond_broadcast( -/*==============*/ - os_cond_t* cond) /*!< in: condition variable. */ -{ - ut_a(cond); - -#ifdef __WIN__ - ut_a(wake_all_condition_variable != NULL); - wake_all_condition_variable(cond); -#else - ut_a(pthread_cond_broadcast(cond) == 0); -#endif -} - -/*********************************************************//** -Destroys condition variable */ -UNIV_INLINE -void -os_cond_destroy( -/*============*/ - os_cond_t* cond) /*!< in: condition variable. */ -{ -#ifdef __WIN__ - /* Do nothing */ -#else - ut_a(pthread_cond_destroy(cond) == 0); -#endif -} - -/*********************************************************//** -On Windows (Vista and later), load function pointers for condition variable -handling. Those functions are not available in prior versions, so we have to -use them via runtime loading, as long as we support XP. */ -static -void -os_cond_module_init(void) -/*=====================*/ -{ -#ifdef __WIN__ - HMODULE h_dll; - - if (!srv_use_native_conditions) - return; - - h_dll = GetModuleHandle("kernel32"); - - initialize_condition_variable = (InitializeConditionVariableProc) - GetProcAddress(h_dll, "InitializeConditionVariable"); - sleep_condition_variable = (SleepConditionVariableCSProc) - GetProcAddress(h_dll, "SleepConditionVariableCS"); - wake_all_condition_variable = (WakeAllConditionVariableProc) - GetProcAddress(h_dll, "WakeAllConditionVariable"); - wake_condition_variable = (WakeConditionVariableProc) - GetProcAddress(h_dll, "WakeConditionVariable"); - - /* When using native condition variables, check function pointers */ - ut_a(initialize_condition_variable); - ut_a(sleep_condition_variable); - ut_a(wake_all_condition_variable); - ut_a(wake_condition_variable); -#endif -} - -/*********************************************************//** -Initializes global event and OS 'slow' mutex lists. */ -UNIV_INTERN -void -os_sync_init(void) -/*==============*/ -{ - UT_LIST_INIT(os_event_list); - UT_LIST_INIT(os_mutex_list); - - os_sync_mutex = NULL; - os_sync_mutex_inited = FALSE; - - /* Now for Windows only */ - os_cond_module_init(); - - os_sync_mutex = os_mutex_create(); - - os_sync_mutex_inited = TRUE; -} - -/*********************************************************//** -Frees created events and OS 'slow' mutexes. */ -UNIV_INTERN -void -os_sync_free(void) -/*==============*/ -{ - os_event_t event; - os_ib_mutex_t mutex; - - os_sync_free_called = TRUE; - event = UT_LIST_GET_FIRST(os_event_list); - - while (event) { - - os_event_free(event); - - event = UT_LIST_GET_FIRST(os_event_list); - } - - mutex = UT_LIST_GET_FIRST(os_mutex_list); - - while (mutex) { - if (mutex == os_sync_mutex) { - /* Set the flag to FALSE so that we do not try to - reserve os_sync_mutex any more in remaining freeing - operations in shutdown */ - os_sync_mutex_inited = FALSE; - } - - os_mutex_free(mutex); - - mutex = UT_LIST_GET_FIRST(os_mutex_list); - } - os_sync_free_called = FALSE; -} - -/*********************************************************//** -Creates an event semaphore, i.e., a semaphore which may just have two -states: signaled and nonsignaled. The created event is manual reset: it -must be reset explicitly by calling sync_os_reset_event. -@return the event handle */ -UNIV_INTERN -os_event_t -os_event_create(void) -/*==================*/ -{ - os_event_t event; - -#ifdef __WIN__ - if(!srv_use_native_conditions) { - - event = static_cast<os_event_t>(ut_malloc(sizeof(*event))); - - event->handle = CreateEvent(NULL, TRUE, FALSE, NULL); - if (!event->handle) { - fprintf(stderr, - "InnoDB: Could not create a Windows event" - " semaphore; Windows error %lu\n", - (ulong) GetLastError()); - } - } else /* Windows with condition variables */ -#endif - { - event = static_cast<os_event_t>(ut_malloc(sizeof *event)); - -#ifndef PFS_SKIP_EVENT_MUTEX - os_fast_mutex_init(event_os_mutex_key, &event->os_mutex); -#else - os_fast_mutex_init(PFS_NOT_INSTRUMENTED, &event->os_mutex); -#endif - - os_cond_init(&(event->cond_var)); - - event->is_set = FALSE; - - /* We return this value in os_event_reset(), which can then be - be used to pass to the os_event_wait_low(). The value of zero - is reserved in os_event_wait_low() for the case when the - caller does not want to pass any signal_count value. To - distinguish between the two cases we initialize signal_count - to 1 here. */ - event->signal_count = 1; - } - - /* The os_sync_mutex can be NULL because during startup an event - can be created [ because it's embedded in the mutex/rwlock ] before - this module has been initialized */ - if (os_sync_mutex != NULL) { - os_mutex_enter(os_sync_mutex); - } - - /* Put to the list of events */ - UT_LIST_ADD_FIRST(os_event_list, os_event_list, event); - - os_event_count++; - - if (os_sync_mutex != NULL) { - os_mutex_exit(os_sync_mutex); - } - - return(event); -} - -/**********************************************************//** -Sets an event semaphore to the signaled state: lets waiting threads -proceed. */ -UNIV_INTERN -void -os_event_set( -/*=========*/ - os_event_t event) /*!< in: event to set */ -{ - ut_a(event); - -#ifdef __WIN__ - if (!srv_use_native_conditions) { - ut_a(SetEvent(event->handle)); - return; - } -#endif - - os_fast_mutex_lock(&(event->os_mutex)); - - if (event->is_set) { - /* Do nothing */ - } else { - event->is_set = TRUE; - event->signal_count += 1; - os_cond_broadcast(&(event->cond_var)); - } - - os_fast_mutex_unlock(&(event->os_mutex)); -} - -/**********************************************************//** -Resets an event semaphore to the nonsignaled state. Waiting threads will -stop to wait for the event. -The return value should be passed to os_even_wait_low() if it is desired -that this thread should not wait in case of an intervening call to -os_event_set() between this os_event_reset() and the -os_event_wait_low() call. See comments for os_event_wait_low(). -@return current signal_count. */ -UNIV_INTERN -ib_int64_t -os_event_reset( -/*===========*/ - os_event_t event) /*!< in: event to reset */ -{ - ib_int64_t ret = 0; - - ut_a(event); - -#ifdef __WIN__ - if(!srv_use_native_conditions) { - ut_a(ResetEvent(event->handle)); - return(0); - } -#endif - - os_fast_mutex_lock(&(event->os_mutex)); - - if (!event->is_set) { - /* Do nothing */ - } else { - event->is_set = FALSE; - } - ret = event->signal_count; - - os_fast_mutex_unlock(&(event->os_mutex)); - return(ret); -} - -/**********************************************************//** -Frees an event object, without acquiring the global lock. */ -static -void -os_event_free_internal( -/*===================*/ - os_event_t event) /*!< in: event to free */ -{ -#ifdef __WIN__ - if(!srv_use_native_conditions) { - ut_a(event); - ut_a(CloseHandle(event->handle)); - } else -#endif - { - ut_a(event); - - /* This is to avoid freeing the mutex twice */ - os_fast_mutex_free(&(event->os_mutex)); - - os_cond_destroy(&(event->cond_var)); - } - - /* Remove from the list of events */ - UT_LIST_REMOVE(os_event_list, os_event_list, event); - - os_event_count--; - - ut_free(event); -} - -/**********************************************************//** -Frees an event object. */ -UNIV_INTERN -void -os_event_free( -/*==========*/ - os_event_t event) /*!< in: event to free */ - -{ - ut_a(event); -#ifdef __WIN__ - if(!srv_use_native_conditions){ - ut_a(CloseHandle(event->handle)); - } else /*Windows with condition variables */ -#endif - { - os_fast_mutex_free(&(event->os_mutex)); - - os_cond_destroy(&(event->cond_var)); - } - - /* Remove from the list of events */ - os_mutex_enter(os_sync_mutex); - - UT_LIST_REMOVE(os_event_list, os_event_list, event); - - os_event_count--; - - os_mutex_exit(os_sync_mutex); - - ut_free(event); -} - -/**********************************************************//** -Waits for an event object until it is in the signaled state. - -Typically, if the event has been signalled after the os_event_reset() -we'll return immediately because event->is_set == TRUE. -There are, however, situations (e.g.: sync_array code) where we may -lose this information. For example: - -thread A calls os_event_reset() -thread B calls os_event_set() [event->is_set == TRUE] -thread C calls os_event_reset() [event->is_set == FALSE] -thread A calls os_event_wait() [infinite wait!] -thread C calls os_event_wait() [infinite wait!] - -Where such a scenario is possible, to avoid infinite wait, the -value returned by os_event_reset() should be passed in as -reset_sig_count. */ -UNIV_INTERN -void -os_event_wait_low( -/*==============*/ - os_event_t event, /*!< in: event to wait */ - ib_int64_t reset_sig_count)/*!< in: zero or the value - returned by previous call of - os_event_reset(). */ -{ -#ifdef __WIN__ - if(!srv_use_native_conditions) { - DWORD err; - - ut_a(event); - - UT_NOT_USED(reset_sig_count); - - /* Specify an infinite wait */ - err = WaitForSingleObject(event->handle, INFINITE); - - ut_a(err == WAIT_OBJECT_0); - return; - } -#endif - - os_fast_mutex_lock(&event->os_mutex); - - if (!reset_sig_count) { - reset_sig_count = event->signal_count; - } - - while (!event->is_set && event->signal_count == reset_sig_count) { - os_cond_wait(&(event->cond_var), &(event->os_mutex)); - - /* Solaris manual said that spurious wakeups may occur: we - have to check if the event really has been signaled after - we came here to wait */ - } - - os_fast_mutex_unlock(&event->os_mutex); -} - -/**********************************************************//** -Waits for an event object until it is in the signaled state or -a timeout is exceeded. -@return 0 if success, OS_SYNC_TIME_EXCEEDED if timeout was exceeded */ -UNIV_INTERN -ulint -os_event_wait_time_low( -/*===================*/ - os_event_t event, /*!< in: event to wait */ - ulint time_in_usec, /*!< in: timeout in - microseconds, or - OS_SYNC_INFINITE_TIME */ - ib_int64_t reset_sig_count) /*!< in: zero or the value - returned by previous call of - os_event_reset(). */ -{ - ibool timed_out = FALSE; - -#ifdef __WIN__ - DWORD time_in_ms; - - if (!srv_use_native_conditions) { - DWORD err; - - ut_a(event); - - if (time_in_usec != OS_SYNC_INFINITE_TIME) { - time_in_ms = static_cast<DWORD>(time_in_usec / 1000); - err = WaitForSingleObject(event->handle, time_in_ms); - } else { - err = WaitForSingleObject(event->handle, INFINITE); - } - - if (err == WAIT_OBJECT_0) { - return(0); - } else if ((err == WAIT_TIMEOUT) || (err == ERROR_TIMEOUT)) { - return(OS_SYNC_TIME_EXCEEDED); - } - - ut_error; - /* Dummy value to eliminate compiler warning. */ - return(42); - } else { - ut_a(sleep_condition_variable != NULL); - - if (time_in_usec != OS_SYNC_INFINITE_TIME) { - time_in_ms = static_cast<DWORD>(time_in_usec / 1000); - } else { - time_in_ms = INFINITE; - } - } -#else - struct timespec abstime; - - if (time_in_usec != OS_SYNC_INFINITE_TIME) { - struct timeval tv; - int ret; - ulint sec; - ulint usec; - - ret = ut_usectime(&sec, &usec); - ut_a(ret == 0); - - tv.tv_sec = sec; - tv.tv_usec = usec; - - tv.tv_usec += time_in_usec; - - if ((ulint) tv.tv_usec >= MICROSECS_IN_A_SECOND) { - tv.tv_sec += tv.tv_usec / MICROSECS_IN_A_SECOND; - tv.tv_usec %= MICROSECS_IN_A_SECOND; - } - - abstime.tv_sec = tv.tv_sec; - abstime.tv_nsec = tv.tv_usec * 1000; - } else { - abstime.tv_nsec = 999999999; - abstime.tv_sec = (time_t) ULINT_MAX; - } - - ut_a(abstime.tv_nsec <= 999999999); - -#endif /* __WIN__ */ - - os_fast_mutex_lock(&event->os_mutex); - - if (!reset_sig_count) { - reset_sig_count = event->signal_count; - } - - do { - if (event->is_set || event->signal_count != reset_sig_count) { - - break; - } - - timed_out = os_cond_wait_timed( - &event->cond_var, &event->os_mutex, -#ifndef __WIN__ - &abstime -#else - time_in_ms -#endif /* !__WIN__ */ - ); - - } while (!timed_out); - - os_fast_mutex_unlock(&event->os_mutex); - - return(timed_out ? OS_SYNC_TIME_EXCEEDED : 0); -} - -/*********************************************************//** -Creates an operating system mutex semaphore. Because these are slow, the -mutex semaphore of InnoDB itself (ib_mutex_t) should be used where possible. -@return the mutex handle */ -UNIV_INTERN -os_ib_mutex_t -os_mutex_create(void) -/*=================*/ -{ - os_fast_mutex_t* mutex; - os_ib_mutex_t mutex_str; - - mutex = static_cast<os_fast_mutex_t*>( - ut_malloc(sizeof(os_fast_mutex_t))); - - os_fast_mutex_init(os_mutex_key, mutex); - - mutex_str = static_cast<os_ib_mutex_t>(ut_malloc(sizeof *mutex_str)); - - mutex_str->handle = mutex; - mutex_str->count = 0; - mutex_str->event = os_event_create(); - - if (UNIV_LIKELY(os_sync_mutex_inited)) { - /* When creating os_sync_mutex itself we cannot reserve it */ - os_mutex_enter(os_sync_mutex); - } - - UT_LIST_ADD_FIRST(os_mutex_list, os_mutex_list, mutex_str); - - os_mutex_count++; - - if (UNIV_LIKELY(os_sync_mutex_inited)) { - os_mutex_exit(os_sync_mutex); - } - - return(mutex_str); -} - -/**********************************************************//** -Acquires ownership of a mutex semaphore. */ -UNIV_INTERN -void -os_mutex_enter( -/*===========*/ - os_ib_mutex_t mutex) /*!< in: mutex to acquire */ -{ - os_fast_mutex_lock(static_cast<os_fast_mutex_t*>(mutex->handle)); - - (mutex->count)++; - - ut_a(mutex->count == 1); -} - -/**********************************************************//** -Releases ownership of a mutex. */ -UNIV_INTERN -void -os_mutex_exit( -/*==========*/ - os_ib_mutex_t mutex) /*!< in: mutex to release */ -{ - ut_a(mutex); - - ut_a(mutex->count == 1); - - (mutex->count)--; - os_fast_mutex_unlock(static_cast<os_fast_mutex_t*>(mutex->handle)); -} - -/**********************************************************//** -Frees a mutex object. */ -UNIV_INTERN -void -os_mutex_free( -/*==========*/ - os_ib_mutex_t mutex) /*!< in: mutex to free */ -{ - ut_a(mutex); - - if (UNIV_LIKELY(!os_sync_free_called)) { - os_event_free_internal(mutex->event); - } - - if (UNIV_LIKELY(os_sync_mutex_inited)) { - os_mutex_enter(os_sync_mutex); - } - - UT_LIST_REMOVE(os_mutex_list, os_mutex_list, mutex); - - os_mutex_count--; - - if (UNIV_LIKELY(os_sync_mutex_inited)) { - os_mutex_exit(os_sync_mutex); - } - - os_fast_mutex_free(static_cast<os_fast_mutex_t*>(mutex->handle)); - ut_free(mutex->handle); - ut_free(mutex); -} - -/*********************************************************//** -Initializes an operating system fast mutex semaphore. */ -UNIV_INTERN -void -os_fast_mutex_init_func( -/*====================*/ - fast_mutex_t* fast_mutex) /*!< in: fast mutex */ -{ -#ifdef __WIN__ - ut_a(fast_mutex); - - InitializeCriticalSection((LPCRITICAL_SECTION) fast_mutex); -#else - ut_a(0 == pthread_mutex_init(fast_mutex, MY_MUTEX_INIT_FAST)); -#endif - if (UNIV_LIKELY(os_sync_mutex_inited)) { - /* When creating os_sync_mutex itself (in Unix) we cannot - reserve it */ - - os_mutex_enter(os_sync_mutex); - } - - os_fast_mutex_count++; - - if (UNIV_LIKELY(os_sync_mutex_inited)) { - os_mutex_exit(os_sync_mutex); - } -} - -/**********************************************************//** -Acquires ownership of a fast mutex. */ -UNIV_INTERN -void -os_fast_mutex_lock_func( -/*====================*/ - fast_mutex_t* fast_mutex) /*!< in: mutex to acquire */ -{ -#ifdef __WIN__ - EnterCriticalSection((LPCRITICAL_SECTION) fast_mutex); -#else - pthread_mutex_lock(fast_mutex); -#endif -} - -/**********************************************************//** -Releases ownership of a fast mutex. */ -UNIV_INTERN -void -os_fast_mutex_unlock_func( -/*======================*/ - fast_mutex_t* fast_mutex) /*!< in: mutex to release */ -{ -#ifdef __WIN__ - LeaveCriticalSection(fast_mutex); -#else - pthread_mutex_unlock(fast_mutex); -#endif -} - -/**********************************************************//** -Releases ownership of a fast mutex. Implies a full memory barrier even on -platforms such as PowerPC where this is not normally required. */ -UNIV_INTERN -void -os_fast_mutex_unlock_full_barrier( -/*=================*/ - os_fast_mutex_t* fast_mutex) /*!< in: mutex to release */ -{ -#ifdef __WIN__ - LeaveCriticalSection(&fast_mutex->mutex); -#else - pthread_mutex_unlock(&fast_mutex->mutex); -#ifdef __powerpc__ - os_mb; -#endif -#endif -} - -/**********************************************************//** -Frees a mutex object. */ -UNIV_INTERN -void -os_fast_mutex_free_func( -/*====================*/ - fast_mutex_t* fast_mutex) /*!< in: mutex to free */ -{ -#ifdef __WIN__ - ut_a(fast_mutex); - - DeleteCriticalSection((LPCRITICAL_SECTION) fast_mutex); -#else - int ret; - - ret = pthread_mutex_destroy(fast_mutex); - - if (UNIV_UNLIKELY(ret != 0)) { - ut_print_timestamp(stderr); - fprintf(stderr, - " InnoDB: error: return value %lu when calling\n" - "InnoDB: pthread_mutex_destroy().\n", (ulint) ret); - fprintf(stderr, - "InnoDB: Byte contents of the pthread mutex at %p:\n", - (void*) fast_mutex); - ut_print_buf(stderr, fast_mutex, sizeof(os_fast_mutex_t)); - putc('\n', stderr); - } -#endif - if (UNIV_LIKELY(os_sync_mutex_inited)) { - /* When freeing the last mutexes, we have - already freed os_sync_mutex */ - - os_mutex_enter(os_sync_mutex); - } - - ut_ad(os_fast_mutex_count > 0); - os_fast_mutex_count--; - - if (UNIV_LIKELY(os_sync_mutex_inited)) { - os_mutex_exit(os_sync_mutex); - } -} diff --git a/storage/innobase/os/os0thread.cc b/storage/innobase/os/os0thread.cc index da2a2f59616..811bd87cef7 100644 --- a/storage/innobase/os/os0thread.cc +++ b/storage/innobase/os/os0thread.cc @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1995, 2015, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 1995, 2016, Oracle and/or its affiliates. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -187,17 +187,12 @@ os_thread_create_func( if (thread_id != NULL) { *thread_id = new_thread_id; } - return((os_thread_t)new_thread_id); } -/*****************************************************************//** -Exits the current thread. */ +/** Exits the current thread. */ void -os_thread_exit( -/*===========*/ - void* exit_value) /*!< in: exit value; in Windows this void* - is cast as a DWORD */ +os_thread_exit() { #ifdef UNIV_DEBUG_THREAD_CREATION ib::info() << "Thread exits, id " @@ -221,11 +216,11 @@ os_thread_exit( mutex_exit(&thread_mutex); - ExitThread((DWORD) exit_value); + ExitThread(0); #else mutex_exit(&thread_mutex); pthread_detach(pthread_self()); - pthread_exit(exit_value); + pthread_exit(NULL); #endif } |