diff options
-rw-r--r-- | myisam/mi_packrec.c | 365 |
1 files changed, 328 insertions, 37 deletions
diff --git a/myisam/mi_packrec.c b/myisam/mi_packrec.c index 13898df0466..5a937c6e9d4 100644 --- a/myisam/mi_packrec.c +++ b/myisam/mi_packrec.c @@ -20,7 +20,10 @@ #define IS_CHAR ((uint) 32768) /* Bit if char (not offset) in tree */ -#if INT_MAX > 65536L +/* Some definitions to keep in sync with myisampack.c */ +#define HEAD_LENGTH 32 /* Length of fixed header */ + +#if INT_MAX > 32767 #define BITS_SAVED 32 #define MAX_QUICK_TABLE_BITS 9 /* Because we may shift in 24 bits */ #else @@ -42,6 +45,7 @@ { bits-=(bit+1); break; } \ pos+= *pos +/* Size in uint16 of a Huffman tree for byte compression of 256 byte values. */ #define OFFSET_TABLE_SIZE 512 static uint read_huff_table(MI_BIT_BUFF *bit_buff,MI_DECODE_TREE *decode_tree, @@ -134,7 +138,7 @@ my_bool _mi_read_pack_info(MI_INFO *info, pbool fix_keys) uint16 *decode_table,*tmp_buff; ulong elements,intervall_length; char *disk_cache,*intervall_buff; - uchar header[32]; + uchar header[HEAD_LENGTH]; MYISAM_SHARE *share=info->s; MI_BIT_BUFF bit_buff; DBUG_ENTER("_mi_read_pack_info"); @@ -152,12 +156,13 @@ my_bool _mi_read_pack_info(MI_INFO *info, pbool fix_keys) my_errno=HA_ERR_END_OF_FILE; goto err0; } + /* Only the first three bytes of magic number are independent of version. */ if (memcmp((byte*) header, (byte*) myisam_pack_file_magic, 3)) { my_errno=HA_ERR_WRONG_IN_RECORD; goto err0; } - share->pack.version= header[3]; + share->pack.version= header[3]; /* fourth byte of magic number */ share->pack.header_length= uint4korr(header+4); share->min_pack_length=(uint) uint4korr(header+8); share->max_pack_length=(uint) uint4korr(header+12); @@ -173,7 +178,22 @@ my_bool _mi_read_pack_info(MI_INFO *info, pbool fix_keys) share->base.min_block_length=share->min_pack_length+1; if (share->min_pack_length > 254) share->base.min_block_length+=2; - + DBUG_PRINT("info", ("fixed header length: %u", HEAD_LENGTH)); + DBUG_PRINT("info", ("total header length: %u", share->pack.header_length)); + DBUG_PRINT("info", ("pack file version: %u", share->pack.version)); + DBUG_PRINT("info", ("min pack length: %u", share->min_pack_length)); + DBUG_PRINT("info", ("max pack length: %u", share->max_pack_length)); + DBUG_PRINT("info", ("elements of all trees: %u", elements)); + DBUG_PRINT("info", ("distinct values bytes: %u", intervall_length)); + DBUG_PRINT("info", ("number of code trees: %u", trees)); + DBUG_PRINT("info", ("bytes for record lgt: %u", share->pack.ref_length)); + DBUG_PRINT("info", ("record pointer length: %u", rec_reflength)); + + /* + Memory segment #1: + - Decode tree heads + - Distinct column values + */ if (!(share->decode_trees=(MI_DECODE_TREE*) my_malloc((uint) (trees*sizeof(MI_DECODE_TREE)+ intervall_length*sizeof(byte)), @@ -181,11 +201,19 @@ my_bool _mi_read_pack_info(MI_INFO *info, pbool fix_keys) goto err0; intervall_buff=(byte*) (share->decode_trees+trees); + /* + Memory segment #2: + - Decode tables + - Quick decode tables + - Temporary decode table + - Compressed data file header cache + This segment will be reallocated after construction of the tables. + */ length=(uint) (elements*2+trees*(1 << myisam_quick_table_bits)); if (!(share->decode_tables=(uint16*) - my_malloc((length+OFFSET_TABLE_SIZE)*sizeof(uint16)+ - (uint) (share->pack.header_length+7), - MYF(MY_WME | MY_ZEROFILL)))) + my_malloc((length + OFFSET_TABLE_SIZE) * sizeof(uint16) + + (uint) (share->pack.header_length - sizeof(header)), + MYF(MY_WME | MY_ZEROFILL)))) goto err1; tmp_buff=share->decode_tables+length; disk_cache=(byte*) (tmp_buff+OFFSET_TABLE_SIZE); @@ -198,7 +226,7 @@ my_bool _mi_read_pack_info(MI_INFO *info, pbool fix_keys) huff_tree_bits=max_bit(trees ? trees-1 : 0); init_bit_buffer(&bit_buff, (uchar*) disk_cache, (uint) (share->pack.header_length-sizeof(header))); - /* Read new info for each field */ + /* Read new info for each field */ for (i=0 ; i < share->base.fields ; i++) { share->rec[i].base_type=(enum en_fieldtype) get_bits(&bit_buff,5); @@ -207,17 +235,26 @@ my_bool _mi_read_pack_info(MI_INFO *info, pbool fix_keys) share->rec[i].huff_tree=share->decode_trees+(uint) get_bits(&bit_buff, huff_tree_bits); share->rec[i].unpack=get_unpack_function(share->rec+i); + DBUG_PRINT("info", ("col: %2u type: %2u pack: %u slbits: %2u", + i, share->rec[i].base_type, share->rec[i].pack_type, + share->rec[i].space_length_bits)); } skip_to_next_byte(&bit_buff); + /* + Construct the decoding tables from the file header. Keep track of + the used memory. + */ decode_table=share->decode_tables; for (i=0 ; i < trees ; i++) if (read_huff_table(&bit_buff,share->decode_trees+i,&decode_table, &intervall_buff,tmp_buff)) goto err3; + /* Reallocate the decoding tables to the used size. */ decode_table=(uint16*) my_realloc((gptr) share->decode_tables, (uint) ((byte*) decode_table - (byte*) share->decode_tables), MYF(MY_HOLD_ON_ERROR)); + /* Fix the table addresses in the tree heads. */ { long diff=PTR_BYTE_DIFF(decode_table,share->decode_tables); share->decode_tables=decode_table; @@ -226,7 +263,7 @@ my_bool _mi_read_pack_info(MI_INFO *info, pbool fix_keys) diff, uint16*); } - /* Fix record-ref-length for keys */ + /* Fix record-ref-length for keys */ if (fix_keys) { for (i=0 ; i < share->base.keys ; i++) @@ -263,7 +300,23 @@ err0: } - /* Read on huff-code-table from datafile */ +/* + Read a huff-code-table from datafile. + + SYNOPSIS + read_huff_table() + bit_buff Bit buffer pointing at start of the + decoding table in the file header cache. + decode_tree Pointer to the decode tree head. + decode_table IN/OUT Address of a pointer to the next free space. + intervall_buff IN/OUT Address of a pointer to the next unused values. + tmp_buff Buffer for temporary extraction of a full + decoding table as read from bit_buff. + + RETURN + 0 OK. + 1 Error. +*/ static uint read_huff_table(MI_BIT_BUFF *bit_buff, MI_DECODE_TREE *decode_tree, uint16 **decode_table, byte **intervall_buff, @@ -272,19 +325,32 @@ static uint read_huff_table(MI_BIT_BUFF *bit_buff, MI_DECODE_TREE *decode_tree, uint min_chr,elements,char_bits,offset_bits,size,intervall_length,table_bits, next_free_offset; uint16 *ptr,*end; + DBUG_ENTER("read_huff_table"); - LINT_INIT(ptr); if (!get_bits(bit_buff,1)) { + /* Byte value compression. */ min_chr=get_bits(bit_buff,8); elements=get_bits(bit_buff,9); char_bits=get_bits(bit_buff,5); offset_bits=get_bits(bit_buff,5); intervall_length=0; ptr=tmp_buff; + DBUG_PRINT("info", ("byte value compression")); + DBUG_PRINT("info", ("minimum byte value: %u", min_chr)); + DBUG_PRINT("info", ("number of tree nodes: %u", elements)); + DBUG_PRINT("info", ("bits for values: %u", char_bits)); + DBUG_PRINT("info", ("bits for tree offsets: %u", offset_bits)); + if (elements > 256) + { + DBUG_PRINT("error", ("ERROR: illegal number of tree elements: %u", + elements)); + DBUG_RETURN(1); + } } else { + /* Distinct column value compression. */ min_chr=0; elements=get_bits(bit_buff,15); intervall_length=get_bits(bit_buff,16); @@ -292,13 +358,27 @@ static uint read_huff_table(MI_BIT_BUFF *bit_buff, MI_DECODE_TREE *decode_tree, offset_bits=get_bits(bit_buff,5); decode_tree->quick_table_bits=0; ptr= *decode_table; + DBUG_PRINT("info", ("distinct column value compression")); + DBUG_PRINT("info", ("number of tree nodes: %u", elements)); + DBUG_PRINT("info", ("value buffer length: %u", intervall_length)); + DBUG_PRINT("info", ("bits for value index: %u", char_bits)); + DBUG_PRINT("info", ("bits for tree offsets: %u", offset_bits)); } size=elements*2-2; + DBUG_PRINT("info", ("tree size in uint16: %u", size)); + DBUG_PRINT("info", ("tree size in bytes: %u", size * sizeof(uint16))); for (end=ptr+size ; ptr < end ; ptr++) { if (get_bit(bit_buff)) + { *ptr= (uint16) get_bits(bit_buff,offset_bits); + if ((ptr + *ptr >= end) || !*ptr) + { + DBUG_PRINT("error", ("ERROR: illegal pointer in decode tree")); + DBUG_RETURN(1); + } + } else *ptr= (uint16) (IS_CHAR + (get_bits(bit_buff,char_bits) + min_chr)); } @@ -308,11 +388,15 @@ static uint read_huff_table(MI_BIT_BUFF *bit_buff, MI_DECODE_TREE *decode_tree, decode_tree->intervalls= *intervall_buff; if (! intervall_length) { - table_bits=find_longest_bitstream(tmp_buff, tmp_buff+OFFSET_TABLE_SIZE); - if (table_bits == (uint) ~0) - return 1; + /* Byte value compression. ptr started from tmp_buff. */ + /* Find longest Huffman code from begin to end of tree in bits. */ + table_bits= find_longest_bitstream(tmp_buff, ptr); + if (table_bits >= OFFSET_TABLE_SIZE) + DBUG_RETURN(1); if (table_bits > myisam_quick_table_bits) table_bits=myisam_quick_table_bits; + DBUG_PRINT("info", ("table bits: %u", table_bits)); + next_free_offset= (1 << table_bits); make_quick_table(*decode_table,tmp_buff,&next_free_offset,0,table_bits, table_bits); @@ -321,105 +405,279 @@ static uint read_huff_table(MI_BIT_BUFF *bit_buff, MI_DECODE_TREE *decode_tree, } else { + /* Distinct column value compression. ptr started from *decode_table */ (*decode_table)=end; + /* + get_bits() moves some bytes to a cache buffer in advance. May need + to step back. + */ bit_buff->pos-= bit_buff->bits/8; + /* Copy the distinct column values from the buffer. */ memcpy(*intervall_buff,bit_buff->pos,(size_t) intervall_length); (*intervall_buff)+=intervall_length; bit_buff->pos+=intervall_length; bit_buff->bits=0; } - return 0; + DBUG_RETURN(0); } +/* + Make a quick_table for faster decoding. + + SYNOPSIS + make_quick_table() + to_table Target quick_table and remaining decode table. + decode_table Source Huffman (sub-)tree within tmp_buff. + next_free_offset IN/OUT Next free offset from to_table. + Starts behind quick_table on the top-level. + value Huffman bits found so far. + bits Remaining bits to be collected. + max_bits Total number of bits to collect (table_bits). + + DESCRIPTION + + The quick table is an array of 16-bit values. There exists one value + for each possible code representable by max_bits (table_bits) bits. + In most cases table_bits is 9. So there are 512 16-bit values. + + If the high-order bit (16) is set (IS_CHAR) then the array slot for + this value is a valid Huffman code for a resulting byte value. + + The low-order 8 bits (1..8) are the resulting byte value. + + Bits 9..14 are the length of the Huffman code for this byte value. + This means so many bits from the input stream were needed to + represent this byte value. The remaining bits belong to later + Huffman codes. This also means that for every Huffman code shorter + than table_bits there are multiple entires in the array, which + differ just in the unused bits. + + If the high-order bit (16) is clear (0) then the remaining bits are + the position of the remaining Huffman decode tree segment behind the + quick table. + + RETURN + void +*/ + static void make_quick_table(uint16 *to_table, uint16 *decode_table, uint *next_free_offset, uint value, uint bits, uint max_bits) { + DBUG_ENTER("make_quick_table"); + + /* + When down the table to the requested maximum, copy the rest of the + Huffman table. + */ if (!bits--) { + /* + Remaining left Huffman tree segment starts behind quick table. + Remaining right Huffman tree segment starts behind left segment. + */ to_table[value]= (uint16) *next_free_offset; - *next_free_offset=copy_decode_table(to_table, *next_free_offset, - decode_table); - return; + /* + Re-construct the remaining Huffman tree segment at + next_free_offset in to_table. + */ + *next_free_offset= copy_decode_table(to_table, *next_free_offset, + decode_table); + DBUG_VOID_RETURN; } + + /* Descent on the left side. Left side bits are clear (0). */ if (!(*decode_table & IS_CHAR)) { - make_quick_table(to_table,decode_table+ *decode_table, - next_free_offset,value,bits,max_bits); + /* Not a leaf. Follow the pointer. */ + make_quick_table(to_table, decode_table + *decode_table, + next_free_offset, value, bits, max_bits); } else - fill_quick_table(to_table+value,bits,max_bits,(uint) *decode_table); + { + /* + A leaf. A Huffman code is complete. Fill the quick_table + array for all possible bit strings starting with this Huffman + code. + */ + fill_quick_table(to_table + value, bits, max_bits, (uint) *decode_table); + } + + /* Descent on the right side. Right side bits are set (1). */ decode_table++; value|= (1 << bits); if (!(*decode_table & IS_CHAR)) { - make_quick_table(to_table,decode_table+ *decode_table, - next_free_offset,value,bits,max_bits); + /* Not a leaf. Follow the pointer. */ + make_quick_table(to_table, decode_table + *decode_table, + next_free_offset, value, bits, max_bits); } else - fill_quick_table(to_table+value,bits,max_bits,(uint) *decode_table); - return; + { + /* + A leaf. A Huffman code is complete. Fill the quick_table + array for all possible bit strings starting with this Huffman + code. + */ + fill_quick_table(to_table + value, bits, max_bits, (uint) *decode_table); + } + + DBUG_VOID_RETURN; } +/* + Fill quick_table for all possible values starting with this Huffman code. + + SYNOPSIS + fill_quick_table() + table Target quick_table position. + bits Unused bits from max_bits. + max_bits Total number of bits to collect (table_bits). + value The byte encoded by the found Huffman code. + + DESCRIPTION + + Fill the segment (all slots) of the quick_table array with the + resulting value for the found Huffman code. There are as many slots + as there are combinations representable by the unused bits. + + In most cases we use 9 table bits. Assume a 3-bit Huffman code. Then + there are 6 unused bits. Hence we fill 2**6 = 64 slots with the + value. + + RETURN + void +*/ + static void fill_quick_table(uint16 *table, uint bits, uint max_bits, uint value) { uint16 *end; - value|=(max_bits-bits) << 8; - for (end=table+ (1 << bits) ; - table < end ; - *table++ = (uint16) value | IS_CHAR) ; + DBUG_ENTER("fill_quick_table"); + + /* + Bits 1..8 of value represent the decoded byte value. + Bits 9..14 become the length of the Huffman code for this byte value. + Bit 16 flags a valid code (IS_CHAR). + */ + value|= (max_bits - bits) << 8 | IS_CHAR; + + for (end= table + (1 << bits); table < end; table++) + { + *table= (uint16) value; + } + DBUG_VOID_RETURN; } +/* + Reconstruct a decode subtree at the target position. + + SYNOPSIS + copy_decode_table() + to_pos Target quick_table and remaining decode table. + offset Next free offset from to_pos. + decode_table Source Huffman subtree within tmp_buff. + + NOTE + Pointers in the decode tree are relative to the pointers position. + + RETURN + next free offset from to_pos. +*/ + static uint copy_decode_table(uint16 *to_pos, uint offset, uint16 *decode_table) { uint prev_offset; prev_offset= offset; + DBUG_ENTER("copy_decode_table"); + /* Descent on the left side. */ if (!(*decode_table & IS_CHAR)) { + /* Set a pointer to the next target node. */ to_pos[offset]=2; + /* Copy the left hand subtree there. */ offset=copy_decode_table(to_pos,offset+2,decode_table+ *decode_table); } else { + /* Copy the byte value. */ to_pos[offset]= *decode_table; + /* Step behind this node. */ offset+=2; } - decode_table++; + /* Descent on the right side. */ + decode_table++; if (!(*decode_table & IS_CHAR)) { + /* Set a pointer to the next free target node. */ to_pos[prev_offset+1]=(uint16) (offset-prev_offset-1); + /* Copy the right hand subtree to the entry of that node. */ offset=copy_decode_table(to_pos,offset,decode_table+ *decode_table); } else + { + /* Copy the byte value. */ to_pos[prev_offset+1]= *decode_table; - return offset; + } + DBUG_RETURN(offset); } +/* + Find the length of the longest Huffman code in this table in bits. + + SYNOPSIS + find_longest_bitstream() + table Code (sub-)table start. + end End of code table. + + IMPLEMENTATION + + Recursively follow the branch(es) of the code pair on every level of + the tree until two byte values (and no branch) are found. Add one to + each level when returning back from each recursion stage. + + 'end' is used for error checking only. A clean tree terminates + before reaching 'end'. Hence the exact value of 'end' is not too + important. However having it higher than necessary could lead to + misbehaviour should 'next' jump into the dirty area. + + RETURN + length Length of longest Huffman code in bits. + >= OFFSET_TABLE_SIZE Error, broken tree. It does not end before 'end'. +*/ + static uint find_longest_bitstream(uint16 *table, uint16 *end) { - uint length=1,length2; + uint length= 1; + uint length2; + if (!(*table & IS_CHAR)) { uint16 *next= table + *table; if (next > end || next == table) - return ~0; - length=find_longest_bitstream(next, end)+1; + { + DBUG_PRINT("error", ("ERROR: illegal pointer in decode tree")); + return OFFSET_TABLE_SIZE; + } + length= find_longest_bitstream(next, end) + 1; } table++; if (!(*table & IS_CHAR)) { uint16 *next= table + *table; if (next > end || next == table) - return ~0; - length2=find_longest_bitstream(table+ *table, end)+1; + { + DBUG_PRINT("error", ("ERROR: illegal pointer in decode tree")); + return OFFSET_TABLE_SIZE; + } + length2= find_longest_bitstream(next, end) + 1; length=max(length,length2); } return length; @@ -855,18 +1113,46 @@ static void decode_bytes(MI_COLUMNDEF *rec,MI_BIT_BUFF *bit_buff,uchar *to, bit_buff->pos+=4; bits+=32; } - /* First use info in quick_table */ + /* + First use info in quick_table. + + The quick table is an array of 16-bit values. There exists one + value for each possible code representable by table_bits bits. + In most cases table_bits is 9. So there are 512 16-bit values. + + If the high-order bit (16) is set (IS_CHAR) then the array slot + for this value is a valid Huffman code for a resulting byte value. + + The low-order 8 bits (1..8) are the resulting byte value. + + Bits 9..14 are the length of the Huffman code for this byte value. + This means so many bits from the input stream were needed to + represent this byte value. The remaining bits belong to later + Huffman codes. This also means that for every Huffman code shorter + than table_bits there are multiple entires in the array, which + differ just in the unused bits. + + If the high-order bit (16) is clear (0) then the remaining bits are + the position of the remaining Huffman decode tree segment behind the + quick table. + */ low_byte=(uint) (bit_buff->current_byte >> (bits - table_bits)) & table_and; low_byte=decode_tree->table[low_byte]; if (low_byte & IS_CHAR) { + /* + All Huffman codes of less or equal table_bits length are in the + quick table. This is one of them. + */ *to++ = (low_byte & 255); /* Found char in quick table */ bits-= ((low_byte >> 8) & 31); /* Remove bits used */ } else { /* Map through rest of decode-table */ + /* This means that the Huffman code must be longer than table_bits. */ pos=decode_tree->table+low_byte; bits-=table_bits; + /* NOTE: decode_bytes_test_bit() is a macro wich contains a break !!! */ for (;;) { low_byte=(uint) (bit_buff->current_byte >> (bits-8)); @@ -1092,6 +1378,11 @@ uint _mi_pack_get_block_info(MI_INFO *myisam, MI_BIT_BUFF *bit_buff, { head_length+= read_pack_length((uint) myisam->s->pack.version, header + head_length, &info->blob_len); + /* + Ensure that the record buffer is big enough for the compressed + record plus all expanded blobs. [We do not have an extra buffer + for the resulting blobs. Sigh.] + */ if (!(mi_alloc_rec_buff(myisam,info->rec_len + info->blob_len, rec_buff_p))) return BLOCK_FATAL_ERROR; /* not enough memory */ |