1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
|
/*****************************************************************************
Copyright (C) 2013 SkySQL Ab. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
Foundation; version 2 of the License.
This program is distributed in the hope that it will be useful, but WITHOUT
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
You should have received a copy of the GNU General Public License along with
this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*****************************************************************************/
/******************************************************************//**
@file fil/fil0pagecompress.cc
Implementation for page compressed file spaces.
Created 11/12/2013 Jan Lindström jan.lindstrom@skysql.com
***********************************************************************/
#include "fil0fil.h"
#include "fil0pagecompress.h"
#include <debug_sync.h>
#include <my_dbug.h>
#include "mem0mem.h"
#include "hash0hash.h"
#include "os0file.h"
#include "mach0data.h"
#include "buf0buf.h"
#include "buf0flu.h"
#include "log0recv.h"
#include "fsp0fsp.h"
#include "srv0srv.h"
#include "srv0start.h"
#include "mtr0mtr.h"
#include "mtr0log.h"
#include "dict0dict.h"
#include "page0page.h"
#include "page0zip.h"
#include "trx0sys.h"
#include "row0mysql.h"
#ifndef UNIV_HOTBACKUP
# include "buf0lru.h"
# include "ibuf0ibuf.h"
# include "sync0sync.h"
# include "os0sync.h"
#else /* !UNIV_HOTBACKUP */
# include "srv0srv.h"
static ulint srv_data_read, srv_data_written;
#endif /* !UNIV_HOTBACKUP */
#include "zlib.h"
#ifdef __linux__
#include <linux/fs.h>
#include <sys/ioctl.h>
#include <fcntl.h>
#include <linux/falloc.h>
#endif
#include "row0mysql.h"
/****************************************************************//**
For page compressed pages compress the page before actual write
operation.
@return compressed page to be written*/
byte*
fil_compress_page(
/*==============*/
ulint space_id, /*!< in: tablespace id of the
table. */
byte* buf, /*!< in: buffer from which to write; in aio
this must be appropriately aligned */
byte* out_buf, /*!< out: compressed buffer */
ulint len, /*!< in: length of input buffer.*/
ulint* out_len) /*!< out: actual length of compressed page */
{
int err = Z_OK;
int level = 0;
ulint header_len = FIL_PAGE_DATA + FIL_PAGE_COMPRESSED_SIZE;
ulint write_size=0;
ut_a(buf);
ut_a(out_buf);
ut_a(len);
ut_a(out_len);
level = fil_space_get_page_compression_level(space_id);
ut_a(fil_space_is_page_compressed(space_id));
fil_system_enter();
fil_space_t* space = fil_space_get_by_id(space_id);
fil_system_exit();
/* If no compression level was provided to this table, use system
default level */
if (level == 0) {
level = srv_compress_zlib_level;
}
#ifdef UNIV_DEBUG
fprintf(stderr,
"InnoDB: Note: Preparing for compress for space %lu name %s len %lu\n",
space_id, fil_space_name(space), len);
#endif
write_size = UNIV_PAGE_SIZE - header_len;
err = compress2(out_buf+header_len, &write_size, buf, len, level);
if (err != Z_OK) {
/* If error we leave the actual page as it was */
fprintf(stderr,
"InnoDB: Warning: Compression failed for space %lu name %s len %lu rt %d write %lu\n",
space_id, fil_space_name(space), len, err, write_size);
*out_len = len;
return (buf);
} else {
/* Set up the page header */
memcpy(out_buf, buf, FIL_PAGE_DATA);
/* Set up the checksum */
mach_write_to_4(out_buf+FIL_PAGE_SPACE_OR_CHKSUM, BUF_NO_CHECKSUM_MAGIC);
/* Set up the correct page type */
mach_write_to_2(out_buf+FIL_PAGE_TYPE, FIL_PAGE_PAGE_COMPRESSED);
/* Set up the flush lsn to be compression algorithm */
mach_write_to_8(out_buf+FIL_PAGE_FILE_FLUSH_LSN, FIL_PAGE_COMPRESSION_ZLIB);
/* Set up the actual payload lenght */
mach_write_to_2(out_buf+FIL_PAGE_DATA, write_size);
#ifdef UNIV_DEBUG
/* Verify */
ut_ad(fil_page_is_compressed(out_buf));
ut_ad(mach_read_from_4(out_buf+FIL_PAGE_SPACE_OR_CHKSUM) == BUF_NO_CHECKSUM_MAGIC);
ut_ad(mach_read_from_2(out_buf+FIL_PAGE_DATA) == write_size);
ut_ad(mach_read_from_8(out_buf+FIL_PAGE_FILE_FLUSH_LSN) == FIL_PAGE_COMPRESSION_ZLIB);
#endif
write_size+=header_len;
/* Actual write needs to be alligned on block size */
if (write_size % OS_FILE_LOG_BLOCK_SIZE) {
write_size = (write_size + (OS_FILE_LOG_BLOCK_SIZE - (write_size % OS_FILE_LOG_BLOCK_SIZE)));
}
#ifdef UNIV_DEBUG
fprintf(stderr,
"InnoDB: Note: Compression succeeded for space %lu name %s len %lu out_len %lu\n",
space_id, fil_space_name(space), len, write_size);
#endif
#define SECT_SIZE 512
srv_stats.page_compression_saved.add((len - write_size));
if ((len - write_size) > 0) {
srv_stats.page_compression_trim_sect512.add(((len - write_size) / SECT_SIZE));
srv_stats.page_compression_trim_sect4096.add(((len - write_size) / (SECT_SIZE*8)));
}
//srv_stats.page_compressed_trim_op.inc();
srv_stats.pages_page_compressed.inc();
*out_len = write_size;
return(out_buf);
}
}
/****************************************************************//**
For page compressed pages decompress the page after actual read
operation. */
void
fil_decompress_page(
/*================*/
byte* page_buf, /*!< in: preallocated buffer or NULL */
byte* buf, /*!< out: buffer from which to read; in aio
this must be appropriately aligned */
ulint len) /*!< in: length of output buffer.*/
{
int err = 0;
ulint actual_size = 0;
ulint compression_alg = 0;
byte *in_buf;
ut_a(buf);
ut_a(len);
/* Before actual decompress, make sure that page type is correct */
if (mach_read_from_4(buf+FIL_PAGE_SPACE_OR_CHKSUM) != BUF_NO_CHECKSUM_MAGIC ||
mach_read_from_2(buf+FIL_PAGE_TYPE) != FIL_PAGE_PAGE_COMPRESSED) {
fprintf(stderr,
"InnoDB: Corruption: We try to uncompress corrupted page\n"
"InnoDB: CRC %lu type %lu.\n"
"InnoDB: len %lu\n",
mach_read_from_4(buf+FIL_PAGE_SPACE_OR_CHKSUM),
mach_read_from_2(buf+FIL_PAGE_TYPE), len);
fflush(stderr);
ut_error;
}
/* Get compression algorithm */
compression_alg = mach_read_from_8(buf+FIL_PAGE_FILE_FLUSH_LSN);
if (compression_alg == FIL_PAGE_COMPRESSION_ZLIB) {
// If no buffer was given, we need to allocate temporal buffer
if (page_buf == NULL) {
in_buf = static_cast<byte *>(ut_malloc(UNIV_PAGE_SIZE));
} else {
in_buf = page_buf;
}
/* Get the actual size of compressed page */
actual_size = mach_read_from_2(buf+FIL_PAGE_DATA);
#ifdef UNIV_DEBUG
fprintf(stderr,
"InnoDB: Note: Preparing for decompress for len %lu\n",
actual_size);
#endif
err= uncompress(in_buf, &len, buf+FIL_PAGE_DATA+FIL_PAGE_COMPRESSED_SIZE, (unsigned long)actual_size);
/* If uncompress fails it means that page is corrupted */
if (err != Z_OK) {
fprintf(stderr,
"InnoDB: Corruption: Page is marked as compressed\n"
"InnoDB: but uncompress failed with error %d.\n"
"InnoDB: size %lu len %lu\n",
err, actual_size, len);
fflush(stderr);
ut_error;
}
#ifdef UNIV_DEBUG
fprintf(stderr,
"InnoDB: Note: Decompression succeeded for len %lu \n",
len);
#endif
/* Copy the uncompressed page to the buffer pool, not
really any other options. */
memcpy(buf, in_buf, len);
// Need to free temporal buffer if no buffer was given
if (page_buf == NULL) {
ut_free(in_buf);
}
srv_stats.pages_page_decompressed.inc();
} else {
fprintf(stderr,
"InnoDB: Corruption: Page is marked as compressed\n"
"InnoDB: but compression algorithm %s\n"
"InnoDB: is not known.\n"
,fil_get_compression_alg_name(compression_alg));
fflush(stderr);
ut_error;
}
}
/*******************************************************************//**
Find out wheather the page is index page or not
@return true if page type index page, false if not */
ibool
fil_page_is_index_page(
/*===================*/
byte *buf) /*!< in: page */
{
return(mach_read_from_2(buf+FIL_PAGE_TYPE) == FIL_PAGE_INDEX);
}
/*******************************************************************//**
Find out wheather the page is page compressed
@return true if page is page compressed, false if not */
ibool
fil_page_is_compressed(
/*===================*/
byte *buf) /*!< in: page */
{
return(mach_read_from_2(buf+FIL_PAGE_TYPE) == FIL_PAGE_PAGE_COMPRESSED);
}
/*******************************************************************//**
Returns the page compression level of the space, or 0 if the space
is not compressed. The tablespace must be cached in the memory cache.
@return page compression level, ULINT_UNDEFINED if space not found */
ulint
fil_space_get_page_compression_level(
/*=================================*/
ulint id) /*!< in: space id */
{
ulint flags;
flags = fil_space_get_flags(id);
if (flags && flags != ULINT_UNDEFINED) {
return(fsp_flags_get_page_compression_level(flags));
}
return(flags);
}
/*******************************************************************//**
Extract the page compression from space.
@return true if space is page compressed, false if space is not found
or space is not page compressed. */
ibool
fil_space_is_page_compressed(
/*=========================*/
ulint id) /*!< in: space id */
{
ulint flags;
flags = fil_space_get_flags(id);
if (flags && flags != ULINT_UNDEFINED) {
return(fsp_flags_is_page_compressed(flags));
}
return(flags);
}
/****************************************************************//**
Get the name of the compression algorithm used for page
compression.
@return compression algorithm name or "UNKNOWN" if not known*/
const char*
fil_get_compression_alg_name(
/*=========================*/
ulint comp_alg) /*!<in: compression algorithm number */
{
switch(comp_alg) {
case FIL_PAGE_COMPRESSION_ZLIB:
return ("ZLIB");
break;
default:
return("UNKNOWN");
break;
}
}
/*******************************************************************//**
Returns the atomic writes flag of the space, or false if the space
is not using atomic writes. The tablespace must be cached in the memory cache.
@return true if space using atomic writes, false if not */
ibool
fil_space_get_atomic_writes(
/*========================*/
ulint id) /*!< in: space id */
{
ulint flags;
flags = fil_space_get_flags(id);
if (flags && flags != ULINT_UNDEFINED) {
return(fsp_flags_get_atomic_writes(flags));
}
return(flags);
}
|