MDEV-11233 CREATE FULLTEXT INDEX with a token longer than 127 bytes

crashes server This bug is the result of merging the Oracle MySQL follow-up fix BUG#22963169 MYSQL CRASHES ON CREATE FULLTEXT INDEX without merging the base bug fix: Bug#79475 Insert a token of 84 4-bytes chars into fts index causes server crash. Unlike the above mentioned fixes in MySQL, our fix will not change the storage format of fulltext indexes in InnoDB or XtraDB when a character encoding with mbmaxlen=2 or mbmaxlen=3 and the length of a word is between 128 and 84*mbmaxlen bytes. The Oracle fix would allocate 2 length bytes for these cases. Compatibility with other MySQL and MariaDB releases is ensured by persisting the used maximum length in the SYS_COLUMNS table in the InnoDB data dictionary. This fix also removes some unnecessary strcmp() calls when checking for the legacy default collation my_charset_latin1 (my_charset_latin1.name=="latin1_swedish_ci"). fts_create_one_index_table(): Store the actual length in bytes. This metadata will be written to the SYS_COLUMNS table. fts_zip_initialize(): Initialize only the first byte of the buffer. Actually the code should not even care about this first byte, because the length is set as 0. FTX_MAX_WORD_LEN: Define as HA_FT_MAXCHARLEN * 4 aka 336 bytes, not as 254 bytes. row_merge_create_fts_sort_index(): Set the actual maximum length of the column in bytes, similar to fts_create_one_index_table(). row_merge_fts_doc_tokenize(): Remove the redundant parameter word_dtype. Use the actual maximum length of the column. Calculate the extra_size in the same way as row_merge_buf_encode() does.
author: Marko Mäkelä <marko.makela@mariadb.com> 2016-12-05 15:25:59 +0200
committer: Marko Mäkelä <marko.makela@mariadb.com> 2017-01-27 10:19:39 +0200
commit: 732672c3044e60fb0d1dfdb466bd3c3d13ea2f8d (patch)
tree: 58e19d71f428f99f6c2589929d5923c597b73107
parent: afb461587c0b7dea2e5e70a165e8d4d437c3f964 (diff)
download: mariadb-git-732672c3044e60fb0d1dfdb466bd3c3d13ea2f8d.tar.gz
13 files changed, 365 insertions, 108 deletions
diff --git a/mysql-test/suite/innodb_fts/r/create.result b/mysql-test/suite/innodb_fts/r/create.result
new file mode 100644
index 00000000000..c537aa81efd
--- /dev/null
+++ b/mysql-test/suite/innodb_fts/r/create.result
@@ -0,0 +1,168 @@
+SET NAMES utf8mb4;
+#
+# MDEV-11233 CREATE FULLTEXT INDEX with a token
+# longer than 127 bytes crashes server
+#
+CREATE TABLE t(t TEXT CHARACTER SET utf8mb3) ENGINE=InnoDB;
+INSERT INTO t SET t=REPEAT(CONCAT(REPEAT(_utf8mb3 0xE0B987, 4), REPEAT(_utf8mb3 0xE0B989, 5)), 5);
+INSERT INTO t SET t=REPEAT(_utf8 0xefbc90,84);
+INSERT INTO t SET t=REPEAT('befor',17);
+INSERT INTO t SET t='BeforeTheIndexCreation';
+CREATE FULLTEXT INDEX ft ON t(t);
+Warnings:
+Warning	124	InnoDB rebuilding table to add column FTS_DOC_ID
+INSERT INTO t SET t='this was inserted after creating the index';
+INSERT INTO t SET t=REPEAT(_utf8 0xefbc91,84);
+INSERT INTO t SET t=REPEAT('after',17);
+INSERT INTO t SET t=REPEAT(_utf8mb3 0xe794b2e9aaa8e69687, 15);
+# The data below is not 3-byte UTF-8, but 4-byte chars.
+INSERT INTO t SET t=REPEAT(_utf8mb4 0xf09f9695, 84);
+Warnings:
+Warning	1366	Incorrect string value: '\xF0\x9F\x96\x95\xF0\x9F...' for column 't' at row 1
+INSERT INTO t SET t=REPEAT(_utf8mb4 0xf09f9696, 85);
+Warnings:
+Warning	1366	Incorrect string value: '\xF0\x9F\x96\x96\xF0\x9F...' for column 't' at row 1
+SELECT COUNT(*) FROM t WHERE MATCH t AGAINST
+(REPEAT(CONCAT(REPEAT(_utf8mb3 0xE0B987, 4), REPEAT(_utf8mb3 0xE0B989, 5)), 5));
+COUNT(*)
+1
+SELECT COUNT(*) FROM t WHERE MATCH t AGAINST ('BeforeTheIndexCreation');
+COUNT(*)
+1
+SELECT COUNT(*) FROM t WHERE MATCH t AGAINST (REPEAT('befor',17));
+COUNT(*)
+0
+SELECT COUNT(*) FROM t WHERE MATCH t AGAINST ('after');
+COUNT(*)
+1
+SELECT COUNT(*) FROM t WHERE MATCH t AGAINST (REPEAT('after',17));
+COUNT(*)
+0
+SELECT COUNT(*) FROM t WHERE MATCH t AGAINST (REPEAT(_utf8 0xefbc90, 83));
+COUNT(*)
+0
+SELECT COUNT(*) FROM t WHERE MATCH t AGAINST (REPEAT(_utf8 0xefbc90, 84));
+COUNT(*)
+1
+SELECT COUNT(*) FROM t WHERE MATCH t AGAINST (REPEAT(_utf8 0xefbc90, 85));
+COUNT(*)
+0
+SELECT COUNT(*) FROM t WHERE MATCH t AGAINST (REPEAT(_utf8 0xefbc91, 83));
+COUNT(*)
+0
+SELECT COUNT(*) FROM t WHERE MATCH t AGAINST (REPEAT(_utf8 0xefbc91, 84));
+COUNT(*)
+1
+SELECT COUNT(*) FROM t WHERE MATCH t AGAINST (REPEAT(_utf8 0xefbc91, 85));
+COUNT(*)
+0
+SELECT COUNT(*) FROM t WHERE MATCH t AGAINST (REPEAT(_utf8mb4 0xf09f9695, 83));
+COUNT(*)
+0
+SELECT COUNT(*) FROM t WHERE MATCH t AGAINST (REPEAT(_utf8mb4 0xf09f9695, 84));
+COUNT(*)
+0
+SELECT COUNT(*) FROM t WHERE MATCH t AGAINST (REPEAT(_utf8mb4 0xf09f9696, 84));
+COUNT(*)
+0
+SELECT COUNT(*) FROM t WHERE MATCH t AGAINST (REPEAT(_utf8mb4 0xf09f9696, 85));
+COUNT(*)
+0
+SELECT * FROM t;
+t
+็็็็้้้้้็็็็้้้้้็็็็้้้้้็็็็้้้้้็็็็้้้้้
+００００００００００００００００００００００００００００００００００００００００００００００００００００００００００００００００００００００００００００００００００００
+beforbeforbeforbeforbeforbeforbeforbeforbeforbeforbeforbeforbeforbeforbeforbeforbefor
+BeforeTheIndexCreation
+this was inserted after creating the index
+１１１１１１１１１１１１１１１１１１１１１１１１１１１１１１１１１１１１１１１１１１１１１１１１１１１１１１１１１１１１１１１１１１１１１１１１１１１１１１１１１１１１
+afterafterafterafterafterafterafterafterafterafterafterafterafterafterafterafterafter
+甲骨文甲骨文甲骨文甲骨文甲骨文甲骨文甲骨文甲骨文甲骨文甲骨文甲骨文甲骨文甲骨文甲骨文甲骨文
+????????????????????????????????????????????????????????????????????????????????????
+?????????????????????????????????????????????????????????????????????????????????????
+SELECT len,COUNT(*) FROM INFORMATION_SCHEMA.INNODB_SYS_COLUMNS where name='word' GROUP BY len;
+len	COUNT(*)
+252	6
+DROP TABLE t;
+CREATE TABLE t(t TEXT CHARACTER SET utf8mb4) ENGINE=InnoDB;
+INSERT INTO t SET t=REPEAT(_utf8mb3 0xe794b2e9aaa8e69687, 15);
+INSERT INTO t SET t=REPEAT(_utf8 0xefbc90,84);
+INSERT INTO t SET t=REPEAT('befor',17);
+INSERT INTO t SET t='BeforeTheIndexCreation';
+CREATE FULLTEXT INDEX ft ON t(t);
+Warnings:
+Warning	124	InnoDB rebuilding table to add column FTS_DOC_ID
+INSERT INTO t SET t='this was inserted after creating the index';
+INSERT INTO t SET t=REPEAT(_utf8 0xefbc91,84);
+INSERT INTO t SET t=REPEAT('after',17);
+INSERT INTO t SET t=REPEAT(concat(repeat(_utf8mb3 0xE0B987, 4), repeat(_utf8mb3 0xE0B989, 5)), 5);
+INSERT INTO t SET t=REPEAT(_utf8mb4 0xf09f9695, 84);
+# The token below exceeds the 84-character limit.
+INSERT INTO t SET t=REPEAT(_utf8mb4 0xf09f9696, 85);
+SELECT COUNT(*) FROM t WHERE MATCH t AGAINST (REPEAT(_utf8mb3 0xe794b2e9aaa8e69687, 15));
+COUNT(*)
+1
+SELECT COUNT(*) FROM t WHERE MATCH t AGAINST ('BeforeTheIndexCreation');
+COUNT(*)
+1
+SELECT COUNT(*) FROM t WHERE MATCH t AGAINST (REPEAT('befor',17));
+COUNT(*)
+0
+SELECT COUNT(*) FROM t WHERE MATCH t AGAINST ('after');
+COUNT(*)
+1
+SELECT COUNT(*) FROM t WHERE MATCH t AGAINST (REPEAT('after',17));
+COUNT(*)
+0
+SELECT COUNT(*) FROM t WHERE MATCH t AGAINST (REPEAT(_utf8 0xefbc90, 83));
+COUNT(*)
+0
+SELECT COUNT(*) FROM t WHERE MATCH t AGAINST (REPEAT(_utf8 0xefbc90, 84));
+COUNT(*)
+1
+SELECT COUNT(*) FROM t WHERE MATCH t AGAINST (REPEAT(_utf8 0xefbc90, 85));
+COUNT(*)
+0
+SELECT COUNT(*) FROM t WHERE MATCH t AGAINST (REPEAT(_utf8 0xefbc91, 83));
+COUNT(*)
+0
+SELECT COUNT(*) FROM t WHERE MATCH t AGAINST (REPEAT(_utf8 0xefbc91, 84));
+COUNT(*)
+1
+SELECT COUNT(*) FROM t WHERE MATCH t AGAINST (REPEAT(_utf8 0xefbc91, 85));
+COUNT(*)
+0
+SELECT COUNT(*) FROM t WHERE MATCH t AGAINST (REPEAT(_utf8mb4 0xf09f9695, 83));
+COUNT(*)
+0
+SELECT COUNT(*) FROM t WHERE MATCH t AGAINST (REPEAT(_utf8mb4 0xf09f9695, 84));
+COUNT(*)
+0
+SELECT COUNT(*) FROM t WHERE MATCH t AGAINST (REPEAT(_utf8mb4 0xf09f9696, 84));
+COUNT(*)
+0
+SELECT COUNT(*) FROM t WHERE MATCH t AGAINST (REPEAT(_utf8mb4 0xf09f9696, 85));
+COUNT(*)
+0
+SELECT * FROM t;
+t
+甲骨文甲骨文甲骨文甲骨文甲骨文甲骨文甲骨文甲骨文甲骨文甲骨文甲骨文甲骨文甲骨文甲骨文甲骨文
+００００００００００００００００００００００００００００００００００００００００００００００００００００００００００００００００００００００００００００００００００００
+beforbeforbeforbeforbeforbeforbeforbeforbeforbeforbeforbeforbeforbeforbeforbeforbefor
+BeforeTheIndexCreation
+this was inserted after creating the index
+１１１１１１１１１１１１１１１１１１１１１１１１１１１１１１１１１１１１１１１１１１１１１１１１１１１１１１１１１１１１１１１１１１１１１１１１１１１１１１１１１１１１
+afterafterafterafterafterafterafterafterafterafterafterafterafterafterafterafterafter
+็็็็้้้้้็็็็้้้้้็็็็้้้้้็็็็้้้้้็็็็้้้้้
+🖕🖕🖕🖕🖕🖕🖕🖕🖕🖕🖕🖕🖕🖕🖕🖕🖕🖕🖕🖕🖕🖕🖕🖕🖕🖕🖕🖕🖕🖕🖕🖕🖕🖕🖕🖕🖕🖕🖕🖕🖕🖕🖕🖕🖕🖕🖕🖕🖕🖕🖕🖕🖕🖕🖕🖕🖕🖕🖕🖕🖕🖕🖕🖕🖕🖕🖕🖕🖕🖕🖕🖕🖕🖕🖕🖕🖕🖕🖕🖕🖕🖕🖕🖕
+🖖🖖🖖🖖🖖🖖🖖🖖🖖🖖🖖🖖🖖🖖🖖🖖🖖🖖🖖🖖🖖🖖🖖🖖🖖🖖🖖🖖🖖🖖🖖🖖🖖🖖🖖🖖🖖🖖🖖🖖🖖🖖🖖🖖🖖🖖🖖🖖🖖🖖🖖🖖🖖🖖🖖🖖🖖🖖🖖🖖🖖🖖🖖🖖🖖🖖🖖🖖🖖🖖🖖🖖🖖🖖🖖🖖🖖🖖🖖🖖🖖🖖🖖🖖🖖
+SELECT len,COUNT(*) FROM INFORMATION_SCHEMA.INNODB_SYS_COLUMNS where name='word' GROUP BY len;
+len	COUNT(*)
+336	6
+DROP TABLE t;
+CREATE TABLE t(t TEXT CHARACTER SET latin1, FULLTEXT INDEX(t))
+ENGINE=InnoDB;
+SELECT len,COUNT(*) FROM INFORMATION_SCHEMA.INNODB_SYS_COLUMNS where name='word' GROUP BY len;
+len	COUNT(*)
+84	6
+DROP TABLE t;
diff --git a/mysql-test/suite/innodb_fts/t/create.opt b/mysql-test/suite/innodb_fts/t/create.opt
new file mode 100644
index 00000000000..3ad568c816e
--- /dev/null
+++ b/mysql-test/suite/innodb_fts/t/create.opt
@@ -0,0 +1 @@
+--loose-innodb-sys-columns
diff --git a/mysql-test/suite/innodb_fts/t/create.test b/mysql-test/suite/innodb_fts/t/create.test
new file mode 100644
index 00000000000..f0329602ed1
--- /dev/null
+++ b/mysql-test/suite/innodb_fts/t/create.test
@@ -0,0 +1,92 @@
+--source include/have_innodb.inc
+SET NAMES utf8mb4;
+
+--echo #
+--echo # MDEV-11233 CREATE FULLTEXT INDEX with a token
+--echo # longer than 127 bytes crashes server
+--echo #
+
+# This bug is the result of merging the Oracle MySQL follow-up fix
+# BUG#22963169 MYSQL CRASHES ON CREATE FULLTEXT INDEX
+# without merging a fix of Bug#79475 Insert a token of 84 4-bytes
+# chars into fts index causes server crash.
+
+# Oracle did not publish tests for either of the above MySQL bugs.
+# The tests below were developed for MariaDB Server.
+# The maximum length of a fulltext-indexed word is 84 characters.
+
+CREATE TABLE t(t TEXT CHARACTER SET utf8mb3) ENGINE=InnoDB;
+INSERT INTO t SET t=REPEAT(CONCAT(REPEAT(_utf8mb3 0xE0B987, 4), REPEAT(_utf8mb3 0xE0B989, 5)), 5);
+INSERT INTO t SET t=REPEAT(_utf8 0xefbc90,84);
+INSERT INTO t SET t=REPEAT('befor',17); # too long, will not be indexed
+INSERT INTO t SET t='BeforeTheIndexCreation';
+CREATE FULLTEXT INDEX ft ON t(t);
+INSERT INTO t SET t='this was inserted after creating the index';
+INSERT INTO t SET t=REPEAT(_utf8 0xefbc91,84);
+INSERT INTO t SET t=REPEAT('after',17); # too long, will not be indexed
+INSERT INTO t SET t=REPEAT(_utf8mb3 0xe794b2e9aaa8e69687, 15);
+--echo # The data below is not 3-byte UTF-8, but 4-byte chars.
+INSERT INTO t SET t=REPEAT(_utf8mb4 0xf09f9695, 84);
+INSERT INTO t SET t=REPEAT(_utf8mb4 0xf09f9696, 85);
+SELECT COUNT(*) FROM t WHERE MATCH t AGAINST
+(REPEAT(CONCAT(REPEAT(_utf8mb3 0xE0B987, 4), REPEAT(_utf8mb3 0xE0B989, 5)), 5));
+SELECT COUNT(*) FROM t WHERE MATCH t AGAINST ('BeforeTheIndexCreation');
+SELECT COUNT(*) FROM t WHERE MATCH t AGAINST (REPEAT('befor',17));
+SELECT COUNT(*) FROM t WHERE MATCH t AGAINST ('after');
+SELECT COUNT(*) FROM t WHERE MATCH t AGAINST (REPEAT('after',17));
+SELECT COUNT(*) FROM t WHERE MATCH t AGAINST (REPEAT(_utf8 0xefbc90, 83));
+SELECT COUNT(*) FROM t WHERE MATCH t AGAINST (REPEAT(_utf8 0xefbc90, 84));
+SELECT COUNT(*) FROM t WHERE MATCH t AGAINST (REPEAT(_utf8 0xefbc90, 85));
+SELECT COUNT(*) FROM t WHERE MATCH t AGAINST (REPEAT(_utf8 0xefbc91, 83));
+SELECT COUNT(*) FROM t WHERE MATCH t AGAINST (REPEAT(_utf8 0xefbc91, 84));
+SELECT COUNT(*) FROM t WHERE MATCH t AGAINST (REPEAT(_utf8 0xefbc91, 85));
+SELECT COUNT(*) FROM t WHERE MATCH t AGAINST (REPEAT(_utf8mb4 0xf09f9695, 83));
+SELECT COUNT(*) FROM t WHERE MATCH t AGAINST (REPEAT(_utf8mb4 0xf09f9695, 84));
+SELECT COUNT(*) FROM t WHERE MATCH t AGAINST (REPEAT(_utf8mb4 0xf09f9696, 84));
+SELECT COUNT(*) FROM t WHERE MATCH t AGAINST (REPEAT(_utf8mb4 0xf09f9696, 85));
+SELECT * FROM t;
+
+# The column length should be 252 bytes (84 characters * 3 bytes/character).
+SELECT len,COUNT(*) FROM INFORMATION_SCHEMA.INNODB_SYS_COLUMNS where name='word' GROUP BY len;
+DROP TABLE t;
+
+CREATE TABLE t(t TEXT CHARACTER SET utf8mb4) ENGINE=InnoDB;
+INSERT INTO t SET t=REPEAT(_utf8mb3 0xe794b2e9aaa8e69687, 15);
+INSERT INTO t SET t=REPEAT(_utf8 0xefbc90,84);
+INSERT INTO t SET t=REPEAT('befor',17); # too long, will not be indexed
+INSERT INTO t SET t='BeforeTheIndexCreation';
+CREATE FULLTEXT INDEX ft ON t(t);
+INSERT INTO t SET t='this was inserted after creating the index';
+INSERT INTO t SET t=REPEAT(_utf8 0xefbc91,84);
+INSERT INTO t SET t=REPEAT('after',17); # too long, will not be indexed
+INSERT INTO t SET t=REPEAT(concat(repeat(_utf8mb3 0xE0B987, 4), repeat(_utf8mb3 0xE0B989, 5)), 5);
+INSERT INTO t SET t=REPEAT(_utf8mb4 0xf09f9695, 84);
+--echo # The token below exceeds the 84-character limit.
+INSERT INTO t SET t=REPEAT(_utf8mb4 0xf09f9696, 85);
+SELECT COUNT(*) FROM t WHERE MATCH t AGAINST (REPEAT(_utf8mb3 0xe794b2e9aaa8e69687, 15));
+SELECT COUNT(*) FROM t WHERE MATCH t AGAINST ('BeforeTheIndexCreation');
+SELECT COUNT(*) FROM t WHERE MATCH t AGAINST (REPEAT('befor',17));
+SELECT COUNT(*) FROM t WHERE MATCH t AGAINST ('after');
+SELECT COUNT(*) FROM t WHERE MATCH t AGAINST (REPEAT('after',17));
+SELECT COUNT(*) FROM t WHERE MATCH t AGAINST (REPEAT(_utf8 0xefbc90, 83));
+SELECT COUNT(*) FROM t WHERE MATCH t AGAINST (REPEAT(_utf8 0xefbc90, 84));
+SELECT COUNT(*) FROM t WHERE MATCH t AGAINST (REPEAT(_utf8 0xefbc90, 85));
+SELECT COUNT(*) FROM t WHERE MATCH t AGAINST (REPEAT(_utf8 0xefbc91, 83));
+SELECT COUNT(*) FROM t WHERE MATCH t AGAINST (REPEAT(_utf8 0xefbc91, 84));
+SELECT COUNT(*) FROM t WHERE MATCH t AGAINST (REPEAT(_utf8 0xefbc91, 85));
+SELECT COUNT(*) FROM t WHERE MATCH t AGAINST (REPEAT(_utf8mb4 0xf09f9695, 83));
+SELECT COUNT(*) FROM t WHERE MATCH t AGAINST (REPEAT(_utf8mb4 0xf09f9695, 84));
+SELECT COUNT(*) FROM t WHERE MATCH t AGAINST (REPEAT(_utf8mb4 0xf09f9696, 84));
+SELECT COUNT(*) FROM t WHERE MATCH t AGAINST (REPEAT(_utf8mb4 0xf09f9696, 85));
+SELECT * FROM t;
+
+# The column length should be 336 bytes (84 characters * 4 bytes/character).
+SELECT len,COUNT(*) FROM INFORMATION_SCHEMA.INNODB_SYS_COLUMNS where name='word' GROUP BY len;
+DROP TABLE t;
+
+CREATE TABLE t(t TEXT CHARACTER SET latin1, FULLTEXT INDEX(t))
+ENGINE=InnoDB;
+
+# The column length should be 84 bytes (84 characters * 1 byte/character).
+SELECT len,COUNT(*) FROM INFORMATION_SCHEMA.INNODB_SYS_COLUMNS where name='word' GROUP BY len;
+DROP TABLE t;
diff --git a/storage/innobase/fts/fts0fts.cc b/storage/innobase/fts/fts0fts.cc
index 6059c28eabc..18d09c8138f 100644
--- a/storage/innobase/fts/fts0fts.cc
+++ b/storage/innobase/fts/fts0fts.cc
@@ -1,6 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 2011, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2016, MariaDB Corporation. All Rights reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -1934,6 +1935,8 @@ func_exit:
 /*************************************************************//**
 Wrapper function of fts_create_index_tables_low(), create auxiliary
 tables for an FTS index
+
+@see row_merge_create_fts_sort_index()
 @return: DB_SUCCESS or error code */
 static
 dict_table_t*
@@ -1965,13 +1968,12 @@ fts_create_one_index_table(
 		(int)(field->col->prtype & DATA_MYSQL_TYPE_MASK),
 		(uint) dtype_get_charset_coll(field->col->prtype));
 
-	if (strcmp(charset->name, "latin1_swedish_ci") == 0) {
-		dict_mem_table_add_col(new_table, heap, "word", DATA_VARCHAR,
-				       field->col->prtype, FTS_MAX_WORD_LEN);
-	} else {
-		dict_mem_table_add_col(new_table, heap, "word", DATA_VARMYSQL,
-				       field->col->prtype, FTS_MAX_WORD_LEN);
-	}
+	dict_mem_table_add_col(new_table, heap, "word",
+			       charset == &my_charset_latin1
+			       ? DATA_VARCHAR : DATA_VARMYSQL,
+			       field->col->prtype,
+			       FTS_MAX_WORD_LEN_IN_CHAR
+			       * DATA_MBMAXLEN(field->col->mbminmaxlen));
 
 	dict_mem_table_add_col(new_table, heap, "first_doc_id", DATA_INT,
 			       DATA_NOT_NULL | DATA_UNSIGNED,
diff --git a/storage/innobase/fts/fts0opt.cc b/storage/innobase/fts/fts0opt.cc
index 455e2669c0d..cb30122adcb 100644
--- a/storage/innobase/fts/fts0opt.cc
+++ b/storage/innobase/fts/fts0opt.cc
@@ -1,6 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 2007, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2016, MariaDB Corporation. All Rights reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -281,7 +282,7 @@ fts_zip_initialize(
 	zip->last_big_block = 0;
 
 	zip->word.f_len = 0;
-	memset(zip->word.f_str, 0, FTS_MAX_WORD_LEN);
+	*zip->word.f_str = 0;
 
 	ib_vector_reset(zip->blocks);
 
diff --git a/storage/innobase/handler/ha_innodb.cc b/storage/innobase/handler/ha_innodb.cc
index f94c564c8b3..2c84530e6ac 100644
--- a/storage/innobase/handler/ha_innodb.cc
+++ b/storage/innobase/handler/ha_innodb.cc
@@ -3539,7 +3539,6 @@ innobase_change_buffering_inited_ok:
 	and consequently we do not need to know the ordering internally in
 	InnoDB. */
 
-	ut_a(0 == strcmp(my_charset_latin1.name, "latin1_swedish_ci"));
 	srv_latin1_ordering = my_charset_latin1.sort_order;
 
 	innobase_commit_concurrency_init_default();
@@ -6149,18 +6148,16 @@ get_innobase_type_from_mysql_type(
 	case MYSQL_TYPE_VARCHAR:	/* new >= 5.0.3 true VARCHAR */
 		if (field->binary()) {
 			return(DATA_BINARY);
-		} else if (strcmp(field->charset()->name,
-				  "latin1_swedish_ci") == 0) {
+		} else if (field->charset() == &my_charset_latin1) {
 			return(DATA_VARCHAR);
 		} else {
 			return(DATA_VARMYSQL);
 		}
 	case MYSQL_TYPE_BIT:
-	case MYSQL_TYPE_STRING: if (field->binary()) {
-
+	case MYSQL_TYPE_STRING:
+		if (field->binary()) {
 			return(DATA_FIXBINARY);
-		} else if (strcmp(field->charset()->name,
-				  "latin1_swedish_ci") == 0) {
+		} else if (field->charset() == &my_charset_latin1) {
 			return(DATA_CHAR);
 		} else {
 			return(DATA_MYSQL);
diff --git a/storage/innobase/include/fts0fts.h b/storage/innobase/include/fts0fts.h
index 3e2f359bbeb..7aa7055640c 100644
--- a/storage/innobase/include/fts0fts.h
+++ b/storage/innobase/include/fts0fts.h
@@ -1,6 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 2011, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2016, MariaDB Corporation. All Rights reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -366,8 +367,8 @@ extern ulong		fts_min_token_size;
 need a sync to free some memory */
 extern bool		fts_need_sync;
 
-/** Maximum possible Fulltext word length */
-#define FTS_MAX_WORD_LEN		HA_FT_MAXBYTELEN
+/** Maximum possible Fulltext word length in bytes (assuming mbmaxlen=4) */
+#define FTS_MAX_WORD_LEN		(HA_FT_MAXCHARLEN * 4)
 
 /** Maximum possible Fulltext word length (in characters) */
 #define FTS_MAX_WORD_LEN_IN_CHAR	HA_FT_MAXCHARLEN
diff --git a/storage/innobase/row/row0ftsort.cc b/storage/innobase/row/row0ftsort.cc
index e4a88047955..f662ef6b461 100644
--- a/storage/innobase/row/row0ftsort.cc
+++ b/storage/innobase/row/row0ftsort.cc
@@ -57,6 +57,8 @@ tokenized doc string. The index has three "fields":
 integer value)
 3) Word's position in original doc.
 
+@see fts_create_one_index_table()
+
 @return dict_index_t structure for the fts sort index */
 UNIV_INTERN
 dict_index_t*
@@ -96,16 +98,12 @@ row_merge_create_fts_sort_index(
 	field->prefix_len = 0;
 	field->col = static_cast<dict_col_t*>(
 		mem_heap_alloc(new_index->heap, sizeof(dict_col_t)));
-	field->col->len = FTS_MAX_WORD_LEN;
-
-	if (strcmp(charset->name, "latin1_swedish_ci") == 0) {
-		field->col->mtype = DATA_VARCHAR;
-	} else {
-		field->col->mtype = DATA_VARMYSQL;
-	}
-
 	field->col->prtype = idx_field->col->prtype | DATA_NOT_NULL;
+	field->col->mtype = charset == &my_charset_latin1
+		? DATA_VARCHAR : DATA_VARMYSQL;
 	field->col->mbminmaxlen = idx_field->col->mbminmaxlen;
+	field->col->len = HA_FT_MAXCHARLEN * DATA_MBMAXLEN(field->col->mbminmaxlen);
+
 	field->fixed_len = 0;
 
 	/* Doc ID */
@@ -359,6 +357,7 @@ row_fts_free_pll_merge_buf(
 
 /*********************************************************************//**
 Tokenize incoming text data and add to the sort buffer.
+@see row_merge_buf_encode()
 @return	TRUE if the record passed, FALSE if out of space */
 static
 ibool
@@ -367,8 +366,6 @@ row_merge_fts_doc_tokenize(
 	row_merge_buf_t**	sort_buf,	/*!< in/out: sort buffer */
 	doc_id_t		doc_id,		/*!< in: Doc ID */
 	fts_doc_t*		doc,		/*!< in: Doc to be tokenized */
-	dtype_t*		word_dtype,	/*!< in: data structure for
-						word col */
 	merge_file_t**		merge_file,	/*!< in/out: merge file */
 	ibool			opt_doc_id_size,/*!< in: whether to use 4 bytes
 						instead of 8 bytes integer to
@@ -400,7 +397,7 @@ row_merge_fts_doc_tokenize(
 		ulint		idx = 0;
 		ib_uint32_t	position;
 		ulint           offset = 0;
-		ulint		cur_len = 0;
+		ulint		cur_len;
 		doc_id_t	write_doc_id;
 
 		inc = innobase_mysql_fts_get_token(
@@ -454,14 +451,34 @@ row_merge_fts_doc_tokenize(
 		dfield_set_data(field, t_str.f_str, t_str.f_len);
 		len = dfield_get_len(field);
 
-		field->type.mtype = word_dtype->mtype;
-		field->type.prtype = word_dtype->prtype | DATA_NOT_NULL;
+		dict_col_copy_type(dict_index_get_nth_col(buf->index, 0), &field->type);
+		field->type.prtype |= DATA_NOT_NULL;
+		ut_ad(len <= field->type.len);
 
-		/* Variable length field, set to max size. */
-		field->type.len = FTS_MAX_WORD_LEN;
-		field->type.mbminmaxlen = word_dtype->mbminmaxlen;
+		/* For the temporary file, row_merge_buf_encode() uses
+		1 byte for representing the number of extra_size bytes.
+		This number will always be 1, because for this 3-field index
+		consisting of one variable-size column, extra_size will always
+		be 1 or 2, which can be encoded in one byte.
+
+		The extra_size is 1 byte if the length of the
+		variable-length column is less than 128 bytes or the
+		maximum length is less than 256 bytes. */
+
+		/* One variable length column, word with its lenght less than
+		fts_max_token_size, add one extra size and one extra byte.
+
+		Since the max length for FTS token now is larger than 255,
+		so we will need to signify length byte itself, so only 1 to 128
+		bytes can be used for 1 bytes, larger than that 2 bytes. */
+		if (len < 128 || field->type.len < 256) {
+			/* Extra size is one byte. */
+			cur_len = 2 + len;
+		} else {
+			/* Extra size is two bytes. */
+			cur_len = 3 + len;
+		}
 
-		cur_len += len;
 		dfield_dup(field, buf->heap);
 		field++;
 
@@ -511,20 +528,6 @@ row_merge_fts_doc_tokenize(
 		cur_len += len;
 		dfield_dup(field, buf->heap);
 
-		/* One variable length column, word with its lenght less than
-		fts_max_token_size, add one extra size and one extra byte.
-
-		Since the max length for FTS token now is larger than 255,
-		so we will need to signify length byte itself, so only 1 to 128
-		bytes can be used for 1 bytes, larger than that 2 bytes. */
-		if (t_str.f_len < 128) {
-			/* Extra size is one byte. */
-			cur_len += 2;
-		} else {
-			/* Extra size is two bytes. */
-			cur_len += 3;
-		}
-
 		/* Reserve one byte for the end marker of row_merge_block_t. */
 		if (buf->total_size + data_size[idx] + cur_len
 		    >= srv_sort_buf_size - 1) {
@@ -617,7 +620,6 @@ fts_parallel_tokenization(
 	mem_heap_t*		blob_heap = NULL;
 	fts_doc_t		doc;
 	dict_table_t*		table = psort_info->psort_common->new_table;
-	dtype_t			word_dtype;
 	dict_field_t*		idx_field;
 	fts_tokenize_ctx_t	t_ctx;
 	ulint			retried = 0;
@@ -642,10 +644,6 @@ fts_parallel_tokenization(
 
 	idx_field = dict_index_get_nth_field(
 		psort_info->psort_common->dup->index, 0);
-	word_dtype.prtype = idx_field->col->prtype;
-	word_dtype.mbminmaxlen = idx_field->col->mbminmaxlen;
-	word_dtype.mtype = (strcmp(doc.charset->name, "latin1_swedish_ci") == 0)
-				? DATA_VARCHAR : DATA_VARMYSQL;
 
 	block = psort_info->merge_block;
 	zip_size = dict_table_zip_size(table);
@@ -696,7 +694,6 @@ loop:
 
 		processed = row_merge_fts_doc_tokenize(
 			buf, doc_item->doc_id, &doc,
-			&word_dtype,
 			merge_file, psort_info->psort_common->opt_doc_id_size,
 			&t_ctx);
 
diff --git a/storage/xtradb/fts/fts0fts.cc b/storage/xtradb/fts/fts0fts.cc
index 4c54afae8cd..c1f0b0bd5fe 100644
--- a/storage/xtradb/fts/fts0fts.cc
+++ b/storage/xtradb/fts/fts0fts.cc
@@ -1,6 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 2011, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2016, MariaDB Corporation. All Rights reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -1934,6 +1935,8 @@ func_exit:
 /*************************************************************//**
 Wrapper function of fts_create_index_tables_low(), create auxiliary
 tables for an FTS index
+
+@see row_merge_create_fts_sort_index()
 @return: DB_SUCCESS or error code */
 static
 dict_table_t*
@@ -1965,13 +1968,12 @@ fts_create_one_index_table(
 		(int)(field->col->prtype & DATA_MYSQL_TYPE_MASK),
 		(uint) dtype_get_charset_coll(field->col->prtype));
 
-	if (strcmp(charset->name, "latin1_swedish_ci") == 0) {
-		dict_mem_table_add_col(new_table, heap, "word", DATA_VARCHAR,
-				       field->col->prtype, FTS_MAX_WORD_LEN);
-	} else {
-		dict_mem_table_add_col(new_table, heap, "word", DATA_VARMYSQL,
-				       field->col->prtype, FTS_MAX_WORD_LEN);
-	}
+	dict_mem_table_add_col(new_table, heap, "word",
+			       charset == &my_charset_latin1
+			       ? DATA_VARCHAR : DATA_VARMYSQL,
+			       field->col->prtype,
+			       FTS_MAX_WORD_LEN_IN_CHAR
+			       * DATA_MBMAXLEN(field->col->mbminmaxlen));
 
 	dict_mem_table_add_col(new_table, heap, "first_doc_id", DATA_INT,
 			       DATA_NOT_NULL | DATA_UNSIGNED,
diff --git a/storage/xtradb/fts/fts0opt.cc b/storage/xtradb/fts/fts0opt.cc
index ea937c20752..ed882d33548 100644
--- a/storage/xtradb/fts/fts0opt.cc
+++ b/storage/xtradb/fts/fts0opt.cc
@@ -1,6 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 2007, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2016, MariaDB Corporation. All Rights reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -281,7 +282,7 @@ fts_zip_initialize(
 	zip->last_big_block = 0;
 
 	zip->word.f_len = 0;
-	memset(zip->word.f_str, 0, FTS_MAX_WORD_LEN);
+	*zip->word.f_str = 0;
 
 	ib_vector_reset(zip->blocks);
 
diff --git a/storage/xtradb/handler/ha_innodb.cc b/storage/xtradb/handler/ha_innodb.cc
index ff4f218b557..1709d4e59a9 100644
--- a/storage/xtradb/handler/ha_innodb.cc
+++ b/storage/xtradb/handler/ha_innodb.cc
@@ -4003,7 +4003,6 @@ innobase_change_buffering_inited_ok:
 	and consequently we do not need to know the ordering internally in
 	InnoDB. */
 
-	ut_a(0 == strcmp(my_charset_latin1.name, "latin1_swedish_ci"));
 	srv_latin1_ordering = my_charset_latin1.sort_order;
 
 	innobase_commit_concurrency_init_default();
@@ -6839,18 +6838,16 @@ get_innobase_type_from_mysql_type(
 	case MYSQL_TYPE_VARCHAR:	/* new >= 5.0.3 true VARCHAR */
 		if (field->binary()) {
 			return(DATA_BINARY);
-		} else if (strcmp(field->charset()->name,
-				  "latin1_swedish_ci") == 0) {
+		} else if (field->charset() == &my_charset_latin1) {
 			return(DATA_VARCHAR);
 		} else {
 			return(DATA_VARMYSQL);
 		}
 	case MYSQL_TYPE_BIT:
-	case MYSQL_TYPE_STRING: if (field->binary()) {
-
+	case MYSQL_TYPE_STRING:
+		if (field->binary()) {
 			return(DATA_FIXBINARY);
-		} else if (strcmp(field->charset()->name,
-				  "latin1_swedish_ci") == 0) {
+		} else if (field->charset() == &my_charset_latin1) {
 			return(DATA_CHAR);
 		} else {
 			return(DATA_MYSQL);
diff --git a/storage/xtradb/include/fts0fts.h b/storage/xtradb/include/fts0fts.h
index 3e2f359bbeb..7aa7055640c 100644
--- a/storage/xtradb/include/fts0fts.h
+++ b/storage/xtradb/include/fts0fts.h
@@ -1,6 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 2011, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2016, MariaDB Corporation. All Rights reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -366,8 +367,8 @@ extern ulong		fts_min_token_size;
 need a sync to free some memory */
 extern bool		fts_need_sync;
 
-/** Maximum possible Fulltext word length */
-#define FTS_MAX_WORD_LEN		HA_FT_MAXBYTELEN
+/** Maximum possible Fulltext word length in bytes (assuming mbmaxlen=4) */
+#define FTS_MAX_WORD_LEN		(HA_FT_MAXCHARLEN * 4)
 
 /** Maximum possible Fulltext word length (in characters) */
 #define FTS_MAX_WORD_LEN_IN_CHAR	HA_FT_MAXCHARLEN
diff --git a/storage/xtradb/row/row0ftsort.cc b/storage/xtradb/row/row0ftsort.cc
index cb47d605623..ce138252a19 100644
--- a/storage/xtradb/row/row0ftsort.cc
+++ b/storage/xtradb/row/row0ftsort.cc
@@ -58,6 +58,8 @@ tokenized doc string. The index has three "fields":
 integer value)
 3) Word's position in original doc.
 
+@see fts_create_one_index_table()
+
 @return dict_index_t structure for the fts sort index */
 UNIV_INTERN
 dict_index_t*
@@ -99,16 +101,12 @@ row_merge_create_fts_sort_index(
 	field->prefix_len = 0;
 	field->col = static_cast<dict_col_t*>(
 		mem_heap_alloc(new_index->heap, sizeof(dict_col_t)));
-	field->col->len = FTS_MAX_WORD_LEN;
-
-	if (strcmp(charset->name, "latin1_swedish_ci") == 0) {
-		field->col->mtype = DATA_VARCHAR;
-	} else {
-		field->col->mtype = DATA_VARMYSQL;
-	}
-
 	field->col->prtype = idx_field->col->prtype | DATA_NOT_NULL;
+	field->col->mtype = charset == &my_charset_latin1
+		? DATA_VARCHAR : DATA_VARMYSQL;
 	field->col->mbminmaxlen = idx_field->col->mbminmaxlen;
+	field->col->len = HA_FT_MAXCHARLEN * DATA_MBMAXLEN(field->col->mbminmaxlen);
+
 	field->fixed_len = 0;
 
 	/* Doc ID */
@@ -362,6 +360,7 @@ row_fts_free_pll_merge_buf(
 
 /*********************************************************************//**
 Tokenize incoming text data and add to the sort buffer.
+@see row_merge_buf_encode()
 @return	TRUE if the record passed, FALSE if out of space */
 static
 ibool
@@ -370,8 +369,6 @@ row_merge_fts_doc_tokenize(
 	row_merge_buf_t**	sort_buf,	/*!< in/out: sort buffer */
 	doc_id_t		doc_id,		/*!< in: Doc ID */
 	fts_doc_t*		doc,		/*!< in: Doc to be tokenized */
-	dtype_t*		word_dtype,	/*!< in: data structure for
-						word col */
 	merge_file_t**		merge_file,	/*!< in/out: merge file */
 	ibool			opt_doc_id_size,/*!< in: whether to use 4 bytes
 						instead of 8 bytes integer to
@@ -403,7 +400,7 @@ row_merge_fts_doc_tokenize(
 		ulint		idx = 0;
 		ib_uint32_t	position;
 		ulint           offset = 0;
-		ulint		cur_len = 0;
+		ulint		cur_len;
 		doc_id_t	write_doc_id;
 
 		inc = innobase_mysql_fts_get_token(
@@ -457,14 +454,34 @@ row_merge_fts_doc_tokenize(
 		dfield_set_data(field, t_str.f_str, t_str.f_len);
 		len = dfield_get_len(field);
 
-		field->type.mtype = word_dtype->mtype;
-		field->type.prtype = word_dtype->prtype | DATA_NOT_NULL;
+		dict_col_copy_type(dict_index_get_nth_col(buf->index, 0), &field->type);
+		field->type.prtype |= DATA_NOT_NULL;
+		ut_ad(len <= field->type.len);
 
-		/* Variable length field, set to max size. */
-		field->type.len = FTS_MAX_WORD_LEN;
-		field->type.mbminmaxlen = word_dtype->mbminmaxlen;
+		/* For the temporary file, row_merge_buf_encode() uses
+		1 byte for representing the number of extra_size bytes.
+		This number will always be 1, because for this 3-field index
+		consisting of one variable-size column, extra_size will always
+		be 1 or 2, which can be encoded in one byte.
+
+		The extra_size is 1 byte if the length of the
+		variable-length column is less than 128 bytes or the
+		maximum length is less than 256 bytes. */
+
+		/* One variable length column, word with its lenght less than
+		fts_max_token_size, add one extra size and one extra byte.
+
+		Since the max length for FTS token now is larger than 255,
+		so we will need to signify length byte itself, so only 1 to 128
+		bytes can be used for 1 bytes, larger than that 2 bytes. */
+		if (len < 128 || field->type.len < 256) {
+			/* Extra size is one byte. */
+			cur_len = 2 + len;
+		} else {
+			/* Extra size is two bytes. */
+			cur_len = 3 + len;
+		}
 
-		cur_len += len;
 		dfield_dup(field, buf->heap);
 		field++;
 
@@ -514,20 +531,6 @@ row_merge_fts_doc_tokenize(
 		cur_len += len;
 		dfield_dup(field, buf->heap);
 
-		/* One variable length column, word with its lenght less than
-		fts_max_token_size, add one extra size and one extra byte.
-
-		Since the max length for FTS token now is larger than 255,
-		so we will need to signify length byte itself, so only 1 to 128
-		bytes can be used for 1 bytes, larger than that 2 bytes. */
-		if (t_str.f_len < 128) {
-			/* Extra size is one byte. */
-			cur_len += 2;
-		} else {
-			/* Extra size is two bytes. */
-			cur_len += 3;
-		}
-
 		/* Reserve one byte for the end marker of row_merge_block_t. */
 		if (buf->total_size + data_size[idx] + cur_len
 		    >= srv_sort_buf_size - 1) {
@@ -620,7 +623,6 @@ fts_parallel_tokenization(
 	mem_heap_t*		blob_heap = NULL;
 	fts_doc_t		doc;
 	dict_table_t*		table = psort_info->psort_common->new_table;
-	dtype_t			word_dtype;
 	dict_field_t*		idx_field;
 	fts_tokenize_ctx_t	t_ctx;
 	ulint			retried = 0;
@@ -645,10 +647,6 @@ fts_parallel_tokenization(
 
 	idx_field = dict_index_get_nth_field(
 		psort_info->psort_common->dup->index, 0);
-	word_dtype.prtype = idx_field->col->prtype;
-	word_dtype.mbminmaxlen = idx_field->col->mbminmaxlen;
-	word_dtype.mtype = (strcmp(doc.charset->name, "latin1_swedish_ci") == 0)
-				? DATA_VARCHAR : DATA_VARMYSQL;
 
 	block = psort_info->merge_block;
 	zip_size = dict_table_zip_size(table);
@@ -699,7 +697,6 @@ loop:
 
 		processed = row_merge_fts_doc_tokenize(
 			buf, doc_item->doc_id, &doc,
-			&word_dtype,
 			merge_file, psort_info->psort_common->opt_doc_id_size,
 			&t_ctx);
author	Marko Mäkelä <marko.makela@mariadb.com>	2016-12-05 15:25:59 +0200
committer	Marko Mäkelä <marko.makela@mariadb.com>	2017-01-27 10:19:39 +0200
commit	732672c3044e60fb0d1dfdb466bd3c3d13ea2f8d (patch)
tree	58e19d71f428f99f6c2589929d5923c597b73107
parent	afb461587c0b7dea2e5e70a165e8d4d437c3f964 (diff)
download	mariadb-git-732672c3044e60fb0d1dfdb466bd3c3d13ea2f8d.tar.gz