diff options
Diffstat (limited to 'storage/mroonga/vendor/groonga/plugins/sharding/logical_range_filter.rb')
-rw-r--r-- | storage/mroonga/vendor/groonga/plugins/sharding/logical_range_filter.rb | 364 |
1 files changed, 282 insertions, 82 deletions
diff --git a/storage/mroonga/vendor/groonga/plugins/sharding/logical_range_filter.rb b/storage/mroonga/vendor/groonga/plugins/sharding/logical_range_filter.rb index 94ae8600d2b..1c8f8644695 100644 --- a/storage/mroonga/vendor/groonga/plugins/sharding/logical_range_filter.rb +++ b/storage/mroonga/vendor/groonga/plugins/sharding/logical_range_filter.rb @@ -14,6 +14,7 @@ module Groonga "offset", "limit", "output_columns", + "use_range_index", ]) def run_body(input) @@ -55,7 +56,26 @@ module Groonga end end + private + def cache_key(input) + key = "logical_range_filter\0" + key << "#{input[:logical_table]}\0" + key << "#{input[:shard_key]}\0" + key << "#{input[:min]}\0" + key << "#{input[:min_border]}\0" + key << "#{input[:max]}\0" + key << "#{input[:max_border]}\0" + key << "#{input[:order]}\0" + key << "#{input[:filter]}\0" + key << "#{input[:offset]}\0" + key << "#{input[:limit]}\0" + key << "#{input[:output_columns]}\0" + key << "#{input[:use_range_index]}\0" + key + end + class ExecuteContext + attr_reader :use_range_index attr_reader :enumerator attr_reader :order attr_reader :filter @@ -68,6 +88,7 @@ module Groonga attr_reader :threshold def initialize(input) @input = input + @use_range_index = parse_use_range_index(@input[:use_range_index]) @enumerator = LogicalEnumerator.new("logical_range_filter", @input) @order = parse_order(@input, :order) @filter = @input[:filter] @@ -93,6 +114,17 @@ module Groonga end private + def parse_use_range_index(use_range_index) + case use_range_index + when "yes" + true + when "no" + false + else + nil + end + end + def parse_order(input, name) order = input[name] return :ascending if order.nil? @@ -123,23 +155,21 @@ module Groonga end def execute - first_table = nil + first_shard = nil enumerator = @context.enumerator + target_range = enumerator.target_range if @context.order == :descending each_method = :reverse_each else each_method = :each end - enumerator.send(each_method) do |table, shard_key, shard_range| - first_table ||= table - next if table.empty? - - shard_executor = ShardExecutor.new(@context, - table, shard_key, shard_range) + enumerator.send(each_method) do |shard, shard_range| + first_shard ||= shard + shard_executor = ShardExecutor.new(@context, shard, shard_range) shard_executor.execute break if @context.current_limit == 0 end - if first_table.nil? + if first_shard.nil? message = "[logical_range_filter] no shard exists: " + "logical_table: <#{enumerator.logical_table}>: " + @@ -148,17 +178,16 @@ module Groonga end if @context.result_sets.empty? result_set = HashTable.create(:flags => ObjectFlags::WITH_SUBREC, - :key_type => first_table) + :key_type => first_shard.table) @context.result_sets << result_set end end end class ShardExecutor - def initialize(context, table, shard_key, shard_range) + def initialize(context, shard, shard_range) @context = context - @table = table - @shard_key = shard_key + @shard = shard @shard_range = shard_range @filter = @context.filter @@ -168,137 +197,279 @@ module Groonga @target_range = @context.enumerator.target_range @cover_type = @target_range.cover_type(@shard_range) - - @expression_builder = RangeExpressionBuilder.new(@shard_key, - @target_range, - @filter) end def execute return if @cover_type == :none + return if @shard.table.empty? + + shard_key = @shard.key + if shard_key.nil? + message = "[logical_range_filter] shard_key doesn't exist: " + + "<#{@shard.key_name}>" + raise InvalidArgument, message + end + + expression_builder = RangeExpressionBuilder.new(shard_key, + @target_range) + expression_builder.filter = @filter - index_info = @shard_key.find_index(Operator::LESS) + index_info = shard_key.find_index(Operator::LESS) if index_info range_index = index_info.index - range_index = nil unless use_range_index?(range_index) + unless use_range_index?(range_index, expression_builder) + range_index = nil + end else range_index = nil end - case @cover_type - when :all - filter_shard_all(range_index) - when :partial_min - if range_index - filter_by_range(range_index, - @target_range.min, @target_range.min_border, - nil, nil) - else - filter_table do |expression| - @expression_builder.build_partial_min(expression) - end - end - when :partial_max - if range_index - filter_by_range(range_index, - nil, nil, - @target_range.max, @target_range.max_border) - else - filter_table do |expression| - @expression_builder.build_partial_max(expression) - end - end - when :partial_min_and_max - if range_index - filter_by_range(range_index, - @target_range.min, @target_range.min_border, - @target_range.max, @target_range.max_border) - else - filter_table do |expression| - @expression_builder.build_partial_min_and_max(expression) - end - end - end + execute_filter(range_index, expression_builder) end private - def use_range_index?(range_index) + def decide_use_range_index(use, reason, line, method) + message = "[logical_range_filter]" + if use + message << "[range-index] " + else + message << "[select] " + end + message << "<#{@shard.table_name}>: " + message << reason + Context.instance.logger.log(Logger::Level::DEBUG, + __FILE__, + line, + method.to_s, + message) + + use + end + + def use_range_index?(range_index, expression_builder) + use_range_index_parameter_message = + "force by use_range_index parameter" + case @context.use_range_index + when true + return decide_use_range_index(true, + use_range_index_parameter_message, + __LINE__, __method__) + when false + return decide_use_range_index(false, + use_range_index_parameter_message, + __LINE__, __method__) + end + + range_index_logical_parameter_message = + "force by range_index logical parameter" + case Parameters.range_index + when :always + return decide_use_range_index(true, + range_index_logical_parameter_message, + __LINE__, __method__) + when :never + return decide_use_range_index(false, + range_index_logical_parameter_message, + __LINE__, __method__) + end + current_limit = @context.current_limit if current_limit < 0 - return false + reason = "limit is negative: <#{current_limit}>" + return decide_use_range_index(false, reason, + __LINE__, __method__) end required_n_records = @context.current_offset + current_limit - max_n_records = @table.size + max_n_records = @shard.table.size if max_n_records <= required_n_records - return false + reason = "the number of required records (#{required_n_records}) " + reason << ">= " + reason << "the number of records in shard (#{max_n_records})" + return decide_use_range_index(false, reason, + __LINE__, __method__) end threshold = @context.threshold if threshold <= 0.0 - return true + reason = "threshold is negative: <#{threshold}>" + return decide_use_range_index(true, reason, + __LINE__, __method__) end if threshold >= 1.0 - return false + reason = "threshold (#{threshold}) >= 1.0" + return decide_use_range_index(false, reason, + __LINE__, __method__) end + table = @shard.table estimated_n_records = 0 case @cover_type when :all if @filter - create_expression(@table) do |expression| - @expression_builder.build_all(expression) - estimated_n_records = expression.estimate_size(@table) + create_expression(table) do |expression| + expression_builder.build_all(expression) + unless range_index_available_expression?(expression, + __LINE__, __method__) + return false + end + estimated_n_records = expression.estimate_size(table) end else estimated_n_records = max_n_records end when :partial_min - create_expression(@table) do |expression| - @expression_builder.build_partial_min(expression) - estimated_n_records = expression.estimate_size(@table) + create_expression(table) do |expression| + expression_builder.build_partial_min(expression) + unless range_index_available_expression?(expression, + __LINE__, __method__) + return false + end + estimated_n_records = expression.estimate_size(table) end when :partial_max - create_expression(@table) do |expression| - @expression_builder.build_partial_max(expression) - estimated_n_records = expression.estimate_size(@table) + create_expression(table) do |expression| + expression_builder.build_partial_max(expression) + unless range_index_available_expression?(expression, + __LINE__, __method__) + return false + end + estimated_n_records = expression.estimate_size(table) end when :partial_min_and_max - create_expression(@table) do |expression| - @expression_builder.build_partial_min_and_max(expression) - estimated_n_records = expression.estimate_size(@table) + create_expression(table) do |expression| + expression_builder.build_partial_min_and_max(expression) + unless range_index_available_expression?(expression, + __LINE__, __method__) + return false + end + estimated_n_records = expression.estimate_size(table) end end if estimated_n_records <= required_n_records - return false + reason = "the number of required records (#{required_n_records}) " + reason << ">= " + reason << "the number of estimated records (#{estimated_n_records})" + return decide_use_range_index(false, reason, + __LINE__, __method__) end hit_ratio = estimated_n_records / max_n_records.to_f - hit_ratio >= threshold + use_range_index_by_hit_ratio = (hit_ratio >= threshold) + if use_range_index_by_hit_ratio + relation = ">=" + else + relation = "<" + end + reason = "hit ratio " + reason << "(#{hit_ratio}=#{estimated_n_records}/#{max_n_records}) " + reason << "#{relation} threshold (#{threshold})" + decide_use_range_index(use_range_index_by_hit_ratio, reason, + __LINE__, __method__) + end + + def range_index_available_expression?(expression, line, method_name) + nested_reference_vector_column_accessor = + find_nested_reference_vector_column_accessor(expression) + if nested_reference_vector_column_accessor + reason = "nested reference vector column accessor can't be used: " + reason << "<#{nested_reference_vector_column_accessor.name}>" + return decide_use_range_index(false, reason, line, method_name) + end + + selector_only_procedure = find_selector_only_procedure(expression) + if selector_only_procedure + reason = "selector only procedure can't be used: " + reason << "<#{selector_only_procedure.name}>" + return decide_use_range_index(false, reason, line, method_name) + end + + true + end + + def find_nested_reference_vector_column_accessor(expression) + expression.codes.each do |code| + value = code.value + next unless value.is_a?(Accessor) + + sub_accessor = value + while sub_accessor.have_next? + object = sub_accessor.object + return value if object.is_a?(Column) and object.vector? + sub_accessor = sub_accessor.next + end + end + nil + end + + def find_selector_only_procedure(expression) + expression.codes.each do |code| + value = code.value + return value if value.is_a?(Procedure) and value.selector_only? + end + nil + end + + def execute_filter(range_index, expression_builder) + case @cover_type + when :all + filter_shard_all(range_index, expression_builder) + when :partial_min + if range_index + filter_by_range(range_index, expression_builder, + @target_range.min, @target_range.min_border, + nil, nil) + else + filter_table do |expression| + expression_builder.build_partial_min(expression) + end + end + when :partial_max + if range_index + filter_by_range(range_index, expression_builder, + nil, nil, + @target_range.max, @target_range.max_border) + else + filter_table do |expression| + expression_builder.build_partial_max(expression) + end + end + when :partial_min_and_max + if range_index + filter_by_range(range_index, expression_builder, + @target_range.min, @target_range.min_border, + @target_range.max, @target_range.max_border) + else + filter_table do |expression| + expression_builder.build_partial_min_and_max(expression) + end + end + end end - def filter_shard_all(range_index) + def filter_shard_all(range_index, expression_builder) + table = @shard.table if @filter.nil? - if @table.size <= @context.current_offset - @context.current_offset -= @table.size + if table.size <= @context.current_offset + @context.current_offset -= table.size return end if range_index - filter_by_range(range_index, + filter_by_range(range_index, expression_builder, nil, nil, nil, nil) else - sort_result_set(@table) + sort_result_set(table) end else if range_index - filter_by_range(range_index, + filter_by_range(range_index, expression_builder, nil, nil, nil, nil) else filter_table do |expression| - @expression_builder.build_all(expression) + expression_builder.build_all(expression) end end end @@ -313,7 +484,7 @@ module Groonga end end - def filter_by_range(range_index, + def filter_by_range(range_index, expression_builder, min, min_border, max, max_border) lexicon = range_index.domain data_table = range_index.range @@ -336,6 +507,10 @@ module Groonga else options[:limit] = current_limit end + max_n_unmatched_records = + compute_max_n_unmatched_records(data_table.size, + options[:limit]) + options[:max_n_unmatched_records] = max_n_unmatched_records if @filter create_expression(data_table) do |expression| expression.parse(@filter) @@ -349,6 +524,17 @@ module Groonga n_matched_records = index_cursor.select(result_set, options) end end + if n_matched_records == -1 + result_set.close + fallback_message = + "fallback because there are too much unmatched records: " + fallback_message << "<#{max_n_unmatched_records}>" + decide_use_range_index(false, + fallback_message, + __LINE__, __method__) + execute_filter(nil, expression_builder) + return + end end rescue result_set.close @@ -393,10 +579,24 @@ module Groonga flags end + def compute_max_n_unmatched_records(data_table_size, limit) + max_n_unmatched_records = limit * 100 + max_n_sample_records = data_table_size + if max_n_sample_records > 10000 + sample_ratio = 1 / (Math.log(data_table_size) ** 2) + max_n_sample_records = (max_n_sample_records * sample_ratio).ceil + end + if max_n_unmatched_records > max_n_sample_records + max_n_unmatched_records = max_n_sample_records + end + max_n_unmatched_records + end + def filter_table - create_expression(@table) do |expression| + table = @shard.table + create_expression(table) do |expression| yield(expression) - result_set = @table.select(expression) + result_set = table.select(expression) sort_result_set(result_set) end end |