diff options
Diffstat (limited to 'test/racc/assets/namae.y')
-rw-r--r-- | test/racc/assets/namae.y | 302 |
1 files changed, 302 insertions, 0 deletions
diff --git a/test/racc/assets/namae.y b/test/racc/assets/namae.y new file mode 100644 index 0000000000..0378345fef --- /dev/null +++ b/test/racc/assets/namae.y @@ -0,0 +1,302 @@ +# -*- ruby -*- +# vi: set ft=ruby : + +# Copyright (C) 2012 President and Fellows of Harvard College +# Copyright (C) 2013-2014 Sylvester Keil +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER ``AS IS'' AND ANY EXPRESS OR +# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO +# EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, +# INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# The views and conclusions contained in the software and documentation are +# those of the authors and should not be interpreted as representing official +# policies, either expressed or implied, of the copyright holder. + +class Namae::Parser + +token COMMA UWORD LWORD PWORD NICK AND APPELLATION TITLE SUFFIX + +expect 0 + +rule + + names : { result = [] } + | name { result = [val[0]] } + | names AND name { result = val[0] << val[2] } + + name : word { result = Name.new(:given => val[0]) } + | display_order + | honorific word { result = val[0].merge(:family => val[1]) } + | honorific display_order { result = val[1].merge(val[0]) } + | sort_order + + honorific : APPELLATION { result = Name.new(:appellation => val[0]) } + | TITLE { result = Name.new(:title => val[0]) } + + display_order : u_words word opt_suffices opt_titles + { + result = Name.new(:given => val[0], :family => val[1], + :suffix => val[2], :title => val[3]) + } + | u_words NICK last opt_suffices opt_titles + { + result = Name.new(:given => val[0], :nick => val[1], + :family => val[2], :suffix => val[3], :title => val[4]) + } + | u_words NICK von last opt_suffices opt_titles + { + result = Name.new(:given => val[0], :nick => val[1], + :particle => val[2], :family => val[3], + :suffix => val[4], :title => val[5]) + } + | u_words von last + { + result = Name.new(:given => val[0], :particle => val[1], + :family => val[2]) + } + | von last + { + result = Name.new(:particle => val[0], :family => val[1]) + } + + sort_order : last COMMA first + { + result = Name.new({ :family => val[0], :suffix => val[2][0], + :given => val[2][1] }, !!val[2][0]) + } + | von last COMMA first + { + result = Name.new({ :particle => val[0], :family => val[1], + :suffix => val[3][0], :given => val[3][1] }, !!val[3][0]) + } + | u_words von last COMMA first + { + result = Name.new({ :particle => val[0,2].join(' '), :family => val[2], + :suffix => val[4][0], :given => val[4][1] }, !!val[4][0]) + } + ; + + von : LWORD + | von LWORD { result = val.join(' ') } + | von u_words LWORD { result = val.join(' ') } + + last : LWORD | u_words + + first : opt_words { result = [nil,val[0]] } + | words opt_comma suffices { result = [val[2],val[0]] } + | suffices { result = [val[0],nil] } + | suffices COMMA words { result = [val[0],val[2]] } + + u_words : u_word + | u_words u_word { result = val.join(' ') } + + u_word : UWORD | PWORD + + words : word + | words word { result = val.join(' ') } + + opt_comma : /* empty */ | COMMA + opt_words : /* empty */ | words + + word : LWORD | UWORD | PWORD + + opt_suffices : /* empty */ | suffices + + suffices : SUFFIX + | suffices SUFFIX { result = val.join(' ') } + + opt_titles : /* empty */ | titles + + titles : TITLE + | titles TITLE { result = val.join(' ') } + +---- header +require 'singleton' +require 'strscan' + +---- inner + + include Singleton + + attr_reader :options, :input + + def initialize + @input, @options = StringScanner.new(''), { + :debug => false, + :prefer_comma_as_separator => false, + :comma => ',', + :stops => ',;', + :separator => /\s*(\band\b|\&|;)\s*/i, + :title => /\s*\b(sir|lord|count(ess)?|(gen|adm|col|maj|capt|cmdr|lt|sgt|cpl|pvt|prof|dr|md|ph\.?d)\.?)(\s+|$)/i, + :suffix => /\s*\b(JR|Jr|jr|SR|Sr|sr|[IVX]{2,})(\.|\b)/, + :appellation => /\s*\b((mrs?|ms|fr|hr)\.?|miss|herr|frau)(\s+|$)/i + } + end + + def debug? + options[:debug] || ENV['DEBUG'] + end + + def separator + options[:separator] + end + + def comma + options[:comma] + end + + def stops + options[:stops] + end + + def title + options[:title] + end + + def suffix + options[:suffix] + end + + def appellation + options[:appellation] + end + + def prefer_comma_as_separator? + options[:prefer_comma_as_separator] + end + + def parse(input) + parse!(input) + rescue => e + warn e.message if debug? + [] + end + + def parse!(string) + input.string = normalize(string) + reset + do_parse + end + + def normalize(string) + string = string.strip + string + end + + def reset + @commas, @words, @initials, @suffices, @yydebug = 0, 0, 0, 0, debug? + self + end + + private + + def stack + @vstack || @racc_vstack || [] + end + + def last_token + stack[-1] + end + + def consume_separator + return next_token if seen_separator? + @commas, @words, @initials, @suffices = 0, 0, 0, 0 + [:AND, :AND] + end + + def consume_comma + @commas += 1 + [:COMMA, :COMMA] + end + + def consume_word(type, word) + @words += 1 + + case type + when :UWORD + @initials += 1 if word =~ /^[[:upper:]]+\b/ + when :SUFFIX + @suffices += 1 + end + + [type, word] + end + + def seen_separator? + !stack.empty? && last_token == :AND + end + + def suffix? + !@suffices.zero? || will_see_suffix? + end + + def will_see_suffix? + input.peek(8).to_s.strip.split(/\s+/)[0] =~ suffix + end + + def will_see_initial? + input.peek(6).to_s.strip.split(/\s+/)[0] =~ /^[[:upper:]]+\b/ + end + + def seen_full_name? + prefer_comma_as_separator? && @words > 1 && + (@initials > 0 || !will_see_initial?) && !will_see_suffix? + end + + def next_token + case + when input.nil?, input.eos? + nil + when input.scan(separator) + consume_separator + when input.scan(/\s*#{comma}\s*/) + if @commas.zero? && !seen_full_name? || @commas == 1 && suffix? + consume_comma + else + consume_separator + end + when input.scan(/\s+/) + next_token + when input.scan(title) + consume_word(:TITLE, input.matched.strip) + when input.scan(suffix) + consume_word(:SUFFIX, input.matched.strip) + when input.scan(appellation) + [:APPELLATION, input.matched.strip] + when input.scan(/((\\\w+)?\{[^\}]*\})*[[:upper:]][^\s#{stops}]*/) + consume_word(:UWORD, input.matched) + when input.scan(/((\\\w+)?\{[^\}]*\})*[[:lower:]][^\s#{stops}]*/) + consume_word(:LWORD, input.matched) + when input.scan(/(\\\w+)?\{[^\}]*\}[^\s#{stops}]*/) + consume_word(:PWORD, input.matched) + when input.scan(/('[^'\n]+')|("[^"\n]+")/) + consume_word(:NICK, input.matched[1...-1]) + else + raise ArgumentError, + "Failed to parse name #{input.string.inspect}: unmatched data at offset #{input.pos}" + end + end + + def on_error(tid, value, stack) + raise ArgumentError, + "Failed to parse name: unexpected '#{value}' at #{stack.inspect}" + end + +# -*- racc -*- |