diff options
author | Lorry Tar Creator <lorry-tar-importer@baserock.org> | 2013-03-17 20:07:05 +0000 |
---|---|---|
committer | <> | 2015-07-07 10:18:30 +0000 |
commit | 802da9dd5d4bc18f46a916eedc0c5c1980a15e59 (patch) | |
tree | f78a8637465b7a4c9624fef03d27eb7aeaa779d4 /webhelp/template/search/stemmers/en_stemmer.js | |
parent | 18f63104106b81bf37ca1af774d7be38051e5444 (diff) | |
download | docbook-xsl-802da9dd5d4bc18f46a916eedc0c5c1980a15e59.tar.gz |
Imported from /home/lorry/working-area/delta_docbook-xsl/docbook-xsl-1.78.1.tar.bz2.HEADdocbook-xsl-1.78.1master
Diffstat (limited to 'webhelp/template/search/stemmers/en_stemmer.js')
-rw-r--r-- | webhelp/template/search/stemmers/en_stemmer.js | 234 |
1 files changed, 234 insertions, 0 deletions
diff --git a/webhelp/template/search/stemmers/en_stemmer.js b/webhelp/template/search/stemmers/en_stemmer.js new file mode 100644 index 0000000..2117c1b --- /dev/null +++ b/webhelp/template/search/stemmers/en_stemmer.js @@ -0,0 +1,234 @@ +// Porter stemmer in Javascript. Few comments, but it's easy to follow against the rules in the original +// paper, in +// +// Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14, +// no. 3, pp 130-137, +// +// see also http://www.tartarus.org/~martin/PorterStemmer + +// Release 1 +// Derived from (http://tartarus.org/~martin/PorterStemmer/js.txt) - cjm (iizuu) Aug 24, 2009 + +var stemmer = (function(){ + var step2list = { + "ational" : "ate", + "tional" : "tion", + "enci" : "ence", + "anci" : "ance", + "izer" : "ize", + "bli" : "ble", + "alli" : "al", + "entli" : "ent", + "eli" : "e", + "ousli" : "ous", + "ization" : "ize", + "ation" : "ate", + "ator" : "ate", + "alism" : "al", + "iveness" : "ive", + "fulness" : "ful", + "ousness" : "ous", + "aliti" : "al", + "iviti" : "ive", + "biliti" : "ble", + "logi" : "log" + }, + + step3list = { + "icate" : "ic", + "ative" : "", + "alize" : "al", + "iciti" : "ic", + "ical" : "ic", + "ful" : "", + "ness" : "" + }, + + c = "[^aeiou]", // consonant + v = "[aeiouy]", // vowel + C = c + "[^aeiouy]*", // consonant sequence + V = v + "[aeiou]*", // vowel sequence + + mgr0 = "^(" + C + ")?" + V + C, // [C]VC... is m>0 + meq1 = "^(" + C + ")?" + V + C + "(" + V + ")?$", // [C]VC[V] is m=1 + mgr1 = "^(" + C + ")?" + V + C + V + C, // [C]VCVC... is m>1 + s_v = "^(" + C + ")?" + v; // vowel in stem + + return function (w) { + var stem, + suffix, + firstch, + re, + re2, + re3, + re4, + origword = w; + + if (w.length < 3) { return w; } + + firstch = w.substr(0,1); + if (firstch == "y") { + w = firstch.toUpperCase() + w.substr(1); + } + + // Step 1a + re = /^(.+?)(ss|i)es$/; + re2 = /^(.+?)([^s])s$/; + + if (re.test(w)) { w = w.replace(re,"$1$2"); } + else if (re2.test(w)) { w = w.replace(re2,"$1$2"); } + + // Step 1b + re = /^(.+?)eed$/; + re2 = /^(.+?)(ed|ing)$/; + if (re.test(w)) { + var fp = re.exec(w); + re = new RegExp(mgr0); + if (re.test(fp[1])) { + re = /.$/; + w = w.replace(re,""); + } + } else if (re2.test(w)) { + var fp = re2.exec(w); + stem = fp[1]; + re2 = new RegExp(s_v); + if (re2.test(stem)) { + w = stem; + re2 = /(at|bl|iz)$/; + re3 = new RegExp("([^aeiouylsz])\\1$"); + re4 = new RegExp("^" + C + v + "[^aeiouwxy]$"); + if (re2.test(w)) { w = w + "e"; } + else if (re3.test(w)) { re = /.$/; w = w.replace(re,""); } + else if (re4.test(w)) { w = w + "e"; } + } + } + + // Step 1c + re = new RegExp("^(.+" + c + ")y$"); + if (re.test(w)) { + var fp = re.exec(w); + stem = fp[1]; + w = stem + "i"; + } + + // Step 2 + re = /^(.+?)(ational|tional|enci|anci|izer|bli|alli|entli|eli|ousli|ization|ation|ator|alism|iveness|fulness|ousness|aliti|iviti|biliti|logi)$/; + if (re.test(w)) { + var fp = re.exec(w); + stem = fp[1]; + suffix = fp[2]; + re = new RegExp(mgr0); + if (re.test(stem)) { + w = stem + step2list[suffix]; + } + } + + // Step 3 + re = /^(.+?)(icate|ative|alize|iciti|ical|ful|ness)$/; + if (re.test(w)) { + var fp = re.exec(w); + stem = fp[1]; + suffix = fp[2]; + re = new RegExp(mgr0); + if (re.test(stem)) { + w = stem + step3list[suffix]; + } + } + + // Step 4 + re = /^(.+?)(al|ance|ence|er|ic|able|ible|ant|ement|ment|ent|ou|ism|ate|iti|ous|ive|ize)$/; + re2 = /^(.+?)(s|t)(ion)$/; + if (re.test(w)) { + var fp = re.exec(w); + stem = fp[1]; + re = new RegExp(mgr1); + if (re.test(stem)) { + w = stem; + } + } else if (re2.test(w)) { + var fp = re2.exec(w); + stem = fp[1] + fp[2]; + re2 = new RegExp(mgr1); + if (re2.test(stem)) { + w = stem; + } + } + + // Step 5 + re = /^(.+?)e$/; + if (re.test(w)) { + var fp = re.exec(w); + stem = fp[1]; + re = new RegExp(mgr1); + re2 = new RegExp(meq1); + re3 = new RegExp("^" + C + v + "[^aeiouwxy]$"); + if (re.test(stem) || (re2.test(stem) && !(re3.test(stem)))) { + w = stem; + } + } + + re = /ll$/; + re2 = new RegExp(mgr1); + if (re.test(w) && re2.test(w)) { + re = /.$/; + w = w.replace(re,""); + } + + // and turn initial Y back to y + + if (firstch == "y") { + w = firstch.toLowerCase() + w.substr(1); + } + + // See http://snowball.tartarus.org/algorithms/english/stemmer.html + // "Exceptional forms in general" + var specialWords = { + "skis" : "ski", + "skies" : "sky", + "dying" : "die", + "lying" : "lie", + "tying" : "tie", + "idly" : "idl", + "gently" : "gentl", + "ugly" : "ugli", + "early": "earli", + "only": "onli", + "singly": "singl" + }; + + if(specialWords[origword]){ + w = specialWords[origword]; + } + + if( "sky news howe atlas cosmos bias \ + andes inning outing canning herring \ + earring proceed exceed succeed".indexOf(origword) !== -1 ){ + w = origword; + } + + // Address words overstemmed as gener- + re = /.*generate?s?d?(ing)?$/; + if( re.test(origword) ){ + w = w + 'at'; + } + re = /.*general(ly)?$/; + if( re.test(origword) ){ + w = w + 'al'; + } + re = /.*generic(ally)?$/; + if( re.test(origword) ){ + w = w + 'ic'; + } + re = /.*generous(ly)?$/; + if( re.test(origword) ){ + w = w + 'ous'; + } + // Address words overstemmed as commun- + re = /.*communit(ies)?y?/; + if( re.test(origword) ){ + w = w + 'iti'; + } + + return w; + } +})(); |