Imported from /home/lorry/working-area/delta_docbook-xsl/docbook-xsl-1.78.1.tar.bz2.HEAD docbook-xsl-1.78.1 master

author: Lorry Tar Creator <lorry-tar-importer@baserock.org> 2013-03-17 20:07:05 +0000
committer: <> 2015-07-07 10:18:30 +0000
commit: 802da9dd5d4bc18f46a916eedc0c5c1980a15e59 (patch)
tree: f78a8637465b7a4c9624fef03d27eb7aeaa779d4 /webhelp/template/search/stemmers/en_stemmer.js
parent: 18f63104106b81bf37ca1af774d7be38051e5444 (diff)
download: docbook-xsl-802da9dd5d4bc18f46a916eedc0c5c1980a15e59.tar.gz
1 files changed, 234 insertions, 0 deletions
diff --git a/webhelp/template/search/stemmers/en_stemmer.js b/webhelp/template/search/stemmers/en_stemmer.js
new file mode 100644
index 0000000..2117c1b
--- /dev/null
+++ b/webhelp/template/search/stemmers/en_stemmer.js
@@ -0,0 +1,234 @@
+// Porter stemmer in Javascript. Few comments, but it's easy to follow against the rules in the original
+// paper, in
+//
+//  Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14,
+//  no. 3, pp 130-137,
+//
+// see also http://www.tartarus.org/~martin/PorterStemmer
+
+// Release 1
+// Derived from (http://tartarus.org/~martin/PorterStemmer/js.txt) - cjm (iizuu) Aug 24, 2009
+
+var stemmer = (function(){
+	var step2list = {
+			"ational" : "ate",
+			"tional" : "tion",
+			"enci" : "ence",
+			"anci" : "ance",
+			"izer" : "ize",
+			"bli" : "ble",
+			"alli" : "al",
+			"entli" : "ent",
+			"eli" : "e",
+			"ousli" : "ous",
+			"ization" : "ize",
+			"ation" : "ate",
+			"ator" : "ate",
+			"alism" : "al",
+			"iveness" : "ive",
+			"fulness" : "ful",
+			"ousness" : "ous",
+			"aliti" : "al",
+			"iviti" : "ive",
+			"biliti" : "ble",
+			"logi" : "log"
+		},
+
+		step3list = {
+			"icate" : "ic",
+			"ative" : "",
+			"alize" : "al",
+			"iciti" : "ic",
+			"ical" : "ic",
+			"ful" : "",
+			"ness" : ""
+		},
+
+		c = "[^aeiou]",          // consonant
+		v = "[aeiouy]",          // vowel
+		C = c + "[^aeiouy]*",    // consonant sequence
+		V = v + "[aeiou]*",      // vowel sequence
+
+		mgr0 = "^(" + C + ")?" + V + C,               // [C]VC... is m>0
+		meq1 = "^(" + C + ")?" + V + C + "(" + V + ")?$",  // [C]VC[V] is m=1
+		mgr1 = "^(" + C + ")?" + V + C + V + C,       // [C]VCVC... is m>1
+		s_v = "^(" + C + ")?" + v;                   // vowel in stem
+
+	return function (w) {
+		var 	stem,
+			suffix,
+			firstch,
+			re,
+			re2,
+			re3,
+			re4,
+			origword = w;
+
+		if (w.length < 3) { return w; }
+
+		firstch = w.substr(0,1);
+		if (firstch == "y") {
+			w = firstch.toUpperCase() + w.substr(1);
+		}
+
+		// Step 1a
+		re = /^(.+?)(ss|i)es$/;
+		re2 = /^(.+?)([^s])s$/;
+
+		if (re.test(w)) { w = w.replace(re,"$1$2"); }
+		else if (re2.test(w)) {	w = w.replace(re2,"$1$2"); }
+
+		// Step 1b
+		re = /^(.+?)eed$/;
+		re2 = /^(.+?)(ed|ing)$/;
+		if (re.test(w)) {
+			var fp = re.exec(w);
+			re = new RegExp(mgr0);
+			if (re.test(fp[1])) {
+				re = /.$/;
+				w = w.replace(re,"");
+			}
+		} else if (re2.test(w)) {
+			var fp = re2.exec(w);
+			stem = fp[1];
+			re2 = new RegExp(s_v);
+			if (re2.test(stem)) {
+				w = stem;
+				re2 = /(at|bl|iz)$/;
+				re3 = new RegExp("([^aeiouylsz])\\1$");
+				re4 = new RegExp("^" + C + v + "[^aeiouwxy]$");
+				if (re2.test(w)) { w = w + "e"; }
+				else if (re3.test(w)) { re = /.$/; w = w.replace(re,""); }
+				else if (re4.test(w)) { w = w + "e"; }
+			}
+		}
+
+		// Step 1c
+	        re = new RegExp("^(.+" + c + ")y$");
+		    if (re.test(w)) {
+			var fp = re.exec(w);
+			stem = fp[1];
+		    w = stem + "i";
+		}
+
+		// Step 2
+		re = /^(.+?)(ational|tional|enci|anci|izer|bli|alli|entli|eli|ousli|ization|ation|ator|alism|iveness|fulness|ousness|aliti|iviti|biliti|logi)$/;
+		if (re.test(w)) {
+			var fp = re.exec(w);
+			stem = fp[1];
+			suffix = fp[2];
+			re = new RegExp(mgr0);
+			if (re.test(stem)) {
+				w = stem + step2list[suffix];
+			}
+		}
+
+		// Step 3
+		re = /^(.+?)(icate|ative|alize|iciti|ical|ful|ness)$/;
+		if (re.test(w)) {
+			var fp = re.exec(w);
+			stem = fp[1];
+			suffix = fp[2];
+			re = new RegExp(mgr0);
+			if (re.test(stem)) {
+				w = stem + step3list[suffix];
+			}
+		}
+
+		// Step 4
+		re = /^(.+?)(al|ance|ence|er|ic|able|ible|ant|ement|ment|ent|ou|ism|ate|iti|ous|ive|ize)$/;
+		re2 = /^(.+?)(s|t)(ion)$/;
+		if (re.test(w)) {
+			var fp = re.exec(w);
+			stem = fp[1];
+			re = new RegExp(mgr1);
+			if (re.test(stem)) {
+				w = stem;
+			}
+		} else if (re2.test(w)) {
+			var fp = re2.exec(w);
+			stem = fp[1] + fp[2];
+			re2 = new RegExp(mgr1);
+			if (re2.test(stem)) {
+				w = stem;
+			}
+		}
+
+		// Step 5
+		re = /^(.+?)e$/;
+		if (re.test(w)) {
+			var fp = re.exec(w);
+			stem = fp[1];
+			re = new RegExp(mgr1);
+			re2 = new RegExp(meq1);
+			re3 = new RegExp("^" + C + v + "[^aeiouwxy]$");
+			if (re.test(stem) || (re2.test(stem) && !(re3.test(stem)))) {
+				w = stem;
+			}
+		}
+
+		re = /ll$/;
+		re2 = new RegExp(mgr1);
+		if (re.test(w) && re2.test(w)) {
+			re = /.$/;
+			w = w.replace(re,"");
+		}
+
+		// and turn initial Y back to y
+
+		if (firstch == "y") {
+			w = firstch.toLowerCase() + w.substr(1);
+		}
+
+	    // See http://snowball.tartarus.org/algorithms/english/stemmer.html
+	    // "Exceptional forms in general"
+	    var specialWords = {
+	    	"skis" : "ski",
+	    	"skies" : "sky",
+	    	"dying" : "die",
+	    	"lying" : "lie",
+	    	"tying" : "tie",
+	    	"idly" : "idl",
+	    	"gently" : "gentl",
+	    	"ugly" : "ugli",
+	    	"early": "earli",
+	    	"only": "onli",
+	    	"singly": "singl"
+	    };
+
+	    if(specialWords[origword]){
+	    	w = specialWords[origword];
+	    }
+
+	    if( "sky news howe atlas cosmos bias \
+	    	 andes inning outing canning herring \
+	    	 earring proceed exceed succeed".indexOf(origword) !== -1 ){
+	    	w = origword;
+	    }
+
+	    // Address words overstemmed as gener-
+	    re = /.*generate?s?d?(ing)?$/;
+	    if( re.test(origword) ){
+		w = w + 'at';
+	    }
+	    re = /.*general(ly)?$/;
+	    if( re.test(origword) ){
+		w = w + 'al';
+	    }
+	    re = /.*generic(ally)?$/;
+	    if( re.test(origword) ){
+		w = w + 'ic';
+	    }
+	    re = /.*generous(ly)?$/;
+	    if( re.test(origword) ){
+		w = w + 'ous';
+	    }
+	    // Address words overstemmed as commun-
+	    re = /.*communit(ies)?y?/;
+	    if( re.test(origword) ){
+		w = w + 'iti';
+	    }
+
+	    return w;
+	}
+})();
author	Lorry Tar Creator <lorry-tar-importer@baserock.org>	2013-03-17 20:07:05 +0000
committer	<>	2015-07-07 10:18:30 +0000
commit	802da9dd5d4bc18f46a916eedc0c5c1980a15e59 (patch)
tree	f78a8637465b7a4c9624fef03d27eb7aeaa779d4 /webhelp/template/search/stemmers/en_stemmer.js
parent	18f63104106b81bf37ca1af774d7be38051e5444 (diff)
download	docbook-xsl-802da9dd5d4bc18f46a916eedc0c5c1980a15e59.tar.gz