summaryrefslogtreecommitdiff
path: root/tools/eslint/node_modules/jschardet/src/hebrewprober.js
diff options
context:
space:
mode:
Diffstat (limited to 'tools/eslint/node_modules/jschardet/src/hebrewprober.js')
-rwxr-xr-xtools/eslint/node_modules/jschardet/src/hebrewprober.js322
1 files changed, 0 insertions, 322 deletions
diff --git a/tools/eslint/node_modules/jschardet/src/hebrewprober.js b/tools/eslint/node_modules/jschardet/src/hebrewprober.js
deleted file mode 100755
index b2e48cecc5..0000000000
--- a/tools/eslint/node_modules/jschardet/src/hebrewprober.js
+++ /dev/null
@@ -1,322 +0,0 @@
-/*
- * The Original Code is Mozilla Universal charset detector code.
- *
- * The Initial Developer of the Original Code is
- * Netscape Communications Corporation.
- * Portions created by the Initial Developer are Copyright (C) 2001
- * the Initial Developer. All Rights Reserved.
- *
- * Contributor(s):
- * António Afonso (antonio.afonso gmail.com) - port to JavaScript
- * Mark Pilgrim - port to Python
- * Shy Shalom - original C code
- *
- * This library is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with this library; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
- * 02110-1301 USA
- */
-
-!function(jschardet) {
-
-// This prober doesn't actually recognize a language or a charset.
-// It is a helper prober for the use of the Hebrew model probers
-
-////// General ideas of the Hebrew charset recognition //////
-//
-// Four main charsets exist in Hebrew:
-// "ISO-8859-8" - Visual Hebrew
-// "windows-1255" - Logical Hebrew
-// "ISO-8859-8-I" - Logical Hebrew
-// "x-mac-hebrew" - ?? Logical Hebrew ??
-//
-// Both "ISO" charsets use a completely identical set of code points, whereas
-// "windows-1255" and "x-mac-hebrew" are two different proper supersets of
-// these code points. windows-1255 defines additional characters in the range
-// 0x80-0x9F as some misc punctuation marks as well as some Hebrew-specific
-// diacritics and additional 'Yiddish' ligature letters in the range 0xc0-0xd6.
-// x-mac-hebrew defines similar additional code points but with a different
-// mapping.
-//
-// As far as an average Hebrew text with no diacritics is concerned, all four
-// charsets are identical with respect to code points. Meaning that for the
-// main Hebrew alphabet, all four map the same values to all 27 Hebrew letters
-// (including final letters).
-//
-// The dominant difference between these charsets is their directionality.
-// "Visual" directionality means that the text is ordered as if the renderer is
-// not aware of a BIDI rendering algorithm. The renderer sees the text and
-// draws it from left to right. The text itself when ordered naturally is read
-// backwards. A buffer of Visual Hebrew generally looks like so:
-// "[last word of first line spelled backwards] [whole line ordered backwards
-// and spelled backwards] [first word of first line spelled backwards]
-// [end of line] [last word of second line] ... etc' "
-// adding punctuation marks, numbers and English text to visual text is
-// naturally also "visual" and from left to right.
-//
-// "Logical" directionality means the text is ordered "naturally" according to
-// the order it is read. It is the responsibility of the renderer to display
-// the text from right to left. A BIDI algorithm is used to place general
-// punctuation marks, numbers and English text in the text.
-//
-// Texts in x-mac-hebrew are almost impossible to find on the Internet. From
-// what little evidence I could find, it seems that its general directionality
-// is Logical.
-//
-// To sum up all of the above, the Hebrew probing mechanism knows about two
-// charsets:
-// Visual Hebrew - "ISO-8859-8" - backwards text - Words and sentences are
-// backwards while line order is natural. For charset recognition purposes
-// the line order is unimportant (In fact, for this implementation, even
-// word order is unimportant).
-// Logical Hebrew - "windows-1255" - normal, naturally ordered text.
-//
-// "ISO-8859-8-I" is a subset of windows-1255 and doesn't need to be
-// specifically identified.
-// "x-mac-hebrew" is also identified as windows-1255. A text in x-mac-hebrew
-// that contain special punctuation marks or diacritics is displayed with
-// some unconverted characters showing as question marks. This problem might
-// be corrected using another model prober for x-mac-hebrew. Due to the fact
-// that x-mac-hebrew texts are so rare, writing another model prober isn't
-// worth the effort and performance hit.
-//
-//////// The Prober ////////
-//
-// The prober is divided between two SBCharSetProbers and a HebrewProber,
-// all of which are managed, created, fed data, inquired and deleted by the
-// SBCSGroupProber. The two SBCharSetProbers identify that the text is in
-// fact some kind of Hebrew, Logical or Visual. The final decision about which
-// one is it is made by the HebrewProber by combining final-letter scores
-// with the scores of the two SBCharSetProbers to produce a final answer.
-//
-// The SBCSGroupProber is responsible for stripping the original text of HTML
-// tags, English characters, numbers, low-ASCII punctuation characters, spaces
-// and new lines. It reduces any sequence of such characters to a single space.
-// The buffer fed to each prober in the SBCS group prober is pure text in
-// high-ASCII.
-// The two SBCharSetProbers (model probers) share the same language model:
-// Win1255Model.
-// The first SBCharSetProber uses the model normally as any other
-// SBCharSetProber does, to recognize windows-1255, upon which this model was
-// built. The second SBCharSetProber is told to make the pair-of-letter
-// lookup in the language model backwards. This in practice exactly simulates
-// a visual Hebrew model using the windows-1255 logical Hebrew model.
-//
-// The HebrewProber is not using any language model. All it does is look for
-// final-letter evidence suggesting the text is either logical Hebrew or visual
-// Hebrew. Disjointed from the model probers, the results of the HebrewProber
-// alone are meaningless. HebrewProber always returns 0.00 as confidence
-// since it never identifies a charset by itself. Instead, the pointer to the
-// HebrewProber is passed to the model probers as a helper "Name Prober".
-// When the Group prober receives a positive identification from any prober,
-// it asks for the name of the charset identified. If the prober queried is a
-// Hebrew model prober, the model prober forwards the call to the
-// HebrewProber to make the final decision. In the HebrewProber, the
-// decision is made according to the final-letters scores maintained and Both
-// model probers scores. The answer is returned in the form of the name of the
-// charset identified, either "windows-1255" or "ISO-8859-8".
-
-jschardet.HebrewProber = function() {
- jschardet.CharSetProber.apply(this);
-
- // windows-1255 / ISO-8859-8 code points of interest
- var FINAL_KAF = '\xea'
- var NORMAL_KAF = '\xeb'
- var FINAL_MEM = '\xed'
- var NORMAL_MEM = '\xee'
- var FINAL_NUN = '\xef'
- var NORMAL_NUN = '\xf0'
- var FINAL_PE = '\xf3'
- var NORMAL_PE = '\xf4'
- var FINAL_TSADI = '\xf5'
- var NORMAL_TSADI = '\xf6'
-
- // Minimum Visual vs Logical final letter score difference.
- // If the difference is below this, don't rely solely on the final letter score distance.
- var MIN_FINAL_CHAR_DISTANCE = 5
-
- // Minimum Visual vs Logical model score difference.
- // If the difference is below this, don't rely at all on the model score distance.
- var MIN_MODEL_DISTANCE = 0.01
-
- var VISUAL_HEBREW_NAME = "ISO-8859-8"
- var LOGICAL_HEBREW_NAME = "windows-1255"
- var self = this;
-
- function init() {
- self._mLogicalProber = null;
- self._mVisualProber = null;
- self.reset();
- }
-
- this.reset = function() {
- this._mFinalCharLogicalScore = 0;
- this._mFinalCharVisualScore = 0;
- // The two last characters seen in the previous buffer,
- // mPrev and mBeforePrev are initialized to space in order to simulate a word
- // delimiter at the beginning of the data
- this._mPrev = " ";
- this._mBeforePrev = " ";
- // These probers are owned by the group prober.
- }
-
- this.setModelProbers = function(logicalProber, visualProber) {
- this._mLogicalProber = logicalProber;
- this._mVisualProber = visualProber;
- }
-
- this.isFinal = function(c) {
- return [FINAL_KAF, FINAL_MEM, FINAL_NUN, FINAL_PE, FINAL_TSADI].indexOf(c) != -1;
- }
-
- this.isNonFinal = function(c) {
- // The normal Tsadi is not a good Non-Final letter due to words like
- // 'lechotet' (to chat) containing an apostrophe after the tsadi. This
- // apostrophe is converted to a space in FilterWithoutEnglishLetters causing
- // the Non-Final tsadi to appear at an end of a word even though this is not
- // the case in the original text.
- // The letters Pe and Kaf rarely display a related behavior of not being a
- // good Non-Final letter. Words like 'Pop', 'Winamp' and 'Mubarak' for
- // example legally end with a Non-Final Pe or Kaf. However, the benefit of
- // these letters as Non-Final letters outweighs the damage since these words
- // are quite rare.
- return [NORMAL_KAF, NORMAL_MEM, NORMAL_NUN, NORMAL_PE].indexOf(c) != -1;
- }
-
- this.feed = function(aBuf) {
- // Final letter analysis for logical-visual decision.
- // Look for evidence that the received buffer is either logical Hebrew or
- // visual Hebrew.
- // The following cases are checked:
- // 1) A word longer than 1 letter, ending with a final letter. This is an
- // indication that the text is laid out "naturally" since the final letter
- // really appears at the end. +1 for logical score.
- // 2) A word longer than 1 letter, ending with a Non-Final letter. In normal
- // Hebrew, words ending with Kaf, Mem, Nun, Pe or Tsadi, should not end with
- // the Non-Final form of that letter. Exceptions to this rule are mentioned
- // above in isNonFinal(). This is an indication that the text is laid out
- // backwards. +1 for visual score
- // 3) A word longer than 1 letter, starting with a final letter. Final letters
- // should not appear at the beginning of a word. This is an indication that
- // the text is laid out backwards. +1 for visual score.
- //
- // The visual score and logical score are accumulated throughout the text and
- // are finally checked against each other in GetCharSetName().
- // No checking for final letters in the middle of words is done since that case
- // is not an indication for either Logical or Visual text.
- //
- // We automatically filter out all 7-bit characters (replace them with spaces)
- // so the word boundary detection works properly. [MAP]
-
- if( this.getState() == jschardet.Constants.notMe ) {
- // Both model probers say it's not them. No reason to continue.
- return jschardet.Constants.notMe;
- }
-
- aBuf = this.filterHighBitOnly(aBuf);
-
- for( var i = 0, cur; i < aBuf.length; i++ ) {
- cur = aBuf[i];
- if( cur == " " ) {
- // We stand on a space - a word just ended
- if( this._mBeforePrev != " " ) {
- // next-to-last char was not a space so self._mPrev is not a 1 letter word
- if( this.isFinal(this._mPrev) ) {
- // case (1) [-2:not space][-1:final letter][cur:space]
- this._mFinalCharLogicalScore++;
- } else if( this.isNonFinal(this._mPrev) ) {
- // case (2) [-2:not space][-1:Non-Final letter][cur:space]
- this._mFinalCharVisualScore++;
- }
- }
- } else {
- // Not standing on a space
- if( this._mBeforePrev == " " && this.isFinal(this._mPrev) && cur != " " ) {
- // case (3) [-2:space][-1:final letter][cur:not space]
- this._mFinalCharVisualScore++;
- }
- }
- this._mBeforePrev = this._mPrev;
- this._mPrev = cur;
- }
- // Forever detecting, till the end or until both model probers return eNotMe (handled above)
- return jschardet.Constants.detecting;
- }
-
- this.getCharsetName = function() {
- // Make the decision: is it Logical or Visual?
- // If the final letter score distance is dominant enough, rely on it.
- var finalsub = this._mFinalCharLogicalScore - this._mFinalCharVisualScore;
- if( finalsub >= MIN_FINAL_CHAR_DISTANCE ) {
- return LOGICAL_HEBREW_NAME;
- }
- if( finalsub <= -MIN_FINAL_CHAR_DISTANCE ) {
- return VISUAL_HEBREW_NAME;
- }
-
- // It's not dominant enough, try to rely on the model scores instead.
- var modelsub = this._mLogicalProber.getConfidence() - this._mVisualProber.getConfidence();
- if( modelsub > MIN_MODEL_DISTANCE ) {
- return LOGICAL_HEBREW_NAME;
- }
- if( modelsub < -MIN_MODEL_DISTANCE ) {
- return VISUAL_HEBREW_NAME;
- }
-
- // Still no good, back to final letter distance, maybe it'll save the day.
- if( finalsub < 0 ) {
- return VISUAL_HEBREW_NAME;
- }
-
- // (finalsub > 0 - Logical) or (don't know what to do) default to Logical.
- return LOGICAL_HEBREW_NAME;
- }
-
- this.getState = function() {
- // Remain active as long as any of the model probers are active.
- if( this._mLogicalProber.getState() == jschardet.Constants.notMe &&
- this._mVisualProber.getState() == jschardet.Constants.notMe ) {
- return jschardet.Constants.notMe;
- }
- return jschardet.Constants.detecting;
- }
-
- init();
-}
-jschardet.HebrewProber.prototype = new jschardet.CharSetProber();
-
-// https://developer.mozilla.org/En/Core_JavaScript_1.5_Reference/Objects/Array/IndexOf
-if (!Array.prototype.indexOf)
-{
- Array.prototype.indexOf = function(elt /*, from*/)
- {
- var len = this.length >>> 0;
-
- var from = Number(arguments[1]) || 0;
- from = (from < 0)
- ? Math.ceil(from)
- : Math.floor(from);
- if (from < 0)
- from += len;
-
- for (; from < len; from++)
- {
- if (from in this &&
- this[from] === elt)
- return from;
- }
- return -1;
- };
-}
-
-}(require('./init'));