diff options
Diffstat (limited to 'tools/eslint/node_modules/jschardet/src/hebrewprober.js')
-rwxr-xr-x | tools/eslint/node_modules/jschardet/src/hebrewprober.js | 322 |
1 files changed, 0 insertions, 322 deletions
diff --git a/tools/eslint/node_modules/jschardet/src/hebrewprober.js b/tools/eslint/node_modules/jschardet/src/hebrewprober.js deleted file mode 100755 index b2e48cecc5..0000000000 --- a/tools/eslint/node_modules/jschardet/src/hebrewprober.js +++ /dev/null @@ -1,322 +0,0 @@ -/* - * The Original Code is Mozilla Universal charset detector code. - * - * The Initial Developer of the Original Code is - * Netscape Communications Corporation. - * Portions created by the Initial Developer are Copyright (C) 2001 - * the Initial Developer. All Rights Reserved. - * - * Contributor(s): - * António Afonso (antonio.afonso gmail.com) - port to JavaScript - * Mark Pilgrim - port to Python - * Shy Shalom - original C code - * - * This library is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with this library; if not, write to the Free Software - * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA - * 02110-1301 USA - */ - -!function(jschardet) { - -// This prober doesn't actually recognize a language or a charset. -// It is a helper prober for the use of the Hebrew model probers - -////// General ideas of the Hebrew charset recognition ////// -// -// Four main charsets exist in Hebrew: -// "ISO-8859-8" - Visual Hebrew -// "windows-1255" - Logical Hebrew -// "ISO-8859-8-I" - Logical Hebrew -// "x-mac-hebrew" - ?? Logical Hebrew ?? -// -// Both "ISO" charsets use a completely identical set of code points, whereas -// "windows-1255" and "x-mac-hebrew" are two different proper supersets of -// these code points. windows-1255 defines additional characters in the range -// 0x80-0x9F as some misc punctuation marks as well as some Hebrew-specific -// diacritics and additional 'Yiddish' ligature letters in the range 0xc0-0xd6. -// x-mac-hebrew defines similar additional code points but with a different -// mapping. -// -// As far as an average Hebrew text with no diacritics is concerned, all four -// charsets are identical with respect to code points. Meaning that for the -// main Hebrew alphabet, all four map the same values to all 27 Hebrew letters -// (including final letters). -// -// The dominant difference between these charsets is their directionality. -// "Visual" directionality means that the text is ordered as if the renderer is -// not aware of a BIDI rendering algorithm. The renderer sees the text and -// draws it from left to right. The text itself when ordered naturally is read -// backwards. A buffer of Visual Hebrew generally looks like so: -// "[last word of first line spelled backwards] [whole line ordered backwards -// and spelled backwards] [first word of first line spelled backwards] -// [end of line] [last word of second line] ... etc' " -// adding punctuation marks, numbers and English text to visual text is -// naturally also "visual" and from left to right. -// -// "Logical" directionality means the text is ordered "naturally" according to -// the order it is read. It is the responsibility of the renderer to display -// the text from right to left. A BIDI algorithm is used to place general -// punctuation marks, numbers and English text in the text. -// -// Texts in x-mac-hebrew are almost impossible to find on the Internet. From -// what little evidence I could find, it seems that its general directionality -// is Logical. -// -// To sum up all of the above, the Hebrew probing mechanism knows about two -// charsets: -// Visual Hebrew - "ISO-8859-8" - backwards text - Words and sentences are -// backwards while line order is natural. For charset recognition purposes -// the line order is unimportant (In fact, for this implementation, even -// word order is unimportant). -// Logical Hebrew - "windows-1255" - normal, naturally ordered text. -// -// "ISO-8859-8-I" is a subset of windows-1255 and doesn't need to be -// specifically identified. -// "x-mac-hebrew" is also identified as windows-1255. A text in x-mac-hebrew -// that contain special punctuation marks or diacritics is displayed with -// some unconverted characters showing as question marks. This problem might -// be corrected using another model prober for x-mac-hebrew. Due to the fact -// that x-mac-hebrew texts are so rare, writing another model prober isn't -// worth the effort and performance hit. -// -//////// The Prober //////// -// -// The prober is divided between two SBCharSetProbers and a HebrewProber, -// all of which are managed, created, fed data, inquired and deleted by the -// SBCSGroupProber. The two SBCharSetProbers identify that the text is in -// fact some kind of Hebrew, Logical or Visual. The final decision about which -// one is it is made by the HebrewProber by combining final-letter scores -// with the scores of the two SBCharSetProbers to produce a final answer. -// -// The SBCSGroupProber is responsible for stripping the original text of HTML -// tags, English characters, numbers, low-ASCII punctuation characters, spaces -// and new lines. It reduces any sequence of such characters to a single space. -// The buffer fed to each prober in the SBCS group prober is pure text in -// high-ASCII. -// The two SBCharSetProbers (model probers) share the same language model: -// Win1255Model. -// The first SBCharSetProber uses the model normally as any other -// SBCharSetProber does, to recognize windows-1255, upon which this model was -// built. The second SBCharSetProber is told to make the pair-of-letter -// lookup in the language model backwards. This in practice exactly simulates -// a visual Hebrew model using the windows-1255 logical Hebrew model. -// -// The HebrewProber is not using any language model. All it does is look for -// final-letter evidence suggesting the text is either logical Hebrew or visual -// Hebrew. Disjointed from the model probers, the results of the HebrewProber -// alone are meaningless. HebrewProber always returns 0.00 as confidence -// since it never identifies a charset by itself. Instead, the pointer to the -// HebrewProber is passed to the model probers as a helper "Name Prober". -// When the Group prober receives a positive identification from any prober, -// it asks for the name of the charset identified. If the prober queried is a -// Hebrew model prober, the model prober forwards the call to the -// HebrewProber to make the final decision. In the HebrewProber, the -// decision is made according to the final-letters scores maintained and Both -// model probers scores. The answer is returned in the form of the name of the -// charset identified, either "windows-1255" or "ISO-8859-8". - -jschardet.HebrewProber = function() { - jschardet.CharSetProber.apply(this); - - // windows-1255 / ISO-8859-8 code points of interest - var FINAL_KAF = '\xea' - var NORMAL_KAF = '\xeb' - var FINAL_MEM = '\xed' - var NORMAL_MEM = '\xee' - var FINAL_NUN = '\xef' - var NORMAL_NUN = '\xf0' - var FINAL_PE = '\xf3' - var NORMAL_PE = '\xf4' - var FINAL_TSADI = '\xf5' - var NORMAL_TSADI = '\xf6' - - // Minimum Visual vs Logical final letter score difference. - // If the difference is below this, don't rely solely on the final letter score distance. - var MIN_FINAL_CHAR_DISTANCE = 5 - - // Minimum Visual vs Logical model score difference. - // If the difference is below this, don't rely at all on the model score distance. - var MIN_MODEL_DISTANCE = 0.01 - - var VISUAL_HEBREW_NAME = "ISO-8859-8" - var LOGICAL_HEBREW_NAME = "windows-1255" - var self = this; - - function init() { - self._mLogicalProber = null; - self._mVisualProber = null; - self.reset(); - } - - this.reset = function() { - this._mFinalCharLogicalScore = 0; - this._mFinalCharVisualScore = 0; - // The two last characters seen in the previous buffer, - // mPrev and mBeforePrev are initialized to space in order to simulate a word - // delimiter at the beginning of the data - this._mPrev = " "; - this._mBeforePrev = " "; - // These probers are owned by the group prober. - } - - this.setModelProbers = function(logicalProber, visualProber) { - this._mLogicalProber = logicalProber; - this._mVisualProber = visualProber; - } - - this.isFinal = function(c) { - return [FINAL_KAF, FINAL_MEM, FINAL_NUN, FINAL_PE, FINAL_TSADI].indexOf(c) != -1; - } - - this.isNonFinal = function(c) { - // The normal Tsadi is not a good Non-Final letter due to words like - // 'lechotet' (to chat) containing an apostrophe after the tsadi. This - // apostrophe is converted to a space in FilterWithoutEnglishLetters causing - // the Non-Final tsadi to appear at an end of a word even though this is not - // the case in the original text. - // The letters Pe and Kaf rarely display a related behavior of not being a - // good Non-Final letter. Words like 'Pop', 'Winamp' and 'Mubarak' for - // example legally end with a Non-Final Pe or Kaf. However, the benefit of - // these letters as Non-Final letters outweighs the damage since these words - // are quite rare. - return [NORMAL_KAF, NORMAL_MEM, NORMAL_NUN, NORMAL_PE].indexOf(c) != -1; - } - - this.feed = function(aBuf) { - // Final letter analysis for logical-visual decision. - // Look for evidence that the received buffer is either logical Hebrew or - // visual Hebrew. - // The following cases are checked: - // 1) A word longer than 1 letter, ending with a final letter. This is an - // indication that the text is laid out "naturally" since the final letter - // really appears at the end. +1 for logical score. - // 2) A word longer than 1 letter, ending with a Non-Final letter. In normal - // Hebrew, words ending with Kaf, Mem, Nun, Pe or Tsadi, should not end with - // the Non-Final form of that letter. Exceptions to this rule are mentioned - // above in isNonFinal(). This is an indication that the text is laid out - // backwards. +1 for visual score - // 3) A word longer than 1 letter, starting with a final letter. Final letters - // should not appear at the beginning of a word. This is an indication that - // the text is laid out backwards. +1 for visual score. - // - // The visual score and logical score are accumulated throughout the text and - // are finally checked against each other in GetCharSetName(). - // No checking for final letters in the middle of words is done since that case - // is not an indication for either Logical or Visual text. - // - // We automatically filter out all 7-bit characters (replace them with spaces) - // so the word boundary detection works properly. [MAP] - - if( this.getState() == jschardet.Constants.notMe ) { - // Both model probers say it's not them. No reason to continue. - return jschardet.Constants.notMe; - } - - aBuf = this.filterHighBitOnly(aBuf); - - for( var i = 0, cur; i < aBuf.length; i++ ) { - cur = aBuf[i]; - if( cur == " " ) { - // We stand on a space - a word just ended - if( this._mBeforePrev != " " ) { - // next-to-last char was not a space so self._mPrev is not a 1 letter word - if( this.isFinal(this._mPrev) ) { - // case (1) [-2:not space][-1:final letter][cur:space] - this._mFinalCharLogicalScore++; - } else if( this.isNonFinal(this._mPrev) ) { - // case (2) [-2:not space][-1:Non-Final letter][cur:space] - this._mFinalCharVisualScore++; - } - } - } else { - // Not standing on a space - if( this._mBeforePrev == " " && this.isFinal(this._mPrev) && cur != " " ) { - // case (3) [-2:space][-1:final letter][cur:not space] - this._mFinalCharVisualScore++; - } - } - this._mBeforePrev = this._mPrev; - this._mPrev = cur; - } - // Forever detecting, till the end or until both model probers return eNotMe (handled above) - return jschardet.Constants.detecting; - } - - this.getCharsetName = function() { - // Make the decision: is it Logical or Visual? - // If the final letter score distance is dominant enough, rely on it. - var finalsub = this._mFinalCharLogicalScore - this._mFinalCharVisualScore; - if( finalsub >= MIN_FINAL_CHAR_DISTANCE ) { - return LOGICAL_HEBREW_NAME; - } - if( finalsub <= -MIN_FINAL_CHAR_DISTANCE ) { - return VISUAL_HEBREW_NAME; - } - - // It's not dominant enough, try to rely on the model scores instead. - var modelsub = this._mLogicalProber.getConfidence() - this._mVisualProber.getConfidence(); - if( modelsub > MIN_MODEL_DISTANCE ) { - return LOGICAL_HEBREW_NAME; - } - if( modelsub < -MIN_MODEL_DISTANCE ) { - return VISUAL_HEBREW_NAME; - } - - // Still no good, back to final letter distance, maybe it'll save the day. - if( finalsub < 0 ) { - return VISUAL_HEBREW_NAME; - } - - // (finalsub > 0 - Logical) or (don't know what to do) default to Logical. - return LOGICAL_HEBREW_NAME; - } - - this.getState = function() { - // Remain active as long as any of the model probers are active. - if( this._mLogicalProber.getState() == jschardet.Constants.notMe && - this._mVisualProber.getState() == jschardet.Constants.notMe ) { - return jschardet.Constants.notMe; - } - return jschardet.Constants.detecting; - } - - init(); -} -jschardet.HebrewProber.prototype = new jschardet.CharSetProber(); - -// https://developer.mozilla.org/En/Core_JavaScript_1.5_Reference/Objects/Array/IndexOf -if (!Array.prototype.indexOf) -{ - Array.prototype.indexOf = function(elt /*, from*/) - { - var len = this.length >>> 0; - - var from = Number(arguments[1]) || 0; - from = (from < 0) - ? Math.ceil(from) - : Math.floor(from); - if (from < 0) - from += len; - - for (; from < len; from++) - { - if (from in this && - this[from] === elt) - return from; - } - return -1; - }; -} - -}(require('./init')); |