#

author: Dave Love <fx@gnu.org> 2002-05-17 17:00:15 +0000
committer: Dave Love <fx@gnu.org> 2002-05-17 17:00:15 +0000
commit: f55d2710b3a2b63af2d90c6f14b532a3d4ce3999 (patch)
tree: 51be77470eb1de158716fac23595e304d0d8849c /lisp/international/utf-8.el
parent: f848daf3d6fc05fe87bd3b0336c2188fbcb59058 (diff)
download: emacs-f55d2710b3a2b63af2d90c6f14b532a3d4ce3999.tar.gz
1 files changed, 0 insertions, 526 deletions
diff --git a/lisp/international/utf-8.el b/lisp/international/utf-8.el
deleted file mode 100644
index a6c7d1fe9ef..00000000000
--- a/lisp/international/utf-8.el
+++ /dev/null
@@ -1,526 +0,0 @@
-;;; utf-8.el --- Limited UTF-8 decoding/encoding support -*- coding: iso-2022-7bit -*-
-
-;; Copyright (C) 2001 Electrotechnical Laboratory, JAPAN.
-;; Licensed to the Free Software Foundation.
-;; Copyright (C) 2001 Free Software Foundation, Inc.
-
-;; Author: TAKAHASHI Naoto  <ntakahas@m17n.org>
-;; Keywords: multilingual, Unicode, UTF-8, i18n
-
-;; This file is part of GNU Emacs.
-
-;; GNU Emacs is free software; you can redistribute it and/or modify
-;; it under the terms of the GNU General Public License as published by
-;; the Free Software Foundation; either version 2, or (at your option)
-;; any later version.
-
-;; GNU Emacs is distributed in the hope that it will be useful,
-;; but WITHOUT ANY WARRANTY; without even the implied warranty of
-;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-;; GNU General Public License for more details.
-
-;; You should have received a copy of the GNU General Public License
-;; along with GNU Emacs; see the file COPYING.  If not, write to the
-;; Free Software Foundation, Inc., 59 Temple Place - Suite 330,
-;; Boston, MA 02111-1307, USA.
-
-;;; Commentary:
-
-;; The coding-system `mule-utf-8' basically supports encoding/decoding
-;; of the following character sets to and from UTF-8:
-;;
-;;   ascii
-;;   eight-bit-control
-;;   latin-iso8859-1
-;;   mule-unicode-0100-24ff
-;;   mule-unicode-2500-33ff
-;;   mule-unicode-e000-ffff
-;;
-;; On decoding, Unicode characters that do not fit into the above
-;; character sets are handled as `eight-bit-control' or
-;; `eight-bit-graphic' characters to retain the information about the
-;; original byte sequence.
-;;
-;; Characters from other character sets can be encoded with
-;; mule-utf-8 by populating the table `ucs-mule-to-mule-unicode' and
-;; registering the translation with `register-char-codings'.
-
-;; UTF-8 is defined in RFC 2279.  A sketch of the encoding is:
-
-;;        scalar       |               utf-8
-;;        value        | 1st byte  | 2nd byte  | 3rd byte
-;; --------------------+-----------+-----------+----------
-;; 0000 0000 0xxx xxxx | 0xxx xxxx |           |
-;; 0000 0yyy yyxx xxxx | 110y yyyy | 10xx xxxx |
-;; zzzz yyyy yyxx xxxx | 1110 zzzz | 10yy yyyy | 10xx xxxx
-
-;;; Code:
-
-(defvar ucs-mule-to-mule-unicode (make-translation-table)
-  "Translation table for encoding to `mule-utf-8'.")
-;; Could have been done by ucs-tables loaded before.
-(unless (get 'ucs-mule-to-mule-unicode 'translation-table)
-  (define-translation-table 'ucs-mule-to-mule-unicode ucs-mule-to-mule-unicode))
-(define-ccl-program ccl-decode-mule-utf-8
-  ;;
-  ;;        charset         | bytes in utf-8 | bytes in emacs
-  ;; -----------------------+----------------+---------------
-  ;;         ascii          |       1        |       1
-  ;; -----------------------+----------------+---------------
-  ;;    eight-bit-control   |       2        |       2
-  ;;    eight-bit-graphic   |       2        |       1
-  ;;     latin-iso8859-1    |       2        |       2
-  ;; -----------------------+----------------+---------------
-  ;; mule-unicode-0100-24ff |       2        |       4
-  ;;        (< 0800)        |                |
-  ;; -----------------------+----------------+---------------
-  ;; mule-unicode-0100-24ff |       3        |       4
-  ;;        (>= 8000)       |                |
-  ;; mule-unicode-2500-33ff |       3        |       4
-  ;; mule-unicode-e000-ffff |       3        |       4
-  ;;
-  ;; Thus magnification factor is two.
-  ;;
-  `(2
-    ((r5 = ,(charset-id 'eight-bit-control))
-     (r6 = ,(charset-id 'eight-bit-graphic))
-     (loop
-      (read r0)
-
-      ;; 1byte encoding, i.e., ascii
-      (if (r0 < #x80)
-	  (write r0)
-
-	;; 2 byte encoding 00000yyyyyxxxxxx = 110yyyyy 10xxxxxx
-	(if (r0 < #xe0)
-	    ((read r1)
-
-	     (if ((r1 & #b11000000) != #b10000000)
-		 ;; Invalid 2-byte sequence
-		 ((if (r0 < #xa0)
-		      (write-multibyte-character r5 r0)
-		    (write-multibyte-character r6 r0))
-		  (if (r1 < #x80)
-		      (write r1)
-		    (if (r1 < #xa0)
-			(write-multibyte-character r5 r1)
-		      (write-multibyte-character r6 r1))))
-
-	       ((r0 &= #x1f)
-		(r0 <<= 6)
-		(r1 &= #x3f)
-		(r1 += r0)
-		;; Now r1 holds scalar value
-
-		;; eight-bit-control
-		(if (r1 < 160)
-		    ((write-multibyte-character r5 r1))
-
-		  ;; latin-iso8859-1
-		  (if (r1 < 256)
-		      ((r0 = ,(charset-id 'latin-iso8859-1))
-		       (r1 -= 128)
-		       (write-multibyte-character r0 r1))
-
-		    ;; mule-unicode-0100-24ff (< 0800)
-		    ((r0 = ,(charset-id 'mule-unicode-0100-24ff))
-		     (r1 -= #x0100)
-		     (r2 = (((r1 / 96) + 32) << 7))
-		     (r1 %= 96)
-		     (r1 += (r2 + 32))
-		     (write-multibyte-character r0 r1)))))))
-
-	  ;; 3byte encoding
-	  ;; zzzzyyyyyyxxxxxx = 1110zzzz 10yyyyyy 10xxxxxx
-	  (if (r0 < #xf0)
-	      ((read r1 r2)
-
-	       ;; This is set to 1 if the encoding is invalid.
-	       (r4 = 0)
-
-	       (r3 = (r1 & #b11000000))
-	       (r3 |= ((r2 >> 2) & #b00110000))
-	       (if (r3 != #b10100000)
-		   (r4 = 1)
-		 ((r3 = ((r0 & #x0f) << 12))
-		  (r3 += ((r1 & #x3f) << 6))
-		  (r3 += (r2 & #x3f))
-		  (if (r3 < #x0800)
-		      (r4 = 1))))
-
-	       (if (r4 != 0)
-		   ;; Invalid 3-byte sequence
-		   ((if (r0 < #xa0)
-			(write-multibyte-character r5 r0)
-		      (write-multibyte-character r6 r0))
-		    (if (r1 < #x80)
-			(write r1)
-		      (if (r1 < #xa0)
-			  (write-multibyte-character r5 r1)
-			(write-multibyte-character r6 r1)))
-		    (if (r2 < #x80)
-			(write r2)
-		      (if (r2 < #xa0)
-			  (write-multibyte-character r5 r2)
-			(write-multibyte-character r6 r2))))
-		 
-		 ;; mule-unicode-0100-24ff (>= 0800)
-		 ((if (r3 < #x2500)
-		      ((r0 = ,(charset-id 'mule-unicode-0100-24ff))
-		       (r3 -= #x0100)
-		       (r3 //= 96)
-		       (r1 = (r7 + 32))
-		       (r1 += ((r3 + 32) << 7))
-		       (write-multibyte-character r0 r1))
-		    
-		    ;; mule-unicode-2500-33ff
-		    (if (r3 < #x3400)
-			((r0 = ,(charset-id 'mule-unicode-2500-33ff))
-			 (r3 -= #x2500)
-			 (r3 //= 96)
-			 (r1 = (r7 + 32))
-			 (r1 += ((r3 + 32) << 7))
-			 (write-multibyte-character r0 r1))
-		      
-		      ;; U+3400 .. U+DFFF
-		    ;; keep those bytes as eight-bit-{control|graphic}
-		      (if (r3 < #xe000)
-			  ( ;; #xe0 <= r0 < #xf0, so r0 is eight-bit-graphic
-			   (r3 = r6)
-			   (write-multibyte-character r3 r0)
-			   (if (r1 < #xa0)
-			       (r3 = r5))
-			   (write-multibyte-character r3 r1)
-			   (if (r2 < #xa0)
-			       (r3 = r5)
-			     (r3 = r6))
-			   (write-multibyte-character r3 r2))
-			
-			;; mule-unicode-e000-ffff
-			((r0 = ,(charset-id 'mule-unicode-e000-ffff))
-			 (r3 -= #xe000)
-			 (r3 //= 96)
-			 (r1 = (r7 + 32))
-			 (r1 += ((r3 + 32) << 7))
-			 (write-multibyte-character r0 r1))))))))
-
-	    ;; 4byte encoding
-	    ;; keep those bytes as eight-bit-{control|graphic}
-	    ((read r1 r2 r3)
-	     ;; r0 > #xf0, thus eight-bit-graphic
-	     (write-multibyte-character r6 r0)
-	     (if (r1 < #xa0)
-		 (write-multibyte-character r5 r1)
-	       (write-multibyte-character r6 r1))
-	     (if (r2 < #xa0)
-		 (write-multibyte-character r5 r2)
-	       (write-multibyte-character r6 r2))
-	     (if (r3 < #xa0)
-		 (write-multibyte-character r5 r3)
-	       (write-multibyte-character r6 r3))))))
-
-      (repeat))))
-
-  "CCL program to decode UTF-8.
-Basic decoding is done into the charsets ascii, latin-iso8859-1 and
-mule-unicode-*.  Encodings of un-representable Unicode characters are
-decoded asis into eight-bit-control and eight-bit-graphic
-characters.")
-
-(define-ccl-program ccl-encode-mule-utf-8
-  `(1
-    ((r5 = -1)
-     (loop
-      (if (r5 < 0)
-	  ((r1 = -1)
-	   (read-multibyte-character r0 r1)
-	   (translate-character ucs-mule-to-mule-unicode r0 r1))
-	(;; We have already done read-multibyte-character.
-	 (r0 = r5)
-	 (r1 = r6)
-	 (r5 = -1)))
-
-      (if (r0 == ,(charset-id 'ascii))
-	  (write r1)
-
-	(if (r0 == ,(charset-id 'latin-iso8859-1))
-	    ;; r1          scalar                  utf-8
-	    ;;       0000 0yyy yyxx xxxx    110y yyyy 10xx xxxx
-	    ;; 20    0000 0000 1010 0000    1100 0010 1010 0000
-	    ;; 7f    0000 0000 1111 1111    1100 0011 1011 1111
-	    ((r0 = (((r1 & #x40) >> 6) | #xc2))
-	     (r1 &= #x3f)
-	     (r1 |= #x80)
-	     (write r0 r1))
-
-	  (if (r0 == ,(charset-id 'mule-unicode-0100-24ff))
-	      ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96))
-	       ;; #x3f80 == (0011 1111 1000 0000)b
-	       (r1 &= #x7f)
-	       (r1 += (r0 + 224))	; 240 == -32 + #x0100
-	       ;; now r1 holds scalar value
-	       (if (r1 < #x0800)
-		   ;; 2byte encoding
-		   ((r0 = (((r1 & #x07c0) >> 6) | #xc0))
-		    ;; #x07c0 == (0000 0111 1100 0000)b
-		    (r1 &= #x3f)
-		    (r1 |= #x80)
-		    (write r0 r1))
-		 ;; 3byte encoding
-		 ((r0 = (((r1 & #xf000) >> 12) | #xe0))
-		  (r2 = ((r1 & #x3f) | #x80))
-		  (r1 &= #x0fc0)
-		  (r1 >>= 6)
-		  (r1 |= #x80)
-		  (write r0 r1 r2))))
-
-	    (if (r0 == ,(charset-id 'mule-unicode-2500-33ff))
-		((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96))
-		 (r1 &= #x7f)
-		 (r1 += (r0 + 9440))	; 9440 == -32 + #x2500
-		 (r0 = (((r1 & #xf000) >> 12) | #xe0))
-		 (r2 = ((r1 & #x3f) | #x80))
-		 (r1 &= #x0fc0)
-		 (r1 >>= 6)
-		 (r1 |= #x80)
-		 (write r0 r1 r2))
-
-	      (if (r0 == ,(charset-id 'mule-unicode-e000-ffff))
-		  ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96))
-		   (r1 &= #x7f)
-		   (r1 += (r0 + 57312))	; 57312 == -160 + #xe000
-		   (r0 = (((r1 & #xf000) >> 12) | #xe0))
-		   (r2 = ((r1 & #x3f) | #x80))
-		   (r1 &= #x0fc0)
-		   (r1 >>= 6)
-		   (r1 |= #x80)
-		   (write r0 r1 r2))
-
-		(if (r0 == ,(charset-id 'eight-bit-control))
-		    ;; r1          scalar                  utf-8
-		    ;;       0000 0yyy yyxx xxxx    110y yyyy 10xx xxxx
-		    ;; 80    0000 0000 1000 0000    1100 0010 1000 0000
-		    ;; 9f    0000 0000 1001 1111    1100 0010 1001 1111
-		    ((write #xc2)
-		     (write r1))
-
-		  (if (r0 == ,(charset-id 'eight-bit-graphic))
-		      ;; r1          scalar                  utf-8
-		      ;;       0000 0yyy yyxx xxxx    110y yyyy 10xx xxxx
-		      ;; a0    0000 0000 1010 0000    1100 0010 1010 0000
-		      ;; ff    0000 0000 1111 1111    1101 1111 1011 1111
-		      ((write r1)
-		       (r1 = -1)
-		       (read-multibyte-character r0 r1)
-		       (if (r0 != ,(charset-id 'eight-bit-graphic))
-			   (if (r0 != ,(charset-id 'eight-bit-control))
-			       ((r5 = r0)
-				(r6 = r1))))
-		       (if (r5 < 0)
-			   ((read-multibyte-character r0 r2)
-			    (if (r0 != ,(charset-id 'eight-bit-graphic))
-				(if (r0 != ,(charset-id 'eight-bit-control))
-				    ((r5 = r0)
-				     (r6 = r2))))
-			    (if (r5 < 0)
-				(write r1 r2)
-			      (if (r1 < #xa0)
-				  (write r1)
-				((write #xc2)
-				 (write r1)))))))
-
-		    ;; Unsupported character.
-		    ;; Output U+FFFD, which is `ef bf bd' in UTF-8.
-		    ((write #xef)
-		     (write #xbf)
-		     (write #xbd)))))))))
-      (repeat)))
-    (if (r1 >= #xa0)
-	(write r1)
-      (if (r1 >= #x80)
-	  ((write #xc2)
-	   (write r1)))))
-
-  "CCL program to encode into UTF-8.
-Only characters from the charsets ascii, eight-bit-control,
-eight-bit-graphic, latin-iso8859-1 and mule-unicode-* are recognized.
-Others are encoded as U+FFFD.")
-
-;; Dummy definition so that the CCL can be checked correctly; the
-;; actual data are loaded on demand.
-(unless (boundp 'ucs-mule-8859-to-mule-unicode)	; don't zap it
-  (define-translation-table 'ucs-mule-8859-to-mule-unicode))
-
-(defsubst utf-8-untranslated-to-ucs ()
-  (let ((b1 (char-after))
-	(b2 (char-after (1+ (point))))
-	(b3 (char-after (+ 2 (point))))
-	(b4 (char-after (+ 4 (point)))))
-    (if (and b1 b2 b3)
-	(cond ((< b1 ?\xf0)
-	       (setq b2 (lsh (logand b2 ?\x3f) 6))
-	       (setq b3 (logand b3 ?\x3f))
-	       (logior b3 (logior b2 (lsh (logand b1 ?\x0f) 12))))
-	      (b4
-	       (setq b2 (lsh (logand b2 ?\x3f) 12))
-	       (setq b3 (lsh (logand b3 ?\x3f) 6))
-	       (setq b4 (logand b4 ?\x3f))
-	       (logior b4 (logior b3 (logior b2 (lsh (logand b1 ?\x07)
-						     18)))))))))
-
-(defun utf-8-help-echo (window object position)
-  (format "Untranslated Unicode U+%04X"
-	  (get-char-property position 'untranslated-utf-8 object)))
-
-(defvar utf-8-subst-table nil
-  "If non-nil, a hash table mapping `untranslatable utf-8' to Emacs characters.")
-
-;; We compose the untranslatable sequences into a single character.
-;; This is infelicitous for editing, because there's currently no
-;; mechanism for treating compositions as atomic, but is OK for
-;; display.  We try to compose an appropriate character from a hash
-;; table of CJK characters to display correctly.  Otherwise we use
-;; U+FFFD.  What we really should have is hash table lookup from CCL
-;; so that we could do this properly.  This function GCs too much.
-(defsubst utf-8-compose ()
-  "Put a suitable composition on an untranslatable sequence.
-Return the sequence's length."
-  (let* ((u (utf-8-untranslated-to-ucs))
-	 (l (and u (if (>= u ?\x10000)
-		       4
-		     3)))
-	 (subst (and utf-8-subst-table (gethash u utf-8-subst-table))))
-    (when u
-      (put-text-property (point) (min (point-max) (+ l (point)))
-			 'untranslated-utf-8 u)
-      (unless subst
-	  (put-text-property (point) (min (point-max) (+ l (point)))
-			     'help-echo 'utf-8-help-echo)
-	  (setq subst ?$,3u=(B))
-      (compose-region (point) (+ l (point)) subst)
-      l)))
-
-(defcustom utf-8-compose-scripts nil
-  "*Non-nil means compose various scipts on decoding utf-8 text."
-  :group 'mule
-  :type 'boolean)	; omitted in Emacs 21.1
-
-(defun utf-8-post-read-conversion (length)
-  "Compose untranslated utf-8 sequences into single characters.
-Also compose particular scripts if `utf-8-compose-scripts' is non-nil."
-  (save-excursion
-    ;; Can't do eval-when-compile to insert a multibyte constant
-    ;; version of the string in the loop, since it's always loaded as
-    ;; unibyte from a byte-compiled file.
-    (let ((range (string-as-multibyte "^\341-\377"))) 
-      (while (and (skip-chars-forward
-		   range)
-		  (not (eobp)))
-	(forward-char (utf-8-compose)))))
-  ;; Fixme: Takahashi-san implies it may not work this easily -- needs
-  ;; checking with him.
-  (when (and utf-8-compose-scripts (> length 1))
-    ;; These currently have definitions which cover the relevant
-    ;; Unicodes.  We could avoid loading thai-util &c by checking
-    ;; whether the region contains any characters with the appropriate
-    ;; categories.  There aren't yet Unicode-based rules for Tibetan.
-    (save-excursion (setq length (diacritic-post-read-conversion length)))
-    (save-excursion (setq length (thai-post-read-conversion length)))
-    (save-excursion (setq length (lao-post-read-conversion length)))
-    (save-excursion (setq length (devanagari-post-read-conversion length))))
-  length)
-
-(defun utf-8-pre-write-conversion (beg end)
-  "Semi-dummy pre-write function effectively to autoload ucs-tables."
-  ;; Ensure translation table is loaded.
-  (require 'ucs-tables)
-  ;; Don't do this again.
-  (coding-system-put 'mule-utf-8 'pre-write-conversion nil)
-  nil)
-
-(make-coding-system
- 'mule-utf-8 4 ?u
- "UTF-8 encoding for Emacs-supported Unicode characters.
-The supported Emacs character sets are the following, plus others
-which may be included in the translation table
-`ucs-mule-to-mule-unicode':
- ascii
- eight-bit-control
- eight-bit-graphic
- latin-iso8859-1
- latin-iso8859-2
- latin-iso8859-3
- latin-iso8859-4
- cyrillic-iso8859-5
- greek-iso8859-7
- hebrew-iso8859-8
- latin-iso8859-9
- latin-iso8859-14
- latin-iso8859-15
- mule-unicode-0100-24ff
- mule-unicode-2500-33ff
- mule-unicode-e000-ffff
-
-Unicode characters out of the ranges U+0000-U+33FF and U+E200-U+FFFF
-are decoded into sequences of eight-bit-control and eight-bit-graphic
-characters to preserve their byte sequences and composed to display as
-a single character.  Emacs characters that can't be encoded to these
-ranges are encoded as U+FFFD."
-
- '(ccl-decode-mule-utf-8 . ccl-encode-mule-utf-8)
- '((safe-charsets
-    ascii
-    eight-bit-control
-    eight-bit-graphic
-    latin-iso8859-1
-    latin-iso8859-15
-    latin-iso8859-14
-    latin-iso8859-9
-    hebrew-iso8859-8
-    greek-iso8859-7
-    cyrillic-iso8859-5
-    latin-iso8859-4
-    latin-iso8859-3
-    latin-iso8859-2
-    vietnamese-viscii-lower
-    vietnamese-viscii-upper
-    thai-tis620
-    ipa
-    ethiopic
-    indian-is13194
-    katakana-jisx0201
-    chinese-sisheng
-    lao
-    mule-unicode-0100-24ff
-    mule-unicode-2500-33ff
-    mule-unicode-e000-ffff)
-   (mime-charset . utf-8)
-   (coding-category . coding-category-utf-8)
-   (valid-codes (0 . 255))
-   (pre-write-conversion . utf-8-pre-write-conversion)
-   (post-read-conversion . utf-8-post-read-conversion)))
-
-(define-coding-system-alias 'utf-8 'mule-utf-8)
-
-;; I think this needs special private charsets defined for the
-;; untranslated sequences, if it's going to work well.
-
-;;; (defun utf-8-compose-function (pos to pattern &optional string)
-;;;   (let* ((prop (get-char-property pos 'composition string))
-;;; 	 (l (and prop (- (cadr prop) (car prop)))))
-;;;     (cond ((and l (> l (- to pos)))
-;;; 	   (delete-region pos to))
-;;; 	  ((and (> (char-after pos) 224)
-;;; 		(< (char-after pos) 256)
-;;; 		(save-restriction
-;;; 		  (narrow-to-region pos to)
-;;; 		  (utf-8-compose)))
-;;; 	   t))))
-
-;;; (dotimes (i 96)
-;;;   (aset composition-function-table
-;;; 	(+ 128 i)
-;;; 	`((,(string-as-multibyte "[\200-\237\240-\377]")
-;;; 	   . utf-8-compose-function))))
-
-;;; utf-8.el ends here
author	Dave Love <fx@gnu.org>	2002-05-17 17:00:15 +0000
committer	Dave Love <fx@gnu.org>	2002-05-17 17:00:15 +0000
commit	f55d2710b3a2b63af2d90c6f14b532a3d4ce3999 (patch)
tree	51be77470eb1de158716fac23595e304d0d8849c /lisp/international/utf-8.el
parent	f848daf3d6fc05fe87bd3b0336c2188fbcb59058 (diff)
download	emacs-f55d2710b3a2b63af2d90c6f14b532a3d4ce3999.tar.gz