diff options
author | Juri Linkov <juri@linkov.net> | 2016-05-17 23:55:38 +0300 |
---|---|---|
committer | Juri Linkov <juri@linkov.net> | 2016-05-17 23:55:38 +0300 |
commit | 1fe1e0a8e7e3863d36c1a87ca38d2fa39bc25a56 (patch) | |
tree | 4e53bb771b1c81d7a4c79c2323c33b7c37a40274 /lisp/char-fold.el | |
parent | a35826dbd44122b97e93d0c67bf658ced0a07ec6 (diff) | |
download | emacs-1fe1e0a8e7e3863d36c1a87ca38d2fa39bc25a56.tar.gz |
* lisp/char-fold.el: Rename from character-fold.el.
* lisp/replace.el (replace-char-fold): Rename from replace-character-fold.
* test/automated/char-fold-tests.el: Rename from character-fold-tests.el.
http://lists.gnu.org/archive/html/emacs-devel/2015-12/msg00529.html
Diffstat (limited to 'lisp/char-fold.el')
-rw-r--r-- | lisp/char-fold.el | 242 |
1 files changed, 242 insertions, 0 deletions
diff --git a/lisp/char-fold.el b/lisp/char-fold.el new file mode 100644 index 00000000000..68bea29ea45 --- /dev/null +++ b/lisp/char-fold.el @@ -0,0 +1,242 @@ +;;; char-fold.el --- match unicode to similar ASCII -*- lexical-binding: t; -*- + +;; Copyright (C) 2015-2016 Free Software Foundation, Inc. + +;; Maintainer: emacs-devel@gnu.org +;; Keywords: matching + +;; This file is part of GNU Emacs. + +;; GNU Emacs is free software: you can redistribute it and/or modify +;; it under the terms of the GNU General Public License as published by +;; the Free Software Foundation, either version 3 of the License, or +;; (at your option) any later version. + +;; GNU Emacs is distributed in the hope that it will be useful, +;; but WITHOUT ANY WARRANTY; without even the implied warranty of +;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +;; GNU General Public License for more details. + +;; You should have received a copy of the GNU General Public License +;; along with GNU Emacs. If not, see <http://www.gnu.org/licenses/>. + +;;; Code: + +(eval-and-compile (put 'char-fold-table 'char-table-extra-slots 1)) + +(defconst char-fold-table + (eval-when-compile + (let ((equiv (make-char-table 'char-fold-table)) + (equiv-multi (make-char-table 'char-fold-table)) + (table (unicode-property-table-internal 'decomposition))) + (set-char-table-extra-slot equiv 0 equiv-multi) + + ;; Ensure the table is populated. + (let ((func (char-table-extra-slot table 1))) + (map-char-table (lambda (char v) + (when (consp char) + (funcall func (car char) v table))) + table)) + + ;; Compile a list of all complex characters that each simple + ;; character should match. + ;; In summary this loop does 3 things: + ;; - A complex character might be allowed to match its decomp. + ;; - The decomp is allowed to match the complex character. + ;; - A single char of the decomp might be allowed to match the + ;; character. + ;; Some examples in the comments below. + (map-char-table + (lambda (char decomp) + (when (consp decomp) + ;; Skip trivial cases like ?a decomposing to (?a). + (unless (and (not (cdr decomp)) + (eq char (car decomp))) + (if (symbolp (car decomp)) + ;; Discard a possible formatting tag. + (setq decomp (cdr decomp)) + ;; If there's no formatting tag, ensure that char matches + ;; its decomp exactly. This is because we want 'ä' to + ;; match 'ä', but we don't want '¹' to match '1'. + (aset equiv char + (cons (apply #'string decomp) + (aref equiv char)))) + + ;; Allow the entire decomp to match char. If decomp has + ;; multiple characters, this is done by adding an entry + ;; to the alist of the first character in decomp. This + ;; allows 'ff' to match 'ff', 'ä' to match 'ä', and '1' to + ;; match '¹'. + (let ((make-decomp-match-char + (lambda (decomp char) + (if (cdr decomp) + (aset equiv-multi (car decomp) + (cons (cons (apply #'string (cdr decomp)) + (regexp-quote (string char))) + (aref equiv-multi (car decomp)))) + (aset equiv (car decomp) + (cons (char-to-string char) + (aref equiv (car decomp)))))))) + (funcall make-decomp-match-char decomp char) + ;; Do it again, without the non-spacing characters. + ;; This allows 'a' to match 'ä'. + (let ((simpler-decomp nil) + (found-one nil)) + (dolist (c decomp) + (if (> (get-char-code-property c 'canonical-combining-class) 0) + (setq found-one t) + (push c simpler-decomp))) + (when (and simpler-decomp found-one) + (funcall make-decomp-match-char simpler-decomp char) + ;; Finally, if the decomp only had one spacing + ;; character, we allow this character to match the + ;; decomp. This is to let 'a' match 'ä'. + (unless (cdr simpler-decomp) + (aset equiv (car simpler-decomp) + (cons (apply #'string decomp) + (aref equiv (car simpler-decomp))))))))))) + table) + + ;; Add some manual entries. + (dolist (it '((?\" """ "“" "”" "”" "„" "⹂" "〞" "‟" "‟" "❞" "❝" "❠" "“" "„" "〝" "〟" "🙷" "🙶" "🙸" "«" "»") + (?' "❟" "❛" "❜" "‘" "’" "‚" "‛" "‚" "" "❮" "❯" "‹" "›") + (?` "❛" "‘" "‛" "" "❮" "‹"))) + (let ((idx (car it)) + (chars (cdr it))) + (aset equiv idx (append chars (aref equiv idx))))) + + ;; Convert the lists of characters we compiled into regexps. + (map-char-table + (lambda (char dec-list) + (let ((re (regexp-opt (cons (char-to-string char) dec-list)))) + (if (consp char) + (set-char-table-range equiv char re) + (aset equiv char re)))) + equiv) + equiv)) + "Used for folding characters of the same group during search. +This is a char-table with the `char-fold-table' subtype. + +Let us refer to the character in question by char-x. +Each entry is either nil (meaning char-x only matches literally) +or a regexp. This regexp should match anything that char-x can +match by itself \(including char-x). For instance, the default +regexp for the ?+ character is \"[+⁺₊﬩﹢+]\". + +This table also has one extra slot which is also a char-table. +Each entry in the extra slot is an alist used for multi-character +matching (which may be nil). The elements of the alist should +have the form (SUFFIX . OTHER-REGEXP). If the characters after +char-x are equal to SUFFIX, then this combination of char-x + +SUFFIX is allowed to match OTHER-REGEXP. This is in addition to +char-x being allowed to match REGEXP. +For instance, the default alist for ?f includes: + \((\"fl\" . \"ffl\") (\"fi\" . \"ffi\") + (\"i\" . \"fi\") (\"f\" . \"ff\")) + +Exceptionally for the space character (32), ALIST is ignored.") + +(defun char-fold--make-space-string (n) + "Return a string that matches N spaces." + (format "\\(?:%s\\|%s\\)" + (make-string n ?\s) + (apply #'concat + (make-list n (or (aref char-fold-table ?\s) " "))))) + +;;;###autoload +(defun char-fold-to-regexp (string &optional _lax from) + "Return a regexp matching anything that char-folds into STRING. +Any character in STRING that has an entry in +`char-fold-table' is replaced with that entry (which is a +regexp) and other characters are `regexp-quote'd. + +If the resulting regexp would be too long for Emacs to handle, +just return the result of calling `regexp-quote' on STRING. + +FROM is for internal use. It specifies an index in the STRING +from which to start." + (let* ((spaces 0) + (multi-char-table (char-table-extra-slot char-fold-table 0)) + (i (or from 0)) + (end (length string)) + (out nil)) + ;; When the user types a space, we want to match the table entry + ;; for ?\s, which is generally a regexp like "[ ...]". However, + ;; the `search-spaces-regexp' variable doesn't "see" spaces inside + ;; these regexp constructs, so we need to use "\\( \\|[ ...]\\)" + ;; instead (to manually expose a space). Furthermore, the lax + ;; search engine acts on a bunch of spaces, not on individual + ;; spaces, so if the string contains sequential spaces like " ", we + ;; need to keep them grouped together like this: "\\( \\|[ ...][ ...]\\)". + (while (< i end) + (pcase (aref string i) + (`?\s (setq spaces (1+ spaces))) + (c (when (> spaces 0) + (push (char-fold--make-space-string spaces) out) + (setq spaces 0)) + (let ((regexp (or (aref char-fold-table c) + (regexp-quote (string c)))) + ;; Long string. The regexp would probably be too long. + (alist (unless (> end 50) + (aref multi-char-table c)))) + (push (let ((matched-entries nil) + (max-length 0)) + (dolist (entry alist) + (let* ((suffix (car entry)) + (len-suf (length suffix))) + (when (eq (compare-strings suffix 0 nil + string (1+ i) (+ i 1 len-suf) + nil) + t) + (push (cons len-suf (cdr entry)) matched-entries) + (setq max-length (max max-length len-suf))))) + ;; If no suffixes matched, just go on. + (if (not matched-entries) + regexp +;;; If N suffixes match, we "branch" out into N+1 executions for the +;;; length of the longest match. This means "fix" will match "fix" but +;;; not "fⅸ", but it's necessary to keep the regexp size from scaling +;;; exponentially. See https://lists.gnu.org/archive/html/emacs-devel/2015-11/msg02562.html + (let ((subs (substring string (1+ i) (+ i 1 max-length)))) + ;; `i' is still going to inc by 1 below. + (setq i (+ i max-length)) + (concat + "\\(?:" + (mapconcat (lambda (entry) + (let ((length (car entry)) + (suffix-regexp (cdr entry))) + (concat suffix-regexp + (char-fold-to-regexp subs nil length)))) + `((0 . ,regexp) . ,matched-entries) "\\|") + "\\)")))) + out)))) + (setq i (1+ i))) + (when (> spaces 0) + (push (char-fold--make-space-string spaces) out)) + (let ((regexp (apply #'concat (nreverse out)))) + ;; Limited by `MAX_BUF_SIZE' in `regex.c'. + (if (> (length regexp) 5000) + (regexp-quote string) + regexp)))) + + +;;; Commands provided for completeness. +(defun char-fold-search-forward (string &optional bound noerror count) + "Search forward for a char-folded version of STRING. +STRING is converted to a regexp with `char-fold-to-regexp', +which is searched for with `re-search-forward'. +BOUND NOERROR COUNT are passed to `re-search-forward'." + (interactive "sSearch: ") + (re-search-forward (char-fold-to-regexp string) bound noerror count)) + +(defun char-fold-search-backward (string &optional bound noerror count) + "Search backward for a char-folded version of STRING. +STRING is converted to a regexp with `char-fold-to-regexp', +which is searched for with `re-search-backward'. +BOUND NOERROR COUNT are passed to `re-search-backward'." + (interactive "sSearch: ") + (re-search-backward (char-fold-to-regexp string) bound noerror count)) + +(provide 'char-fold) + +;;; char-fold.el ends here |