summaryrefslogtreecommitdiff
path: root/lisp/char-fold.el
diff options
context:
space:
mode:
authorJuri Linkov <juri@linkov.net>2016-05-17 23:55:38 +0300
committerJuri Linkov <juri@linkov.net>2016-05-17 23:55:38 +0300
commit1fe1e0a8e7e3863d36c1a87ca38d2fa39bc25a56 (patch)
tree4e53bb771b1c81d7a4c79c2323c33b7c37a40274 /lisp/char-fold.el
parenta35826dbd44122b97e93d0c67bf658ced0a07ec6 (diff)
downloademacs-1fe1e0a8e7e3863d36c1a87ca38d2fa39bc25a56.tar.gz
* lisp/char-fold.el: Rename from character-fold.el.
* lisp/replace.el (replace-char-fold): Rename from replace-character-fold. * test/automated/char-fold-tests.el: Rename from character-fold-tests.el. http://lists.gnu.org/archive/html/emacs-devel/2015-12/msg00529.html
Diffstat (limited to 'lisp/char-fold.el')
-rw-r--r--lisp/char-fold.el242
1 files changed, 242 insertions, 0 deletions
diff --git a/lisp/char-fold.el b/lisp/char-fold.el
new file mode 100644
index 00000000000..68bea29ea45
--- /dev/null
+++ b/lisp/char-fold.el
@@ -0,0 +1,242 @@
+;;; char-fold.el --- match unicode to similar ASCII -*- lexical-binding: t; -*-
+
+;; Copyright (C) 2015-2016 Free Software Foundation, Inc.
+
+;; Maintainer: emacs-devel@gnu.org
+;; Keywords: matching
+
+;; This file is part of GNU Emacs.
+
+;; GNU Emacs is free software: you can redistribute it and/or modify
+;; it under the terms of the GNU General Public License as published by
+;; the Free Software Foundation, either version 3 of the License, or
+;; (at your option) any later version.
+
+;; GNU Emacs is distributed in the hope that it will be useful,
+;; but WITHOUT ANY WARRANTY; without even the implied warranty of
+;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+;; GNU General Public License for more details.
+
+;; You should have received a copy of the GNU General Public License
+;; along with GNU Emacs. If not, see <http://www.gnu.org/licenses/>.
+
+;;; Code:
+
+(eval-and-compile (put 'char-fold-table 'char-table-extra-slots 1))
+
+(defconst char-fold-table
+ (eval-when-compile
+ (let ((equiv (make-char-table 'char-fold-table))
+ (equiv-multi (make-char-table 'char-fold-table))
+ (table (unicode-property-table-internal 'decomposition)))
+ (set-char-table-extra-slot equiv 0 equiv-multi)
+
+ ;; Ensure the table is populated.
+ (let ((func (char-table-extra-slot table 1)))
+ (map-char-table (lambda (char v)
+ (when (consp char)
+ (funcall func (car char) v table)))
+ table))
+
+ ;; Compile a list of all complex characters that each simple
+ ;; character should match.
+ ;; In summary this loop does 3 things:
+ ;; - A complex character might be allowed to match its decomp.
+ ;; - The decomp is allowed to match the complex character.
+ ;; - A single char of the decomp might be allowed to match the
+ ;; character.
+ ;; Some examples in the comments below.
+ (map-char-table
+ (lambda (char decomp)
+ (when (consp decomp)
+ ;; Skip trivial cases like ?a decomposing to (?a).
+ (unless (and (not (cdr decomp))
+ (eq char (car decomp)))
+ (if (symbolp (car decomp))
+ ;; Discard a possible formatting tag.
+ (setq decomp (cdr decomp))
+ ;; If there's no formatting tag, ensure that char matches
+ ;; its decomp exactly. This is because we want 'ä' to
+ ;; match 'ä', but we don't want '¹' to match '1'.
+ (aset equiv char
+ (cons (apply #'string decomp)
+ (aref equiv char))))
+
+ ;; Allow the entire decomp to match char. If decomp has
+ ;; multiple characters, this is done by adding an entry
+ ;; to the alist of the first character in decomp. This
+ ;; allows 'ff' to match 'ff', 'ä' to match 'ä', and '1' to
+ ;; match '¹'.
+ (let ((make-decomp-match-char
+ (lambda (decomp char)
+ (if (cdr decomp)
+ (aset equiv-multi (car decomp)
+ (cons (cons (apply #'string (cdr decomp))
+ (regexp-quote (string char)))
+ (aref equiv-multi (car decomp))))
+ (aset equiv (car decomp)
+ (cons (char-to-string char)
+ (aref equiv (car decomp))))))))
+ (funcall make-decomp-match-char decomp char)
+ ;; Do it again, without the non-spacing characters.
+ ;; This allows 'a' to match 'ä'.
+ (let ((simpler-decomp nil)
+ (found-one nil))
+ (dolist (c decomp)
+ (if (> (get-char-code-property c 'canonical-combining-class) 0)
+ (setq found-one t)
+ (push c simpler-decomp)))
+ (when (and simpler-decomp found-one)
+ (funcall make-decomp-match-char simpler-decomp char)
+ ;; Finally, if the decomp only had one spacing
+ ;; character, we allow this character to match the
+ ;; decomp. This is to let 'a' match 'ä'.
+ (unless (cdr simpler-decomp)
+ (aset equiv (car simpler-decomp)
+ (cons (apply #'string decomp)
+ (aref equiv (car simpler-decomp)))))))))))
+ table)
+
+ ;; Add some manual entries.
+ (dolist (it '((?\" """ "“" "”" "”" "„" "⹂" "〞" "‟" "‟" "❞" "❝" "❠" "“" "„" "〝" "〟" "🙷" "🙶" "🙸" "«" "»")
+ (?' "❟" "❛" "❜" "‘" "’" "‚" "‛" "‚" "󠀢" "❮" "❯" "‹" "›")
+ (?` "❛" "‘" "‛" "󠀢" "❮" "‹")))
+ (let ((idx (car it))
+ (chars (cdr it)))
+ (aset equiv idx (append chars (aref equiv idx)))))
+
+ ;; Convert the lists of characters we compiled into regexps.
+ (map-char-table
+ (lambda (char dec-list)
+ (let ((re (regexp-opt (cons (char-to-string char) dec-list))))
+ (if (consp char)
+ (set-char-table-range equiv char re)
+ (aset equiv char re))))
+ equiv)
+ equiv))
+ "Used for folding characters of the same group during search.
+This is a char-table with the `char-fold-table' subtype.
+
+Let us refer to the character in question by char-x.
+Each entry is either nil (meaning char-x only matches literally)
+or a regexp. This regexp should match anything that char-x can
+match by itself \(including char-x). For instance, the default
+regexp for the ?+ character is \"[+⁺₊﬩﹢+]\".
+
+This table also has one extra slot which is also a char-table.
+Each entry in the extra slot is an alist used for multi-character
+matching (which may be nil). The elements of the alist should
+have the form (SUFFIX . OTHER-REGEXP). If the characters after
+char-x are equal to SUFFIX, then this combination of char-x +
+SUFFIX is allowed to match OTHER-REGEXP. This is in addition to
+char-x being allowed to match REGEXP.
+For instance, the default alist for ?f includes:
+ \((\"fl\" . \"ffl\") (\"fi\" . \"ffi\")
+ (\"i\" . \"fi\") (\"f\" . \"ff\"))
+
+Exceptionally for the space character (32), ALIST is ignored.")
+
+(defun char-fold--make-space-string (n)
+ "Return a string that matches N spaces."
+ (format "\\(?:%s\\|%s\\)"
+ (make-string n ?\s)
+ (apply #'concat
+ (make-list n (or (aref char-fold-table ?\s) " ")))))
+
+;;;###autoload
+(defun char-fold-to-regexp (string &optional _lax from)
+ "Return a regexp matching anything that char-folds into STRING.
+Any character in STRING that has an entry in
+`char-fold-table' is replaced with that entry (which is a
+regexp) and other characters are `regexp-quote'd.
+
+If the resulting regexp would be too long for Emacs to handle,
+just return the result of calling `regexp-quote' on STRING.
+
+FROM is for internal use. It specifies an index in the STRING
+from which to start."
+ (let* ((spaces 0)
+ (multi-char-table (char-table-extra-slot char-fold-table 0))
+ (i (or from 0))
+ (end (length string))
+ (out nil))
+ ;; When the user types a space, we want to match the table entry
+ ;; for ?\s, which is generally a regexp like "[ ...]". However,
+ ;; the `search-spaces-regexp' variable doesn't "see" spaces inside
+ ;; these regexp constructs, so we need to use "\\( \\|[ ...]\\)"
+ ;; instead (to manually expose a space). Furthermore, the lax
+ ;; search engine acts on a bunch of spaces, not on individual
+ ;; spaces, so if the string contains sequential spaces like " ", we
+ ;; need to keep them grouped together like this: "\\( \\|[ ...][ ...]\\)".
+ (while (< i end)
+ (pcase (aref string i)
+ (`?\s (setq spaces (1+ spaces)))
+ (c (when (> spaces 0)
+ (push (char-fold--make-space-string spaces) out)
+ (setq spaces 0))
+ (let ((regexp (or (aref char-fold-table c)
+ (regexp-quote (string c))))
+ ;; Long string. The regexp would probably be too long.
+ (alist (unless (> end 50)
+ (aref multi-char-table c))))
+ (push (let ((matched-entries nil)
+ (max-length 0))
+ (dolist (entry alist)
+ (let* ((suffix (car entry))
+ (len-suf (length suffix)))
+ (when (eq (compare-strings suffix 0 nil
+ string (1+ i) (+ i 1 len-suf)
+ nil)
+ t)
+ (push (cons len-suf (cdr entry)) matched-entries)
+ (setq max-length (max max-length len-suf)))))
+ ;; If no suffixes matched, just go on.
+ (if (not matched-entries)
+ regexp
+;;; If N suffixes match, we "branch" out into N+1 executions for the
+;;; length of the longest match. This means "fix" will match "fix" but
+;;; not "fⅸ", but it's necessary to keep the regexp size from scaling
+;;; exponentially. See https://lists.gnu.org/archive/html/emacs-devel/2015-11/msg02562.html
+ (let ((subs (substring string (1+ i) (+ i 1 max-length))))
+ ;; `i' is still going to inc by 1 below.
+ (setq i (+ i max-length))
+ (concat
+ "\\(?:"
+ (mapconcat (lambda (entry)
+ (let ((length (car entry))
+ (suffix-regexp (cdr entry)))
+ (concat suffix-regexp
+ (char-fold-to-regexp subs nil length))))
+ `((0 . ,regexp) . ,matched-entries) "\\|")
+ "\\)"))))
+ out))))
+ (setq i (1+ i)))
+ (when (> spaces 0)
+ (push (char-fold--make-space-string spaces) out))
+ (let ((regexp (apply #'concat (nreverse out))))
+ ;; Limited by `MAX_BUF_SIZE' in `regex.c'.
+ (if (> (length regexp) 5000)
+ (regexp-quote string)
+ regexp))))
+
+
+;;; Commands provided for completeness.
+(defun char-fold-search-forward (string &optional bound noerror count)
+ "Search forward for a char-folded version of STRING.
+STRING is converted to a regexp with `char-fold-to-regexp',
+which is searched for with `re-search-forward'.
+BOUND NOERROR COUNT are passed to `re-search-forward'."
+ (interactive "sSearch: ")
+ (re-search-forward (char-fold-to-regexp string) bound noerror count))
+
+(defun char-fold-search-backward (string &optional bound noerror count)
+ "Search backward for a char-folded version of STRING.
+STRING is converted to a regexp with `char-fold-to-regexp',
+which is searched for with `re-search-backward'.
+BOUND NOERROR COUNT are passed to `re-search-backward'."
+ (interactive "sSearch: ")
+ (re-search-backward (char-fold-to-regexp string) bound noerror count))
+
+(provide 'char-fold)
+
+;;; char-fold.el ends here