diff options
Diffstat (limited to 'lisp/character-fold.el')
-rw-r--r-- | lisp/character-fold.el | 240 |
1 files changed, 160 insertions, 80 deletions
diff --git a/lisp/character-fold.el b/lisp/character-fold.el index 0e156c50dde..1e49fe2f0e5 100644 --- a/lisp/character-fold.el +++ b/lisp/character-fold.el @@ -22,65 +22,79 @@ ;;; Code: +(eval-and-compile (put 'character-fold-table 'char-table-extra-slots 1)) (defconst character-fold-table (eval-when-compile - (let* ((equiv (make-char-table 'character-fold-table)) - (table (unicode-property-table-internal 'decomposition)) - (func (char-table-extra-slot table 1))) + (let ((equiv (make-char-table 'character-fold-table)) + (equiv-multi (make-char-table 'character-fold-table)) + (table (unicode-property-table-internal 'decomposition))) + (set-char-table-extra-slot equiv 0 equiv-multi) + ;; Ensure the table is populated. - (map-char-table - (lambda (i v) (when (consp i) (funcall func (car i) v table))) - table) + (let ((func (char-table-extra-slot table 1))) + (map-char-table (lambda (char v) + (when (consp char) + (funcall func (car char) v table))) + table)) ;; Compile a list of all complex characters that each simple ;; character should match. + ;; In summary this loop does 3 things: + ;; - A complex character might be allowed to match its decomp. + ;; - The decomp is allowed to match the complex character. + ;; - A single char of the decomp might be allowed to match the + ;; character. + ;; Some examples in the comments below. (map-char-table - (lambda (i dec) - (when (consp dec) - ;; Discard a possible formatting tag. - (when (symbolp (car dec)) - (setq dec (cdr dec))) + (lambda (char decomp) + (when (consp decomp) ;; Skip trivial cases like ?a decomposing to (?a). - (unless (or (and (eq i (car dec)) - (not (cdr dec)))) - (let ((d dec) - (fold-decomp t) - k found) - (while (and d (not found)) - (setq k (pop d)) - ;; Is k a number or letter, per unicode standard? - (setq found (memq (get-char-code-property k 'general-category) - '(Lu Ll Lt Lm Lo Nd Nl No)))) - (if found - ;; Check if the decomposition has more than one letter, - ;; because then we don't want the first letter to match - ;; the decomposition. - (dolist (k d) - (when (and fold-decomp - (memq (get-char-code-property k 'general-category) - '(Lu Ll Lt Lm Lo Nd Nl No))) - (setq fold-decomp nil))) - ;; If there's no number or letter on the - ;; decomposition, take the first character in it. - (setq found (car-safe dec))) - ;; Finally, we only fold multi-char decomposition if at - ;; least one of the chars is non-spacing (combining). - (when fold-decomp - (setq fold-decomp nil) - (dolist (k dec) - (when (and (not fold-decomp) - (> (get-char-code-property k 'canonical-combining-class) 0)) - (setq fold-decomp t)))) - ;; Add i to the list of characters that k can - ;; represent. Also possibly add its decomposition, so we can - ;; match multi-char representations like (format "a%c" 769) - (when (and found (not (eq i k))) - (let ((chars (cons (char-to-string i) (aref equiv k)))) - (aset equiv k - (if fold-decomp - (cons (apply #'string dec) chars) - chars)))))))) + (unless (and (not (cdr decomp)) + (eq char (car decomp))) + (if (symbolp (car decomp)) + ;; Discard a possible formatting tag. + (setq decomp (cdr decomp)) + ;; If there's no formatting tag, ensure that char matches + ;; its decomp exactly. This is because we want 'ä' to + ;; match 'ä', but we don't want '¹' to match '1'. + (aset equiv char + (cons (apply #'string decomp) + (aref equiv char)))) + + ;; Allow the entire decomp to match char. If decomp has + ;; multiple characters, this is done by adding an entry + ;; to the alist of the first character in decomp. This + ;; allows 'ff' to match 'ff', 'ä' to match 'ä', and '1' to + ;; match '¹'. + (let ((make-decomp-match-char + (lambda (decomp char) + (if (cdr decomp) + (aset equiv-multi (car decomp) + (cons (cons (apply #'string (cdr decomp)) + (regexp-quote (string char))) + (aref equiv-multi (car decomp)))) + (aset equiv (car decomp) + (cons (char-to-string char) + (aref equiv (car decomp)))))))) + (funcall make-decomp-match-char decomp char) + ;; Do it again, without the non-spacing characters. + ;; This allows 'a' to match 'ä'. + (let ((simpler-decomp nil) + (found-one nil)) + (dolist (c decomp) + (if (> (get-char-code-property c 'canonical-combining-class) 0) + (setq found-one t) + (push c simpler-decomp))) + (when (and simpler-decomp found-one) + (funcall make-decomp-match-char simpler-decomp char) + ;; Finally, if the decomp only had one spacing + ;; character, we allow this character to match the + ;; decomp. This is to let 'a' match 'ä'. + (unless (cdr simpler-decomp) + (aset equiv (car simpler-decomp) + (cons (apply #'string decomp) + (aref equiv (car simpler-decomp))))))))))) table) ;; Add some manual entries. @@ -93,13 +107,34 @@ ;; Convert the lists of characters we compiled into regexps. (map-char-table - (lambda (i v) (let ((re (regexp-opt (cons (char-to-string i) v)))) - (if (consp i) - (set-char-table-range equiv i re) - (aset equiv i re)))) + (lambda (char dec-list) + (let ((re (regexp-opt (cons (char-to-string char) dec-list)))) + (if (consp char) + (set-char-table-range equiv char re) + (aset equiv char re)))) equiv) equiv)) - "Used for folding characters of the same group during search.") + "Used for folding characters of the same group during search. +This is a char-table with the `character-fold-table' subtype. + +Let us refer to the character in question by char-x. +Each entry is either nil (meaning char-x only matches literally) +or a regexp. This regexp should match anything that char-x can +match by itself \(including char-x). For instance, the default +regexp for the ?+ character is \"[+⁺₊﬩﹢+]\". + +This table also has one extra slot which is also a char-table. +Each entry in the extra slot is an alist used for multi-character +matching (which may be nil). The elements of the alist should +have the form (SUFFIX . OTHER-REGEXP). If the characters after +char-x are equal to SUFFIX, then this combination of char-x + +SUFFIX is allowed to match OTHER-REGEXP. This is in addition to +char-x being allowed to match REGEXP. +For instance, the default alist for ?f includes: + \((\"fl\" . \"ffl\") (\"fi\" . \"ffi\") + (\"i\" . \"fi\") (\"f\" . \"ff\")) + +Exceptionally for the space character (32), ALIST is ignored.") (defun character-fold--make-space-string (n) "Return a string that matches N spaces." @@ -109,35 +144,80 @@ (make-list n (or (aref character-fold-table ?\s) " "))))) ;;;###autoload -(defun character-fold-to-regexp (string &optional _lax) +(defun character-fold-to-regexp (string &optional _lax from) "Return a regexp matching anything that character-folds into STRING. Any character in STRING that has an entry in `character-fold-table' is replaced with that entry (which is a -regexp) and other characters are `regexp-quote'd." +regexp) and other characters are `regexp-quote'd. + +If the resulting regexp would be too long for Emacs to handle, +just return the result of calling `regexp-quote' on STRING. + +FROM is for internal use. It specifies an index in the STRING +from which to start." (let* ((spaces 0) - (chars (mapcar #'identity string)) - (out chars)) - ;; When the user types a space, we want to match the table entry, - ;; but we also want the ?\s to be visible to `search-spaces-regexp'. - ;; See commit message for a longer description. - (while chars - (let ((c (car chars))) - (setcar chars - (cond - ((eq c ?\s) - (setq spaces (1+ spaces)) - nil) - ((> spaces 0) - (prog1 (concat (character-fold--make-space-string spaces) - (or (aref character-fold-table c) - (regexp-quote (string c)))) - (setq spaces 0))) - (t (or (aref character-fold-table c) - (regexp-quote (string c)))))) - (setq chars (cdr chars)))) - (concat (apply #'concat out) - (when (> spaces 0) - (character-fold--make-space-string spaces))))) + (multi-char-table (char-table-extra-slot character-fold-table 0)) + (i (or from 0)) + (end (length string)) + (out nil)) + ;; When the user types a space, we want to match the table entry + ;; for ?\s, which is generally a regexp like "[ ...]". However, + ;; the `search-spaces-regexp' variable doesn't "see" spaces inside + ;; these regexp constructs, so we need to use "\\( \\|[ ...]\\)" + ;; instead (to manually expose a space). Furthermore, the lax + ;; search engine acts on a bunch of spaces, not on individual + ;; spaces, so if the string contains sequential spaces like " ", we + ;; need to keep them grouped together like this: "\\( \\|[ ...][ ...]\\)". + (while (< i end) + (pcase (aref string i) + (`?\s (setq spaces (1+ spaces))) + (c (when (> spaces 0) + (push (character-fold--make-space-string spaces) out) + (setq spaces 0)) + (let ((regexp (or (aref character-fold-table c) + (regexp-quote (string c)))) + ;; Long string. The regexp would probably be too long. + (alist (unless (> end 50) + (aref multi-char-table c)))) + (push (let ((matched-entries nil) + (max-length 0)) + (dolist (entry alist) + (let* ((suffix (car entry)) + (len-suf (length suffix))) + (when (eq (compare-strings suffix 0 nil + string (1+ i) (+ i 1 len-suf) + nil) + t) + (push (cons len-suf (cdr entry)) matched-entries) + (setq max-length (max max-length len-suf))))) + ;; If no suffixes matched, just go on. + (if (not matched-entries) + regexp +;;; If N suffixes match, we "branch" out into N+1 executions for the +;;; length of the longest match. This means "fix" will match "fix" but +;;; not "fⅸ", but it's necessary to keep the regexp size from scaling +;;; exponentially. See https://lists.gnu.org/archive/html/emacs-devel/2015-11/msg02562.html + (let ((subs (substring string (1+ i) (+ i 1 max-length)))) + ;; `i' is still going to inc by 1 below. + (setq i (+ i max-length)) + (concat + "\\(?:" + (mapconcat (lambda (entry) + (let ((length (car entry)) + (suffix-regexp (cdr entry))) + (concat suffix-regexp + (character-fold-to-regexp subs nil length)))) + `((0 . ,regexp) . ,matched-entries) "\\|") + "\\)")))) + out)))) + (setq i (1+ i))) + (when (> spaces 0) + (push (character-fold--make-space-string spaces) out)) + (let ((regexp (apply #'concat (nreverse out)))) + ;; Limited by `MAX_BUF_SIZE' in `regex.c'. + (if (> (length regexp) 5000) + (regexp-quote string) + regexp)))) ;;; Commands provided for completeness. |