1 files changed, 160 insertions, 80 deletions
diff --git a/lisp/character-fold.el b/lisp/character-fold.el
index 0e156c50dde..1e49fe2f0e5 100644
--- a/lisp/character-fold.el
+++ b/lisp/character-fold.el
@@ -22,65 +22,79 @@
 
 ;;; Code:
 
+(eval-and-compile (put 'character-fold-table 'char-table-extra-slots 1))
 
 (defconst character-fold-table
   (eval-when-compile
-    (let* ((equiv (make-char-table 'character-fold-table))
-           (table (unicode-property-table-internal 'decomposition))
-           (func (char-table-extra-slot table 1)))
+    (let ((equiv (make-char-table 'character-fold-table))
+          (equiv-multi (make-char-table 'character-fold-table))
+          (table (unicode-property-table-internal 'decomposition)))
+      (set-char-table-extra-slot equiv 0 equiv-multi)
+
       ;; Ensure the table is populated.
-      (map-char-table
-       (lambda (i v) (when (consp i) (funcall func (car i) v table)))
-       table)
+      (let ((func (char-table-extra-slot table 1)))
+        (map-char-table (lambda (char v)
+                          (when (consp char)
+                            (funcall func (car char) v table)))
+                        table))
 
       ;; Compile a list of all complex characters that each simple
       ;; character should match.
+      ;; In summary this loop does 3 things:
+      ;; - A complex character might be allowed to match its decomp.
+      ;; - The decomp is allowed to match the complex character.
+      ;; - A single char of the decomp might be allowed to match the
+      ;;   character.
+      ;; Some examples in the comments below.
       (map-char-table
-       (lambda (i dec)
-         (when (consp dec)
-           ;; Discard a possible formatting tag.
-           (when (symbolp (car dec))
-             (setq dec (cdr dec)))
+       (lambda (char decomp)
+         (when (consp decomp)
            ;; Skip trivial cases like ?a decomposing to (?a).
-           (unless (or (and (eq i (car dec))
-                            (not  (cdr dec))))
-             (let ((d dec)
-                   (fold-decomp t)
-                   k found)
-               (while (and d (not found))
-                 (setq k (pop d))
-                 ;; Is k a number or letter, per unicode standard?
-                 (setq found (memq (get-char-code-property k 'general-category)
-                                   '(Lu Ll Lt Lm Lo Nd Nl No))))
-               (if found
-                   ;; Check if the decomposition has more than one letter,
-                   ;; because then we don't want the first letter to match
-                   ;; the decomposition.
-                   (dolist (k d)
-                     (when (and fold-decomp
-                                (memq (get-char-code-property k 'general-category)
-                                      '(Lu Ll Lt Lm Lo Nd Nl No)))
-                       (setq fold-decomp nil)))
-                 ;; If there's no number or letter on the
-                 ;; decomposition, take the first character in it.
-                 (setq found (car-safe dec)))
-               ;; Finally, we only fold multi-char decomposition if at
-               ;; least one of the chars is non-spacing (combining).
-               (when fold-decomp
-                 (setq fold-decomp nil)
-                 (dolist (k dec)
-                   (when (and (not fold-decomp)
-                              (> (get-char-code-property k 'canonical-combining-class) 0))
-                     (setq fold-decomp t))))
-               ;; Add i to the list of characters that k can
-               ;; represent. Also possibly add its decomposition, so we can
-               ;; match multi-char representations like (format "a%c" 769)
-               (when (and found (not (eq i k)))
-                 (let ((chars (cons (char-to-string i) (aref equiv k))))
-                   (aset equiv k
-                         (if fold-decomp
-                             (cons (apply #'string dec) chars)
-                           chars))))))))
+           (unless (and (not (cdr decomp))
+                        (eq char (car decomp)))
+             (if (symbolp (car decomp))
+                 ;; Discard a possible formatting tag.
+                 (setq decomp (cdr decomp))
+               ;; If there's no formatting tag, ensure that char matches
+               ;; its decomp exactly.  This is because we want 'ä' to
+               ;; match 'ä', but we don't want '¹' to match '1'.
+               (aset equiv char
+                     (cons (apply #'string decomp)
+                           (aref equiv char))))
+
+             ;; Allow the entire decomp to match char.  If decomp has
+             ;; multiple characters, this is done by adding an entry
+             ;; to the alist of the first character in decomp.  This
+             ;; allows 'ff' to match 'ﬀ', 'ä' to match 'ä', and '1' to
+             ;; match '¹'.
+             (let ((make-decomp-match-char
+                    (lambda (decomp char)
+                      (if (cdr decomp)
+                          (aset equiv-multi (car decomp)
+                                (cons (cons (apply #'string (cdr decomp))
+                                            (regexp-quote (string char)))
+                                      (aref equiv-multi (car decomp))))
+                        (aset equiv (car decomp)
+                              (cons (char-to-string char)
+                                    (aref equiv (car decomp))))))))
+               (funcall make-decomp-match-char decomp char)
+               ;; Do it again, without the non-spacing characters.
+               ;; This allows 'a' to match 'ä'.
+               (let ((simpler-decomp nil)
+                     (found-one nil))
+                 (dolist (c decomp)
+                   (if (> (get-char-code-property c 'canonical-combining-class) 0)
+                       (setq found-one t)
+                     (push c simpler-decomp)))
+                 (when (and simpler-decomp found-one)
+                   (funcall make-decomp-match-char simpler-decomp char)
+                   ;; Finally, if the decomp only had one spacing
+                   ;; character, we allow this character to match the
+                   ;; decomp.  This is to let 'a' match 'ä'.
+                   (unless (cdr simpler-decomp)
+                     (aset equiv (car simpler-decomp)
+                           (cons (apply #'string decomp)
+                                 (aref equiv (car simpler-decomp)))))))))))
        table)
 
       ;; Add some manual entries.
@@ -93,13 +107,34 @@
 
       ;; Convert the lists of characters we compiled into regexps.
       (map-char-table
-       (lambda (i v) (let ((re (regexp-opt (cons (char-to-string i) v))))
-                  (if (consp i)
-                      (set-char-table-range equiv i re)
-                    (aset equiv i re))))
+       (lambda (char dec-list)
+         (let ((re (regexp-opt (cons (char-to-string char) dec-list))))
+           (if (consp char)
+               (set-char-table-range equiv char re)
+             (aset equiv char re))))
        equiv)
       equiv))
-  "Used for folding characters of the same group during search.")
+  "Used for folding characters of the same group during search.
+This is a char-table with the `character-fold-table' subtype.
+
+Let us refer to the character in question by char-x.
+Each entry is either nil (meaning char-x only matches literally)
+or a regexp.  This regexp should match anything that char-x can
+match by itself \(including char-x).  For instance, the default
+regexp for the ?+ character is \"[+⁺₊﬩﹢＋]\".
+
+This table also has one extra slot which is also a char-table.
+Each entry in the extra slot is an alist used for multi-character
+matching (which may be nil).  The elements of the alist should
+have the form (SUFFIX . OTHER-REGEXP).  If the characters after
+char-x are equal to SUFFIX, then this combination of char-x +
+SUFFIX is allowed to match OTHER-REGEXP.  This is in addition to
+char-x being allowed to match REGEXP.
+For instance, the default alist for ?f includes:
+    \((\"fl\" . \"ﬄ\") (\"fi\" . \"ﬃ\")
+     (\"i\" . \"ﬁ\") (\"f\" . \"ﬀ\"))
+
+Exceptionally for the space character (32), ALIST is ignored.")
 
 (defun character-fold--make-space-string (n)
   "Return a string that matches N spaces."
@@ -109,35 +144,80 @@
                  (make-list n (or (aref character-fold-table ?\s) " ")))))
 
 ;;;###autoload
-(defun character-fold-to-regexp (string &optional _lax)
+(defun character-fold-to-regexp (string &optional _lax from)
   "Return a regexp matching anything that character-folds into STRING.
 Any character in STRING that has an entry in
 `character-fold-table' is replaced with that entry (which is a
-regexp) and other characters are `regexp-quote'd."
+regexp) and other characters are `regexp-quote'd.
+
+If the resulting regexp would be too long for Emacs to handle,
+just return the result of calling `regexp-quote' on STRING.
+
+FROM is for internal use.  It specifies an index in the STRING
+from which to start."
   (let* ((spaces 0)
-         (chars (mapcar #'identity string))
-         (out chars))
-    ;; When the user types a space, we want to match the table entry,
-    ;; but we also want the ?\s to be visible to `search-spaces-regexp'.
-    ;; See commit message for a longer description.
-    (while chars
-      (let ((c (car chars)))
-        (setcar chars
-                (cond
-                 ((eq c ?\s)
-                  (setq spaces (1+ spaces))
-                  nil)
-                 ((> spaces 0)
-                  (prog1 (concat (character-fold--make-space-string spaces)
-                                 (or (aref character-fold-table c)
-                                     (regexp-quote (string c))))
-                    (setq spaces 0)))
-                 (t (or (aref character-fold-table c)
-                        (regexp-quote (string c))))))
-        (setq chars (cdr chars))))
-    (concat (apply #'concat out)
-            (when (> spaces 0)
-              (character-fold--make-space-string spaces)))))
+         (multi-char-table (char-table-extra-slot character-fold-table 0))
+         (i (or from 0))
+         (end (length string))
+         (out nil))
+    ;; When the user types a space, we want to match the table entry
+    ;; for ?\s, which is generally a regexp like "[ ...]".  However,
+    ;; the `search-spaces-regexp' variable doesn't "see" spaces inside
+    ;; these regexp constructs, so we need to use "\\( \\|[ ...]\\)"
+    ;; instead (to manually expose a space).  Furthermore, the lax
+    ;; search engine acts on a bunch of spaces, not on individual
+    ;; spaces, so if the string contains sequential spaces like "  ", we
+    ;; need to keep them grouped together like this: "\\(  \\|[ ...][ ...]\\)".
+    (while (< i end)
+      (pcase (aref string i)
+        (`?\s (setq spaces (1+ spaces)))
+        (c (when (> spaces 0)
+             (push (character-fold--make-space-string spaces) out)
+             (setq spaces 0))
+           (let ((regexp (or (aref character-fold-table c)
+                             (regexp-quote (string c))))
+                 ;; Long string.  The regexp would probably be too long.
+                 (alist (unless (> end 50)
+                          (aref multi-char-table c))))
+             (push (let ((matched-entries nil)
+                         (max-length 0))
+                     (dolist (entry alist)
+                       (let* ((suffix (car entry))
+                              (len-suf (length suffix)))
+                         (when (eq (compare-strings suffix 0 nil
+                                                    string (1+ i) (+ i 1 len-suf)
+                                                    nil)
+                                   t)
+                           (push (cons len-suf (cdr entry)) matched-entries)
+                           (setq max-length (max max-length len-suf)))))
+                     ;; If no suffixes matched, just go on.
+                     (if (not matched-entries)
+                         regexp
+;;; If N suffixes match, we "branch" out into N+1 executions for the
+;;; length of the longest match.  This means "fix" will match "ﬁx" but
+;;; not "fⅸ", but it's necessary to keep the regexp size from scaling
+;;; exponentially.  See https://lists.gnu.org/archive/html/emacs-devel/2015-11/msg02562.html
+                       (let ((subs (substring string (1+ i) (+ i 1 max-length))))
+                         ;; `i' is still going to inc by 1 below.
+                         (setq i (+ i max-length))
+                         (concat
+                          "\\(?:"
+                          (mapconcat (lambda (entry)
+                                       (let ((length (car entry))
+                                             (suffix-regexp (cdr entry)))
+                                         (concat suffix-regexp
+                                                 (character-fold-to-regexp subs nil length))))
+                                     `((0 . ,regexp) . ,matched-entries) "\\|")
+                          "\\)"))))
+                   out))))
+      (setq i (1+ i)))
+    (when (> spaces 0)
+      (push (character-fold--make-space-string spaces) out))
+    (let ((regexp (apply #'concat (nreverse out))))
+      ;; Limited by `MAX_BUF_SIZE' in `regex.c'.
+      (if (> (length regexp) 5000)
+          (regexp-quote string)
+        regexp))))
 
 
 ;;; Commands provided for completeness.