Sort perlapi, perlintern better

This changes the dictionary sort to be closer to a standard dictionary sort. Underscores and digits collate to after all other characters, and sequences of digits collate numerically even if one sequence is shorter than the other. Unless a tie breaker is needed, non-trailing underscores and digits are ignored. The result is I8 comes before I16; IV before I8; __ASSERT_ before ASSUME, etc. The results look more intuitive to me, but could be tweaked if advisable.
author: Karl Williamson <khw@cpan.org> 2022-06-24 20:32:22 -0600
committer: Karl Williamson <khw@cpan.org> 2022-07-01 21:34:26 -0600
commit: 431ea43db26622d369b66e8ebcbf5abb1b899c0d (patch)
tree: 66b5b2c23918f482f186ca8b15f5993b28bd19be /autodoc.pl
parent: 409173239aadc465992374048dfa7b54441ce82e (diff)
download: perl-431ea43db26622d369b66e8ebcbf5abb1b899c0d.tar.gz
1 files changed, 60 insertions, 5 deletions
diff --git a/autodoc.pl b/autodoc.pl
index 8421d90794..5bbf5d3176 100644
--- a/autodoc.pl
+++ b/autodoc.pl
@@ -1632,11 +1632,66 @@ sub construct_missings_section {
 }
 
 sub dictionary_order {
-    # Do a case-insensitive dictionary sort, with only alphabetics
-    # significant, falling back to using everything for determinancy
-    return (uc($a =~ s/[[:^alpha:]]//r) cmp uc($b =~ s/[[:^alpha:]]//r))
-           || uc($a) cmp uc($b)
-           || $a cmp $b;
+    # Do a case-insensitive dictionary sort, falling back in stages to using
+    # everything for determinancy.  The initial comparison ignores
+    # all non-word characters and non-trailing underscores and digits, with
+    # trailing ones collating to after any other characters.  This collation
+    # order continues in case tie breakers are needed; sequences of digits
+    # that do get looked at always compare numerically.  The first tie
+    # breaker takes all digits and underscores into account.  The next tie
+    # breaker uses a caseless character-by-character comparison of everything
+    # (including non-word characters).  Finally is a cased comparison.
+    #
+    # This gives intuitive results, but obviously could be tweaked.
+
+    no warnings 'non_unicode';
+
+    local $a = $a;
+    local $b = $b;
+
+    # Convert all digit sequences to same length with leading zeros, so for
+    # example, 8 will compare less than 16 (using a fill length value that
+    # should be longer than any sequence in the input).
+    $a =~ s/(\d+)/sprintf "%06d", $1/ge;
+    $b =~ s/(\d+)/sprintf "%06d", $1/ge;
+
+    # Translate any underscores and digits so they compare after all Unicode
+    # characters
+    $a =~ tr[_0-9]/\x{110000}-\x{11000A}/;
+    $b =~ tr[_0-9]/\x{110000}-\x{11000A}/;
+
+    use feature 'state';
+    # Modify \w, \W to reflect the changes.
+    state $ud = '\x{110000}-\x{11000A}';    # xlated underscore, digits
+    state $w = "\\w$ud";                    # new \w string
+    state $mod_w = qr/[$w]/;
+    state $mod_W = qr/[^$w]/;
+
+    # Only \w for initial comparison
+    my $a_only_word = uc($a =~ s/$mod_W//gr);
+    my $b_only_word = uc($b =~ s/$mod_W//gr);
+
+    # And not initial nor interior underscores nor digits (by squeezing them
+    # out)
+    my $a_stripped = $a_only_word =~ s/ (*atomic:[$ud]+) (*pla: $mod_w ) //grxx;
+    my $b_stripped = $b_only_word =~ s/ (*atomic:[$ud]+) (*pla: $mod_w ) //grxx;
+
+    # If the stripped versions differ, use that as the comparison.
+    my $cmp = $a_stripped cmp $b_stripped;
+    return $cmp if $cmp;
+
+    # For the first tie breaker, repeat, but consider initial and interior
+    # underscores and digits, again having those compare after all Unicode
+    # characters
+    $cmp = $a_only_word cmp $b_only_word;
+    return $cmp if $cmp;
+
+    # Next tie breaker is just a caseless comparison
+    $cmp = uc($a) cmp uc($b);
+    return $cmp if $cmp;
+
+    # Finally a straight comparison
+    return $a cmp $b;
 }
 
 sub output {
author	Karl Williamson <khw@cpan.org>	2022-06-24 20:32:22 -0600
committer	Karl Williamson <khw@cpan.org>	2022-07-01 21:34:26 -0600
commit	431ea43db26622d369b66e8ebcbf5abb1b899c0d (patch)
tree	66b5b2c23918f482f186ca8b15f5993b28bd19be /autodoc.pl
parent	409173239aadc465992374048dfa7b54441ce82e (diff)
download	perl-431ea43db26622d369b66e8ebcbf5abb1b899c0d.tar.gz