summaryrefslogtreecommitdiff
path: root/autodoc.pl
diff options
context:
space:
mode:
authorKarl Williamson <khw@cpan.org>2022-06-24 20:32:22 -0600
committerKarl Williamson <khw@cpan.org>2022-07-01 21:34:26 -0600
commit431ea43db26622d369b66e8ebcbf5abb1b899c0d (patch)
tree66b5b2c23918f482f186ca8b15f5993b28bd19be /autodoc.pl
parent409173239aadc465992374048dfa7b54441ce82e (diff)
downloadperl-431ea43db26622d369b66e8ebcbf5abb1b899c0d.tar.gz
Sort perlapi, perlintern better
This changes the dictionary sort to be closer to a standard dictionary sort. Underscores and digits collate to after all other characters, and sequences of digits collate numerically even if one sequence is shorter than the other. Unless a tie breaker is needed, non-trailing underscores and digits are ignored. The result is I8 comes before I16; IV before I8; __ASSERT_ before ASSUME, etc. The results look more intuitive to me, but could be tweaked if advisable.
Diffstat (limited to 'autodoc.pl')
-rw-r--r--autodoc.pl65
1 files changed, 60 insertions, 5 deletions
diff --git a/autodoc.pl b/autodoc.pl
index 8421d90794..5bbf5d3176 100644
--- a/autodoc.pl
+++ b/autodoc.pl
@@ -1632,11 +1632,66 @@ sub construct_missings_section {
}
sub dictionary_order {
- # Do a case-insensitive dictionary sort, with only alphabetics
- # significant, falling back to using everything for determinancy
- return (uc($a =~ s/[[:^alpha:]]//r) cmp uc($b =~ s/[[:^alpha:]]//r))
- || uc($a) cmp uc($b)
- || $a cmp $b;
+ # Do a case-insensitive dictionary sort, falling back in stages to using
+ # everything for determinancy. The initial comparison ignores
+ # all non-word characters and non-trailing underscores and digits, with
+ # trailing ones collating to after any other characters. This collation
+ # order continues in case tie breakers are needed; sequences of digits
+ # that do get looked at always compare numerically. The first tie
+ # breaker takes all digits and underscores into account. The next tie
+ # breaker uses a caseless character-by-character comparison of everything
+ # (including non-word characters). Finally is a cased comparison.
+ #
+ # This gives intuitive results, but obviously could be tweaked.
+
+ no warnings 'non_unicode';
+
+ local $a = $a;
+ local $b = $b;
+
+ # Convert all digit sequences to same length with leading zeros, so for
+ # example, 8 will compare less than 16 (using a fill length value that
+ # should be longer than any sequence in the input).
+ $a =~ s/(\d+)/sprintf "%06d", $1/ge;
+ $b =~ s/(\d+)/sprintf "%06d", $1/ge;
+
+ # Translate any underscores and digits so they compare after all Unicode
+ # characters
+ $a =~ tr[_0-9]/\x{110000}-\x{11000A}/;
+ $b =~ tr[_0-9]/\x{110000}-\x{11000A}/;
+
+ use feature 'state';
+ # Modify \w, \W to reflect the changes.
+ state $ud = '\x{110000}-\x{11000A}'; # xlated underscore, digits
+ state $w = "\\w$ud"; # new \w string
+ state $mod_w = qr/[$w]/;
+ state $mod_W = qr/[^$w]/;
+
+ # Only \w for initial comparison
+ my $a_only_word = uc($a =~ s/$mod_W//gr);
+ my $b_only_word = uc($b =~ s/$mod_W//gr);
+
+ # And not initial nor interior underscores nor digits (by squeezing them
+ # out)
+ my $a_stripped = $a_only_word =~ s/ (*atomic:[$ud]+) (*pla: $mod_w ) //grxx;
+ my $b_stripped = $b_only_word =~ s/ (*atomic:[$ud]+) (*pla: $mod_w ) //grxx;
+
+ # If the stripped versions differ, use that as the comparison.
+ my $cmp = $a_stripped cmp $b_stripped;
+ return $cmp if $cmp;
+
+ # For the first tie breaker, repeat, but consider initial and interior
+ # underscores and digits, again having those compare after all Unicode
+ # characters
+ $cmp = $a_only_word cmp $b_only_word;
+ return $cmp if $cmp;
+
+ # Next tie breaker is just a caseless comparison
+ $cmp = uc($a) cmp uc($b);
+ return $cmp if $cmp;
+
+ # Finally a straight comparison
+ return $a cmp $b;
}
sub output {