# Grind out a lot of combinatoric tests for folding.

use charnames ":full";

binmode STDOUT, ":utf8";

BEGIN {
    chdir 't' if -d 't';
    @INC = '../lib';
    require './test.pl';
}

use strict;
use warnings;
use Encode;

# Tests both unicode and not, so make sure not implicitly testing unicode
no feature 'unicode_strings';

# Case-insensitive matching is a large and complicated issue.  Perl does not
# implement it fully, properly.  For example, it doesn't include normalization
# as part of the equation.  To test every conceivable combination is clearly
# impossible; these tests are mostly drawn from visual inspection of the code
# and experience, trying to exercise all areas.

# There are three basic ranges of characters that Perl may treat differently:
# 1) Invariants under utf8 which on ASCII-ish machines are ASCII, and are
#    referred to here as ASCII.  On EBCDIC machines, the non-ASCII invariants
#    are all controls that fold to themselves.
my $ASCII = 1;

# 2) Other characters that fit into a byte but are different in utf8 than not;
#    here referred to, taking some liberties, as Latin1.
my $Latin1 = 2;

# 3) Characters that won't fit in a byte; here referred to as Unicode
my $Unicode = 3;

# Within these basic groups are equivalence classes that testing any character
# in is likely to lead to the same results as any other character.  This is
# used to cut down the number of tests needed, unless PERL_RUN_SLOW_TESTS is
# set.
my $skip_apparently_redundant = ! $ENV{PERL_RUN_SLOW_TESTS};

sub range_type {
    my $ord = shift;

    return $ASCII if $ord < 128;
    return $Latin1 if $ord < 256;
    return $Unicode;
}

sub numerically {
    return $a <=> $b
}

my %tests;
my %simple_folds;
my %multi_folds;

# First, analyze the current Unicode's folding rules
my %folded_from;
my $file="../lib/unicore/CaseFolding.txt";
open my $fh, "<", $file or die "Failed to read '$file': $!";
while (<$fh>) {
    chomp;

    # Lines look like (though without the initial '#')
    #0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE

    my ($line, $comment) = split / \s+ \# \s+ /x, $_;
    next if $line eq "" || substr($line, 0, 1) eq '#';
    my ($hex_from, $fold_type, @folded) = split /[\s;]+/, $line;

    my $from = hex $hex_from;

    if ($fold_type eq 'F') {
        next;   # XXX TODO multi-char folds
        my $from_range_type = range_type($from);
        @folded = map { hex $_ } @folded;

        # Include three code points that are handled internally by the regex
        # engine specially, plus all non-Unicode multi folds (which actually
        # the only one is already included in the three, but this makes sure)
        # And if any member of the fold is not the same range type as the
        # source, add it directly to the tests.  It needs to be an array of an
        # array, so that it is distinguished from multiple single folds
        if ($from == 0xDF || $from == 0x390 || $from == 0x3B0
            || $from_range_type != $Unicode
            || grep { range_type($_) != $from_range_type } @folded)
        {
            $tests{$from} = [ [ @folded ] ];
        }
        else {

            # Must be Unicode here, so chr is automatically utf8.  Get the
            # number of bytes in each.  This is because the optimizer cares
            # about length differences.
            my $from_length = length encode('utf-8', chr($from));
            my $to_length = length encode('utf-8', pack 'U*', @folded);
            push @{$multi_folds{$from_length}{$to_length}}, { $from => [ @folded ] };
        }
    }

    # Perl only deals with C and F folds
    next if $fold_type ne 'C';

    # C folds are single-char $from to single-char $folded
    push @{$folded_from{hex $folded[0]}}, $from;
}

# Now try to sort the single char folds into equivalence classes of that are
# likely to have identical successes and failures.  Any fold that crosses
# range types is suspect, and is automatically tested.  Otherwise, store by
# the number of characters that participate in a fold.  Likely all folds in a
# range type that fold to each other like B->b->B will have identical success
# and failure; similarly all folds that have three characters participating
# are likely to have the same successes and failures, etc.
foreach my $folded (sort numerically keys %folded_from) {
    my $target_range_type  = range_type($folded);
    my $count = @{$folded_from{$folded}};

    # Automatically test any fold that crosses range types
    if (grep { range_type($_) != $target_range_type } @{$folded_from{$folded}})
    {
        $tests{$folded} = $folded_from{$folded};
    }
    else {
        push @{$simple_folds{$target_range_type}{$count}},
               { $folded => $folded_from{$folded} };
    } 
}

foreach my $from_length (keys %multi_folds) {
    foreach my $fold_length (keys %{$multi_folds{$from_length}}) {
        #print __LINE__, ref $multi_folds{$from_length}{$fold_length}, Dumper $multi_folds{$from_length}{$fold_length};
        foreach my $test (@{$multi_folds{$from_length}{$fold_length}}) {
            #print __LINE__, ": $from_length, $fold_length, $test:\n";
            my ($target, $pattern) = each %$test;
            #print __LINE__, ": $target: $pattern\n";
            $tests{$target} = $pattern;
            last if $skip_apparently_redundant;
        }
    }
}

# Add in tests for single character folds.  Add tests for each range type,
# and within those tests for each number of characters participating in a
# fold.  Thus B->b has two characters participating.  But K->k and Kelvin
# Sign->k has three characters participating.  So we would make sure that
# there is a test for 3 chars, 4 chars, ... .  (Note that the 'k' example is a
# bad one because it crosses range types, so is automatically tested.  In the
# Unicode range there are various of these 3 and 4 char classes, but aren't as
# easily described as the 'k' one.)
foreach my $type (keys %simple_folds) {
    foreach my $count (keys %{$simple_folds{$type}}) {
        foreach my $test (@{$simple_folds{$type}{$count}}) {
            my ($target, $pattern) = each %$test;
            $tests{$target} = $pattern;
            last if $skip_apparently_redundant;
        }
    }
}

# For each range type, test additionally a character that folds to itself
$tests{0x3A} = [ 0x3A ];
$tests{0xF7} = [ 0xF7 ];
$tests{0x2C7} = [ 0x2C7 ];

my $clump_execs = 10000;    # Speed up by building an 'exec' of many tests
my @eval_tests;

# For use by pairs() in generating combinations
sub prefix {
    my $p = shift;
    map [ $p, $_ ], @_ 
}

# Returns all ordered combinations of pairs of elements from the input array.
# It doesn't return pairs like (a, a), (b, b).  Change the slice to an array
# to do that.  This was just to have fewer tests.
sub pairs (@) { 
    #print __LINE__, ": ", join(" XXX ", @_), "\n";
    map { prefix $_[$_], @_[0..$_-1, $_+1..$#_] } 0..$#_ 
}


# Finally ready to do the tests
my $count=1;
foreach my $test (sort { numerically } keys %tests) {

  my $previous_target;
  my $previous_pattern;
  my @pairs = pairs(sort numerically $test, @{$tests{$test}});

  # Each fold can be viewed as a closure of all the characters that
  # participate in it.  Look at each possible pairing from a closure, with the
  # first member of the pair the target string to match against, and the
  # second member forming the pattern.  Thus each fold member gets tested as
  # the string, and the pattern with every other member in the opposite role.
  while (my $pair = shift @pairs) {
    my ($target, $pattern) = @$pair;

    # When testing a char that doesn't fold, we can get the same
    # permutation twice; so skip all but the first.
    next if $previous_target
            && $previous_target == $target
            && $previous_pattern == $pattern;
    ($previous_target, $previous_pattern) = ($target, $pattern);

    # Each side may be either a single char or a string.  Extract each into an
    # array (perhaps of length 1)
    my @target, my @pattern;
    @target = (ref $target) ? @$target : $target;
    @pattern = (ref $pattern) ? @$pattern : $pattern;

    # Have to convert non-utf8 chars to native char set
    @target = map { $_ > 255 ? $_ : ord latin1_to_native(chr($_)) } @target;
    @pattern = map { $_ > 255 ? $_ : ord latin1_to_native(chr($_)) } @pattern;

    # Get in hex form.
    my @x_target = map { sprintf "\\x{%04X}", $_ } @target;
    my @x_pattern = map { sprintf "\\x{%04X}", $_ } @pattern;

    my $target_above_latin1 = grep { $_ > 255 } @target;
    my $pattern_above_latin1 = grep { $_ > 255 } @pattern;
    my $is_self = @target == 1 && @pattern == 1 && $target[0] == $pattern[0];

    # We don't test multi-char folding into other multi-chars.  We are testing
    # a code point that folds to or from other characters.  Find the single
    # code point for diagnostic purposes.  (If both are single, choose the
    # target string)
    my $ord = @target == 1 ? $target[0] : $pattern[0];
    my $progress = sprintf "\"%s\" and /%s/",
                            join("", @x_target),
                            join("", @x_pattern);
    #print $progress, "\n";
    #diag $progress;

    # Now grind out tests, using various combinations.
    # XXX foreach my $charset ('d', 'u', 'l') {
    foreach my $charset ('d', 'u') {
      foreach my $utf8_target (0, 1) {    # Both utf8 and not, for
                                          # code points < 256
        my $upgrade_target = "";

        # These must already be in utf8 because the string to match has
        # something above latin1.  So impossible to test if to not to be in
        # utf8; and otherwise, no upgrade is needed.
        next if $target_above_latin1 && ! $utf8_target;
        $upgrade_target = ' utf8::upgrade($c);' if ! $target_above_latin1 && $utf8_target;

        foreach my $utf8_pattern (0, 1) {
          next if $pattern_above_latin1 && ! $utf8_pattern;
          my $uni_semantics = $utf8_target || $charset eq 'u' || ($charset eq 'd' && $utf8_pattern);
          my $upgrade_pattern = "";
          $upgrade_pattern = ' utf8::upgrade($p);' if ! $pattern_above_latin1 && $utf8_pattern;

          my $lhs = join "", @x_target;
          my @rhs = @x_pattern;
          my $rhs = join "", @rhs;
          my $should_fail = ! $uni_semantics && $ord >= 128 && $ord < 256 && ! $is_self;

          # Do simple tests of referencing capture buffers, named and
          # numbered.
          my $op = '=~';
          $op = '!~' if $should_fail;
          my $eval = "my \$c = \"$lhs$rhs\"; my \$p = qr/(?$charset:^($rhs)\\1\$)/i;$upgrade_target$upgrade_pattern \$c $op \$p";
          push @eval_tests, qq[ok(eval '$eval', '$eval')];
          $eval = "my \$c = \"$lhs$rhs\"; my \$p = qr/(?$charset:^(?<grind>$rhs)\\k<grind>\$)/i;$upgrade_target$upgrade_pattern \$c $op \$p";
          push @eval_tests, qq[ok(eval '$eval', '$eval')];
          $count += 2;
          if ($lhs ne $rhs) {
            $eval = "my \$c = \"$rhs$lhs\"; my \$p = qr/(?$charset:^($rhs)\\1\$)/i;$upgrade_target$upgrade_pattern \$c $op \$p";
            push @eval_tests, qq[ok(eval '$eval', '$eval')];
            $eval = "my \$c = \"$rhs$lhs\"; my \$p = qr/(?$charset:^(?<grind>$rhs)\\k<grind>\$)/i;$upgrade_target$upgrade_pattern \$c $op \$p";
            push @eval_tests, qq[ok(eval '$eval', '$eval')];
            $count += 2;
          }
          #diag $eval_tests[-1];
          #next;

          foreach my $bracketed (0, 1) {   # Put rhs in [...], or not
            foreach my $inverted (0,1) {
                next if $inverted && ! $bracketed;

              # In some cases, add an extra character that doesn't fold, and
              # looks ok in the output.
              my $extra_char = "_";
              foreach my $prepend ("", $extra_char) {
                foreach my $append ("", $extra_char) {
                  # Append a char for after quantifier, as results vary if no
                  # char appended.

                  # Assemble the rhs.  Put each character in a separate
                  # bracketed if using charclasses.  This creates a stress on
                  # the code to span a match across multiple elements
                  my $rhs = "";
                  foreach my $rhs_char (@rhs) {
                      $rhs .= '[' if $bracketed;
                      $rhs .= '^' if $inverted;
                      $rhs .=  $rhs_char;

                      # Add a character to the class, so class doesn't get
                      # optimized out
                      $rhs .= '_]' if $bracketed;
                  }

                  # Add one of: no capturing parens
                  #             a single set
                  #             a nested set
                  # Use quantifiers and extra variable width matches inside
                  # them to keep some optimizations from happening
                  foreach my $parend (0, 1, 2) {
                    my $interior = (! $parend)
                                    ? $rhs
                                    : ($parend == 1)
                                        ? "(${rhs},?)"
                                        : "((${rhs})+,?)";
                    foreach my $quantifier ("", '?', '*', '+', '{1,3}') {

                      # A ? or * quantifier normally causes the thing to be
                      # able to match a null string
                      my $quantifier_can_match_null = $quantifier eq '?' || $quantifier eq '*';

                      # But since we only quantify the last character in a
                      # multiple fold, the other characters will have width,
                      # except if we are quantifying the whole rhs
                      my $can_match_null = $quantifier_can_match_null && (@rhs == 1 || $parend);

                      foreach my $l_anchor ("", '^') { # '\A' didn't change result)
                        foreach my $r_anchor ("", '$') { # '\Z', '\z' didn't change result)

                          # The folded part can match the null string if it
                          # isn't required to have width, and there's not
                          # something on one or both sides that force it to.
                          my $must_match = ! $can_match_null || ($l_anchor && $r_anchor) || ($l_anchor && $append) || ($r_anchor && $prepend) || ($prepend && $append);
                          #next unless $must_match;
                          my $quantified = "(?$charset:$l_anchor$prepend$interior${quantifier}$append$r_anchor)";
                          my $op;
                          if ($must_match && $should_fail)  {
                              $op = 0;
                          } else {
                              $op = 1;
                          }
                          $op = ! $op if $must_match && $inverted;
                          $op = ($op) ? '=~' : '!~';

                          my $stuff .= " uni_semantics=$uni_semantics, should_fail=$should_fail, bracketed=$bracketed, prepend=$prepend, append=$append, parend=$parend, quantifier=$quantifier, l_anchor=$l_anchor, r_anchor=$r_anchor";
                          $stuff .= "; pattern_above_latin1=$pattern_above_latin1; utf8_pattern=$utf8_pattern";
                          my $eval = "my \$c = \"$prepend$lhs$append\"; my \$p = qr/$quantified/i;$upgrade_target$upgrade_pattern \$c $op \$p;";

                          # XXX Doesn't currently test multi-char folds
                          next if @pattern != 1;
                          #next if ! $must_match;
                          push @eval_tests, qq[ok(eval '$eval', '$eval')];
                          $count++;

                          # Group tests
                          if (@eval_tests >= $clump_execs) {
                              eval join ";\n", @eval_tests;
                              undef @eval_tests;
                          }
                        }
                      }
                    }
                  }
                }
              }
            }
          }
        }
      }
    }
  }
}

# Finish up any tests not already done
eval join ";\n", @eval_tests;

plan($count-1);

1