9 files changed, 180 insertions, 24 deletions
diff --git a/embed.fnc b/embed.fnc
index bf3b8c5c72..85166eb558 100644
--- a/embed.fnc
+++ b/embed.fnc
@@ -909,7 +909,7 @@ Ap	|I32 *	|markstack_grow
 #if defined(USE_LOCALE_COLLATE)
 p	|int	|magic_setcollxfrm|NN SV* sv|NN MAGIC* mg
 : Defined in locale.c, used only in sv.c
-p	|char*	|mem_collxfrm	|NN const char* s|STRLEN len|NN STRLEN* xlen
+p	|char*	|mem_collxfrm	|NN const char* input_string|STRLEN len|NN STRLEN* xlen
 #endif
 Afpd	|SV*	|mess		|NN const char* pat|...
 Apd	|SV*	|mess_sv	|NN SV* basemsg|bool consume
diff --git a/embedvar.h b/embedvar.h
index bd151934fd..794ed9a31e 100644
--- a/embedvar.h
+++ b/embedvar.h
@@ -309,6 +309,7 @@
 #define PL_stderrgv		(vTHX->Istderrgv)
 #define PL_stdingv		(vTHX->Istdingv)
 #define PL_strtab		(vTHX->Istrtab)
+#define PL_strxfrm_min_char	(vTHX->Istrxfrm_min_char)
 #define PL_sub_generation	(vTHX->Isub_generation)
 #define PL_subline		(vTHX->Isubline)
 #define PL_subname		(vTHX->Isubname)
diff --git a/intrpvar.h b/intrpvar.h
index 9366383221..42872e5ba2 100644
--- a/intrpvar.h
+++ b/intrpvar.h
@@ -564,6 +564,7 @@ PERLVAR(I, collation_name, char *)	/* Name of current collation */
 PERLVAR(I, collxfrm_base, Size_t)	/* Basic overhead in *xfrm() */
 PERLVARI(I, collxfrm_mult,Size_t, 2)	/* Expansion factor in *xfrm() */
 PERLVARI(I, collation_ix, U32,	0)	/* Collation generation index */
+PERLVARA(I, strxfrm_min_char, 3, char)
 PERLVARI(I, collation_standard, bool, TRUE)
 					/* Assume simple collation */
 #endif /* USE_LOCALE_COLLATE */
diff --git a/lib/locale.t b/lib/locale.t
index ddb5d795d9..ce0c9879d3 100644
--- a/lib/locale.t
+++ b/lib/locale.t
@@ -1740,7 +1740,7 @@ foreach my $Locale (@Locale) {
 
         ++$locales_test_number;
         $test_names{$locales_test_number}
-            = 'TODO Skip in locales where \001 has primary sorting weight; '
+            = 'Skip in locales where \001 has primary sorting weight; '
             . 'otherwise verify that \0 doesn\'t have primary sorting weight';
         if ("a\001c" lt "ab") {
             report_result($Locale, $locales_test_number, 1);
@@ -1749,6 +1749,19 @@ foreach my $Locale (@Locale) {
             my $ok = "ab" lt "a\0c";
             report_result($Locale, $locales_test_number, $ok);
         }
+
+        ++$locales_test_number;
+        $test_names{$locales_test_number}
+                            = 'TODO Verify that strings with embedded NUL collate';
+        my $ok = "a\0a\0a" lt "a\001a\001a";
+        report_result($Locale, $locales_test_number, $ok);
+
+        ++$locales_test_number;
+        $test_names{$locales_test_number}
+                            = 'TODO Verify that strings with embedded NUL and '
+                            . 'extra trailing NUL collate';
+        $ok = "a\0a\0" lt "a\001a\001";
+        report_result($Locale, $locales_test_number, $ok);
     }
 
     my $ok1;
diff --git a/locale.c b/locale.c
index 383c83e124..1806132f00 100644
--- a/locale.c
+++ b/locale.c
@@ -486,6 +486,7 @@ Perl_new_collate(pTHX_ const char *newcoll)
 	PL_collxfrm_base = 0;
 	PL_collxfrm_mult = 2;
         PL_in_utf8_COLLATE_locale = FALSE;
+        *PL_strxfrm_min_char = '\0';
 	return;
     }
 
@@ -500,6 +501,7 @@ Perl_new_collate(pTHX_ const char *newcoll)
         }
 
         PL_in_utf8_COLLATE_locale = _is_cur_LC_category_utf8(LC_COLLATE);
+        *PL_strxfrm_min_char = '\0';
 
         /* A locale collation definition includes primary, secondary, tertiary,
          * etc. weights for each character.  To sort, the primary weights are
@@ -1295,13 +1297,136 @@ Perl_init_i18nl10n(pTHX_ int printwarn)
  */
 
 char *
-Perl_mem_collxfrm(pTHX_ const char *s, STRLEN len, STRLEN *xlen)
+Perl_mem_collxfrm(pTHX_ const char *input_string,
+                         STRLEN len,
+                         STRLEN *xlen
+                   )
 {
+    char * s = (char *) input_string;
+    STRLEN s_strlen = strlen(input_string);
     char *xbuf;
-    STRLEN xAlloc, xin, xout; /* xalloc is a reserved word in VC */
+    STRLEN xAlloc, xout; /* xalloc is a reserved word in VC */
 
     PERL_ARGS_ASSERT_MEM_COLLXFRM;
 
+    /* Replace any embedded NULs with the control that sorts before any others.
+     * This will give as good as possible results on strings that don't
+     * otherwise contain that character, but otherwise there may be
+     * less-than-perfect results with that character and NUL.  This is
+     * unavoidable unless we replace strxfrm with our own implementation.
+     *
+     * XXX This code may be overkill.  khw wrote it before realizing that if
+     * you change a NUL into some other character, that that may change the
+     * strxfrm results if that character is part of a sequence with other
+     * characters for weight calculations.  To minimize the chances of this,
+     * now the replacement is restricted to another control (likely to be
+     * \001).  But the full generality has been retained.
+     *
+     * This is one of the few places in the perl core, where we can use
+     * standard functions like strlen() and strcat().  It's because we're
+     * looking for NULs. */
+    if (s_strlen < len) {
+        char * e = s + len;
+        char * sans_nuls;
+        STRLEN cur_min_char_len;
+
+        /* If we don't know what control character sorts lowest for this
+         * locale, find it */
+        if (*PL_strxfrm_min_char == '\0') {
+            int j;
+            char * cur_min_x = NULL;    /* Cur cp's xfrm, (except it also
+                                           includes the collation index
+                                           prefixed. */
+
+            /* Look through all legal code points (NUL isn't) */
+            for (j = 1; j < 256; j++) {
+                char * x;       /* j's xfrm plus collation index */
+                STRLEN x_len;   /* length of 'x' */
+                STRLEN trial_len = 1;
+
+                /* Create a 1 byte string of the current code point, but with
+                 * room to be 2 bytes */
+                char cur_source[] = { (char) j, '\0' , '\0' };
+
+                if (PL_in_utf8_COLLATE_locale) {
+                    if (! isCNTRL_L1(j)) {
+                        continue;
+                    }
+
+                    /* If needs to be 2 bytes, find them */
+                    if (! UVCHR_IS_INVARIANT(j)) {
+                        continue;  /* Can't handle variants yet */
+                    }
+                }
+                else if (! isCNTRL_LC(j)) {
+                    continue;
+                }
+
+                /* Then transform it */
+                x = mem_collxfrm(cur_source, trial_len, &x_len);
+
+                /* If something went wrong (which it shouldn't), just
+                 * ignore this code point */
+                if (   x_len == 0
+                    || strlen(x + sizeof(PL_collation_ix)) < x_len)
+                {
+                    continue;
+                }
+
+                /* If this character's transformation is lower than
+                 * the current lowest, this one becomes the lowest */
+                if (   cur_min_x == NULL
+                    || strLT(x         + sizeof(PL_collation_ix),
+                             cur_min_x + sizeof(PL_collation_ix)))
+                {
+                    strcpy(PL_strxfrm_min_char, cur_source);
+                    cur_min_x = x;
+                }
+                else {
+                    Safefree(x);
+                }
+            } /* end of loop through all bytes */
+
+            /* Unlikely, but possible, if there aren't any controls in the
+             * locale, arbitrarily use \001 */
+            if (cur_min_x == NULL) {
+                STRLEN x_len;   /* temporary */
+                cur_min_x = mem_collxfrm("\001", 1, &x_len);
+                /* cur_min_cp was already initialized to 1 */
+            }
+
+            Safefree(cur_min_x);
+        }
+
+        /* The worst case length for the replaced string would be if every
+         * character in it is NUL.  Multiply that by the length of each
+         * replacement, and allow for a trailing NUL */
+        cur_min_char_len = strlen(PL_strxfrm_min_char);
+        Newx(sans_nuls, (len * cur_min_char_len) + 1, char);
+        *sans_nuls = '\0';
+
+
+        /* Replace each NUL with the lowest collating control.  Loop until have
+         * exhausted all the NULs */
+        while (s + s_strlen < e) {
+            strcat(sans_nuls, s);
+
+            /* Do the actual replacement */
+            strcat(sans_nuls, PL_strxfrm_min_char);
+
+            /* Move past the input NUL */
+            s += s_strlen + 1;
+            s_strlen = strlen(s);
+        }
+
+        /* And add anything that trails the final NUL */
+        strcat(sans_nuls, s);
+
+        /* Switch so below we transform this modified string */
+        s = sans_nuls;
+        len = strlen(s);
+    }
+
     /* The first element in the output is the collation id, used by
      * sv_collxfrm(); then comes the space for the transformed string.  The
      * equation should give us a good estimate as to how much is needed */
@@ -1316,17 +1441,16 @@ Perl_mem_collxfrm(pTHX_ const char *s, STRLEN len, STRLEN *xlen)
 
     /* Then the transformation of the input.  We loop until successful, or we
      * give up */
-    for (xin = 0; xin < len; ) {
-	Size_t xused;
-
 	for (;;) {
-	    xused = strxfrm(xbuf + xout, s + xin, xAlloc - xout);
+            STRLEN xused = strxfrm(xbuf + xout, s, xAlloc - xout);
 
             /* If the transformed string occupies less space than we told
              * strxfrm() was available, it means it successfully transformed
              * the whole string. */
-	    if ((STRLEN)xused < xAlloc - xout)
+	    if (xused < xAlloc - xout) {
+                xout += xused;
 		break;
+            }
 
 	    if (UNLIKELY(xused >= PERL_INT_MAX))
 		goto bad;
@@ -1340,19 +1464,20 @@ Perl_mem_collxfrm(pTHX_ const char *s, STRLEN len, STRLEN *xlen)
 		goto bad;
 	}
 
-	xin += strlen(s + xin) + 1;
-	xout += xused;
+    *xlen = xout - sizeof(PL_collation_ix);
+
 
-	/* Embedded NULs are understood but silently skipped
-	 * because they make no sense in locale collation. */
+    if (s != input_string) {
+        Safefree(s);
     }
 
-    xbuf[xout] = '\0';
-    *xlen = xout - sizeof(PL_collation_ix);
     return xbuf;
 
   bad:
     Safefree(xbuf);
+    if (s != input_string) {
+        Safefree(s);
+    }
     *xlen = 0;
     return NULL;
 }
diff --git a/pod/perldelta.pod b/pod/perldelta.pod
index 76f4972c08..11a089b989 100644
--- a/pod/perldelta.pod
+++ b/pod/perldelta.pod
@@ -27,6 +27,14 @@ here, but most should go in the L</Performance Enhancements> section.
 
 [ List each enhancement as a =head2 entry ]
 
+=head2 Better locale collation of strings containing embedded C<NUL>
+characters
+
+In locales that have multi-level character weights, these are now
+ignored at the higher priority ones.  There are still some gotchas in
+some strings, though.  See
+L<perllocale/Collation of strings containing embedded C<NUL> characters>.
+
 =head1 Security
 
 XXX Any security-related notices go here.  In particular, any security
diff --git a/pod/perllocale.pod b/pod/perllocale.pod
index 0c7e769111..d842a0781a 100644
--- a/pod/perllocale.pod
+++ b/pod/perllocale.pod
@@ -1567,13 +1567,14 @@ called, and whatever it does is what you get.
 
 =head2 Collation of strings containing embedded C<NUL> characters
 
-Perl handles C<NUL> characters in the middle of strings.  In many
-locales, control characters are ignored unless the strings otherwise
-compare equal.  Unlike other control characters, C<NUL> characters are
-never ignored.   For example, if given that C<"b"> sorts after
-C<"\001">, and C<"c"> sorts after C<"b">, C<"a\0c"> always sorts before
-C<"ab">.  This is true even in locales in which C<"ab"> sorts before
-C<"a\001c">.
+C<NUL> characters will sort the same as the lowest collating control
+character does, or to C<"\001"> in the unlikely event that there are no
+control characters at all in the locale.  In cases where the strings
+don't contain this non-C<NUL> control, the results will be correct, and
+in many locales, this control, whatever it might be, will rarely be
+encountered.  But there are cases where a C<NUL> should sort before this
+control, but doesn't.  If two strings do collate identically, the one
+containing the C<NUL> will sort to earlier.
 
 =head2 Broken systems
 
diff --git a/proto.h b/proto.h
index d16dd07ede..0819f26572 100644
--- a/proto.h
+++ b/proto.h
@@ -5785,9 +5785,9 @@ STATIC char*	S_stdize_locale(pTHX_ char* locs);
 PERL_CALLCONV int	Perl_magic_setcollxfrm(pTHX_ SV* sv, MAGIC* mg);
 #define PERL_ARGS_ASSERT_MAGIC_SETCOLLXFRM	\
 	assert(sv); assert(mg)
-PERL_CALLCONV char*	Perl_mem_collxfrm(pTHX_ const char* s, STRLEN len, STRLEN* xlen);
+PERL_CALLCONV char*	Perl_mem_collxfrm(pTHX_ const char* input_string, STRLEN len, STRLEN* xlen);
 #define PERL_ARGS_ASSERT_MEM_COLLXFRM	\
-	assert(s); assert(xlen)
+	assert(input_string); assert(xlen)
 /* PERL_CALLCONV char*	sv_collxfrm(pTHX_ SV *const sv, STRLEN *const nxp); */
 PERL_CALLCONV char*	Perl_sv_collxfrm_flags(pTHX_ SV *const sv, STRLEN *const nxp, I32 const flags);
 #define PERL_ARGS_ASSERT_SV_COLLXFRM_FLAGS	\
diff --git a/t/porting/libperl.t b/t/porting/libperl.t
index 00f2606738..161f7162d3 100644
--- a/t/porting/libperl.t
+++ b/t/porting/libperl.t
@@ -527,6 +527,13 @@ for my $symbol (sort keys %unexpected) {
       SKIP: {
         skip("uses sprintf for Gconvert in sv.o");
       }
+    }
+    elsif (   $symbol eq 'strcat'
+           && @o == 1 && $o[0] eq 'locale.o')
+    {
+      SKIP: {
+        skip("locale.o legitimately uses strcat");
+      }
     } else {
         is(@o, 0, "uses no $symbol (@o)");
     }