Add \o{} escape

This commit adds the new construct \o{} to express a character constant by its octal ordinal value, along with ancillary tests and documentation. A function to handle this is added to util.c, and it is called from the 3 parsing places it could occur. The function is a candidate for in-lining, though I doubt that it will ever be used frequently.
author: Karl Williamson <khw@khw-desktop.(none)> 2010-07-15 17:28:28 -0600
committer: David Golden <dagolden@cpan.org> 2010-07-17 21:50:48 -0400
commit: f0a2b745ce6c03aec6412d79ce0b782f20eddce4 (patch)
tree: d1786b1a4a80f6b848dca1ab4eba6e3ffd5dc5d1
parent: 8e4698ef1ed0da722532bfcc769ba22fe85c4b47 (diff)
download: perl-f0a2b745ce6c03aec6412d79ce0b782f20eddce4.tar.gz
16 files changed, 284 insertions, 34 deletions
diff --git a/embed.fnc b/embed.fnc
index 8493dd7bc7..37c7f2b98e 100644
--- a/embed.fnc
+++ b/embed.fnc
@@ -639,6 +639,7 @@ p	|OP*	|localize	|NN OP *o|I32 lex
 ApdR	|I32	|looks_like_number|NN SV *const sv
 Apd	|UV	|grok_bin	|NN const char* start|NN STRLEN* len_p|NN I32* flags|NULLOK NV *result
 EXpR	|char	|grok_bslash_c	|const char source|const bool output_warning
+EXpR	|char*	|grok_bslash_o	|NN const char* s|NN UV* uv|NN STRLEN* len|const bool output_warning
 Apd	|UV	|grok_hex	|NN const char* start|NN STRLEN* len_p|NN I32* flags|NULLOK NV *result
 Apd	|int	|grok_number	|NN const char *pv|STRLEN len|NULLOK UV *valuep
 ApdR	|bool	|grok_numeric_radix|NN const char **sp|NN const char *send
diff --git a/embed.h b/embed.h
index 8fb3cbe4bf..fffdedec1a 100644
--- a/embed.h
+++ b/embed.h
@@ -463,6 +463,7 @@
 #define grok_bin		Perl_grok_bin
 #if defined(PERL_CORE) || defined(PERL_EXT)
 #define grok_bslash_c		Perl_grok_bslash_c
+#define grok_bslash_o		Perl_grok_bslash_o
 #endif
 #define grok_hex		Perl_grok_hex
 #define grok_number		Perl_grok_number
@@ -2909,6 +2910,7 @@
 #define grok_bin(a,b,c,d)	Perl_grok_bin(aTHX_ a,b,c,d)
 #if defined(PERL_CORE) || defined(PERL_EXT)
 #define grok_bslash_c(a,b)	Perl_grok_bslash_c(aTHX_ a,b)
+#define grok_bslash_o(a,b,c,d)	Perl_grok_bslash_o(aTHX_ a,b,c,d)
 #endif
 #define grok_hex(a,b,c,d)	Perl_grok_hex(aTHX_ a,b,c,d)
 #define grok_number(a,b,c)	Perl_grok_number(aTHX_ a,b,c)
diff --git a/global.sym b/global.sym
index aa61a699a5..332381564d 100644
--- a/global.sym
+++ b/global.sym
@@ -283,6 +283,7 @@ Perl_vload_module
 Perl_looks_like_number
 Perl_grok_bin
 Perl_grok_bslash_c
+Perl_grok_bslash_o
 Perl_grok_hex
 Perl_grok_number
 Perl_grok_numeric_radix
diff --git a/pod/perl5133delta.pod b/pod/perl5133delta.pod
index 476427e025..d4db338fd9 100644
--- a/pod/perl5133delta.pod
+++ b/pod/perl5133delta.pod
@@ -28,6 +28,17 @@ here, but most should go in the L</Performance Enhancements> section.
 
 [ List each enhancement as a =head2 entry ]
 
+=head2 \o{...}
+
+The escape sequence C<"\o"> in double-quotish contexts is now defined.  It
+must be followed by braces enclosing an octal number of at least one digit.  It
+means the character whose ordinal value is that octal number.  This construct
+allows large octal ordinals beyond the current max of 0777 to be represented.
+It also allows you to specify a character in octal which can safely be
+concatenated with other regex snippets without danger of changing its meaning,
+and one which won't ever be confused with being a backreference to a regex
+capture group.  See L<perlre/Capture groups>
+
 =head2 C<\N{I<name>}> and C<charnames> enhancements
 
 C<\N{}> and C<charnames::vianame> now know about the abbreviated character
@@ -84,7 +95,9 @@ anomalous behavior than their use in all other double-quotish contexts.   Since
 all double-quotish contexts have the same behavior, namely to be equivalent to
 C<\x{100}> - C<\x{1FF}>, with no deprecation warning. Use of these values in the
 command line option C<"-0"> retains the current meaning to slurp input files
-whole; previously, this was documented only for C<"-0777">.
+whole; previously, this was documented only for C<"-0777">.  It is recommended,
+however, because of various ambiguities, to use the new L</\o{...}> construct
+to represent characters in octal.
 
 =head1 Deprecations
 
diff --git a/pod/perldiag.pod b/pod/perldiag.pod
index 9e8a28735c..9f9fe4b553 100644
--- a/pod/perldiag.pod
+++ b/pod/perldiag.pod
@@ -2510,6 +2510,10 @@ comment) between the C<\N> and the C<{> in a regex with the C</x> modifier.
 This modifier does not change the requirement that the brace immediately follow
 the C<\N>.
 
+=item Missing braces on \o{}
+
+(F) A C<\o> must be followed immediately by a C<{> in double-quotish context.
+
 =item Missing comma after first argument to %s function
 
 (F) While certain functions allow you to specify a filehandle or an
@@ -2978,6 +2982,11 @@ to UTC.  If it's not, define the logical name
 F<SYS$TIMEZONE_DIFFERENTIAL> to translate to the number of seconds which
 need to be added to UTC to get local time.
 
+=item Non-octal character '%c'.  Resolved as "%s"
+
+(W digit)  In parsing an octal numeric constant, a character was unexpectedly
+encountered that isn't octal.  The resulting value is as indicated.
+
 =item Non-string passed as bitmask
 
 (W misc) A number has been passed as a bitmask argument to select().
@@ -3020,6 +3029,11 @@ versions of Perl are likely to eliminate this arbitrary limitation.  In
 the meantime, try using scientific notation (e.g. "1e6" instead of
 "1_000_000").
 
+=item Number with no digits
+
+(F) Perl was looking for a number but found nothing that looked like a number.
+This happens, for example with C<\o{}>, with no number between the braces.
+
 =item Octal number in vector unsupported
 
 (F) Numbers with a leading C<0> are not currently allowed in vectors.
diff --git a/pod/perlre.pod b/pod/perlre.pod
index 5ea15a013a..2e00f0bc69 100644
--- a/pod/perlre.pod
+++ b/pod/perlre.pod
@@ -229,11 +229,11 @@ also work:
  \f          form feed             (FF)
  \a          alarm (bell)          (BEL)
  \e          escape (think troff)  (ESC)
- \033        octal char            (example: ESC)
  \cK         control char          (example: VT)
  \x{}, \x00  character whose ordinal is the given hexadecimal number
  \N{name}    named Unicode character
  \N{U+263D}  Unicode character     (example: FIRST QUARTER MOON)
+ \o{}, \000  character whose ordinal is the given octal number
  \l          lowercase next char (think vi)
  \u          uppercase next char (think vi)
  \L          lowercase till \E (think vi)
diff --git a/pod/perlrebackslash.pod b/pod/perlrebackslash.pod
index 9d246bdc2e..d460f7f052 100644
--- a/pod/perlrebackslash.pod
+++ b/pod/perlrebackslash.pod
@@ -62,7 +62,7 @@ quoted constructs>.
 Those not usable within a bracketed character class (like C<[\da-z]>) are marked
 as C<Not in [].>
 
- \000              Octal escape sequence.
+ \000              Octal escape sequence.  See also \o{}.
  \1                Absolute backreference.  Not in [].
  \a                Alarm or bell.
  \A                Beginning of string.  Not in [].
@@ -86,6 +86,7 @@ as C<Not in [].>
  \n                (Logical) newline character.
  \N                Any character but newline.  Experimental.  Not in [].
  \N{}              Named or numbered (Unicode) character.
+ \o{}              Octal escape sequence.
  \p{}, \pP         Character with the given Unicode property.
  \P{}, \PP         Character without the given Unicode property.
  \Q                Quotemeta till \E.  Not in [].
@@ -207,33 +208,57 @@ match "as is".
 
 =head3 Octal escapes
 
-Octal escapes consist of a backslash followed by three octal digits
-matching the code point of the character you want to use.  (In some contexts,
-two or even one octal digits are also accepted, sometimes with a warning.) This
-allows for 512 characters (C<\000> up to C<\777>) that can be expressed this
-way.  Enough in pre-Unicode days,
-but most Unicode characters cannot be escaped this way.
+There are two forms of octal escapes.  Each is used to specify a character by
+its ordinal, specified in octal notation.
+
+One form, available starting in Perl 5.14 looks like C<\o{...}>, where the dots
+represent one or more octal digits.  It can be used for any Unicode character.
+
+It was introduced to avoid the potential problems with the other form,
+available in all Perls.  That form consists of a backslash followed by three
+octal digits.  One problem with this form is that it can look exactly like an
+old-style backreference (see
+L</Disambiguation rules between old-style octal escapes and backreferences>
+below.)  You can avoid this by making the first of the three digits always a
+zero, but that makes \077 the largest ordinal unambiguously specifiable by this
+form.
+
+In some contexts, a backslash followed by two or even one octal digits may be
+interpreted as an octal escape, sometimes with a warning, and because of some
+bugs, sometimes with surprising results.  Also, if you are creating a regex
+out of smaller snippets concatentated together, and you use fewer than three
+digits, the beginning of one snippet may be interpreted as adding digits to the
+ending of the snippet before it.  See L</Absolute referencing> for more
+discussion and examples of the snippet problem.
 
 Note that a character that is expressed as an octal escape is considered
 as a character without special meaning by the regex engine, and will match
 "as is".
 
-=head4 Examples (assuming an ASCII platform)
+To summarize, the C<\o{}> form is always safe to use, and the other form is
+safe to use for ordinals up through \077 when you use exactly three digits to
+specify them.
 
- $str = "Perl";
- $str =~ /\120/;    # Match, "\120" is "P".
- $str =~ /\120+/;   # Match, "\120" is "P", it is repeated at least once
- $str =~ /P\053/;   # No match, "\053" is "+" and taken literally.
+Mnemonic: I<0>ctal or I<o>ctal.
 
-=head4 Caveat
+=head4 Examples (assuming an ASCII platform)
 
-Octal escapes potentially clash with old-style backreferences (see L</Absolute
-referencing> below). They both consist of a backslash followed by numbers. So
-Perl has to use heuristics to determine whether it is a backreference or an
-octal escape. You can avoid ambiguity by using the C<\g> form for
-backreferences, and by beginning octal escapes with a "0".  (Since octal
-escapes are 3 digits, this latter method works only up to C<\077>.)  In the
-absence of C<\g>, Perl uses the following rules:
+ $str = "Perl";
+ $str =~ /\o{120}/;  # Match, "\120" is "P".
+ $str =~ /\120/;     # Same.
+ $str =~ /\o{120}+/; # Match, "\120" is "P", it's repeated at least once
+ $str =~ /\120+/;    # Same.
+ $str =~ /P\053/;    # No match, "\053" is "+" and taken literally.
+ /\o{23073}/         # Black foreground, white background smiling face.
+ /\o{4801234567}/    # Raises a warning, and yields chr(4)
+
+=head4 Disambiguation rules between old-style octal escapes and backreferences
+
+Octal escapes of the C<\000> form outside of bracketed character classes
+potentially clash with old-style backreferences.  (see L</Absolute referencing>
+below).  They both consist of a backslash followed by numbers.  So Perl has to
+use heuristics to determine whether it is a backreference or an octal escape.
+Perl uses the following rules to disambiguate:
 
 =over 4
 
@@ -258,18 +283,24 @@ the rest are matched as is.
     $pat .= ")" x 999;
  /^($pat)\1000$/;   #  Matches 'aa'; there are 1000 capture groups.
  /^$pat\1000$/;     #  Matches 'a@0'; there are 999 capture groups
-                    #    and \1000 is seen as \100 (a '@') and a '0'.
+                    #    and \1000 is seen as \100 (a '@') and a '0'
 
 =back
 
+You can the force a backreference interpretation always by using the C<\g{...}>
+form.  You can the force an octal interpretation always by using the C<\o{...}>
+form, or for numbers up through \077 (= 63 decimal), by using three digits,
+beginning with a "0".
+
 =head3 Hexadecimal escapes
 
-Hexadecimal escapes start with C<\x> and are then either followed by a
-two digit hexadecimal number, or a hexadecimal number of arbitrary length
-surrounded by curly braces. The hexadecimal number is the code point of
-the character you want to express.
+Like octal escapes, there are two forms of hexadecimal escapes, but both start
+with the same thing, C<\x>.  This is followed by either exactly two hexadecimal
+digits forming a number, or a hexadecimal number of arbitrary length surrounded
+by curly braces. The hexadecimal number is the code point of the character you
+want to express.
 
-Note that a character that is expressed as a hexadecimal escape is considered
+Note that a character that is expressed as one of these escapes is considered
 as a character without special meaning by the regex engine, and will match
 "as is".
 
diff --git a/pod/perlretut.pod b/pod/perlretut.pod
index eae266a407..f2187179c2 100644
--- a/pod/perlretut.pod
+++ b/pod/perlretut.pod
@@ -184,7 +184,8 @@ bytes.  Here are some examples of escapes:
     "1000\t2000" =~ m(0\t2)   # matches
     "1000\n2000" =~ /0\n20/   # matches
     "1000\t2000" =~ /\000\t2/ # doesn't match, "0" ne "\000"
-    "cat"   =~ /\143\x61\x74/ # matches in ASCII, but a weird way to spell cat
+    "cat"   =~ /\o{143}\x61\x74/ # matches in ASCII, but a weird way
+                                 # to spell cat
 
 If you've been around Perl a while, all this talk of escape sequences
 may seem familiar.  Similar escape sequences are used in double-quoted
@@ -1876,9 +1877,9 @@ much about Perl's internal representation of strings.  But they do need
 to know 1) how to represent Unicode characters in a regexp and 2) that
 a matching operation will treat the string to be searched as a sequence
 of characters, not bytes.  The answer to 1) is that Unicode characters
-greater than C<chr(255)> are represented using the C<\x{hex}> notation,
-because the \0 octal and \x hex (without curly braces) don't go further
-than 255.
+greater than C<chr(255)> are represented using the C<\x{hex}> notation, because
+\x hex (without curly braces) doesn't go further than 255.  (Starting in Perl
+5.14) if you're an octal fan, you can also use C<\o{oct}>.
 
     /\x{263a}/;  # match a Unicode smiley face :)
 
diff --git a/proto.h b/proto.h
index 6a5110ea27..1fc11806e8 100644
--- a/proto.h
+++ b/proto.h
@@ -1633,6 +1633,14 @@ PERL_CALLCONV UV	Perl_grok_bin(pTHX_ const char* start, STRLEN* len_p, I32* flag
 PERL_CALLCONV char	Perl_grok_bslash_c(pTHX_ const char source, const bool output_warning)
 			__attribute__warn_unused_result__;
 
+PERL_CALLCONV char*	Perl_grok_bslash_o(pTHX_ const char* s, UV* uv, STRLEN* len, const bool output_warning)
+			__attribute__warn_unused_result__
+			__attribute__nonnull__(pTHX_1)
+			__attribute__nonnull__(pTHX_2)
+			__attribute__nonnull__(pTHX_3);
+#define PERL_ARGS_ASSERT_GROK_BSLASH_O	\
+	assert(s); assert(uv); assert(len)
+
 PERL_CALLCONV UV	Perl_grok_hex(pTHX_ const char* start, STRLEN* len_p, I32* flags, NV *result)
 			__attribute__nonnull__(pTHX_1)
 			__attribute__nonnull__(pTHX_2)
diff --git a/regcomp.c b/regcomp.c
index 72af569aae..74f996bcd6 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -7362,6 +7362,7 @@ tryagain:
 	    register UV ender;
 	    register char *p;
 	    char *s;
+	    char *error_msg;
 	    STRLEN foldlen;
 	    U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
 
@@ -7462,6 +7463,26 @@ tryagain:
 			  ender = ASCII_TO_NATIVE('\007');
 			p++;
 			break;
+		    case 'o':
+			{
+			    STRLEN brace_len = len;
+			    if ((error_msg = grok_bslash_o(p,
+							   &ender,
+							   &brace_len,
+							   SIZE_ONLY))
+				!= NULL)
+			    {
+				vFAIL(error_msg);
+			    }
+			    p += brace_len;
+			    if (PL_encoding && ender < 0x100) {
+				goto recode_encoding;
+			    }
+			    if (ender > 0xff) {
+				RExC_utf8 = 1;
+			    }
+			    break;
+			}
 		    case 'x':
 			if (*++p == '{') {
 			    char* const e = strchr(p, '}');
@@ -7971,6 +7992,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
 
 parseit:
     while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
+	char* error_msg;
 
     charclassloop:
 
@@ -8077,6 +8099,21 @@ parseit:
 	    case 'b':	value = '\b';			break;
 	    case 'e':	value = ASCII_TO_NATIVE('\033');break;
 	    case 'a':	value = ASCII_TO_NATIVE('\007');break;
+	    case 'o':
+		RExC_parse--;	/* function expects to be pointed at the 'o' */
+		if ((error_msg = grok_bslash_o(RExC_parse,
+					       &value,
+					       &numlen,
+					       SIZE_ONLY))
+		    != NULL)
+		{
+		    vFAIL(error_msg);
+		}
+		RExC_parse += numlen;
+		if (PL_encoding && value < 0x100) {
+		    goto recode_encoding;
+		}
+		break;
 	    case 'x':
 		if (*RExC_parse == '{') {
                     I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
diff --git a/t/lib/warnings/regcomp b/t/lib/warnings/regcomp
index f85aa440c4..3f80ccce72 100644
--- a/t/lib/warnings/regcomp
+++ b/t/lib/warnings/regcomp
@@ -207,3 +207,33 @@ Useless (?-c) - don't use /gc modifier in regex; marked by <-- HERE in m/(?o-c <
 Useless (?o) - use /o modifier in regex; marked by <-- HERE in m/(?o <-- HERE gc)/ at - line 12.
 Useless (?g) - use /g modifier in regex; marked by <-- HERE in m/(?og <-- HERE c)/ at - line 12.
 Useless (?c) - use /gc modifier in regex; marked by <-- HERE in m/(?ogc <-- HERE )/ at - line 12.
+########
+# regcomp.c [S_regatom]
+$a = qr/\o{/;
+EXPECT
+Missing right brace on \o{ in regex; marked by <-- HERE in m/\ <-- HERE o{/ at - line 2.
+########
+# regcomp.c [S_regatom]
+$a = qr/\o/;
+EXPECT
+Missing braces on \o{} in regex; marked by <-- HERE in m/\ <-- HERE o/ at - line 2.
+########
+# regcomp.c [S_regatom]
+$a = qr/\o{}/;
+EXPECT
+Number with no digits in regex; marked by <-- HERE in m/\ <-- HERE o{}/ at - line 2.
+########
+# regcomp.c [S_regclass]
+$a = qr/[\o{]/;
+EXPECT
+Missing right brace on \o{ in regex; marked by <-- HERE in m/[\ <-- HERE o{]/ at - line 2.
+########
+# regcomp.c [S_regclass]
+$a = qr/[\o]/;
+EXPECT
+Missing braces on \o{} in regex; marked by <-- HERE in m/[\ <-- HERE o]/ at - line 2.
+########
+# regcomp.c [S_regclass]
+$a = qr/[\o{}]/;
+EXPECT
+Number with no digits in regex; marked by <-- HERE in m/[\ <-- HERE o{}]/ at - line 2.
diff --git a/t/lib/warnings/toke b/t/lib/warnings/toke
index 4bb131f7b9..076270ce06 100644
--- a/t/lib/warnings/toke
+++ b/t/lib/warnings/toke
@@ -966,3 +966,26 @@ Use of := for an empty attribute list is deprecated at - line 36.
 Use of := for an empty attribute list is deprecated at - line 38.
 Use of := for an empty attribute list is deprecated at - line 41.
 Use of := for an empty attribute list is deprecated at - line 42.
+########
+# toke.c
+use warnings 'syntax' ;
+my $a = "\o";
+my $a = "\o{";
+my $a = "\o{}";
+no warnings 'syntax' ;
+my $a = "\o";
+my $a = "\o{";
+my $a = "\o{}";
+EXPECT
+Missing braces on \o{} at - line 3, within string
+Missing right brace on \o{ at - line 4, within string
+Number with no digits at - line 5, within string
+BEGIN not safe after errors--compilation aborted at - line 6.
+########
+# toke.c
+use warnings 'digit' ;
+my $a = "\o{1238456}";
+no warnings 'digit' ;
+my $a = "\o{1238456}";
+EXPECT
+Non-octal character '8'.  Resolved as "\o{123}" at - line 3.
diff --git a/t/op/qq.t b/t/op/qq.t
index 7b75b9b7ef..01366086b1 100644
--- a/t/op/qq.t
+++ b/t/op/qq.t
@@ -5,7 +5,7 @@ BEGIN {
     @INC = '../lib';
 }
 
-print q(1..26
+print q(1..29
 );
 
 # This is() function is written to avoid ""
@@ -64,6 +64,9 @@ is ("\x{10FFFD}", chr 1114109);
 is ("\400", chr 0x100);
 is ("\600", chr 0x180);
 is ("\777", chr 0x1FF);
+is ("a\o{120}b", "a" . chr(0x50) . "b");
+is ("a\o{400}b", "a" . chr(0x100) . "b");
+is ("a\o{1000}b", "a" . chr(0x200) . "b");
 
 # These kludged tests should change when we remove the temporary fatal error
 # in util.c for "\c{".  And, the warning there should probably not be
diff --git a/t/re/re_tests b/t/re/re_tests
index fc29fb6bd5..36a2f4cee5 100644
--- a/t/re/re_tests
+++ b/t/re/re_tests
@@ -1460,5 +1460,11 @@ abc\N{def	-	c	-	\\N{NAME} must be resolved by the lexer
 [a\400]	\x{100}	y	$&	\x{100}
 [b\600]	\x{180}	y	$&	\x{180}
 [c\777]	\x{1FF}	y	$&	\x{1FF}
+\o{120}	\x{50}	y	$&	\x{50}
+\o{400}	\x{100}	y	$&	\x{100}
+\o{1000}	\x{200}	y	$&	\x{200}
+[a\o{120}]	\x{50}	y	$&	\x{50}
+[a\o{400}]	\x{100}	y	$&	\x{100}
+[a\o{1000}]	\x{200}	y	$&	\x{200}
 
 # vim: softtabstop=0 noexpandtab
diff --git a/toke.c b/toke.c
index b7b33e87b1..75fb327148 100644
--- a/toke.c
+++ b/toke.c
@@ -2879,6 +2879,20 @@ S_scan_const(pTHX_ char *start)
 		}
 		goto NUM_ESCAPE_INSERT;
 
+	    /* eg. \o{24} indicates the octal constant \024 */
+	    case 'o':
+		{
+		    STRLEN len;
+
+		    char* error = grok_bslash_o(s, &uv, &len, 1);
+		    s += len;
+		    if (error) {
+			yyerror(error);
+			continue;
+		    }
+		    goto NUM_ESCAPE_INSERT;
+		}
+
 	    /* eg. \x24 indicates the hex constant 0x24 */
 	    case 'x':
 		++s;
diff --git a/util.c b/util.c
index b3b385e2cc..6fdc6534a9 100644
--- a/util.c
+++ b/util.c
@@ -3904,7 +3904,7 @@ Perl_report_evil_fh(pTHX_ const GV *gv, const IO *io, I32 op)
 char
 Perl_grok_bslash_c(pTHX_ const char source, const bool output_warning)
 {
-    
+
     U8 result;
 
     if (! isASCII(source)) {
@@ -3935,6 +3935,72 @@ Perl_grok_bslash_c(pTHX_ const char source, const bool output_warning)
     return result;
 }
 
+char *
+Perl_grok_bslash_o(pTHX_ const char *s, UV *uv, STRLEN *len, const bool output_warning)
+{
+
+/*  Documentation to be supplied when interface nailed down finally
+ *  This returns NULL on success, otherwise a pointer to an internal constant
+ *  error message.  On input:
+ *	s   points to a string that begins with o, and the previous character was
+ *	    a backslash.
+ *	uv  points to a UV that will hold the output value
+ *	len will point to the next character in the string past the end of this
+ *	    construct
+ *	output_warning says whether to output any warning messages, or suppress
+ *	    them
+ */
+    char* e;
+    STRLEN numbers_len;
+    I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
+		| PERL_SCAN_DISALLOW_PREFIX
+		/* XXX Until the message is improved in grok_oct, handle errors
+		 * ourselves */
+	        | PERL_SCAN_SILENT_ILLDIGIT;
+
+    PERL_ARGS_ASSERT_GROK_BSLASH_O;
+
+
+    assert(*s == 'o');
+    s++;
+
+    if (*s != '{') {
+	*len = 1;	/* Move past the o */
+	return "Missing braces on \\o{}";
+    }
+
+    e = strchr(s, '}');
+    if (!e) {
+	*len = 2;	/* Move past the o{ */
+	return "Missing right brace on \\o{";
+    }
+
+    /* Return past the '}' no matter what is inside the braces */
+    *len = e - s + 2;	/* 2 = 1 for the o + 1 for the '}' */
+
+    s++;    /* Point to first digit */
+
+    numbers_len = e - s;
+    if (numbers_len == 0) {
+	return "Number with no digits";
+    }
+
+    *uv = NATIVE_TO_UNI(grok_oct(s, &numbers_len, &flags, NULL));
+    /* Note that if has non-octal, will ignore everything starting with that up
+     * to the '}' */
+
+    if (output_warning && numbers_len != (STRLEN) (e - s)) {
+	Perl_ck_warner(aTHX_ packWARN(WARN_DIGIT),
+	/* diag_listed_as: Non-octal character '%c'.  Resolved as "%s" */
+		       "Non-octal character '%c'.  Resolved as \"\\o{%.*s}\"",
+		       *(s + numbers_len),
+		       (int) numbers_len,
+		       s);
+    }
+
+    return NULL;
+}
+
 /* To workaround core dumps from the uninitialised tm_zone we get the
  * system to give us a reasonable struct to copy.  This fix means that
  * strftime uses the tm_zone and tm_gmtoff values returned by
author	Karl Williamson <khw@khw-desktop.(none)>	2010-07-15 17:28:28 -0600
committer	David Golden <dagolden@cpan.org>	2010-07-17 21:50:48 -0400
commit	f0a2b745ce6c03aec6412d79ce0b782f20eddce4 (patch)
tree	d1786b1a4a80f6b848dca1ab4eba6e3ffd5dc5d1
parent	8e4698ef1ed0da722532bfcc769ba22fe85c4b47 (diff)
download	perl-f0a2b745ce6c03aec6412d79ce0b782f20eddce4.tar.gz