diff options
-rw-r--r-- | MANIFEST | 2 | ||||
-rw-r--r-- | pod.lst | 2 | ||||
-rw-r--r-- | pod/perl.pod | 2 | ||||
-rw-r--r-- | pod/perlrebackslash.pod | 528 | ||||
-rw-r--r-- | pod/perlrecharclass.pod | 525 | ||||
-rw-r--r-- | vms/descrip_mms.template | 25 | ||||
-rw-r--r-- | win32/pod.mak | 8 |
7 files changed, 1084 insertions, 8 deletions
@@ -3149,6 +3149,8 @@ pod/perlpodspec.pod Perl plain old documentation format specification pod/perlport.pod Perl portability guide pod/perlpragma.pod Perl modules: writing a user pragma pod/perlreapi.pod Perl regular expression plugin interface +pod/perlrebackslash.pod Perl regular expression backslash sequences +pod/perlrecharclass.pod Perl regular expression character classes pod/perlref.pod Perl references, the rest of the story pod/perlreftut.pod Perl references short introduction pod/perlreguts.pod Perl regular expression engine internals @@ -60,6 +60,8 @@ h Reference Manual perldebug Perl debugging perlvar Perl predefined variables perlre Perl regular expressions, the rest of the story + perlrebackslash Perl regular expression backslash sequences + perlrecharclass Perl regular expression character classes perlreref Perl regular expressions quick reference perlref Perl references, the rest of the story perlform Perl formats diff --git a/pod/perl.pod b/pod/perl.pod index d6bbd61135..bf24ed86f8 100644 --- a/pod/perl.pod +++ b/pod/perl.pod @@ -77,6 +77,8 @@ For ease of access, the Perl manual has been split up into several sections. perldebug Perl debugging perlvar Perl predefined variables perlre Perl regular expressions, the rest of the story + perlrebackslash Perl regular expression backslash sequences + perlrecharclass Perl regular expression character classes perlreref Perl regular expressions quick reference perlref Perl references, the rest of the story perlform Perl formats diff --git a/pod/perlrebackslash.pod b/pod/perlrebackslash.pod new file mode 100644 index 0000000000..7851bf9f6a --- /dev/null +++ b/pod/perlrebackslash.pod @@ -0,0 +1,528 @@ +=head1 NAME + +perlrebackslash - Perl Regular Expression Backslash Sequences and Escapes + +=head1 DESCRIPTION + +The top level documentation about Perl regular expressions +is found in L<perlre>. + +This document describes all backslash and escape sequences. After +explaining the role of the backslash, it lists all the sequences that have +a special meaning in Perl regular expressions (in alphabetical order), +then describes each of them. + +Most sequences are described in detail in different documents; the primary +purpose of this document is to have a quick reference guide describing all +backslash and escape sequences. + + +=head2 The backslash + +In a regular expression, the backslash can perform one of two tasks: +it either takes away the special meaning of the character following it +(for instance, C<\|> matches a vertical bar, it's not an alternation), +or it is the start of a backslash or escape sequence. + +The rules determining what it is are quite simple: if the character +following the backslash is a punctuation (non-word) character (that is, +anything that is not a letter, digit or underscore), then the backslash +just takes away the special meaning (if any) of the character following +it. + +If the character following the backslash is a letter or a digit, then the +sequence may be special; if so, it's listed below. A few letters have not +been used yet, and escaping them with a backslash is safe for now, but a +future version of Perl may assign a special meaning to it. However, if you +have warnings turned on, Perl will issue a warning if you use such a sequence. +[1]. + +It is however garanteed that backslash or escape sequences never have a +punctuation character following the backslash, not now, and not in a future +version of Perl 5. So it is safe to put a backslash in front of a non-word +character. + +Note that the backslash itself is special; if you want to match a backslash, +you have to escape the backslash with a backslash: C</\\/> matches a single +backslash. + +=over 4 + +=item [1] + +There is one exception. If you use an alphanumerical character as the +delimiter of your pattern (which you probably shouldn't do for readability +reasons), you will have to escape the delimiter if you want to match +it. Perl won't warn then. See also L<perlop/Gory details of parsing +quoted constructs>. + +=back + + +=head2 All the sequences and escapes + + \000 Octal escape sequence. + \1 Absolute backreference. + \a Alarm or bell. + \A Beginning of string. + \b Word/non-word boundary. (Backspace in a char class). + \B Not a word/non-word boundary. + \cX Control-X (X can be any ASCII character). + \C Single octet, even under UTF-8. + \d Character class for digits. + \D Character class for non-digits. + \e Escape character. + \E Turn off \Q, \L and \U processing. + \f Form feed. + \g{}, \g1 Named, absolute or relative backreference. + \G Pos assertion. + \h Character class for horizontal white space. + \H Character class for non horizontal white space. + \k{}, \k<>, \k'' Named backreference. + \K Keep the stuff left of \K. + \l Lowercase next character. + \L Lowercase till \E. + \n (Logical) newline character. + \N{} Named (Unicode) character. + \p{}, \pP Character with a Unicode property. + \P{}, \PP Character without a Unicode property. + \Q Quotemeta till \E. + \r Return character. + \R Generic new line. + \s Character class for white space. + \S Character class for non white space. + \t Tab character. + \u Uppercase next character. + \U Uppercase till \E. + \v Character class for vertical white space. + \V Character class for non vertical white space. + \w Character class for word characters. + \W Character class for non-word characters. + \x{}, \x00 Hexadecimal escape sequence. + \X Extended Unicode "combining character sequence". + \z End of string. + \Z End of string. + +=head2 Character Escapes + +=head3 Fixed characters + +A handful of characters have a dedidated I<character escape>. The following +table shows them, along with their code points (in decimal and hex), their +ASCII name, the control escape (see below) and a short description. + + Seq. Code Point ASCII Cntr Description. + Dec Hex + \a 7 07 BEL \cG alarm or bell + \b 8 08 BS \cH backspace [1] + \e 27 1B ESC \c[ escape character + \f 12 0C FF \cL form feed + \n 10 0A LF \cJ line feed [2] + \r 13 0D CR \cM carriage return + \t 9 09 TAB \cI tab + +=over 4 + +=item [1] + +C<\b> is only the backspace character inside a character class. Outside a +character class, C<\b> is a word/non-word boundary. + +=item [2] + +C<\n> matches a logical newline. Perl will convert between C<\n> and your +OSses native newline character when reading from or writing to text files. + +=back + +=head4 Example + + $str =~ /\t/; # Matches if $str contains a (horizontal) tab. + +=head3 Control characters + +C<\c> is used to denote a control character; the character following C<\c> +is the name of the control character. For instance, C</\cM/> matches the +character I<control-M> (a carriage return, code point 13). The case of the +character following C<\c> doesn't matter: C<\cM> and C<\cm> match the same +character. + +Mnemonic: I<c>ontrol character. + +=head4 Example + + $str =~ /\cK/; # Matches if $str contains a vertical tab (control-K). + +=head3 Named characters + +All Unicode characters have a Unicode name, and characters in various scripts +have names as well. It is even possible to give your own names to characters. +You can use a character by name by using the C<\N{}> construct; the name of +the character goes between the curly braces. You do have to C<use charnames> +to load the names of the characters, otherwise Perl will complain you use +a name it doesn't know about. For more details, see L<charnames>. + +Mnemonic: I<N>amed character. + +=head4 Example + + use charnames ':full'; # Loads the Unicode names. + $str =~ /\N{THAI CHARACTER SO SO}/; # Matches the Thai SO SO character + + use charnames 'Cyrillic'; # Loads Cyrillic names. + $str =~ /\N{ZHE}\N{KA}/; # Match "ZHE" followed by "KA". + +=head3 Octal escapes + +Octal escapes consist of a backslash followed by two or three octal digits +matching the code point of the character you want to use. This allows for +522 characters (C<\00> up to C<\777>) that can be expressed this way. +Enough in pre-Unicode days, but most Unicode characters cannot be escaped +this way. + +Note that a character that is expressed as an octal escape is considered +as a character without special meaning by the regex engine, and will match +"as is". + +=head4 Examples + + $str = "Perl"; + $str =~ /\120/; # Match, "\120" is "P". + $str =~ /\120+/; # Match, "\120" is "P", it is repeated at least once. + $str =~ /P\053/; # No match, "\053" is "+" and taken literally. + +=head4 Caveat + +Octal escapes potentially clash with backreferences. They both consist +of a backslash followed by numbers. So Perl has to use heuristics to +determine whether it is a backreference or an octal escape. Perl uses +the following rules: + +=over 4 + +=item 1 + +If the backslash is followed by a single digit, it's a backrefence. + +=item 2 + +If the first digit following the backslash is a 0, it's an octal escape. + +=item 3 + +If the number following the backslash is N (decimal), and Perl already has +seen N capture groups, Perl will consider this to be a backreference. +Otherwise, it will consider it to be an octal escape. Note that if N > 999, +Perl only takes the first three digits for the octal escape; the rest is +matched as is. + + my $pat = "(" x 999; + $pat .= "a"; + $pat .= ")" x 999; + /^($pat)\1000$/; # Matches 'aa'; there are 1000 capture groups. + /^$pat\1000$/; # Matches 'a@0'; there are 999 capture groups + # and \1000 is seen as \100 (a '@') and a '0'. + +=back + +=head3 Hexadecimal escapes + +Hexadecimal escapes start with C<\x> and are then either followed by +two digit hexadecimal number, or a hexadecimal number of arbitrary length +surrounded by curly braces. The hexadecimal number is the code point of +the character you want to express. + +Note that a character that is expressed as a hexadecimal escape is considered +as a character without special meaning by the regex engine, and will match +"as is". + +Mnemonic: heI<x>adecimal. + +=head4 Examples + + $str = "Perl"; + $str =~ /\x50/; # Match, "\x50" is "P". + $str =~ /\x50+/; # Match, "\x50" is "P", it is repeated at least once. + $str =~ /P\x2B/; # No match, "\x2B" is "+" and taken literally. + + /\x{2603}\x{2602}/ # Snowman with an umbrella. + # The Unicode character 2603 is a snowman, + # the Unicode character 2602 is an umbrella. + /\x{263B}/ # Black smiling face. + /\x{263b}/ # Same, the hex digits A - F are case insensitive. + +=head2 Modifiers + +A number of backslash sequences have to do with changing the character, +or characters following them. C<\l> will lowercase the character following +it, while C<\u> will uppercase the character following it. (They perform +similar functionality as the functions C<lcfirst> and C<ucfirst>). + +To uppercase or lowercase several characters, one might want to use +C<\L> or C<\U>, which will lowercase/uppercase all characters following +them, until either the end of the pattern, or the next occurance of +C<\E>, whatever comes first. They perform similar functionality as the +functions C<lc> and C<uc> do. + +C<\Q> is used to escape all characters following, up to the next C<\E> +or the end of the pattern. C<\Q> adds a backslash to any character that +isn't a letter, digit or underscore. This will ensure that any character +between C<\Q> and C<\E> is matched literally, and will not be interpreted +by the regexp engine. + +Mnemonic: I<L>owercase, I<U>ppercase, I<Q>uotemeta, I<E>nd. + +=head4 Examples + + $sid = "sid"; + $greg = "GrEg"; + $miranda = "(Miranda)"; + $str =~ /\u$sid/; # Matches 'Sid' + $str =~ /\L$greg/; # Matches 'greg' + $str =~ /\Q$miranda\E/; # Matches '(Miranda)', as if the pattern + # had been written as /\(Miranda\)/ + +=head2 Character classes + +Perl regular expressions have a large range of character classes. Some of +the character classes are written as a backslash sequence. We will briefly +discuss those here; full details of character classes can be found in +L<perlrecharclass>. + +C<\w> is a character class that matches any I<word> character (letters, +digits, underscore). C<\d> is a character class that matches any digit, +while the character class C<\s> matches any white space character. +New in perl 5.10 are the classes C<\h> and C<\v> which match horizontal +and vertical white space characters. + +The uppercase variants (C<\W>, C<\D>, C<\S>, C<\H>, and C<\V>) are +character classes that match any character that isn't a word character, +digit, white space, horizontal white space or vertical white space. + +Mnemonics: I<w>ord, I<d>igit, I<s>pace, I<h>orizontal, I<v>ertical. + +=head3 Unicode classes + +C<\pP> (where C<P> is a single letter) and C<\p{Property}> are used to +match a character that matches the given Unicode property; properties +include things like "letter", or "thai character". Capitalizing the +sequence to C<\PP> and C<\P{Property}> make the sequence match a character +that doesn't match the given Unicode property. For more details, see +L<perlrecharclass/Backslashed sequences> and +L<perlunicode/Unicode Character Properties>. + +Mnemonic: I<p>roperty. + + +=head2 Referencing + +If capturing parenthesis are used in a regular expression, we can refer +to the part of the source string that was matched, and match exactly the +same thing. (Full details are discussed in L<perlrecapture>). There are +three ways of refering to such I<backreference>: absolutely, relatively, +and by name. + +=head3 Absolute referencing + +A backslash sequence that starts with a backslash and is followed by a +number is an absolute reference (but be aware of the caveat mentioned above). +If the number is I<N>, it refers to the Nth set of parenthesis - whatever +has been matched by that set of parenthesis has to be matched by the C<\N> +as well. + +=head4 Examples + + /(\w+) \1/; # Finds a duplicated word, (e.g. "cat cat"). + /(.)(.)\2\1/; # Match a four letter palindrome (e.g. "ABBA"). + + +=head3 Relative referencing + +New in perl 5.10 is different way of refering to capture buffers: C<\g>. +C<\g> takes a number as argument, with the number in curly braces (the +braces are optional). If the number (N) does not have a sign, it's a reference +to the Nth capture group (so C<\g{2}> is equivalent to C<\2> - except that +C<\g> always refers to a capture group and will never be seen as an octal +escape). If the number is negative, the reference is relative, refering to +the Nth group before the C<\g{-N}>. + +The big advantage of C<\g{-N}> is that it makes it much easier to write +patterns with references that can be interpolated in larger patterns, +even if the larger pattern also contains capture groups. + +Mnemonic: I<g>roup. + +=head4 Examples + + /(A) # Buffer 1 + ( # Buffer 2 + (B) # Buffer 3 + \g{-1} # Refers to buffer 3 (B) + \g{-3} # Refers to buffer 1 (A) + ) + /x; # Matches "ABBA". + + my $qr = qr /(.)(.)\g{-2}\g{-1}/; # Matches 'abab', 'cdcd', etc. + /$qr$qr/ # Matches 'ababcdcd'. + +=head3 Named referencing + +Also new in perl 5.10 is the use of named capture buffers, which can be +referred to by name. This is done with C<\g{name}>, which is a +backreference to the capture buffer with the name I<name>. + +To be compatible with .Net regular expressions, C<\g{name}> may also be +written as C<\k{name}>, C<< \k<name> >> or C<\k'name'>. + +Note that C<\g{}> has the potential to be ambiguous, as it could be a named +reference, or an absolute or relative reference (if its argument is numeric). +However, names are not allowed to start with digits, nor are allowed to +contain a hyphen, so there is no ambiguity. + +=head4 Examples + + /(?<word>\w+) \g{word}/ # Finds duplicated word, (e.g. "cat cat") + /(?<word>\w+) \k{word}/ # Same. + /(?<word>\w+) \k<word>/ # Same. + /(?<letter1>.)(?<letter2>.)\g{letter2}\g{letter1}/ + # Match a four letter palindrome (e.g. "ABBA") + +=head2 Assertions + +Assertions are conditions that have to be true -- they don't actually +match parts of the substring. There are six assertions that are written as +backslash sequences. + +=over 4 + +=item \A + +C<\A> only matches at the beginning of the string. If the C</m> modifier +isn't used, then C</\A/> is equivalent with C</^/>. However, if the C</m> +modifier is used, then C</^/> matches internal newlines, but the meaning +of C</\A/> isn't changed by the C</m> modifier. C<\A> matches at the beginning +of the string regardless whether the C</m> modifier is used. + +=item \z, \Z + +C<\z> and C<\Z> match at the end of the string. If the C</m> modifier isn't +used, then C</\Z/> is equivalent with C</$/>, that is, it matches at the +end of the string, or before the newline at the end of the string. If the +C</m> modifier is used, then C</$/> matches at internal newlines, but the +meaning of C</\Z/> isn't changed by the C</m> modifier. C<\Z> matches at +the end of the string (or just before a trailing newline) regardless whether +the C</m> modifier is used. + +C<\z> is just like C<\Z>, except that it will not match before a trailing +newline. C<\z> will only match at the end of the string - regardless of the +modifiers used, and not before a newline. + +=item \G + +C<\G> is usually only used in combination with the C</g> modifier. If the +C</g> modifier is used (and the match is done in scalar context), Perl will +remember where in the source string the last match ended, and the next time, +it will start the match from where it ended the previous time. + +C<\G> matches the point where the previous match ended, or the beginning +of the string if there was no previous match. See also L<perlremodifiers>. + +Mnemonic: I<G>lobal. + +=item \b, \B + +C<\b> matches at any place between a word and a non-word character; C<\B> +matches at any place between characters where C<\b> doesn't match. C<\b> +and C<\B> assume there's a non-word character before the beginning and after +the end of the source string; so C<\b> will match at the beginning (or end) +of the source string if the source string begins (or ends) with a word +character. Otherwise, C<\B> will match. + +Mnemonic: I<b>oundary. + +=back + +=head4 Examples + + "cat" =~ /\Acat/; # Match. + "cat" =~ /cat\Z/; # Match. + "cat\n" =~ /cat\Z/; # Match. + "cat\n" =~ /cat\z/; # No match. + + "cat" =~ /\bcat\b/; # Matches. + "cats" =~ /\bcat\b/; # No match. + "cat" =~ /\bcat\B/; # No match. + "cats" =~ /\bcat\B/; # Match. + + while ("cat dog" =~ /(\w+)/g) { + print $1; # Prints 'catdog' + } + while ("cat dog" =~ /\G(\w+)/g) { + print $1; # Prints 'cat' + } + +=head2 Misc + +Here we document the backslash sequences that don't fall in one of the +categories above. They are: + +=over 4 + +=item \C + +C<\C> always matches a single octet, even if the source string is encoded +in UTF-8 format, and the character to be matched is a multi-octet character. +C<\C> was introduced in perl 5.6. + +Mnemonic: oI<C>tet. + +=item \K + +This is new in perl 5.10. Anything that is matched left of C<\K> is +not included in C<$&> - and will not be replaced if the pattern is +used in a substitution. This will allow you to write C<s/PAT1 \K PAT2/REPL/x> +instead of C<s/(PAT1) PAT2/${1}REPL/x> or C<s/(?<=PAT1) PAT2/REPL/x>. + +Mnemonic: I<K>eep. + +=item \R + +C<\R> matches a I<generic newline>, that is, anything that is considered +a newline by Unicode. This includes all characters matched by C<\v> +(vertical white space), and the multi character sequence C<"\x0D\x0A"> +(carriage return followed by a line feed, aka the network newline, or +the newline used in Windows text files). C<\R> is equivalent with +C<(?>\x0D\x0A)|\v)>. Since C<\R> can match a more than one character, +it cannot be put inside a bracketed character class; C</[\R]/> is an error. +C<\R> is introduced in perl 5.10. + +Mnemonic: none really. C<\R> was picked because PCRE already uses C<\R>. + +=item \X + +This matches an extended Unicode I<combining character sequence>, and +is equivalent to C<< (?>\PM\pM*) >>. C<\PM> matches any character that is +not considered a Unicode mark character, while C<\pM> matches any character +that is considered a Unicode mark character; so C<\X> matches any non +mark character followed by zero or more mark characters. Mark characters +include (but are not restricted to) I<combining characters> and +I<vowel signs>. + +Mnemonic: eI<X>tended Unicode character. + +=back + +=head4 Examples + + "\x{256}" =~ /^\C\C$/; # Match as chr (256) takes 2 octets in UTF-8. + + $str =~ s/foo\Kbar/baz/g; # Change any 'bar' following a 'foo' to 'baz'. + $str =~ s/(.)\K\1//g; # Delete duplicated characters. + + "\n" =~ /^\R$/; # Match, \n is a generic newline. + "\r" =~ /^\R$/; # Match, \r is a generic newline. + "\r\n" =~ /^\R$/; # Match, \r\n is a generic newline. + + "P\x{0307}" =~ /^\X$/ # \X matches a P with a dot above. + +=cut diff --git a/pod/perlrecharclass.pod b/pod/perlrecharclass.pod new file mode 100644 index 0000000000..afdf11627a --- /dev/null +++ b/pod/perlrecharclass.pod @@ -0,0 +1,525 @@ +=head1 NAME + +perlrecharclass - Perl Regular Expression Character Classes + +=head1 DESCRIPTION + +The top level documentation about Perl regular expressions +is found in L<perlre>. + +This manual page discusses the syntax and use of character +classes in Perl Regular Expressions. + +A character class is a way of denoting a set of characters, +in such a way that one character of the set is matched. +It's important to remember that matching a character class +consumes exactly one character in the source string. (The source +string is the string the regular expression is matched against.) + +There are three types of character classes in Perl regular +expressions: the dot, backslashed sequences, and the bracketed form. + +=head2 The dot + +The dot (or period), C<.> is probably the most used, and certainly +the most well-known character class. By default, a dot matches any +character, except for the newline. The default can be changed to +add matching the newline with the I<single line> modifier: either +for the entire regular expression using the C</s> modifier, or +locally using C<(?s)>. + +Here are some examples: + + "a" =~ /./ # Match + "." =~ /./ # Match + "" =~ /./ # No match (dot has to match a character) + "\n" =~ /./ # No match (dot does not match a newline) + "\n" =~ /./s # Match (global 'single line' modifier) + "\n" =~ /(?s:.)/ # Match (local 'single line' modifier) + "ab" =~ /^.$/ # No match (dot matches one character) + + +=head2 Backslashed sequences + +Perl regular expressions contain many backslashed sequences that +constitute a character class. That is, they will match a single +character, if that character belongs to a specific set of characters +(defined by the sequence). A backslashed sequence is a sequence of +characters starting with a backslash. Not all backslashed sequences +are character class; for a full list, see L<perlrebackslash>. + +Here's a list of the backslashed sequences, which are discussed in +more detail below. + + \d Match a digit character. + \D Match a non-digit character. + \w Match a "word" character. + \W Match a non-"word" character. + \s Match a white space character. + \S Match a non-white space character. + \h Match a horizontal white space character. + \H Match a character that isn't horizontal white space. + \v Match a vertical white space character. + \V Match a character that isn't vertical white space. + \pP, \p{Prop} Match a character matching a Unicode property. + \PP, \P{Prop} Match a character that doesn't match a Unicode property. + +=head3 Digits + +C<\d> matches a single character that is considered to be a I<digit>. +What is considered a digit depends on the internal encoding of +the source string. If the source string is in UTF-8 format, C<\d> +not only matches the digits '0' - '9', but also Arabic, Devanagari and +digits from other languages. Otherwise, if there is a locale in effect, +it will match whatever characters the locale considers digits. Without +a locale, C<\d> matches the digits '0' to '9'. +See L</Locale, Unicode and UTF-8>. + +Any character that isn't matched by C<\d> will be matched by C<\D>. + +=head3 Word characters + +C<\w> matches a single I<word> character: an alphanumeric character +(that is, an alphabetic character, or a digit), or the underscore (C<_>). +What is considered a word character depends on the internal encoding +of the string. If it's in UTF-8 format, C<\w> matches those characters +that are considered word characters in the Unicode database. That is, it +not only matches ASCII letters, but also Thai letters, Greek letters, etc. +If the source string isn't in UTF-8 format, C<\w> matches those characters +that are considered word characters by the current locale. Without +a locale in effect, C<\w> matches the ASCII letters, digits and the +underscore. + +Any character that isn't matched by C<\w> will be matched by C<\W>. + +=head3 White space + +C<\s> matches any single character that is consider white space. In the +ASCII range, C<\s> matches the horizontal tab (C<\t>), the new line +(C<\n>), the form feed (C<\f>), the carriage return (C<\r>), and the +space (the vertical tab, C<\cK> is not matched by C<\s>). The exact set +of characters matched by C<\s> depends on whether the source string is +in UTF-8 format. If it is, C<\s> matches what is considered white space +in the Unicode database. Otherwise, if there is a locale in effect, C<\s> +matches whatever is considered white space by the current locale. Without +a locale, C<\s> matches the five characters mentioned in the beginning +of this paragraph. Perhaps the most notable difference is that C<\s> +matches a non-breaking space only if the non-breaking space is in a +UTF-8 encoded string. + +Any character that isn't matched by C<\s> will be matched by C<\S>. + +C<\h> will match any character that is considered horizontal white space; +this includes the space and the tab characters. C<\H> will match any character +that is not considered horizontal white space. + +C<\v> will match any character that is considered vertical white space; +this includes the carriage return and line feed characters (newline). +C<\V> will match any character that is not considered vertical white space. + +C<\R> matches anything that can be considered a newline under Unicode +rules. It's not a character class, as it can match a multi-character +sequence. Therefore, it cannot be used inside a bracketed character +class. Details are discussed in L<perlrebackslash>. + +C<\h>, C<\H>, C<\v>, C<\V>, and C<\R> are new in perl 5.10. + +Note that unlike C<\s>, C<\d> and C<\w>, C<\h> and C<\v> always match +the same characters, regardless whether the source string is in UTF-8 +format or not. The set of characters they match is also not influenced +by locale. + +One might think that C<\s> is equivalent with C<[\h\v]>. This is not true. +The vertical tab (C<"\x0b">) is not matched by C<\s>, it is however +considered vertical white space. Furthermore, if the source string is +not in UTF-8 format, the next line (C<"\x85">) and the no-break space +(C<"\xA0">) are not matched by C<\s>, but are by C<\v> and C<\h> respectively. +If the source string is in UTF-8 format, both the next line and the +no-break space are matched by C<\s>. + +The following table is a complete listing of characters matched by +C<\s>, C<\h> and C<\v>. + +The first column gives the code point of the character (in hex format), +the second column gives the (Unicode) name. The third column indicates +by which class(es) the character is matched. + + 0x00009 CHARACTER TABULATION h s + 0x0000a LINE FEED (LF) vs + 0x0000b LINE TABULATION v + 0x0000c FORM FEED (FF) vs + 0x0000d CARRIAGE RETURN (CR) vs + 0x00020 SPACE h s + 0x00085 NEXT LINE (NEL) vs [1] + 0x000a0 NO-BREAK SPACE h s [1] + 0x01680 OGHAM SPACE MARK h s + 0x0180e MONGOLIAN VOWEL SEPARATOR h s + 0x02000 EN QUAD h s + 0x02001 EM QUAD h s + 0x02002 EN SPACE h s + 0x02003 EM SPACE h s + 0x02004 THREE-PER-EM SPACE h s + 0x02005 FOUR-PER-EM SPACE h s + 0x02006 SIX-PER-EM SPACE h s + 0x02007 FIGURE SPACE h s + 0x02008 PUNCTUATION SPACE h s + 0x02009 THIN SPACE h s + 0x0200a HAIR SPACE h s + 0x02028 LINE SEPARATOR vs + 0x02029 PARAGRAPH SEPARATOR vs + 0x0202f NARROW NO-BREAK SPACE h s + 0x0205f MEDIUM MATHEMATICAL SPACE h s + 0x03000 IDEOGRAPHIC SPACE h s + +=over 4 + +=item [1] + +NEXT LINE and NO-BREAK SPACE only match C<\s> if the source string is in +UTF-8 format. + +=back + +It is worth noting that C<\d>, C<\w>, etc, match single characters, not +complete numbers or words. To match a number (that consists of integers), +use C<\d+>; to match a word, use C<\w+>. + + +=head3 Unicode Properties + +C<\pP> and C<\p{Prop}> are character classes to match characters that +fit given Unicode classes. One letter classes can be used in the C<\pP> +form, with the class name following the C<\p>, otherwise, the property +name is enclosed in braces, and follows the C<\p>. For instance, a +match for a number can be written as C</\pN/> or as C</\p{Number}/>. +Lowercase letters are matched by the property I<LowercaseLetter> which +has as short form I<Ll>. They have to be written as C</\p{Ll}/> or +C</\p{LowercaseLetter}/>. C</\pLl/> is valid, but means something different. +It matches a two character string: a letter (Unicode property C<\pL>), +followed by a lowercase C<l>. + +For a list of possible properties, see +L<perlunicode/Unicode Character Properties>. It is also possible to +defined your own properties. This is discussed in +L<perlunicode/User-Defined Character Properties>. + + +=head4 Examples + + "a" =~ /\w/ # Match, "a" is a 'word' character. + "7" =~ /\w/ # Match, "7" is a 'word' character as well. + "a" =~ /\d/ # No match, "a" isn't a digit. + "7" =~ /\d/ # Match, "7" is a digit. + " " =~ /\s/ # Match, a space is white space. + "a" =~ /\D/ # Match, "a" is a non-digit. + "7" =~ /\D/ # No match, "7" is not a non-digit. + " " =~ /\S/ # No match, a space is not non-white space. + + " " =~ /\h/ # Match, space is horizontal white space. + " " =~ /\v/ # No match, space is not vertical white space. + "\r" =~ /\v/ # Match, a return is vertical white space. + + "a" =~ /\pL/ # Match, "a" is a letter. + "a" =~ /\p{Lu}/ # No match, /\p{Lu}/ matches upper case letters. + + "\x{0e0b}" =~ /\p{Thai}/ # Match, \x{0e0b} is the character + # 'THAI CHARACTER SO SO', and that's in + # Thai Unicode class. + "a" =~ /\P{Lao}/ # Match, as "a" is not a Laoian character. + + +=head2 Bracketed Character Classes + +The third form of character class you can use in Perl regular expressions +is the bracketed form. In its simplest form, it lists the characters +that may be matched inside square brackets, like this: C<[aeiou]>. +This matches one of C<a>, C<e>, C<i>, C<o> or C<u>. Just as the other +character classes, exactly one character will be matched. To match +a longer string consisting of characters mentioned in the characters +class, follow the character class with a quantifier. For instance, +C<[aeiou]+> matches a string of one or more lowercase ASCII vowels. + +Repeating a character in a character class has no +effect; it's considered to be in the set only once. + +Examples: + + "e" =~ /[aeiou]/ # Match, as "e" is listed in the class. + "p" =~ /[aeiou]/ # No match, "p" is not listed in the class. + "ae" =~ /^[aeiou]$/ # No match, a character class only matches + # a single character. + "ae" =~ /^[aeiou]+$/ # Match, due to the quantifier. + +=head3 Special Characters Inside a Bracketed Character Class + +Most characters that are meta characters in regular expressions (that +is, characters that carry a special meaning like C<*> or C<(>) lose +their special meaning and can be used inside a character class without +the need to escape them. For instance, C<[()]> matches either an opening +parenthesis, or a closing parenthesis, and the parens inside the character +class don't group or capture. + +Characters that may carry a special meaning inside a character class are: +C<\>, C<^>, C<->, C<[> and C<]>, and are discussed below. They can be +escaped with a backslash, although this is sometimes not needed, in which +case the backslash may be omitted. + +The sequence C<\b> is special inside a bracketed character class. While +outside the character class C<\b> is an assertion indicating a point +that does not have either two word characters or two non-word characters +on either side, inside a bracketed character class, C<\b> matches a +backspace character. + +A C<[> is not special inside a character class, unless it's the start +of a POSIX character class (see below). It normally does not need escaping. + +A C<]> is either the end of a POSIX character class (see below), or it +signals the end of the bracketed character class. Normally it needs +escaping if you want to include a C<]> in the set of characters. +However, if the C<]> is the I<first> (or the second if the first +character is a caret) character of a bracketed character class, it +does not denote the end of the class (as you cannot have an empty class) +and is considered part of the set of characters that can be matched without +escaping. + +Examples: + + "+" =~ /[+?*]/ # Match, "+" in a character class is not special. + "\cH" =~ /[\b]/ # Match, \b inside in a character class + # is equivalent with a backspace. + "]" =~ /[][]/ # Match, as the character class contains. + # both [ and ]. + "[]" =~ /[[]]/ # Match, the pattern contains a character class + # containing just ], and the character class is + # followed by a ]. + +=head3 Character Ranges + +It is not uncommon to want to match a range of characters. Luckily, instead +of listing all the characters in the range, one may use the hyphen (C<->). +If inside a bracketed character class you have two characters separated +by a hyphen, it's treated as if all the characters between the two are in +the class. For instance, C<[0-9]> matches any ASCII digit, and C<[a-m]> +matches any lowercase letter from the first half of the ASCII alphabet. + +Note that the two characters on either side of the hyphen are not +necessary both letters or both digits. Any character is possible, +although not advisable. C<['-?]> contains a range of characters, but +most people will not know which characters that will be. Furthermore, +such ranges may lead to portability problems if the code has to run on +a platform that uses a different character set, such as EBCDIC. + +If a hyphen in a character class cannot be part of a range, for instance +because it is the first or the last character of the character class, +or if it immediately follows a range, the hyphen isn't special, and will be +considered a character that may be matched. You have to escape the hyphen +with a backslash if you want to have a hyphen in your set of characters to +be matched, and its position in the class is such that it can be considered +part of a range. + +Examples: + + [a-z] # Matches a character that is a lower case ASCII letter. + [a-fz] # Matches any letter between 'a' and 'f' (inclusive) or the + # letter 'z'. + [-z] # Matches either a hyphen ('-') or the letter 'z'. + [a-f-m] # Matches any letter between 'a' and 'f' (inclusive), the + # hyphen ('-'), or the letter 'm'. + ['-?] # Matches any of the characters '()*+,-./0123456789:;<=>? + # (But not on an EBCDIC platform). + + +=head3 Negation + +It is also possible to instead list the characters you do not want to +match. You can do so by using a caret (C<^>) as the first character in the +character class. For instance, C<[^a-z]> matches a character that is not a +lowercase ASCII letter. + +This syntax make the caret a special character inside a bracketed character +class, but only if it is the first character of the class. So if you want +to have the caret as one of the characters you want to match, you either +have to escape the caret, or not list it first. + +Examples: + + "e" =~ /[^aeiou]/ # No match, the 'e' is listed. + "x" =~ /[^aeiou]/ # Match, as 'x' isn't a lowercase vowel. + "^" =~ /[^^]/ # No match, matches anything that isn't a caret. + "^" =~ /[x^]/ # Match, caret is not special here. + +=head3 Backslash Sequences + +You can put a backslash sequence character class inside a bracketed character +class, and it will act just as if you put all the characters matched by +the backslash sequence inside the character class. For instance, +C<[a-f\d]> will match any digit, or any of the lowercase letters between +'a' and 'f' inclusive. + +Examples: + + /[\p{Thai}\d]/ # Matches a character that is either a Thai + # character, or a digit. + /[^\p{Arabic}()]/ # Matches a character that is neither an Arabic + # character, nor a parenthesis. + +Backslash sequence character classes cannot form one of the endpoints +of a range. + +=head3 Posix Character Classes + +Posix character classes have the form C<[:class:]>, where I<class> is +name, and the C<[:> and C<:]> delimiters. Posix character classes appear +I<inside> bracketed character classes, and are a convenient and descriptive +way of listing a group of characters. Be careful about the syntax, + + # Correct: + $string =~ /[[:alpha:]]/ + + # Incorrect (will warn): + $string =~ /[:alpha:]/ + +The latter pattern would be a character class consisting of a colon, +and the letters C<a>, C<l>, C<p> and C<h>. + +Perl recognizes the following POSIX character classes: + + alpha Any alphabetical character. + alnum Any alphanumerical character. + ascii Any ASCII character. + blank A GNU extension, equal to a space or a horizontal tab (C<\t>). + cntrl Any control character. + digit Any digit, equivalent to C<\d>. + graph Any printable character, excluding a space. + lower Any lowercase character. + print Any printable character, including a space. + punct Any punctuation character. + space Any white space character. C<\s> plus the vertical tab (C<\cK>). + upper Any uppercase character. + word Any "word" character, equivalent to C<\w>. + xdigit Any hexadecimal digit, '0' - '9', 'a' - 'f', 'A' - 'F'. + +The exact set of characters matched depends on whether the source string +is internally in UTF-8 format or not. See L</Locale, Unicode and UTF-8>. + +Most POSIX character classes have C<\p> counterparts. The difference +is that the C<\p> classes will always match according to the Unicode +properties, regardless whether the string is in UTF-8 format or not. + +The following table shows the relation between POSIX character classes +and the Unicode properties: + + [[:...:]] \p{...} backslash + + alpha IsAlpha + alnum IsAlnum + ascii IsASCII + blank + cntrl IsCntrl + digit IsDigit \d + graph IsGraph + lower IsLower + print IsPrint + punct IsPunct + space IsSpace + IsSpacePerl \s + upper IsUpper + word IsWord + xdigit IsXDigit + +Some character classes may have a non-obvious name: + +=over 4 + +=item cntrl + +Any control character. Usually, control characters don't produce output +as such, but instead control the terminal somehow: for example newline +and backspace are control characters. All characters with C<ord()> less +than 32 are usually classified as control characters (in ASCII, the ISO +Latin character sets, and Unicode), as is the character C<ord()> value +of 127 (C<DEL>). + +=item graph + +Any character that is I<graphical>, that is, visible. This class consists +of all the alphanumerical characters and all punctuation characters. + +=item print + +All printable characters, which is the set of all the graphical characters +plus the space. + +=item punct + +Any punctuation (special) character. + +=back + +=head4 Negation + +A Perl extension to the POSIX character class is the ability to +negate it. This is done by prefixing the class name with a caret (C<^>). +Some examples: + + POSIX Unicode Backslash + [[:^digit:]] \P{IsDigit} \D + [[:^space:]] \P{IsSpace} \S + [[:^word:]] \P{IsWord} \W + +=head4 [= =] and [. .] + +Perl will recognize the POSIX character classes C<[=class=]>, and +C<[.class.]>, but does not (yet?) support this construct. Use of +such a constructs will lead to an error. + + +=head4 Examples + + /[[:digit:]]/ # Matches a character that is a digit. + /[01[:lower:]]/ # Matches a character that is either a + # lowercase letter, or '0' or '1'. + /[[:digit:][:^xdigit:]]/ # Matches a character that can be anything, + # but the letters 'a' to 'f' in either case. + # This is because the character class contains + # all digits, and anything that isn't a + # hex digit, resulting in a class containing + # all characters, but the letters 'a' to 'f' + # and 'A' to 'F'. + + +=head2 Locale, Unicode and UTF-8 + +Some of the character classes have a somewhat different behaviour depending +on the internal encoding of the source string, and the locale that is +in effect. + +C<\w>, C<\d>, C<\s> and the POSIX character classes (and their negations, +including C<\W>, C<\D>, C<\S>) suffer from this behaviour. + +The rule is that if the source string is in UTF-8 format, the character +classes match according to the Unicode properties. If the source string +isn't, then the character classes match according to whatever locale is +in effect. If there is no locale, they match the ASCII defaults +(52 letters, 10 digits and underscore for C<\w>, 0 to 9 for C<\d>, etc). + +This usually means that if you are matching against characters whose C<ord()> +values are between 128 and 255 inclusive, your character class may match +or not depending on the current locale, and whether the source string is +in UTF-8 format. The string will be in UTF-8 format if it contains +characters whose C<ord()> value exceeds 255. But a string may be in UTF-8 +format without it having such characters. + +For portability reasons, it may be better to not use C<\w>, C<\d>, C<\s> +or the POSIX character classes, and use the Unicode properties instead. + +=head4 Examples + + $str = "\xDF"; # $str is not in UTF-8 format. + $str =~ /^\w/; # No match, as $str isn't in UTF-8 format. + $str .= "\x{0e0b}"; # Now $str is in UTF-8 format. + $str =~ /^\w/; # Match! $str is now in UTF-8 format. + chop $str; + $str =~ /^\w/; # Still a match! $str remains in UTF-8 format. + +=cut diff --git a/vms/descrip_mms.template b/vms/descrip_mms.template index 166c9b5057..d6c740ed62 100644 --- a/vms/descrip_mms.template +++ b/vms/descrip_mms.template @@ -408,14 +408,15 @@ pod17 = [.lib.pods]perlmodinstall.pod [.lib.pods]perlmodlib.pod [.lib.pods]perlm pod18 = [.lib.pods]perlnewmod.pod [.lib.pods]perlnumber.pod [.lib.pods]perlobj.pod [.lib.pods]perlop.pod [.lib.pods]perlopenbsd.pod pod19 = [.lib.pods]perlopentut.pod [.lib.pods]perlos2.pod [.lib.pods]perlos390.pod [.lib.pods]perlos400.pod [.lib.pods]perlothrtut.pod pod20 = [.lib.pods]perlpacktut.pod [.lib.pods]perlplan9.pod [.lib.pods]perlpod.pod [.lib.pods]perlpodspec.pod [.lib.pods]perlport.pod -pod21 = [.lib.pods]perlpragma.pod [.lib.pods]perlqnx.pod [.lib.pods]perlre.pod [.lib.pods]perlreapi.pod [.lib.pods]perlref.pod [.lib.pods]perlreftut.pod -pod22 = [.lib.pods]perlreguts.pod [.lib.pods]perlrequick.pod [.lib.pods]perlreref.pod [.lib.pods]perlretut.pod [.lib.pods]perlriscos.pod -pod23 = [.lib.pods]perlrun.pod [.lib.pods]perlsec.pod [.lib.pods]perlsolaris.pod [.lib.pods]perlstyle.pod [.lib.pods]perlsub.pod [.lib.pods]perlsymbian.pod -pod24 = [.lib.pods]perlsyn.pod [.lib.pods]perlthrtut.pod [.lib.pods]perltie.pod [.lib.pods]perltoc.pod [.lib.pods]perltodo.pod [.lib.pods]perltooc.pod -pod25 = [.lib.pods]perltoot.pod [.lib.pods]perltrap.pod [.lib.pods]perltru64.pod [.lib.pods]perltw.pod [.lib.pods]perlunicode.pod [.lib.pods]perlunifaq.pod -pod26 = [.lib.pods]perluniintro.pod [.lib.pods]perlunitut.pod [.lib.pods]perlutil.pod [.lib.pods]perluts.pod [.lib.pods]perlvar.pod [.lib.pods]perlvmesa.pod -pod27 = [.lib.pods]perlvms.pod [.lib.pods]perlvos.pod [.lib.pods]perlwin32.pod [.lib.pods]perlxs.pod [.lib.pods]perlxstut.pod -pod = $(pod0) $(pod1) $(pod2) $(pod3) $(pod4) $(pod5) $(pod6) $(pod7) $(pod8) $(pod9) $(pod10) $(pod11) $(pod12) $(pod13) $(pod14) $(pod15) $(pod16) $(pod17) $(pod18) $(pod19) $(pod20) $(pod21) $(pod22) $(pod23) $(pod24) $(pod25) $(pod26) $(pod27) +pod21 = [.lib.pods]perlpragma.pod [.lib.pods]perlqnx.pod [.lib.pods]perlre.pod [.lib.pods]perlreapi.pod [.lib.pods]perlrebackslash.pod +pod22 = [.lib.pods]perlrecharclass.pod [.lib.pods]perlref.pod [.lib.pods]perlreftut.pod [.lib.pods]perlreguts.pod [.lib.pods]perlrequick.pod +pod23 = [.lib.pods]perlreref.pod [.lib.pods]perlretut.pod [.lib.pods]perlriscos.pod [.lib.pods]perlrun.pod [.lib.pods]perlsec.pod [.lib.pods]perlsolaris.pod +pod24 = [.lib.pods]perlstyle.pod [.lib.pods]perlsub.pod [.lib.pods]perlsymbian.pod [.lib.pods]perlsyn.pod [.lib.pods]perlthrtut.pod [.lib.pods]perltie.pod +pod25 = [.lib.pods]perltoc.pod [.lib.pods]perltodo.pod [.lib.pods]perltooc.pod [.lib.pods]perltoot.pod [.lib.pods]perltrap.pod [.lib.pods]perltru64.pod +pod26 = [.lib.pods]perltw.pod [.lib.pods]perlunicode.pod [.lib.pods]perlunifaq.pod [.lib.pods]perluniintro.pod [.lib.pods]perlunitut.pod +pod27 = [.lib.pods]perlutil.pod [.lib.pods]perluts.pod [.lib.pods]perlvar.pod [.lib.pods]perlvmesa.pod [.lib.pods]perlvms.pod [.lib.pods]perlvos.pod +pod28 = [.lib.pods]perlwin32.pod [.lib.pods]perlxs.pod [.lib.pods]perlxstut.pod +pod = $(pod0) $(pod1) $(pod2) $(pod3) $(pod4) $(pod5) $(pod6) $(pod7) $(pod8) $(pod9) $(pod10) $(pod11) $(pod12) $(pod13) $(pod14) $(pod15) $(pod16) $(pod17) $(pod18) $(pod19) $(pod20) $(pod21) $(pod22) $(pod23) $(pod24) $(pod25) $(pod26) $(pod27) $(pod28) # Would be useful to automate the generation of this rule from pod/buildtoc # Plus its corresponding delete in the clean target. @@ -1185,6 +1186,14 @@ makeppport : $(MINIPERL_EXE) $(ARCHDIR)Config.pm @ If F$Search("[.lib]pods.dir").eqs."" Then Create/Directory [.lib.pods] Copy/NoConfirm/Log $(MMS$SOURCE) [.lib.pods] +[.lib.pods]perlrebackslash.pod : [.pod]perlrebackslash.pod + @ If F$Search("[.lib]pods.dir").eqs."" Then Create/Directory [.lib.pods] + Copy/NoConfirm/Log $(MMS$SOURCE) [.lib.pods] + +[.lib.pods]perlrecharclass.pod : [.pod]perlrecharclass.pod + @ If F$Search("[.lib]pods.dir").eqs."" Then Create/Directory [.lib.pods] + Copy/NoConfirm/Log $(MMS$SOURCE) [.lib.pods] + [.lib.pods]perlref.pod : [.pod]perlref.pod @ If F$Search("[.lib]pods.dir").eqs."" Then Create/Directory [.lib.pods] Copy/NoConfirm/Log $(MMS$SOURCE) [.lib.pods] diff --git a/win32/pod.mak b/win32/pod.mak index 1a5da4b5d8..3bcc27711c 100644 --- a/win32/pod.mak +++ b/win32/pod.mak @@ -103,6 +103,8 @@ POD = \ perlpragma.pod \ perlre.pod \ perlreapi.pod \ + perlrebackslash.pod \ + perlrecharclass.pod \ perlref.pod \ perlreftut.pod \ perlreguts.pod \ @@ -219,6 +221,8 @@ MAN = \ perlpragma.man \ perlre.man \ perlreapi.man \ + perlrebackslash.man \ + perlrecharclass.man \ perlref.man \ perlreftut.man \ perlreguts.man \ @@ -335,6 +339,8 @@ HTML = \ perlpragma.html \ perlre.html \ perlreapi.html \ + perlrebackslash.html \ + perlrecharclass.html \ perlref.html \ perlreftut.html \ perlreguts.html \ @@ -451,6 +457,8 @@ TEX = \ perlpragma.tex \ perlre.tex \ perlreapi.tex \ + perlrebackslash.tex \ + perlrecharclass.tex \ perlref.tex \ perlreftut.tex \ perlreguts.tex \ |