diff options
author | naruse <naruse@b2dd03c8-39d4-4d8f-98ff-823fe69b080e> | 2009-09-17 00:40:49 +0000 |
---|---|---|
committer | naruse <naruse@b2dd03c8-39d4-4d8f-98ff-823fe69b080e> | 2009-09-17 00:40:49 +0000 |
commit | 63c7ca40d8320a5b4ecb373de69d65db49bc1122 (patch) | |
tree | c4fb9f4b08172b674685029da8b802d19d31d10e /doc | |
parent | 6023a79deb6a9833542ee1a15666a0589b05fa1c (diff) | |
download | ruby-63c7ca40d8320a5b4ecb373de69d65db49bc1122.tar.gz |
* doc/re.rb: New document for Ruby's fork of Oniguruma.
written by Run Paint Run Run [ruby-core:25420]
* re.c: import document in doc/re.rb.
* .document: add doc/re.rb.
git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@24973 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
Diffstat (limited to 'doc')
-rw-r--r-- | doc/re.rb | 584 |
1 files changed, 584 insertions, 0 deletions
diff --git a/doc/re.rb b/doc/re.rb new file mode 100644 index 0000000000..6eabbb5c5a --- /dev/null +++ b/doc/re.rb @@ -0,0 +1,584 @@ +module Doc +# Regular expressions (<i>regexp</i>s) are patterns which describe the +# contents of a string. They're used for testing whether a string contains a +# given pattern, or extracting the portions that match. They are created +# with the <tt>/</tt><i>pat</i><tt>/</tt> and +# <tt>%r{</tt><i>pat</i><tt>}</tt> literals or the <tt>Regexp.new</tt> +# constructor. +# +# A regexp is usually delimited with forward slashes (<tt>/</tt>). For +# example: +# +# /hay/ =~ 'haystack' #=> 0 +# /y/.match('haystack') #=> #<MatchData "y"> +# +# If a string contains the pattern it is said to <i>match</i>. A literal +# string matches itself. +# +# # 'haystack' does not contain the pattern 'needle', so doesn't match. +# /needle/.match('haystack') #=> nil +# # 'haystack' does contain the pattern 'hay', so it matches +# /hay/.match('haystack') #=> #<MatchData "hay"> +# +# Specifically, <tt>/st/</tt> requires that the string contains the letter +# _s_ followed by the letter _t_, so it matches _haystack_, also. +# +# == Metacharacters and Escapes +# +# The following are <i>metacharacters</i> <tt>(</tt>, <tt>)</tt>, +# <tt>[</tt>, <tt>]</tt>, <tt>{</tt>, <tt>}</tt>, <tt>.</tt>, <tt>?</tt>, +# <tt>+</tt>, <tt>*</tt>. They have a specific meaning when appearing in a +# pattern. To match them literally they must be backslash-escaped. To match +# a backslash literally backslash-escape that: <tt>\\\\\\</tt>. +# +# /1 \+ 2 = 3\?/.match('Does 1 + 2 = 3?') #=> #<MatchData "1 + 2 = 3?"> +# +# Patterns behave like double-quoted strings so can contain the same +# backslash escapes. +# +# /\s\u{6771 4eac 90fd}/.match("Go to 東京都") +# #=> #<MatchData " 東京都"> +# +# Arbitrary Ruby expressions can be embedded into patterns with the +# <tt>#{...}</tt> construct. +# +# place = "東京都" +# /#{place}/.match("Go to 東京都") +# #=> #<MatchData "東京都"> +# +# == Character Classes +# +# A <i>character class</i> is delimited with square brackets (<tt>[</tt>, +# <tt>]</tt>) and lists characters that may appear at that point in the +# match. <tt>/[ab]/</tt> means _a_ or _b_, as opposed to <tt>/ab/</tt> which +# means _a_ followed by _b_. +# +# /W[aeiou]rd/.match("Word") #=> #<MatchData "Word"> +# +# Within a character class the hyphen (<tt>-</tt>) is a metacharacter +# denoting an inclusive range of characters. <tt>[abcd]</tt> is equivalent +# to <tt>[a-d]</tt>. A range can be followed by another range, so +# <tt>[abcdwxyz]</tt> is equivalent to <tt>[a-dw-z]</tt>. The order in which +# ranges or individual characters appear inside a character class is +# irrelevant. +# +# /[0-9a-f]/.match('9f') #=> #<MatchData "9"> +# /[9f]/.match('9f') #=> #<MatchData "9"> +# +# If the first character of a character class is a caret (<tt>^</tt>) the +# class is inverted: it matches any character _except_ those named. +# +# /[^a-eg-z]/.match('f') #=> #<MatchData "f"> +# +# A character class may contain another character class. By itself this +# isn't useful because <tt>[a-z[0-9]]</tt> describes the same set as +# <tt>[a-z0-9]</tt>. However, character classes also support the <tt>&&</tt> +# operator which performs set intersection on its arguments. The two can be +# combined as follows: +# +# /[a-w&&[^c-g]z]/ # ([a-w] AND ([^c-g] OR z)) +# # This is equivalent to: +# /[abh-w]/ +# +# The following metacharacters also behave like character classes: +# +# * <tt>/./</tt> - Any character except a newline. +# * <tt>/./m</tt> - Any character (the +m+ modifier enables multiline mode) +# * <tt>/\w/</tt> - A word character (<tt>[a-zA-Z0-9_]</tt>) +# * <tt>/\W/</tt> - A non-word character (<tt>[^a-zA-Z0-9_]</tt>) +# * <tt>/\d/</tt> - A digit character (<tt>[0-9]</tt>) +# * <tt>/\D/</tt> - A non-digit character (<tt>[^0-9]</tt>) +# * <tt>/\h/</tt> - A hexdigit character (<tt>[0-9a-fA-F]</tt>) +# * <tt>/\H/</tt> - A non-hexdigit character (<tt>[^0-9a-fA-F]</tt>) +# * <tt>/\s/</tt> - A whitespace character: <tt>/[ \t\r\n\f]/</tt> +# * <tt>/\S/</tt> - A non-whitespace character: <tt>/[^ \t\r\n\f]/</tt> +# +# POSIX <i>bracket expressions</i> are also similar to character classes. +# They provide a portable alternative to the above, with the added benefit +# that they encompass non-ASCII characters. For instance, <tt>/\d/</tt> +# matches only the ASCII decimal digits (0-9); whereas <tt>/[[:digit:]]/</tt> +# matches any character in the Unicode _Nd_ category. +# +# * <tt>/[[:alnum:]]/</tt> - Alphabetic and numeric character +# * <tt>/[[:alpha:]]/</tt> - Alphabetic character +# * <tt>/[[:blank:]]/</tt> - Space or tab +# * <tt>/[[:cntrl:]]/</tt> - Control character +# * <tt>/[[:digit:]]/</tt> - Digit +# * <tt>/[[:graph:]]/</tt> - Non-blank character (excludes spaces, control +# characters, and similar) +# * <tt>/[[:lower:]]/</tt> - Lowercase alphabetical character +# * <tt>/[[:print:]]/</tt> - Like [:graph:], but includes the space character +# * <tt>/[[:punct:]]/</tt> - Punctuation character +# * <tt>/[[:space:]]/</tt> - Whitespace character (<tt>[:blank:]</tt>, newline, +# carriage return, etc.) +# * <tt>/[[:upper:]]/</tt> - Uppercase alphabetical +# * <tt>/[[:xdigit:]]/</tt> - Digit allowed in a hexadecimal number (i.e., +# 0-9a-fA-F) +# +# Ruby also supports the following non-POSIX character classes: +# +# * <tt>/[[:word:]]/</tt> - A character in one of the following Unicode +# general categories _Letter_, _Mark_, _Number_, +# <i>Connector_Punctuation<i/i> +# * <tt>/[[:ascii:]]/</tt> - A character in the ASCII character set +# +# # U+06F2 is "EXTENDED ARABIC-INDIC DIGIT TWO" +# /[[:digit:]]/.match("\u06F2") #=> #<MatchData "\u{06F2}"> +# /[[:upper:]][[:lower:]]/.match("Hello") #=> #<MatchData "He"> +# /[[:xdigit:]][[:xdigit:]]/.match("A6") #=> #<MatchData "A6"> +# +# == Repetition +# +# The constructs described so far match a single character. They can be +# followed by a repetition metacharacter to specify how many times they need +# to occur. Such metacharacters are called <i>quantifiers</i>. +# +# * <tt>*</tt> - Zero or more times +# * <tt>+</tt> - One or more times +# * <tt>?</tt> - Zero or one times (optional) +# * <tt>{</tt><i>n</i><tt>}</tt> - Exactly <i>n</i> times +# * <tt>{</tt><i>n</i><tt>,}</tt> - <i>n</i> or more times +# * <tt>{,</tt><i>m</i><tt>}</tt> - <i>m</i> or less times +# * <tt>{</tt><i>n</i><tt>,</tt><i>m</i><tt>}</tt> - At least <i>n</i> and +# at most <i>m</i> times +# +# # At least one uppercase character ('H'), at least one lowercase +# # character ('e'), two 'l' characters, then one 'o' +# "Hello".match(/[[:upper:]]+[[:lower:]]+l{2}o/) #=> #<MatchData "Hello"> +# +# Repetition is <i>greedy</i> by default: as many occurrences as possible +# are matched while still allowing the overall match to succeed. By +# contrast, <i>lazy</i> matching makes the minimal amount of matches +# necessary for overall success. A greedy metacharacter can be made lazy by +# following it with <tt>?</tt>. +# +# # Both patterns below match the string. The fist uses a greedy +# # quantifier so '.+' matches '<a><b>'; the second uses a lazy +# # quantifier so '.+?' matches '<a>'. +# /<.+>/.match("<a><b>") #=> #<MatchData "<a><b>"> +# /<.+?>/.match("<a><b>") #=> #<MatchData "<a>"> +# +# A quantifier followed by <tt>+</tt> matches <i>possessively</i>: once it +# has matched it does not backtrack. They behave like greedy quantifiers, +# but having matched they refuse to "give up" their match even if this +# jeopardises the overall match. +# +# == Capturing +# +# Parentheses can be used for <i>capturing</i>. The text enclosed by the +# <i>n</i><sup>th</sup> group of parentheses can be subsequently referred to +# with <i>n</i>. Within a pattern use the <i>backreference</i> +# <tt>\</tt><i>n</i>; outside of the pattern use +# <tt>MatchData[</tt><i>n</i><tt>]</tt>. +# +# # 'at' is captured by the first group of parentheses, then referred to +# # later with \1 +# /[csh](..) [csh]\1 in/.match("The cat sat in the hat") +# #=> #<MatchData "cat sat in" 1:"at"> +# # Regexp#match returns a MatchData object which makes the captured +# # text available with its #[] method. +# /[csh](..) [csh]\1 in/.match("The cat sat in the hat")[1] #=> 'at' +# +# Capture groups can be referred to by name when defined with the +# <tt>(?<</tt><i>name</i><tt>>)</tt> or <tt>(?'</tt><i>name</i><tt>')</tt> +# constructs. +# +# /\$(?<dollars>\d+)\.(?<cents>\d+)/.match("$3.67") +# => #<MatchData "$3.67" dollars:"3" cents:"67"> +# /\$(?<dollars>\d+)\.(?<cents>\d+)/.match("$3.67")[:dollars] #=> "3" +# +# Named groups can be backreferenced with <tt>\k<</tt><i>name</i><tt>></tt>, +# where _name_ is the group name. +# +# /(?<vowel>[aeiou]).\k<vowel>.\k<vowel>/.match('ototomy') +# #=> #<MatchData "ototo" vowel:"o"> +# +# *Note*: A regexp can't use named backreferences and numbered +# backreferences simultaneously. +# +# When named capture groups are used with a literal regexp on the left-hand +# side of an expression and the <tt>=~</tt> operator, the captured text is +# also assigned to local variables with corresponding names. +# +# /\$(?<dollars>\d+)\.(?<cents>\d+)/ =~ "$3.67" #=> 0 +# dollars #=> "3" +# +# == Grouping +# +# Parentheses also <i>group</i> the terms they enclose, allowing them to be +# quantified as one <i>atomic</i> whole. +# +# # The pattern below matches a vowel followed by 2 word characters: +# # 'aen' +# /[aeiou]\w{2}/.match("Caenorhabditis elegans") #=> #<MatchData "aen"> +# # Whereas the following pattern matches a vowel followed by a word +# # character, twice, i.e. <tt>[aeiou]\w[aeiou]\w</tt>: 'enor'. +# /([aeiou]\w){2}/.match("Caenorhabditis elegans") +# #=> #<MatchData "enor" 1:"or"> +# +# The <tt>(?:</tt>...<tt>)</tt> construct provides grouping without +# capturing. That is, it combines the terms it contains into an atomic whole +# without creating a backreference. This benefits performance at the slight +# expense of readabilty. +# +# # The group of parentheses captures 'n' and the second 'ti'. The +# # second group is referred to later with the backreference \2 +# /I(n)ves(ti)ga\2ons/.match("Investigations") +# #=> #<MatchData "Investigations" 1:"n" 2:"ti"> +# # The first group of parentheses is now made non-capturing with '?:', +# # so it still matches 'n', but doesn't create the backreference. Thus, +# # the backreference \1 now refers to 'ti'. +# /I(?:n)ves(ti)ga\1ons/.match("Investigations") +# #=> #<MatchData "Investigations" 1:"ti"> +# +# === Atomic Grouping +# +# Grouping can be made <i>atomic</i> with +# <tt>(?></tt><i>pat</i><tt>)</tt>. This causes the subexpression <i>pat</i> +# to be matched independently of the rest of the expression such that what +# it matches becomes fixed for the remainder of the match, unless the entire +# subexpression must be abandoned and subsequently revisited. In this +# way <i>pat</i> is treated as a non-divisible whole. Atomic grouping is +# typically used to optimise patterns so as to prevent the regular +# expression engine from backtracking needlesly. +# +# # The <tt>"</tt> in the pattern below matches the first character of +# # the string, then <tt>.*</tt> matches <i>Quote"</i>. This causes the +# # overall match to fail, so the text matched by <tt>.*</tt> is +# # backtracked by one position, which leaves the final character of the +# # string available to match <tt>"</tt> +# /".*"/.match('"Quote"') #=> #<MatchData "\"Quote\""> +# # If <tt>.*</tt> is grouped atomically, it refuses to backtrack +# # <i>Quote"</i>, even though this means that the overall match fails +# /"(?>.*)"/.match('"Quote"') #=> nil +# +# == Subexpression Calls +# +# The <tt>\g<</tt><i>name</i><tt>></tt> syntax matches the previous +# subexpression named _name_, which can be a group name or number, again. +# This differs from backreferences in that it re-executes the group rather +# than simply trying to re-match the same text. +# +# # Matches a <i>(</i> character and assigns it to the <tt>paren</tt> +# # group, tries to call that the <tt>paren</tt> sub-expression again +# # but fails, then matches a literal <i>)</i>. +# /\A(?<paren>\(\g<paren>*\))*\z/ =~ '()' +# +# +# /\A(?<paren>\(\g<paren>*\))*\z/ =~ '(())' #=> 0 +# # ^1 +# # ^2 +# # ^3 +# # ^4 +# # ^5 +# # ^6 +# # ^7 +# # ^8 +# # ^9 +# # ^10 +# +# 1. Matches at the beginning of the string, i.e. before the first +# character. +# 2. Enters a named capture group called <tt>paren</tt> +# 3. Matches a literal <i>(</i>, the first character in the string +# 4. Calls the <tt>paren</tt> group again, i.e. recurses back to the +# second step +# 5. Re-enters the <tt>paren</tt> group +# 6. Matches a literal <i>(</i>, the second character in the +# string +# 7. Try to call <tt>paren</tt> a third time, but fail because +# doing so would prevent an overall successful match +# 8. Match a literal <i>)</i>, the third character in the string. +# Marks the end of the second recursive call +# 9. Match a literal <i>)</i>, the fourth character in the string +# 10. Match the end of the string +# +# == Alternation +# +# The vertical bar metacharacter (<tt>|</tt>) combines two expressions into +# a single one that matches either of the expressions. Each expression is an +# <i>alternative</i>. +# +# /\w(and|or)\w/.match("Feliformia") #=> #<MatchData "form" 1:"or"> +# /\w(and|or)\w/.match("furandi") #=> #<MatchData "randi" 1:"and"> +# /\w(and|or)\w/.match("dissemblance") #=> nil +# +# == Character Properties +# +# The <tt>\p{}</tt> construct matches characters with the named property, +# much like POSIX bracket classes. +# +# * <tt>/\p{Alnum}/</tt> - Alphabetic and numeric character +# * <tt>/\p{Alpha}/</tt> - Alphabetic character +# * <tt>/\p{Blank}/</tt> - Space or tab +# * <tt>/\p{Cntrl}/</tt> - Control character +# * <tt>/\p{Digit}/</tt> - Digit +# * <tt>/\p{Graph}/</tt> - Non-blank character (excludes spaces, control +# characters, and similar) +# * <tt>/\p{Lower}/</tt> - Lowercase alphabetical character +# * <tt>/\p{Print}/</tt> - Like <tt>\p{Graph}</tt>, but includes the space character +# * <tt>/\p{Punct}/</tt> - Punctuation character +# * <tt>/\p{Space}/</tt> - Whitespace character (<tt>[:blank:]</tt>, newline, +# carriage return, etc.) +# * <tt>/\p{Upper}/</tt> - Uppercase alphabetical +# * <tt>/\p{XDigit}/</tt> - Digit allowed in a hexadecimal number (i.e., 0-9a-fA-F) +# * <tt>/\p{Word}/</tt> - A member of one of the following Unicode general +# category <i>Letter</i>, <i>Mark</i>, <i>Number</i>, +# <i>Connector\_Punctuation</i> +# * <tt>/\p{ASCII}/</tt> - A character in the ASCII character set +# * <tt>/\p{Any}/</tt> - Any Unicode character (including unassigned +# characters) +# * <tt>/\p{Assigned}/</tt> - An assigned character +# +# A Unicode character's <i>General Category</i> value can also be matched +# with <tt>\p{</tt><i>Ab</i><tt>}</tt> where <i>Ab</i> is the category's +# abbreviation as described below: +# +# * <tt>/\p{L}/</tt> - 'Letter' +# * <tt>/\p{Ll}/</tt> - 'Letter: Lowercase' +# * <tt>/\p{Lm}/</tt> - 'Letter: Mark' +# * <tt>/\p{Lo}/</tt> - 'Letter: Other' +# * <tt>/\p{Lt}/</tt> - 'Letter: Titlecase' +# * <tt>/\p{Lu}/</tt> - 'Letter: Uppercase +# * <tt>/\p{Lo}/</tt> - 'Letter: Other' +# * <tt>/\p{M}/</tt> - 'Mark' +# * <tt>/\p{Mn}/</tt> - 'Mark: Nonspacing' +# * <tt>/\p{Mc}/</tt> - 'Mark: Spacing Combining' +# * <tt>/\p{Me}/</tt> - 'Mark: Enclosing' +# * <tt>/\p{N}/</tt> - 'Number' +# * <tt>/\p{Nd}/</tt> - 'Number: Decimal Digit' +# * <tt>/\p{Nl}/</tt> - 'Number: Letter' +# * <tt>/\p{No}/</tt> - 'Number: Other' +# * <tt>/\p{P}/</tt> - 'Punctuation' +# * <tt>/\p{Pc}/</tt> - 'Punctuation: Connector' +# * <tt>/\p{Pd}/</tt> - 'Punctuation: Dash' +# * <tt>/\p{Ps}/</tt> - 'Punctuation: Open' +# * <tt>/\p{Pe}/</tt> - 'Punctuation: Close' +# * <tt>/\p{Pi}/</tt> - 'Punctuation: Initial Quote' +# * <tt>/\p{Pf}/</tt> - 'Punctuation: Final Quote' +# * <tt>/\p{Po}/</tt> - 'Punctuation: Other' +# * <tt>/\p{S}/</tt> - 'Symbol' +# * <tt>/\p{Sm}/</tt> - 'Symbol: Math' +# * <tt>/\p{Sc}/</tt> - 'Symbol: Currency' +# * <tt>/\p{Sc}/</tt> - 'Symbol: Currency' +# * <tt>/\p{Sk}/</tt> - 'Symbol: Modifier' +# * <tt>/\p{So}/</tt> - 'Symbol: Other' +# * <tt>/\p{Z}/</tt> - 'Separator' +# * <tt>/\p{Zs}/</tt> - 'Separator: Space' +# * <tt>/\p{Zl}/</tt> - 'Separator: Line' +# * <tt>/\p{Zp}/</tt> - 'Separator: Paragraph' +# * <tt>/\p{C}/</tt> - 'Other' +# * <tt>/\p{Cc}/</tt> - 'Other: Control' +# * <tt>/\p{Cf}/</tt> - 'Other: Format' +# * <tt>/\p{Cn}/</tt> - 'Other: Not Assigned' +# * <tt>/\p{Co}/</tt> - 'Other: Private Use' +# * <tt>/\p{Cs}/</tt> - 'Other: Surrogate' +# +# Lastly, <tt>\p{}</tt> matches a character's Unicode <i>script</i>. The +# following scripts are supported: <i>Arabic</i>, <i>Armenian</i>, +# <i>Balinese</i>, <i>Bengali</i>, <i>Bopomofo</i>, <i>Braille</i>, +# <i>Buginese</i>, <i>Buhid</i>, <i>Canadian_Aboriginal</i>, <i>Carian</i>, +# <i>Cham</i>, <i>Cherokee</i>, <i>Common</i>, <i>Coptic</i>, +# <i>Cuneiform</i>, <i>Cypriot</i>, <i>Cyrillic</i>, <i>Deseret</i>, +# <i>Devanagari</i>, <i>Ethiopic</i>, <i>Georgian</i>, <i>Glagolitic</i>, +# <i>Gothic</i>, <i>Greek</i>, <i>Gujarati</i>, <i>Gurmukhi</i>, <i>Han</i>, +# <i>Hangul</i>, <i>Hanunoo</i>, <i>Hebrew</i>, <i>Hiragana</i>, +# <i>Inherited</i>, <i>Kannada</i>, <i>Katakana</i>, <i>Kayah_Li</i>, +# <i>Kharoshthi</i>, <i>Khmer</i>, <i>Lao</i>, <i>Latin</i>, <i>Lepcha</i>, +# <i>Limbu</i>, <i>Linear_B</i>, <i>Lycian</i>, <i>Lydian</i>, +# <i>Malayalam</i>, <i>Mongolian</i>, <i>Myanmar</i>, <i>New_Tai_Lue</i>, +# <i>Nko</i>, <i>Ogham</i>, <i>Ol_Chiki</i>, <i>Old_Italic</i>, +# <i>Old_Persian</i>, <i>Oriya</i>, <i>Osmanya</i>, <i>Phags_Pa</i>, +# <i>Phoenician</i>, <i>Rejang</i>, <i>Runic</i>, <i>Saurashtra</i>, +# <i>Shavian</i>, <i>Sinhala</i>, <i>Sundanese</i>, <i>Syloti_Nagri</i>, +# <i>Syriac</i>, <i>Tagalog</i>, <i>Tagbanwa</i>, <i>Tai_Le</i>, +# <i>Tamil</i>, <i>Telugu</i>, <i>Thaana</i>, <i>Thai</i>, <i>Tibetan</i>, +# <i>Tifinagh</i>, <i>Ugaritic</i>, <i>Vai</i>, and <i>Yi</i>. +# +# # Unicode codepoint U+06E9 is named "ARABIC PLACE OF SAJDAH" and +# # belongs to the Arabic script. +# /\p{Arabic}/.match("\u06E9") #=> #<MatchData "\u06E9"> +# +# All character properties can be inverted by prefixing their name with a +# caret (<tt>^</tt>). +# +# # Letter 'A' is not in the Unicode Ll (Letter; Lowercase) category, so +# # this match succeeds +# /\p{^Ll}/.match("A") #=> #<MatchData "A"> +# +# == Anchors +# +# Anchors are metacharacter that match the zero-width positions between +# characters, <i>anchoring</i> the match to a specific position. +# +# * <tt>^</tt> - Matches beginning of line +# * <tt>$</tt> - Matches end of line +# * <tt>\A</tt> - Matches beginning of string. +# * <tt>\Z</tt> - Matches end of string. If string ends with a newline, +# it matches just before newline +# * <tt>\z</tt> - Matches end of string +# * <tt>\G</tt> - Matches point where last match finished +# * <tt>\b</tt> - Matches word boundaries when outside brackets; backspace +# (0x08) inside brackets +# * <tt>\B</tt> - Matches non-word boundaries +# * <tt>(?=</tt><i>pat</i><tt>)</tt> - <i>Positive lookahead</i> assertion: +# ensures that the following characters match <i>pat</i>, but doesn't +# include those characters in the matched text +# * <tt>(?!</tt><i>pat</i><tt>)</tt> - <i>Negative lookahead</i> assertion: +# ensures that the following characters do not match <i>pat</i>, but +# doesn't include those characters in the matched text +# * <tt>(?<=</tt><i>pat</i><tt>)</tt> - <i>Positive lookbehind</i> +# assertion: ensures that the preceding characters match <i>pat</i>, but +# doesn't include those characters in the matched text +# * <tt>(?<!</tt><i>pat</i><tt>)</tt> - <i>Negative lookbehind</i> +# assertion: ensures that the preceding characters do not match +# <i>pat</i>, but doesn't include those characters in the matched text +# +# # If a pattern isn't anchored it can begin at any point in the string +# /real/.match("surrealist") #=> #<MatchData "real"> +# # Anchoring the pattern to the beginning of the string forces the +# # match to start there. 'real' doesn't occur at the beginning of the +# # string, so now the match fails +# /\Areal/.match("surrealist") #=> nil +# # The match below fails because although 'Demand' contains 'and', the +# pattern does not occur at a word boundary. +# /\band/.match("Demand") +# # Whereas in the following example 'and' has been anchored to a +# # non-word boundary so instead of matching the first 'and' it matches +# # from the fourth letter of 'demand' instead +# /\Band.+/.match("Supply and demand curve") #=> #<MatchData "and curve"> +# # The pattern below uses positive lookahead and positive lookbehind to +# # match text appearing in <b></b> tags without including the tags in the +# # match +# /(?<=<b>)\w+(?=<\/b>)/.match("Fortune favours the <b>bold</b>") +# #=> #<MatchData "bold"> +# +# == Options +# +# The end delimiter for a regexp can be followed by one or more single-letter +# options which control how the pattern can match. +# +# * <tt>/pat/i</tt> - Ignore case +# * <tt>/pat/m</tt> - Treat a newline as a character matched by <tt>.</tt> +# * <tt>/pat/x</tt> - Ignore whitespace and comments in the pattern +# * <tt>/pat/o</tt> - Perform <tt>#{}</tt> interpolation only once +# +# <tt>i</tt>, <tt>m</tt>, and <tt>x</tt> can also be applied on the +# subexpression level with the +# <tt>(?</tt><i>on</i><tt>-</tt><i>off</i><tt>)</tt> construct, which +# enables options <i>on</i>, and disables options <i>off</i> for the +# expression enclosed by the parentheses. +# +# /a(?i:b)c/.match('aBc') #=> #<MatchData "aBc"> +# /a(?i:b)c/.match('abc') #=> #<MatchData "abc"> +# +# == Free-Spacing Mode and Comments +# +# As mentioned above, the <tt>x</tt> option enables <i>free-spacing</i> +# mode. Literal white space inside the pattern is ignored, and the +# octothorpe (<tt>#</tt>) character introduces a comment until the end of +# the line. This allows the components of the pattern to be organised in a +# potentially more readable fashion. +# +# # A contrived pattern to match a number with optional decimal places +# float_pat = /\A +# [[:digit:]]+ # 1 or more digits before the decimal point +# (\. # Decimal point +# [[:digit:]]+ # 1 or more digits after the decimal point +# )? # The decimal point and following digits are optional +# \Z/x +# float_pat.match('3.14') #=> #<MatchData "3.14" 1:".14"> +# +# *Note*: To match whitespace in an <tt>x</tt> pattern use an escape such as +# <tt>\s</tt> or <tt>\p{Space}</tt>. +# +# Comments can be included in a non-<tt>x</tt> pattern with the +# <tt>(?#</tt><i>comment</i><tt>)</tt> construct, where <i>comment</i> is +# arbitrary text ignored by the regexp engine. +# +# == Encoding +# +# Regular expressions are assumed to use the source encoding. This can be +# overridden with one of the following modifiers. +# +# * <tt>/</tt><i>pat</i><tt>/u</tt> - UTF-8 +# * <tt>/</tt><i>pat</i><tt>/e</tt> - EUC-JP +# * <tt>/</tt><i>pat</i><tt>/s</tt> - Windows-31J +# * <tt>/</tt><i>pat</i><tt>/n</tt> - ASCII-8BIT +# +# A regexp can be matched against a string when they either share an +# encoding, or the regexp's encoding is _US-ASCII_ and the string's encoding +# is ASCII-compatible. +# +# If a match between incompatible encodings is attempted an +# <tt>Encoding::CompatibilityError</tt> exception is raised. +# +# The <tt>Regexp#fixed_encoding?</tt> predicate indicates whether the regexp +# has a <i>fixed</i> encoding, that is one incompatible with ASCII. A +# regexp's encoding can be explicitly fixed by supplying +# <tt>Regexp::FIXEDENCODING</tt> as the second argument of +# <tt>Regexp.new</tt>: +# +# r = Regexp.new("a".force_encoding("iso-8859-1"),Regexp::FIXEDENCODING) +# r =~"a\u3042" +# #=> Encoding::CompatibilityError: incompatible encoding regexp match +# (ISO-8859-1 regexp with UTF-8 string) +# +# == Performance +# +# Certain pathological combinations of constructs can lead to abysmally bad +# performance. +# +# Consider a string of 25 <i>a</i>s, a <i>d</i>, 4 <i>a</i>s, and a +# <i>c</i>. +# +# s = 'a' * 25 + 'd' 'a' * 4 + 'c' +# #=> "aaaaaaaaaaaaaaaaaaaaaaaaadadadadac" +# +# The following patterns match instantly as you would expect: +# +# /(b|a)/ =~ s #=> 0 +# /(b|a+)/ =~ s #=> 0 +# /(b|a+)*\/ =~ s #=> 0 +# +# However, the following pattern takes appreciably longer: +# +# /(b|a+)*c/ =~ s #=> 32 +# +# This happens because an atom in the regexp is quantified by both an +# immediate <tt>+</tt> and an enclosing <tt>*</tt> with nothing to +# differentiate which is in control of any particular character. The +# nondeterminism that results produces super-linear performance. (Consult +# <i>Mastering Regular Expressions</i> (3rd ed.), pp 222, by +# <i>Jeffery Friedl</i>, for an in-depth analysis). This particular case +# can be fixed by use of atomic grouping, which prevents the unnecessary +# backtracking: +# +# (start = Time.now) && /(b|a+)*c/ =~ s && (Time.now - start) +# #=> 24.702736882 +# (start = Time.now) && /(?>b|a+)*c/ =~ s && (Time.now - start) +# #=> 0.000166571 +# +# A similar case is typified by the following example, which takes +# approximately 60 seconds to execute for me: +# +# # Match a string of 29 <i>a</i>s against a pattern of 29 optional +# # <i>a</i>s followed by 29 mandatory <i>a</i>s. +# Regexp.new('a?' * 29 + 'a' * 29) =~ 'a' * 29 +# +# The 29 optional <i>a</i>s match the string, but this prevents the 29 +# mandatory <i>a</i>s that follow from matching. Ruby must then backtrack +# repeatedly so as to satisfy as many of the optional matches as it can +# while still matching the mandatory 29. It is plain to us that none of the +# optional matches can succeed, but this fact unfortunately eludes Ruby. +# +# One approach for improving performance is to anchor the match to the +# beginning of the string, thus significantly reducing the amount of +# backtracking needed. +# +# Regexp.new('\A' 'a?' * 29 + 'a' * 29).match('a' * 29) +# #=> #<MatchData "aaaaaaaaaaaaaaaaaaaaaaaaaaaaa"> +# +# + class Regexp; end +end |