diff options
author | ph10 <ph10@6239d852-aaf2-0410-a92c-79f79f948069> | 2014-09-23 11:35:51 +0000 |
---|---|---|
committer | ph10 <ph10@6239d852-aaf2-0410-a92c-79f79f948069> | 2014-09-23 11:35:51 +0000 |
commit | fd8438eb9b6bec69a456b69a7dece77aadc06a36 (patch) | |
tree | b0f09f3d92934ea3ad0570599c861891cf360362 | |
parent | cf3d2f48e3a1281a47cd544cfd2457b8342037f9 (diff) | |
download | pcre2-fd8438eb9b6bec69a456b69a7dece77aadc06a36.tar.gz |
Documentation scripts
git-svn-id: svn://vcs.exim.org/pcre2/code/trunk@79 6239d852-aaf2-0410-a92c-79f79f948069
-rwxr-xr-x | 132html | 313 | ||||
-rwxr-xr-x | CheckMan | 67 | ||||
-rwxr-xr-x | CleanTxt | 113 | ||||
-rwxr-xr-x | Detrail | 35 | ||||
-rwxr-xr-x | PrepareRelease | 265 | ||||
-rw-r--r-- | doc/html/README.txt | 1 | ||||
-rw-r--r-- | doc/html/index.html | 177 | ||||
-rw-r--r-- | doc/html/pcre2api.html | 2659 | ||||
-rw-r--r-- | doc/html/pcre2callout.html | 270 | ||||
-rw-r--r-- | doc/html/pcre2demo.html | 443 | ||||
-rw-r--r-- | doc/html/pcre2test.html | 1199 | ||||
-rw-r--r-- | doc/html/pcre2unicode.html | 270 | ||||
-rw-r--r-- | doc/index.html.src | 177 | ||||
-rw-r--r-- | doc/pcre2.txt | 2903 | ||||
-rw-r--r-- | doc/pcre2api.3 | 2 | ||||
-rw-r--r-- | doc/pcre2demo.3 | 441 | ||||
-rw-r--r-- | doc/pcre2test.1 | 8 | ||||
-rw-r--r-- | doc/pcre2test.txt | 1073 | ||||
-rw-r--r-- | src/pcre2demo.c | 2 |
19 files changed, 10412 insertions, 6 deletions
@@ -0,0 +1,313 @@ +#! /usr/bin/perl -w + +# Script to turn PCRE2 man pages into HTML + + +# Subroutine to handle font changes and other escapes + +sub do_line { +my($s) = $_[0]; + +$s =~ s/</</g; # Deal with < and > +$s =~ s/>/>/g; +$s =~ s"\\fI(.*?)\\f[RP]"<i>$1</i>"g; +$s =~ s"\\fB(.*?)\\f[RP]"<b>$1</b>"g; +$s =~ s"\\e"\\"g; +$s =~ s/(?<=Copyright )\(c\)/©/g; +$s; +} + +# Subroutine to ensure not in a paragraph + +sub end_para { +if ($inpara) + { + print TEMP "</PRE>\n" if ($inpre); + print TEMP "</P>\n"; + } +$inpara = $inpre = 0; +$wrotetext = 0; +} + +# Subroutine to start a new paragraph + +sub new_para { +&end_para(); +print TEMP "<P>\n"; +$inpara = 1; +} + + +# Main program + +$innf = 0; +$inpara = 0; +$inpre = 0; +$wrotetext = 0; +$toc = 0; +$ref = 1; + +while ($#ARGV >= 0 && $ARGV[0] =~ /^-/) + { + $toc = 1 if $ARGV[0] eq "-toc"; + shift; + } + +# Initial output to STDOUT + +print <<End ; +<html> +<head> +<title>$ARGV[0] specification</title> +</head> +<body bgcolor="#FFFFFF" text="#00005A" link="#0066FF" alink="#3399FF" vlink="#2222BB"> +<h1>$ARGV[0] man page</h1> +<p> +Return to the <a href="index.html">PCRE2 index page</a>. +</p> +<p> +This page is part of the PCRE2 HTML documentation. It was generated +automatically from the original man page. If there is any nonsense in it, +please consult the man page, in case the conversion went wrong. +<br> +End + +print "<ul>\n" if ($toc); + +open(TEMP, ">/tmp/$$") || die "Can't open /tmp/$$ for output\n"; + +while (<STDIN>) + { + # Handle lines beginning with a dot + + if (/^\./) + { + # Some of the PCRE2 man pages used to contain instances of .br. However, + # they should have all been removed because they cause trouble in some + # (other) automated systems that translate man pages to HTML. Complain if + # we find .br or .in (another macro that is deprecated). + + if (/^\.br/ || /^\.in/) + { + print STDERR "\n*** Deprecated macro encountered - rewrite needed\n"; + print STDERR "*** $_\n"; + die "*** Processing abandoned\n"; + } + + # Instead of .br, relevent "literal" sections are enclosed in .nf/.fi. + + elsif (/^\.nf/) + { + $innf = 1; + } + + elsif (/^\.fi/) + { + $innf = 0; + } + + # Handling .sp is subtle. If it is inside a literal section, do nothing if + # the next line is a non literal text line; similarly, if not inside a + # literal section, do nothing if a literal follows, unless we are inside + # a .nf/.ne section. The point being that the <pre> and </pre> that delimit + # literal sections will do the spacing. Always skip if no previous output. + + elsif (/^\.sp/) + { + if ($wrotetext) + { + $_ = <STDIN>; + if ($inpre) + { + print TEMP "\n" if (/^[\s.]/); + } + else + { + print TEMP "<br>\n<br>\n" if ($innf || !/^[\s.]/); + } + redo; # Now process the lookahead line we just read + } + } + elsif (/^\.TP/ || /^\.PP/ || /^\.P/) + { + &new_para(); + } + elsif (/^\.SH\s*("?)(.*)\1/) + { + # Ignore the NAME section + if ($2 =~ /^NAME\b/) + { + <STDIN>; + next; + } + + &end_para(); + my($title) = &do_line($2); + if ($toc) + { + printf("<li><a name=\"TOC%d\" href=\"#SEC%d\">$title</a>\n", + $ref, $ref); + printf TEMP ("<br><a name=\"SEC%d\" href=\"#TOC1\">$title</a><br>\n", + $ref, $ref); + $ref++; + } + else + { + print TEMP "<br><b>\n$title\n</b><br>\n"; + } + } + elsif (/^\.SS\s*("?)(.*)\1/) + { + &end_para(); + my($title) = &do_line($2); + print TEMP "<br><b>\n$title\n</b><br>\n"; + } + elsif (/^\.B\s*(.*)/) + { + &new_para() if (!$inpara); + $_ = &do_line($1); + s/"(.*?)"/$1/g; + print TEMP "<b>$_</b>\n"; + $wrotetext = 1; + } + elsif (/^\.I\s*(.*)/) + { + &new_para() if (!$inpara); + $_ = &do_line($1); + s/"(.*?)"/$1/g; + print TEMP "<i>$_</i>\n"; + $wrotetext = 1; + } + + # A comment that starts "HREF" takes the next line as a name that + # is turned into a hyperlink, using the text given, which might be + # in a special font. If it ends in () or (digits) or punctuation, they + # aren't part of the link. + + elsif (/^\.\\"\s*HREF/) + { + $_=<STDIN>; + chomp; + $_ = &do_line($_); + $_ =~ s/\s+$//; + $_ =~ /^(?:<.>)?([^<(]+)(?:\(\))?(?:<\/.>)?(?:\(\d+\))?[.,;:]?$/; + print TEMP "<a href=\"$1.html\">$_</a>\n"; + } + + # A comment that starts "HTML" inserts literal HTML + + elsif (/^\.\\"\s*HTML\s*(.*)/) + { + print TEMP $1; + } + + # A comment that starts < inserts that HTML at the end of the + # *next* input line - so as not to get a newline between them. + + elsif (/^\.\\"\s*(<.*>)/) + { + my($markup) = $1; + $_=<STDIN>; + chomp; + $_ = &do_line($_); + $_ =~ s/\s+$//; + print TEMP "$_$markup\n"; + } + + # A comment that starts JOIN joins the next two lines together, with one + # space between them. Then that line is processed. This is used in some + # displays where two lines are needed for the "man" version. JOINSH works + # the same, except that it assumes this is a shell command, so removes + # continuation backslashes. + + elsif (/^\.\\"\s*JOIN(SH)?/) + { + my($one,$two); + $one = <STDIN>; + $two = <STDIN>; + $one =~ s/\s*\\e\s*$// if (defined($1)); + chomp($one); + $two =~ s/^\s+//; + $_ = "$one $two"; + redo; # Process the joined lines + } + + # .EX/.EE are used in the pcredemo page to bracket the entire program, + # which is unmodified except for turning backslash into "\e". + + elsif (/^\.EX\s*$/) + { + print TEMP "<PRE>\n"; + while (<STDIN>) + { + last if /^\.EE\s*$/; + s/\\e/\\/g; + s/&/&/g; + s/</</g; + s/>/>/g; + print TEMP; + } + } + + # Ignore anything not recognized + + next; + } + + # Line does not begin with a dot. Replace blank lines with new paragraphs + + if (/^\s*$/) + { + &end_para() if ($wrotetext); + next; + } + + # Convert fonts changes and output an ordinary line. Ensure that indented + # lines are marked as literal. + + $_ = &do_line($_); + &new_para() if (!$inpara); + + if (/^\s/) + { + if (!$inpre) + { + print TEMP "<pre>\n"; + $inpre = 1; + } + } + elsif ($inpre) + { + print TEMP "</pre>\n"; + $inpre = 0; + } + + # Add <br> to the end of a non-literal line if we are within .nf/.fi + + $_ .= "<br>\n" if (!$inpre && $innf); + + print TEMP; + $wrotetext = 1; + } + +# The TOC, if present, will have been written - terminate it + +print "</ul>\n" if ($toc); + +# Copy the remainder to the standard output + +close(TEMP); +open(TEMP, "/tmp/$$") || die "Can't open /tmp/$$ for input\n"; + +print while (<TEMP>); + +print <<End ; +<p> +Return to the <a href="index.html">PCRE2 index page</a>. +</p> +End + +close(TEMP); +unlink("/tmp/$$"); + +# End diff --git a/CheckMan b/CheckMan new file mode 100755 index 0000000..5686746 --- /dev/null +++ b/CheckMan @@ -0,0 +1,67 @@ +#! /usr/bin/perl + +# A script to scan PCRE2's man pages to check for typos in the control +# sequences. I use only a small set of the available repertoire, so it is +# straightforward to check that nothing else has slipped in by mistake. This +# script should be called in the doc directory. + +$yield = 0; + +while (scalar(@ARGV) > 0) + { + $line = 0; + $file = shift @ARGV; + + open (IN, $file) || die "Failed to open $file\n"; + + while (<IN>) + { + $line++; + if (/^\s*$/) + { + printf "Empty line $line of $file\n"; + $yield = 1; + } + elsif (/^\./) + { + if (!/^\.\s*$| + ^\.B\s+\S| + ^\.TH\s\S| + ^\.SH\s\S| + ^\.SS\s\S| + ^\.TP(?:\s?\d+)?\s*$| + ^\.SM\s*$| + ^\.br\s*$| + ^\.rs\s*$| + ^\.sp\s*$| + ^\.nf\s*$| + ^\.fi\s*$| + ^\.P\s*$| + ^\.PP\s*$| + ^\.\\"(?:\ HREF)?\s*$| + ^\.\\"\sHTML\s<a\shref="[^"]+?">\s*$| + ^\.\\"\sHTML\s<a\sname="[^"]+?"><\/a>\s*$| + ^\.\\"\s<\/a>\s*$| + ^\.\\"\sJOINSH\s*$| + ^\.\\"\sJOIN\s*$/x + ) + { + printf "Bad control line $line of $file\n"; + $yield = 1; + } + } + else + { + if (/\\[^ef]|\\f[^IBP]/) + { + printf "Bad backslash in line $line of $file\n"; + $yield = 1; + } + } + } + + close(IN); + } + +exit $yield; +# End diff --git a/CleanTxt b/CleanTxt new file mode 100755 index 0000000..1f42519 --- /dev/null +++ b/CleanTxt @@ -0,0 +1,113 @@ +#! /usr/bin/perl -w + +# Script to take the output of nroff -man and remove all the backspacing and +# the page footers and the screen commands etc so that it is more usefully +# readable online. In fact, in the latest nroff, intermediate footers don't +# seem to be generated any more. + +$blankcount = 0; +$lastwascut = 0; +$firstheader = 1; + +# Input on STDIN; output to STDOUT. + +while (<STDIN>) + { + s/\x1b\[\d+m//g; # Remove screen controls "ESC [ number m" + s/.\x8//g; # Remove "char, backspace" + + # Handle header lines. Retain only the first one we encounter, but remove + # the blank line that follows. Any others (e.g. at end of document) and the + # following blank line are dropped. + + if (/^PCRE(\w*)\(([13])\)\s+PCRE\1\(\2\)$/) + { + if ($firstheader) + { + $firstheader = 0; + print; + $lastprinted = $_; + $lastwascut = 0; + } + $_=<STDIN>; # Remove a blank that follows + next; + } + + # Count runs of empty lines + + if (/^\s*$/) + { + $blankcount++; + $lastwascut = 0; + next; + } + + # If a chunk of lines has been cut out (page footer) and the next line + # has a different indentation, put back one blank line. + + if ($lastwascut && $blankcount < 1 && defined($lastprinted)) + { + ($a) = $lastprinted =~ /^(\s*)/; + ($b) = $_ =~ /^(\s*)/; + $blankcount++ if ($a ne $b); + } + + # We get here only when we have a non-blank line in hand. If it was preceded + # by 3 or more blank lines, read the next 3 lines and see if they are blank. + # If so, remove all 7 lines, and remember that we have just done a cut. + + if ($blankcount >= 3) + { + for ($i = 0; $i < 3; $i++) + { + $next[$i] = <STDIN>; + $next[$i] = "" if !defined $next[$i]; + $next[$i] =~ s/\x1b\[\d+m//g; # Remove screen controls "ESC [ number m" + $next[$i] =~ s/.\x8//g; # Remove "char, backspace" + } + + # Cut out chunks of the form <3 blanks><non-blank><3 blanks> + + if ($next[0] =~ /^\s*$/ && + $next[1] =~ /^\s*$/ && + $next[2] =~ /^\s*$/) + { + $blankcount -= 3; + $lastwascut = 1; + } + + # Otherwise output the saved blanks, the current, and the next three + # lines. Remember the last printed line. + + else + { + for ($i = 0; $i < $blankcount; $i++) { print "\n"; } + print; + for ($i = 0; $i < 3; $i++) + { + $next[$i] =~ s/.\x8//g; + print $next[$i]; + $lastprinted = $_; + } + $lastwascut = 0; + $blankcount = 0; + } + } + + # This non-blank line is not preceded by 3 or more blank lines. Output + # any blanks there are, and the line. Remember it. Force two blank lines + # before headings. + + else + { + $blankcount = 2 if /^\S/ && !/^Last updated/ && !/^Copyright/ && + defined($lastprinted); + for ($i = 0; $i < $blankcount; $i++) { print "\n"; } + print; + $lastprinted = $_; + $lastwascut = 0; + $blankcount = 0; + } + } + +# End @@ -0,0 +1,35 @@ +#!/usr/bin/perl + +# This is a script for removing trailing whitespace from lines in files that +# are listed on the command line. + +# This subroutine does the work for one file. + +sub detrail { +my($file) = $_[0]; +my($changed) = 0; +open(IN, "$file") || die "Can't open $file for input"; +@lines = <IN>; +close(IN); +foreach (@lines) + { + if (/\s+\n$/) + { + s/\s+\n$/\n/; + $changed = 1; + } + } +if ($changed) + { + open(OUT, ">$file") || die "Can't open $file for output"; + print OUT @lines; + close(OUT); + } +} + +# This is the main program + +$, = ""; # Output field separator +for ($i = 0; $i < @ARGV; $i++) { &detrail($ARGV[$i]); } + +# End diff --git a/PrepareRelease b/PrepareRelease new file mode 100755 index 0000000..c92d7f9 --- /dev/null +++ b/PrepareRelease @@ -0,0 +1,265 @@ +#/bin/sh + +# Script to prepare the files for building a PCRE2 release. It does some +# processing of the documentation, detrails files, and creates pcre2.h.generic +# and config.h.generic (for use by builders who can't run ./configure). + +# You must run this script before runnning "make dist". If its first argument +# is "doc", it stops after preparing the documentation. There are no other +# arguments. The script makes use of the following files: + +# 132html A Perl script that converts a .1 or .3 man page into HTML. It +# "knows" the relevant troff constructs that are used in the PCRE2 +# man pages. + +# CheckMan A Perl script that checks man pages for typos in the mark up. + +# CleanTxt A Perl script that cleans up the output of "nroff -man" by +# removing backspaces and other redundant text so as to produce +# a readable .txt file. + +# Detrail A Perl script that removes trailing spaces from files. + +# doc/index.html.src +# A file that is copied as index.html into the doc/html directory +# when the HTML documentation is built. It works like this so that +# doc/html can be deleted and re-created from scratch. + +# README & NON-AUTOTOOLS-BUILD +# These files are copied into the doc/html directory, with .txt +# extensions so that they can by hyperlinked from the HTML +# documentation, because some people just go to the HTML without +# looking for text files. + + +# First, sort out the documentation. Remove pcre2demo.3 first because it won't +# pass the markup check (it is created below, using markup that none of the +# other pages use). + +cd doc +echo Processing documentation + +/bin/rm -f pcre2demo.3 + +# Check the remaining man pages + +perl ../CheckMan *.1 *.3 +if [ $? != 0 ] ; then exit 1; fi + +# Make Text form of the documentation. It needs some mangling to make it +# tidy for online reading. Concatenate all the .3 stuff, but omit the +# individual function pages. + +cat <<End >pcre2.txt +----------------------------------------------------------------------------- +This file contains a concatenation of the PCRE2 man pages, converted to plain +text format for ease of searching with a text editor, or for use on systems +that do not have a man page processor. The small individual files that give +synopses of each function in the library have not been included. Neither has +the pcre2demo program. There are separate text files for the pcre2grep and +pcre2test commands. +----------------------------------------------------------------------------- + + +End + +echo "Making pcre2.txt" +for file in pcre2api pcre2callout pcre2unicode ; do + +#for file in pcre pcre16 pcre32 pcrebuild pcrematching \ +# pcrecompat pcrepattern pcresyntax pcrejit pcrepartial \ +# pcreprecompile pcreperform pcreposix pcrecpp pcresample \ +# pcrelimits pcrestack ; do + + echo " Processing $file.3" + nroff -c -man $file.3 >$file.rawtxt + perl ../CleanTxt <$file.rawtxt >>pcre2.txt + /bin/rm $file.rawtxt + echo "------------------------------------------------------------------------------" >>pcre2.txt + if [ "$file" != "pcre2sample" ] ; then + echo " " >>pcre2.txt + echo " " >>pcre2.txt + fi +done + +# The three commands +for file in pcre2test ; do +# for file in pcre2test pcre2grep pcre-config ; do + echo Making $file.txt + nroff -c -man $file.1 >$file.rawtxt + perl ../CleanTxt <$file.rawtxt >$file.txt + /bin/rm $file.rawtxt +done + + +# Make pcre2demo.3 from the pcre2demo.c source file + +echo "Making pcre2demo.3" +perl <<"END" >pcre2demo.3 + open(IN, "../src/pcre2demo.c") || die "Failed to open src/pcre2demo.c\n"; + open(OUT, ">pcre2demo.3") || die "Failed to open pcre2demo.3\n"; + print OUT ".\\\" Start example.\n" . + ".de EX\n" . + ". nr mE \\\\n(.f\n" . + ". nf\n" . + ". nh\n" . + ". ft CW\n" . + "..\n" . + ".\n" . + ".\n" . + ".\\\" End example.\n" . + ".de EE\n" . + ". ft \\\\n(mE\n" . + ". fi\n" . + ". hy \\\\n(HY\n" . + "..\n" . + ".\n" . + ".EX\n" ; + while (<IN>) + { + s/\\/\\e/g; + print OUT; + } + print OUT ".EE\n"; + close(IN); + close(OUT); +END +if [ $? != 0 ] ; then exit 1; fi + + +# Make HTML form of the documentation. + +echo "Making HTML documentation" +/bin/rm html/* +cp index.html.src html/index.html +cp ../README html/README.txt +# cp ../NON-AUTOTOOLS-BUILD html/NON-AUTOTOOLS-BUILD.txt + +for file in *.1 ; do + base=`basename $file .1` + echo " Making $base.html" + perl ../132html -toc $base <$file >html/$base.html +done + +# Exclude table of contents for function summaries. It seems that expr +# forces an anchored regex. Also exclude them for small pages that have +# only one section. + +for file in *.3 ; do + base=`basename $file .3` + toc=-toc + if [ `expr $base : '.*_'` -ne 0 ] ; then toc="" ; fi + if [ "$base" = "pcre2sample" ] || \ + [ "$base" = "pcre2stack" ] || \ + [ "$base" = "pcre2compat" ] || \ + [ "$base" = "pcre2limits" ] || \ + [ "$base" = "pcre2perform" ] || \ + [ "$base" = "pcre2unicode" ] ; then + toc="" + fi + echo " Making $base.html" + perl ../132html $toc $base <$file >html/$base.html + if [ $? != 0 ] ; then exit 1; fi +done + +# End of documentation processing; stop if only documentation required. + +cd .. +echo Documentation done +if [ "$1" = "doc" ] ; then exit; fi + +# FIXME pro tem only do docs +exit + +# These files are detrailed; do not detrail the test data because there may be +# significant trailing spaces. Do not detrail RunTest.bat, because it has CRLF +# line endings and the detrail script removes all trailing white space. The +# configure files are also omitted from the detrailing. We don't bother with +# those pcre[16|32]_xx files that just define COMPILE_PCRE16 and then #include the +# common file, because they aren't going to change. + +files="\ + Makefile.am \ + Makefile.in \ + configure.ac \ + README \ + LICENCE \ + COPYING \ + AUTHORS \ + NEWS \ + NON-UNIX-USE \ + NON-AUTOTOOLS-BUILD \ + INSTALL \ + 132html \ + CleanTxt \ + Detrail \ + ChangeLog \ + CMakeLists.txt \ + RunGrepTest \ + RunTest \ + pcre-config.in \ + libpcre.pc.in \ + libpcre16.pc.in \ + libpcre32.pc.in \ + libpcreposix.pc.in \ + libpcrecpp.pc.in \ + config.h.in \ + pcre_chartables.c.dist \ + pcredemo.c \ + pcregrep.c \ + pcretest.c \ + dftables.c \ + pcreposix.c \ + pcreposix.h \ + pcre.h.in \ + pcre_internal.h \ + pcre_byte_order.c \ + pcre_compile.c \ + pcre_config.c \ + pcre_dfa_exec.c \ + pcre_exec.c \ + pcre_fullinfo.c \ + pcre_get.c \ + pcre_globals.c \ + pcre_jit_compile.c \ + pcre_jit_test.c \ + pcre_maketables.c \ + pcre_newline.c \ + pcre_ord2utf8.c \ + pcre16_ord2utf16.c \ + pcre32_ord2utf32.c \ + pcre_printint.c \ + pcre_refcount.c \ + pcre_string_utils.c \ + pcre_study.c \ + pcre_tables.c \ + pcre_valid_utf8.c \ + pcre_version.c \ + pcre_xclass.c \ + pcre16_utf16_utils.c \ + pcre32_utf32_utils.c \ + pcre16_valid_utf16.c \ + pcre32_valid_utf32.c \ + pcre_scanner.cc \ + pcre_scanner.h \ + pcre_scanner_unittest.cc \ + pcrecpp.cc \ + pcrecpp.h \ + pcrecpparg.h.in \ + pcrecpp_unittest.cc \ + pcre_stringpiece.cc \ + pcre_stringpiece.h.in \ + pcre_stringpiece_unittest.cc \ + perltest.pl \ + ucp.h \ + makevp.bat \ + pcre.def \ + libpcre.def \ + libpcreposix.def" + +echo Detrailing +perl ./Detrail $files doc/p* doc/html/* + +echo Done + +#End diff --git a/doc/html/README.txt b/doc/html/README.txt new file mode 100644 index 0000000..7ad597a --- /dev/null +++ b/doc/html/README.txt @@ -0,0 +1 @@ +This is a placeholder README file for a work in progress. diff --git a/doc/html/index.html b/doc/html/index.html new file mode 100644 index 0000000..4e264ec --- /dev/null +++ b/doc/html/index.html @@ -0,0 +1,177 @@ +<html> +<!-- This is a manually maintained file that is the root of the HTML version of + the PCRE2 documentation. When the HTML documents are built from the man + page versions, the entire doc/html directory is emptied, this file is then + copied into doc/html/index.html, and the remaining files therein are + created by the 132html script. +--> +<head> +<title>PCRE2 specification</title> +</head> +<body bgcolor="#FFFFFF" text="#00005A" link="#0066FF" alink="#3399FF" vlink="#2222BB"> +<h1>Perl-compatible Regular Expressions (revised API: PCRE2)</h1> +<p> +The HTML documentation for PCRE2 consists of a number of pages that are listed +below in alphabetical order. If you are new to PCRE2, please read the first one +first. +</p> + +<table> +<tr><td><a href="pcre2.html">pcre</a></td> + <td> Introductory page</td></tr> + +<tr><td><a href="pcre2-config.html">pcre-config</a></td> + <td> Information about the installation configuration</td></tr> + +<tr><td><a href="pcre2api.html">pcreapi</a></td> + <td> PCRE2's native API</td></tr> + +<tr><td><a href="pcre2build.html">pcrebuild</a></td> + <td> Building PCRE2</td></tr> + +<tr><td><a href="pcre2callout.html">pcre2callout</a></td> + <td> The <i>callout</i> facility</td></tr> + +<tr><td><a href="pcre2compat.html">pcre2compat</a></td> + <td> Compability with Perl</td></tr> + +<tr><td><a href="pcre2demo.html">pcre2demo</a></td> + <td> A demonstration C program that uses the PCRE2 library</td></tr> + +<tr><td><a href="pcre2grep.html">pcre2grep</a></td> + <td> The <b>pcre2grep</b> command</td></tr> + +<tr><td><a href="pcre2jit.html">pcre2jit</a></td> + <td> Discussion of the just-in-time optimization support</td></tr> + +<tr><td><a href="pcre2limits.html">pcre2limits</a></td> + <td> Details of size and other limits</td></tr> + +<tr><td><a href="pcre2matching.html">pcre2matching</a></td> + <td> Discussion of the two matching algorithms</td></tr> + +<tr><td><a href="pcre2partial.html">pcre2partial</a></td> + <td> Using PCRE2 for partial matching</td></tr> + +<tr><td><a href="pcre2pattern.html">pcre2pattern</a></td> + <td> Specification of the regular expressions supported by PCRE2</td></tr> + +<tr><td><a href="pcre2perform.html">pcre2perform</a></td> + <td> Some comments on performance</td></tr> + +<tr><td><a href="pcre2posix.html">pcre2posix</a></td> + <td> The POSIX API to the PCRE2 8-bit library</td></tr> + +<tr><td><a href="pcre2precompile.html">pcre2precompile</a></td> + <td> How to save and re-use compiled patterns</td></tr> + +<tr><td><a href="pcre2sample.html">pcre2sample</a></td> + <td> Discussion of the pcre2demo program</td></tr> + +<tr><td><a href="pcre2stack.html">pcre2stack</a></td> + <td> Discussion of PCRE2's stack usage</td></tr> + +<tr><td><a href="pcre2syntax.html">pcre2syntax</a></td> + <td> Syntax quick-reference summary</td></tr> + +<tr><td><a href="pcre2test.html">pcre2test</a></td> + <td> The <b>pcre2test</b> command for testing PCRE2</td></tr> + +<tr><td><a href="pcre2unicode.html">pcre2unicode</a></td> + <td> Discussion of Unicode and UTF-8/UTF-16/UTF-32 support</td></tr> +</table> + +<p> +There are also individual pages that summarize the interface for each function +in the library. There is a single page for each triple of 8-bit/16-bit/32-bit +functions. +</p> + +<table> + +<tr><td><a href="pcre2_assign_jit_stack.html">pcre2_assign_jit_stack</a></td> + <td> Assign stack for JIT matching</td></tr> + +<tr><td><a href="pcre2_compile.html">pcre2_compile</a></td> + <td> Compile a regular expression</td></tr> + +<tr><td><a href="pcre2_compile2.html">pcre2_compile2</a></td> + <td> Compile a regular expression (alternate interface)</td></tr> + +<tr><td><a href="pcre2_config.html">pcre2_config</a></td> + <td> Show build-time configuration options</td></tr> + +<tr><td><a href="pcre2_copy_named_substring.html">pcre2_copy_named_substring</a></td> + <td> Extract named substring into given buffer</td></tr> + +<tr><td><a href="pcre2_copy_substring.html">pcre2_copy_substring</a></td> + <td> Extract numbered substring into given buffer</td></tr> + +<tr><td><a href="pcre2_dfa_exec.html">pcre2_dfa_exec</a></td> + <td> Match a compiled pattern to a subject string + (DFA algorithm; <i>not</i> Perl compatible)</td></tr> + +<tr><td><a href="pcre2_exec.html">pcre2_exec</a></td> + <td> Match a compiled pattern to a subject string + (Perl compatible)</td></tr> + +<tr><td><a href="pcre2_free_study.html">pcre2_free_study</a></td> + <td> Free study data</td></tr> + +<tr><td><a href="pcre2_free_substring.html">pcre2_free_substring</a></td> + <td> Free extracted substring</td></tr> + +<tr><td><a href="pcre2_free_substring_list.html">pcre2_free_substring_list</a></td> + <td> Free list of extracted substrings</td></tr> + +<tr><td><a href="pcre2_fullinfo.html">pcre2_fullinfo</a></td> + <td> Extract information about a pattern</td></tr> + +<tr><td><a href="pcre2_get_named_substring.html">pcre2_get_named_substring</a></td> + <td> Extract named substring into new memory</td></tr> + +<tr><td><a href="pcre2_get_stringnumber.html">pcre2_get_stringnumber</a></td> + <td> Convert captured string name to number</td></tr> + +<tr><td><a href="pcre2_get_stringtable_entries.html">pcre2_get_stringtable_entries</a></td> + <td> Find table entries for given string name</td></tr> + +<tr><td><a href="pcre2_get_substring.html">pcre2_get_substring</a></td> + <td> Extract numbered substring into new memory</td></tr> + +<tr><td><a href="pcre2_get_substring_list.html">pcre2_get_substring_list</a></td> + <td> Extract all substrings into new memory</td></tr> + +<tr><td><a href="pcre2_jit_exec.html">pcre2_jit_exec</a></td> + <td> Fast path interface to JIT matching</td></tr> + +<tr><td><a href="pcre2_jit_stack_alloc.html">pcre2_jit_stack_alloc</a></td> + <td> Create a stack for JIT matching</td></tr> + +<tr><td><a href="pcre2_jit_stack_free.html">pcre2_jit_stack_free</a></td> + <td> Free a JIT matching stack</td></tr> + +<tr><td><a href="pcre2_maketables.html">pcre2_maketables</a></td> + <td> Build character tables in current locale</td></tr> + +<tr><td><a href="pcre2_pattern_to_host_byte_order.html">pcre2_pattern_to_host_byte_order</a></td> + <td> Convert compiled pattern to host byte order if necessary</td></tr> + +<tr><td><a href="pcre2_refcount.html">pcre2_refcount</a></td> + <td> Maintain reference count in compiled pattern</td></tr> + +<tr><td><a href="pcre2_study.html">pcre2_study</a></td> + <td> Study a compiled pattern</td></tr> + +<tr><td><a href="pcre2_utf16_to_host_byte_order.html">pcre2_utf16_to_host_byte_order</a></td> + <td> Convert UTF-16 string to host byte order if necessary</td></tr> + +<tr><td><a href="pcre2_utf32_to_host_byte_order.html">pcre2_utf32_to_host_byte_order</a></td> + <td> Convert UTF-32 string to host byte order if necessary</td></tr> + +<tr><td><a href="pcre2_version.html">pcre2_version</a></td> + <td> Return PCRE2 version and release date</td></tr> +</table> + +</html> + diff --git a/doc/html/pcre2api.html b/doc/html/pcre2api.html new file mode 100644 index 0000000..dd95b4c --- /dev/null +++ b/doc/html/pcre2api.html @@ -0,0 +1,2659 @@ +<html> +<head> +<title>pcre2api specification</title> +</head> +<body bgcolor="#FFFFFF" text="#00005A" link="#0066FF" alink="#3399FF" vlink="#2222BB"> +<h1>pcre2api man page</h1> +<p> +Return to the <a href="index.html">PCRE2 index page</a>. +</p> +<p> +This page is part of the PCRE2 HTML documentation. It was generated +automatically from the original man page. If there is any nonsense in it, +please consult the man page, in case the conversion went wrong. +<br> +<ul> +<li><a name="TOC1" href="#SEC1">PCRE2 NATIVE API BASIC FUNCTIONS</a> +<li><a name="TOC2" href="#SEC2">PCRE2 NATIVE API AUXILIARY MATCH FUNCTIONS</a> +<li><a name="TOC3" href="#SEC3">PCRE2 NATIVE API GENERAL CONTEXT FUNCTIONS</a> +<li><a name="TOC4" href="#SEC4">PCRE2 NATIVE API COMPILE CONTEXT FUNCTIONS</a> +<li><a name="TOC5" href="#SEC5">PCRE2 NATIVE API MATCH CONTEXT FUNCTIONS</a> +<li><a name="TOC6" href="#SEC6">PCRE2 NATIVE API STRING EXTRACTION FUNCTIONS</a> +<li><a name="TOC7" href="#SEC7">PCRE2 NATIVE API JIT FUNCTIONS</a> +<li><a name="TOC8" href="#SEC8">PCRE2 NATIVE API AUXILIARY FUNCTIONS</a> +<li><a name="TOC9" href="#SEC9">PCRE2 8-BIT, 16-BIT, AND 32-BIT LIBRARIES</a> +<li><a name="TOC10" href="#SEC10">PCRE2 API OVERVIEW</a> +<li><a name="TOC11" href="#SEC11">NEWLINES</a> +<li><a name="TOC12" href="#SEC12">MULTITHREADING</a> +<li><a name="TOC13" href="#SEC13">PCRE2 CONTEXTS</a> +<li><a name="TOC14" href="#SEC14">CHECKING BUILD-TIME OPTIONS</a> +<li><a name="TOC15" href="#SEC15">COMPILING A PATTERN</a> +<li><a name="TOC16" href="#SEC16">COMPILATION ERROR CODES</a> +<li><a name="TOC17" href="#SEC17">JUST-IN-TIME (JIT) COMPILATION</a> +<li><a name="TOC18" href="#SEC18">LOCALE SUPPORT</a> +<li><a name="TOC19" href="#SEC19">INFORMATION ABOUT A COMPILED PATTERN</a> +<li><a name="TOC20" href="#SEC20">THE MATCH DATA BLOCK</a> +<li><a name="TOC21" href="#SEC21">MATCHING A PATTERN: THE TRADITIONAL FUNCTION</a> +<li><a name="TOC22" href="#SEC22">NEWLINE HANDLING WHEN MATCHING</a> +<li><a name="TOC23" href="#SEC23">HOW PCRE2_MATCH() RETURNS A STRING AND CAPTURED SUBSTRINGS</a> +<li><a name="TOC24" href="#SEC24">EXTRACTING CAPTURED SUBSTRINGS BY NUMBER</a> +<li><a name="TOC25" href="#SEC25">EXTRACTING A LIST OF ALL CAPTURED SUBSTRINGS</a> +<li><a name="TOC26" href="#SEC26">EXTRACTING CAPTURED SUBSTRINGS BY NAME</a> +<li><a name="TOC27" href="#SEC27">DUPLICATE SUBPATTERN NAMES</a> +<li><a name="TOC28" href="#SEC28">FINDING ALL POSSIBLE MATCHES</a> +<li><a name="TOC29" href="#SEC29">MATCHING A PATTERN: THE ALTERNATIVE FUNCTION</a> +<li><a name="TOC30" href="#SEC30">SEE ALSO</a> +<li><a name="TOC31" href="#SEC31">AUTHOR</a> +<li><a name="TOC32" href="#SEC32">REVISION</a> +</ul> +<P> +<b>#include <pcre2.h></b> +<br> +<br> +PCRE2 is a new API for PCRE. This document contains a description of all its +functions. See the +<a href="pcre2.html"><b>pcre2</b></a> +document for an overview of all the PCRE2 documentation. +</P> +<br><a name="SEC1" href="#TOC1">PCRE2 NATIVE API BASIC FUNCTIONS</a><br> +<P> +<b>pcre2_code *pcre2_compile(PCRE2_SPTR <i>pattern</i>, PCRE2_SIZE <i>length</i>,</b> +<b> uint32_t <i>options</i>, int *<i>errorcode</i>, PCRE2_SIZE *<i>erroroffset,</i></b> +<b> pcre2_compile_context *<i>ccontext</i>);</b> +<br> +<br> +<b>pcre2_code_free(pcre2_code *<i>code</i>);</b> +<br> +<br> +<b>pcre2_match_data_create(uint32_t <i>ovecsize</i>,</b> +<b> pcre2_general_context *<i>gcontext</i>);</b> +<br> +<br> +<b>pcre2_match_data_create_from_pattern(pcre2_code *<i>code</i>,</b> +<b> pcre2_general_context *<i>gcontext</i>);</b> +<br> +<br> +<b>int pcre2_match(const pcre2_code *<i>code</i>, PCRE2_SPTR <i>subject</i>,</b> +<b> PCRE2_SIZE <i>length</i>, PCRE2_SIZE <i>startoffset</i>,</b> +<b> uint32_t <i>options</i>, pcre2_match_data *<i>match_data</i>,</b> +<b> pcre2_match_context *<i>mcontext</i>);</b> +<br> +<br> +<b>int pcre2_dfa_match(const pcre2_code *<i>code</i>, PCRE2_SPTR <i>subject</i>,</b> +<b> PCRE2_SIZE <i>length</i>, PCRE2_SIZE <i>startoffset</i>,</b> +<b> uint32_t <i>options</i>, pcre2_match_data *<i>match_data</i>,</b> +<b> pcre2_match_context *<i>mcontext</i>,</b> +<b> int *<i>workspace</i>, PCRE2_SIZE <i>wscount</i>);</b> +<br> +<br> +<b>void pcre2_match_data_free(pcre2_match_data *<i>match_data</i>);</b> +</P> +<br><a name="SEC2" href="#TOC1">PCRE2 NATIVE API AUXILIARY MATCH FUNCTIONS</a><br> +<P> +<b>PCRE2_SIZE pcre2_get_leftchar(pcre2_match_data *<i>match_data</i>);</b> +<br> +<br> +<b>PCRE2_SPTR pcre2_get_mark(pcre2_match_data *<i>match_data</i>);</b> +<br> +<br> +<b>uint32_t pcre2_get_ovector_count(pcre2_match_data *<i>match_data</i>);</b> +<br> +<br> +<b>PCRE2_SIZE *pcre2_get_ovector_pointer(pcre2_match_data *<i>match_data</i>);</b> +<br> +<br> +<b>PCRE2_SIZE pcre2_get_rightchar(pcre2_match_data *<i>match_data</i>);</b> +<br> +<br> +<b>PCRE2_SIZE pcre2_get_startchar(pcre2_match_data *<i>match_data</i>);</b> +</P> +<br><a name="SEC3" href="#TOC1">PCRE2 NATIVE API GENERAL CONTEXT FUNCTIONS</a><br> +<P> +<b>pcre2_general_context *pcre2_general_context_create(</b> +<b> void *(*<i>private_malloc</i>)(PCRE2_SIZE, void *),</b> +<b> void (*<i>private_free</i>)(void *, void *), void *<i>memory_data</i>);</b> +<br> +<br> +<b>pcre2_general_context *pcre2_general_context_copy(</b> +<b> pcre2_general_context *<i>gcontext</i>);</b> +<br> +<br> +<b>void pcre2_general_context_free(pcre2_general_context *<i>gcontext</i>);</b> +</P> +<br><a name="SEC4" href="#TOC1">PCRE2 NATIVE API COMPILE CONTEXT FUNCTIONS</a><br> +<P> +<b>pcre2_compile_context *pcre2_compile_context_create(</b> +<b> pcre2_general_context *<i>gcontext</i>);</b> +<br> +<br> +<b>pcre2_compile_context *pcre2_compile_context_copy(</b> +<b> pcre2_compile_context *<i>ccontext</i>);</b> +<br> +<br> +<b>void pcre2_compile_context_free(pcre2_compile_context *<i>ccontext</i>);</b> +<br> +<br> +<b>int pcre2_set_bsr_compile(pcre2_compile_context *<i>ccontext</i>,</b> +<b> uint32_t <i>value</i>);</b> +<br> +<br> +<b>int pcre2_set_character_tables(pcre2_compile_context *<i>ccontext</i>,</b> +<b> const unsigned char *<i>tables</i>);</b> +<br> +<br> +<b>int pcre2_set_newline_compile(pcre2_compile_context *<i>ccontext</i>,</b> +<b> uint32_t <i>value</i>);</b> +<br> +<br> +<b>int pcre2_set_parens_nest_limit(pcre2_compile_context *<i>ccontext</i>,</b> +<b> uint32_t <i>value</i>);</b> +<br> +<br> +<b>int pcre2_set_compile_recursion_guard(pcre2_compile_context *<i>ccontext</i>,</b> +<b> int (*<i>guard_function</i>)(uint32_t));</b> +</P> +<br><a name="SEC5" href="#TOC1">PCRE2 NATIVE API MATCH CONTEXT FUNCTIONS</a><br> +<P> +<b>pcre2_match_context *pcre2_match_context_create(</b> +<b> pcre2_general_context *<i>gcontext</i>);</b> +<br> +<br> +<b>pcre2_match_context *pcre2_match_context_copy(</b> +<b> pcre2_match_context *<i>mcontext</i>);</b> +<br> +<br> +<b>void pcre2_match_context_free(pcre2_match_context *<i>mcontext</i>);</b> +<br> +<br> +<b>int pcre2_set_bsr_match(pcre2_match_context *<i>mcontext</i>,</b> +<b> uint32_t <i>value</i>);</b> +<br> +<br> +<b>int pcre2_set_callout(pcre2_match_context *<i>mcontext</i>,</b> +<b> int (*<i>callout_function</i>)(pcre2_callout_block *),</b> +<b> void *<i>callout_data</i>);</b> +<br> +<br> +<b>int pcre2_set_match_limit(pcre2_match_context *<i>mcontext</i>,</b> +<b> uint32_t <i>value</i>);</b> +<br> +<br> +<b>int pcre2_set_newline_match(pcre2_match_context *<i>mcontext</i>,</b> +<b> uint32_t <i>value</i>);</b> +<br> +<br> +<b>int pcre2_set_recursion_limit(pcre2_match_context *<i>mcontext</i>,</b> +<b> uint32_t <i>value</i>);</b> +<br> +<br> +<b>int pcre2_set_recursion_memory_management(</b> +<b> pcre2_match_context *<i>mcontext</i>,</b> +<b> void *(*<i>private_malloc</i>)(PCRE2_SIZE, void *),</b> +<b> void (*<i>private_free</i>)(void *, void *), void *<i>memory_data</i>);</b> +</P> +<br><a name="SEC6" href="#TOC1">PCRE2 NATIVE API STRING EXTRACTION FUNCTIONS</a><br> +<P> +<b>int pcre2_substring_copy_byname(pcre2_match_data *<i>match_data</i>,</b> +<b> PCRE2_SPTR <i>name</i>, PCRE2_UCHAR *<i>buffer</i>, PCRE2_SIZE *<i>bufflen</i>);</b> +<br> +<br> +<b>int pcre2_substring_copy_bynumber(pcre2_match_data *<i>match_data</i>,</b> +<b> unsigned int <i>number</i>, PCRE2_UCHAR *<i>buffer</i>,</b> +<b> PCRE2_SIZE *<i>bufflen</i>);</b> +<br> +<br> +<b>void pcre2_substring_free(PCRE2_UCHAR *<i>buffer</i>);</b> +<br> +<br> +<b>int pcre2_substring_get_byname(pcre2_match_data *<i>match_data</i>,</b> +<b> PCRE2_SPTR <i>name</i>, PCRE2_UCHAR **<i>bufferptr</i>, PCRE2_SIZE *<i>bufflen</i>);</b> +<br> +<br> +<b>int pcre2_substring_get_bynumber(pcre2_match_data *<i>match_data</i>,</b> +<b> unsigned int <i>number</i>, PCRE2_UCHAR **<i>bufferptr</i>,</b> +<b> PCRE2_SIZE *<i>bufflen</i>);</b> +<br> +<br> +<b>int pcre2_substring_length_byname(pcre2_match_data *<i>match_data</i>,</b> +<b> PCRE2_SPTR <i>name</i>, PCRE2_SIZE *<i>length</i>);</b> +<br> +<br> +<b>int pcre2_substring_length_bynumber(pcre2_match_data *<i>match_data</i>,</b> +<b> unsigned int <i>number</i>, PCRE2_SIZE *<i>length</i>);</b> +<br> +<br> +<b>int pcre2_substring_nametable_scan(const pcre2_code *<i>code</i>,</b> +<b> PCRE2_SPTR <i>name</i>, PCRE2_SPTR *<i>first</i>, PCRE2_SPTR *<i>last</i>);</b> +<br> +<br> +<b>int pcre2_substring_number_from_name(const pcre2_code *<i>code</i>,</b> +<b> PCRE2_SPTR <i>name</i>);</b> +<br> +<br> +<b>void pcre2_substring_list_free(PCRE2_SPTR *<i>list</i>);</b> +<br> +<br> +<b>int pcre2_substring_list_get(pcre2_match_data *<i>match_data</i>,</b> +<b>" PCRE2_UCHAR ***<i>listptr</i>, PCRE2_SIZE **<i>lengthsptr</i>);</b> +</P> +<br><a name="SEC7" href="#TOC1">PCRE2 NATIVE API JIT FUNCTIONS</a><br> +<P> +<b>int pcre2_jit_compile(pcre2_code *<i>code</i>, uint32_t <i>options</i>);</b> +<br> +<br> +<b>int pcre2_jit_match(const pcre2_code *<i>code</i>, PCRE2_SPTR <i>subject</i>,</b> +<b> PCRE2_SIZE <i>length</i>, PCRE2_SIZE <i>startoffset</i>,</b> +<b> uint32_t <i>options</i>, pcre2_match_data *<i>match_data</i>,</b> +<b> pcre2_match_context *<i>mcontext</i>, pcre2_jit_stack *<i>jit_stack</i>);</b> +<br> +<br> +<b>void pcre2_jit_free_unused_memory(pcre2_general_context *<i>gcontext</i>);</b> +<br> +<br> +<b>pcre2_jit_stack *pcre2_jit_stack_alloc(pcre2_general_context *<i>gcontext</i>,</b> +<b> PCRE2_SIZE <i>startsize</i>, PCRE2_SIZE <i>maxsize</i>);</b> +<br> +<br> +<b>void pcre2_jit_stack_assign(const pcre2_code *<i>code</i>,</b> +<b> pcre2_jit_callback <i>callback_function</i>, void *<i>callback_data</i>);</b> +<br> +<br> +<b>void pcre2_jit_stack_free(pcre2_jit_stack *<i>jit_stack</i>);</b> +</P> +<br><a name="SEC8" href="#TOC1">PCRE2 NATIVE API AUXILIARY FUNCTIONS</a><br> +<P> +<b>int pcre2_get_error_message(int <i>errorcode</i>, PCRE2_UCHAR *<i>buffer</i>,</b> +<b> PCRE2_SIZE <i>bufflen</i>);</b> +<br> +<br> +<b>const unsigned char *pcre2_maketables(pcre2_general_context *<i>gcontext</i>);</b> +<br> +<br> +<b>int pcre2_pattern_info(const pcre2 *<i>code</i>, uint32_t <i>what</i>, void *<i>where</i>);</b> +<br> +<br> +<b>int pcre2_config(uint32_t <i>what</i>, void *<i>where</i>, PCRE2_SIZE <i>length</i>);</b> +</P> +<br><a name="SEC9" href="#TOC1">PCRE2 8-BIT, 16-BIT, AND 32-BIT LIBRARIES</a><br> +<P> +There are three PCRE2 libraries, supporting 8-bit, 16-bit, and 32-bit code +units, respectively. However, there is just one header file, <b>pcre2.h</b>. +This contains the function prototypes and other definitions for all three +libraries. One, two, or all three can be installed simultaneously. On Unix-like +systems the libraries are called <b>libpcre2-8</b>, <b>libpcre2-16</b>, and +<b>libpcre2-32</b>, and they can also co-exist with the original PCRE libraries. +</P> +<P> +Character strings are passed to and from a PCRE2 library as a sequence of +unsigned integers in code units of the appropriate width. Every PCRE2 function +comes in three different forms, one for each library, for example: +<pre> + <b>pcre2_compile_8()</b> + <b>pcre2_compile_16()</b> + <b>pcre2_compile_32()</b> +</pre> +There are also three different sets of data types: +<pre> + <b>PCRE2_UCHAR8, PCRE2_UCHAR16, PCRE2_UCHAR32</b> + <b>PCRE2_SPTR8, PCRE2_SPTR16, PCRE2_SPTR32</b> +</pre> +The UCHAR types define unsigned code units of the appropriate widths. For +example, PCRE2_UCHAR16 is usually defined as `uint16_t'. The SPTR types are +constant pointers to the equivalent UCHAR types, that is, they are pointers to +vectors of unsigned code units. +</P> +<P> +Many applications use only one code unit width. For their convenience, macros +are defined whose names are the generic forms such as <b>pcre2_compile()</b> and +PCRE2_SPTR. These macros use the value of the macro PCRE2_CODE_UNIT_WIDTH to +generate the appropriate width-specific function and macro names. +PCRE2_CODE_UNIT_WIDTH is not defined by default. An application must define it +to be 8, 16, or 32 before including <b>pcre2.h</b> in order to make use of the +generic names. +</P> +<P> +Applications that use more than one code unit width can be linked with more +than one PCRE2 library, but must define PCRE2_CODE_UNIT_WIDTH to be 0 before +including <b>pcre2.h</b>, and then use the real function names. Any code that is +to be included in an environment where the value of PCRE2_CODE_UNIT_WIDTH is +unknown should also use the real function names. (Unfortunately, it is not +possible in C code to save and restore the value of a macro.) +</P> +<P> +If PCRE2_CODE_UNIT_WIDTH is not defined before including <b>pcre2.h</b>, a +compiler error occurs. +</P> +<P> +When using multiple libraries in an application, you must take care when +processing any particular pattern to use only functions from a single library. +For example, if you want to run a match using a pattern that was compiled with +<b>pcre2_compile_16()</b>, you must do so with <b>pcre2_match_16()</b>, not +<b>pcre2_match_8()</b>. +</P> +<P> +In the function summaries above, and in the rest of this document and other +PCRE2 documents, functions and data types are described using their generic +names, without the 8, 16, or 32 suffix. +</P> +<br><a name="SEC10" href="#TOC1">PCRE2 API OVERVIEW</a><br> +<P> +PCRE2 has its own native API, which is described in this document. There are +also some wrapper functions for the 8-bit library that correspond to the +POSIX regular expression API, but they do not give access to all the +functionality. They are described in the +<a href="pcre2posix.html"><b>pcre2posix</b></a> +documentation. Both these APIs define a set of C function calls. +</P> +<P> +The native API C data types, function prototypes, option values, and error +codes are defined in the header file <b>pcre2.h</b>, which contains definitions +of PCRE2_MAJOR and PCRE2_MINOR, the major and minor release numbers for the +library. Applications can use these to include support for different releases +of PCRE2. +</P> +<P> +In a Windows environment, if you want to statically link an application program +against a non-dll PCRE2 library, you must define PCRE2_STATIC before including +<b>pcre2.h</b>. +</P> +<P> +The functions <b>pcre2_compile()</b>, and <b>pcre2_match()</b> are used for +compiling and matching regular expressions in a Perl-compatible manner. A +sample program that demonstrates the simplest way of using them is provided in +the file called <i>pcre2demo.c</i> in the PCRE2 source distribution. A listing +of this program is given in the +<a href="pcre2demo.html"><b>pcre2demo</b></a> +documentation, and the +<a href="pcre2sample.html"><b>pcre2sample</b></a> +documentation describes how to compile and run it. +</P> +<P> +Just-in-time compiler support is an optional feature of PCRE2 that can be built +in appropriate hardware environments. It greatly speeds up the matching +performance of many patterns. Programs can request that it be used if +available, by calling <b>pcre2_jit_compile()</b> after a pattern has been +successfully compiled by <b>pcre2_compile()</b>. This does nothing if JIT +support is not available. +</P> +<P> +More complicated programs might need to make use of the specialist functions +<b>pcre2_jit_stack_alloc()</b>, <b>pcre2_jit_stack_free()</b>, and +<b>pcre2_jit_stack_assign()</b> in order to control the JIT code's memory usage. +</P> +<P> +JIT matching is automatically used by <b>pcre2_match()</b> if it is available. +There is also a direct interface for JIT matching, which gives improved +performance. The JIT-specific functions are discussed in the +<a href="pcre2jit.html"><b>pcre2jit</b></a> +documentation. +</P> +<P> +A second matching function, <b>pcre2_dfa_exec()</b>, which is not +Perl-compatible, is also provided. This uses a different algorithm for the +matching. The alternative algorithm finds all possible matches (at a given +point in the subject), and scans the subject just once (unless there are +lookbehind assertions). However, this algorithm does not return captured +substrings. A description of the two matching algorithms and their advantages +and disadvantages is given in the +<a href="pcre2matching.html"><b>pcre2matching</b></a> +documentation. There is no JIT support for <b>pcre2_dfa_match()</b>. +</P> +<P> +In addition to the main compiling and matching functions, there are convenience +functions for extracting captured substrings from a subject string that is +matched by <b>pcre2_match()</b>. They are: +<pre> + <b>pcre2_substring_copy_byname()</b> + <b>pcre2_substring_copy_bynumber()</b> + <b>pcre2_substring_get_byname()</b> + <b>pcre2_substring_get_bynumber()</b> + <b>pcre2_substring_list_get()</b> + <b>pcre2_substring_length_byname()</b> + <b>pcre2_substring_length_bynumber()</b> + <b>pcre2_substring_nametable_scan()</b> + <b>pcre2_substring_number_from_name()</b> +</pre> +<b>pcre2_substring_free()</b> and <b>pcre2_substring_list_free()</b> are also +provided, to free the memory used for extracted strings. +</P> +<P> +There are functions for finding out information about a compiled pattern +(<b>pcre2_pattern_info()</b>) and about the configuration with which PCRE2 was +built (<b>pcre2_config()</b>). +<a name="newlines"></a></P> +<br><a name="SEC11" href="#TOC1">NEWLINES</a><br> +<P> +PCRE2 supports five different conventions for indicating line breaks in +strings: a single CR (carriage return) character, a single LF (linefeed) +character, the two-character sequence CRLF, any of the three preceding, or any +Unicode newline sequence. The Unicode newline sequences are the three just +mentioned, plus the single characters VT (vertical tab, U+000B), FF (form feed, +U+000C), NEL (next line, U+0085), LS (line separator, U+2028), and PS +(paragraph separator, U+2029). +</P> +<P> +Each of the first three conventions is used by at least one operating system as +its standard newline sequence. When PCRE2 is built, a default can be specified. +The default default is LF, which is the Unix standard. When PCRE2 is run, the +default can be overridden, either when a pattern is compiled, or when it is +matched. +</P> +<P> +The newline convention can be changed when calling <b>pcre2_compile()</b>, or it +can be specified by special text at the start of the pattern itself; this +overrides any other settings. See the +<a href="pcre2pattern.html"><b>pcre2pattern</b></a> +page for details of the special character sequences. +</P> +<P> +In the PCRE2 documentation the word "newline" is used to mean "the character or +pair of characters that indicate a line break". The choice of newline +convention affects the handling of the dot, circumflex, and dollar +metacharacters, the handling of #-comments in /x mode, and, when CRLF is a +recognized line ending sequence, the match position advancement for a +non-anchored pattern. There is more detail about this in the +<a href="#matchoptions">section on <b>pcre2_match()</b> options</a> +below. +</P> +<P> +The choice of newline convention does not affect the interpretation of +the \n or \r escape sequences, nor does it affect what \R matches, which has +its own separate control. +</P> +<br><a name="SEC12" href="#TOC1">MULTITHREADING</a><br> +<P> +In a multithreaded application it is important to keep thread-specific data +separate from data that can be shared between threads. The PCRE2 library code +itself is thread-safe: it contains no static or global variables. The API is +designed to be fairly simple for non-threaded applications while at the same +time ensuring that multithreaded applications can use it. +</P> +<P> +There are several different blocks of data that are used to pass information +between the application and the PCRE libraries. +</P> +<P> +(1) A pointer to the compiled form of a pattern is returned to the user when +<b>pcre2_compile()</b> is successful. The data in the compiled pattern is fixed, +and does not change when the pattern is matched. Therefore, it is thread-safe, +that is, the same compiled pattern can be used by more than one thread +simultaneously. An application can compile all its patterns at the start, +before forking off multiple threads that use them. However, if the just-in-time +optimization feature is being used, it needs separate memory stack areas for +each thread. See the +<a href="pcre2jit.html"><b>pcre2jit</b></a> +documentation for more details. +</P> +<P> +(2) The next section below introduces the idea of "contexts" in which PCRE2 +functions are called. A context is nothing more than a collection of parameters +that control the way PCRE2 operates. Grouping a number of parameters together +in a context is a convenient way of passing them to a PCRE2 function without +using lots of arguments. The parameters that are stored in contexts are in some +sense "advanced features" of the API. Many straightforward applications will +not need to use contexts. +</P> +<P> +In a multithreaded application, if the parameters in a context are values that +are never changed, the same context can be used by all the threads. However, if +any thread needs to change any value in a context, it must make its own +thread-specific copy. +</P> +<P> +(3) The matching functions need a block of memory for working space and for +storing the results of a match. This includes details of what was matched, as +well as additional information such as the name of a (*MARK) setting. Each +thread must provide its own version of this memory. +</P> +<br><a name="SEC13" href="#TOC1">PCRE2 CONTEXTS</a><br> +<P> +Some PCRE2 functions have a lot of parameters, many of which are used only by +specialist applications, for example, those that use custom memory management +or non-standard character tables. To keep function argument lists at a +reasonable size, and at the same time to keep the API extensible, "uncommon" +parameters are passed to certain functions in a <b>context</b> instead of +directly. A context is just a block of memory that holds the parameter values. +Applications that do not need to adjust any of the context parameters can pass +NULL when a context pointer is required. +</P> +<P> +There are three different types of context: a general context that is relevant +for several PCRE2 operations, a compile-time context, and a match-time context. +</P> +<br><b> +The general context +</b><br> +<P> +At present, this context just contains pointers to (and data for) external +memory management functions that are called from several places in the PCRE2 +library. The context is named `general' rather than specifically `memory' +because in future other fields may be added. If you do not want to supply your +own custom memory management functions, you do not need to bother with a +general context. A general context is created by: +<b>pcre2_general_context *pcre2_general_context_create(</b> +<b> void *(*<i>private_malloc</i>)(PCRE2_SIZE, void *),</b> +<b> void (*<i>private_free</i>)(void *, void *), void *<i>memory_data</i>);</b> +<br> +<br> +The two function pointers specify custom memory management functions, whose +prototypes are: +<pre> + <b>void *private_malloc(PCRE2_SIZE, void *);</b> + <b>void private_free(void *, void *);</b> +</pre> +Whenever code in PCRE2 calls these functions, the final argument is the value +of <i>memory_data</i>. Either of the first two arguments of the creation +function may be NULL, in which case the system memory management functions +<i>malloc()</i> and <i>free()</i> are used. (This is not currently useful, as +there are no other fields in a general context, but in future there might be.) +The <i>private_malloc()</i> function is used (if supplied) to obtain memory for +storing the context, and all three values are saved as part of the context. +</P> +<P> +Whenever PCRE2 creates a data block of any kind, the block contains a pointer +to the <i>free()</i> function that matches the <i>malloc()</i> function that was +used. When the time comes to free the block, this function is called. +</P> +<P> +A general context can be copied by calling: +<b>pcre2_general_context *pcre2_general_context_copy(</b> +<b> pcre2_general_context *<i>gcontext</i>);</b> +<br> +<br> +The memory used for a general context should be freed by calling: +<b>void pcre2_general_context_free(pcre2_general_context *<i>gcontext</i>);</b> +<a name="compilecontext"></a></P> +<br><b> +The compile context +</b><br> +<P> +A compile context is required if you want to change the default values of any +of the following compile-time parameters: +<pre> + What \R matches (Unicode newlines or CR, LF, CRLF only); + PCRE2's character tables; + The newline character sequence; + The compile time nested parentheses limit; + An external function for stack checking. +</pre> +A compile context is also required if you are using custom memory management. +If none of these apply, just pass NULL as the context argument of +<i>pcre2_compile()</i>. +</P> +<P> +A compile context is created, copied, and freed by the following functions: +<b>pcre2_compile_context *pcre2_compile_context_create(</b> +<b> pcre2_general_context *<i>gcontext</i>);</b> +<br> +<br> +<b>pcre2_compile_context *pcre2_compile_context_copy(</b> +<b> pcre2_compile_context *<i>ccontext</i>);</b> +<br> +<br> +<b>void pcre2_compile_context_free(pcre2_compile_context *<i>ccontext</i>);</b> +<br> +<br> +A compile context is created with default values for its parameters. These can +be changed by calling the following functions, which return 0 on success, or +PCRE2_ERROR_BADDATA if invalid data is detected. +<b>int pcre2_set_bsr_compile(pcre2_compile_context *<i>ccontext</i>,</b> +<b> uint32_t <i>value</i>);</b> +<br> +<br> +The value must be PCRE2_BSR_ANYCRLF, to specify that \R matches only CR, LF, +or CRLF, or PCRE2_BSR_UNICODE, to specify that \R matches any Unicode line +ending sequence. The value of this parameter does not affect what is compiled; +it is just saved with the compiled pattern. The value is used by the JIT +compiler and by the two interpreted matching functions, <i>pcre2_match()</i> and +<i>pcre2_dfa_match()</i>. You can change the value when calling these functions, +but doing so disables the use of JIT. +<b>int pcre2_set_character_tables(pcre2_compile_context *<i>ccontext</i>,</b> +<b> const unsigned char *<i>tables</i>);</b> +<br> +<br> +The value must be the result of a call to <i>pcre2_maketables()</i>, whose only +argument is a general context. This function builds a set of character tables +in the current locale. +<b>int pcre2_set_newline_compile(pcre2_compile_context *<i>ccontext</i>,</b> +<b> uint32_t <i>value</i>);</b> +<br> +<br> +This specifies which characters or character sequences are to be recognized as +newlines. The value must be one of PCRE2_NEWLINE_CR (carriage return only), +PCRE2_NEWLINE_LF (linefeed only), PCRE2_NEWLINE_CRLF (the two-character +sequence CR followed by LF), PCRE2_NEWLINE_ANYCRLF (any of the above), or +PCRE2_NEWLINE_ANY (any Unicode newline sequence). +</P> +<P> +When a pattern is compiled with the PCRE2_EXTENDED option, the value of this +parameter affects the recognition of white space and the end of internal +comments starting with #. The value is saved with the compiled pattern for +subsequent use by the JIT compiler and by the two interpreted matching +functions, <i>pcre2_match()</i> and <i>pcre2_dfa_match()</i>. You can change the +value when calling these functions, but doing so disables the use of JIT. +<b>int pcre2_set_parens_nest_limit(pcre2_compile_context *<i>ccontext</i>,</b> +<b> uint32_t <i>value</i>);</b> +<br> +<br> +This parameter ajusts the limit, set when PCRE2 is built (default 250), on the +depth of parenthesis nesting in a pattern. This limit stops rogue patterns +using up too much system stack when being compiled. +<b>int pcre2_set_compile_recursion_guard(pcre2_compile_context *<i>ccontext</i>,</b> +<b> int (*<i>guard_function</i>)(uint32_t));</b> +<br> +<br> +There is at least one application that runs PCRE2 in threads with very limited +system stack, where running out of stack is to be avoided at all costs. The +parenthesis limit above cannot take account of how much stack is actually +available. For a finer control, you can supply a function that is called +whenever <b>pcre2_compile()</b> starts to compile a parenthesized part of a +pattern. The argument to the function gives the current depth of nesting. The +function should return zero if all is well, or non-zero to force an error. +<a name="matchcontext"></a></P> +<br><b> +The match context +</b><br> +<P> +A match context is required if you want to change the default values of any +of the following match-time parameters: +<pre> + What \R matches (Unicode newlines or CR, LF, CRLF only); + A callout function; + The limit for calling <i>match()</i>; + The limit for calling <i>match()</i> recursively; + The newline character sequence; +</pre> +A match context is also required if you are using custom memory management. +If none of these apply, just pass NULL as the context argument of +<b>pcre2_match()</b>, <b>pcre2_dfa_match()</b>, or <b>pcre2_jit_match()</b>. +Changing the newline value or what \R matches at match time disables the use +of JIT via <b>pcre2_match()</b>. +</P> +<P> +A match context is created, copied, and freed by the following functions: +<b>pcre2_match_context *pcre2_match_context_create(</b> +<b> pcre2_general_context *<i>gcontext</i>);</b> +<br> +<br> +<b>pcre2_match_context *pcre2_match_context_copy(</b> +<b> pcre2_match_context *<i>mcontext</i>);</b> +<br> +<br> +<b>void pcre2_match_context_free(pcre2_match_context *<i>mcontext</i>);</b> +<br> +<br> +A match context is created with default values for its parameters. These can +be changed by calling the following functions, which return 0 on success, or +PCRE2_ERROR_BADDATA if invalid data is detected. +<b>int pcre2_set_bsr_match(pcre2_match_context *<i>mcontext</i>,</b> +<b> uint32_t <i>value</i>);</b> +<br> +<br> +The value must be PCRE2_BSR_ANYCRLF, to specify that \R matches only CR, LF, +or CRLF, or PCRE2_BSR_UNICODE, to specify that \R matches any Unicode line +ending sequence. If you want to make use of JIT matching, you should not use +this function, but instead set the value in a compile context. +<b>int pcre2_set_callout(pcre2_match_context *<i>mcontext</i>,</b> +<b> int (*<i>callout_function</i>)(pcre2_callout_block *),</b> +<b> void *<i>callout_data</i>);</b> +<br> +<br> +This sets up a "callout" function, which PCRE2 will call at specified points +during a matching operation. Details are given in the +<a href="pcre2callout.html"><b>pcre2callout</b></a> +documentation. +<b>int pcre2_set_match_limit(pcre2_match_context *<i>mcontext</i>,</b> +<b> uint32_t <i>value</i>);</b> +<br> +<br> +The <i>match_limit</i> parameter provides a means of preventing PCRE2 from using +up too many resources when processing patterns that are not going to match, but +which have a very large number of possibilities in their search trees. The +classic example is a pattern that uses nested unlimited repeats. +</P> +<P> +Internally, <b>pcre2_match()</b> uses a function called <b>match()</b>, which it +calls repeatedly (sometimes recursively). The limit set by <i>match_limit</i> is +imposed on the number of times this function is called during a match, which +has the effect of limiting the amount of backtracking that can take place. For +patterns that are not anchored, the count restarts from zero for each position +in the subject string. This limit is not relevant to <b>pcre2_dfa_match()</b>, +which ignores it. +</P> +<P> +When <b>pcre2_match()</b> is called with a pattern that was successfully studied +with <b>pcre2_jit_compile()</b>, the way that the matching is executed is +entirely different. However, there is still the possibility of runaway matching +that goes on for a very long time, and so the <i>match_limit</i> value is also +used in this case (but in a different way) to limit how long the matching can +continue. +</P> +<P> +The default value for the limit can be set when PCRE2 is built; the default +default is 10 million, which handles all but the most extreme cases. If the +limit is exceeded, <b>pcre2_match()</b> returns PCRE2_ERROR_MATCHLIMIT. A value +for the match limit may also be supplied by an item at the start of a pattern +of the form +<pre> + (*LIMIT_MATCH=ddd) +</pre> +where ddd is a decimal number. However, such a setting is ignored unless ddd is +less than the limit set by the caller of <b>pcre2_match()</b> or, if no such +limit is set, less than the default. +<b>int pcre2_set_recursion_limit(pcre2_match_context *<i>mcontext</i>,</b> +<b> uint32_t <i>value</i>);</b> +<br> +<br> +The <i>recursion_limit</i> parameter is similar to <i>match_limit</i>, but +instead of limiting the total number of times that <b>match()</b> is called, it +limits the depth of recursion. The recursion depth is a smaller number than the +total number of calls, because not all calls to <b>match()</b> are recursive. +This limit is of use only if it is set smaller than <i>match_limit</i>. +</P> +<P> +Limiting the recursion depth limits the amount of system stack that can be +used, or, when PCRE2 has been compiled to use memory on the heap instead of the +stack, the amount of heap memory that can be used. This limit is not relevant, +and is ignored, when matching is done using JIT compiled code or by the +<b>pcre2_dfa_match()</b> function. +</P> +<P> +The default value for <i>recursion_limit</i> can be set when PCRE2 is built; the +default default is the same value as the default for <i>match_limit</i>. If the +limit is exceeded, <b>pcre2_match()</b> returns PCRE2_ERROR_RECURSIONLIMIT. A +value for the recursion limit may also be supplied by an item at the start of a +pattern of the form +<pre> + (*LIMIT_RECURSION=ddd) +</pre> +where ddd is a decimal number. However, such a setting is ignored unless ddd is +less than the limit set by the caller of <b>pcre2_match()</b> or, if no such +limit is set, less than the default. +<b>int pcre2_set_newline_match(pcre2_match_context *<i>mcontext</i>,</b> +<b> uint32_t <i>value</i>);</b> +<br> +<br> +This specifies which characters or character sequences are to be recognized as +newlines. The value must be one of PCRE2_NEWLINE_CR (carriage return only), +PCRE2_NEWLINE_LF (linefeed only), PCRE2_NEWLINE_CRLF (the two-character +sequence CR followed by LF), PCRE2_NEWLINE_ANYCRLF (any of the above), or +PCRE2_NEWLINE_ANY (any Unicode newline sequence). If you want to make use of +JIT matching, you should not use this function, but instead set the value in a +compile context. +<b>int pcre2_set_recursion_memory_management(</b> +<b> pcre2_match_context *<i>mcontext</i>,</b> +<b> void *(*<i>private_malloc</i>)(PCRE2_SIZE, void *),</b> +<b> void (*<i>private_free</i>)(void *, void *), void *<i>memory_data</i>);</b> +<br> +<br> +This function sets up two additional custom memory management functions for use +by <b>pcre2_match()</b> when PCRE2 is compiled to use the heap for remembering +backtracking data, instead of recursive function calls that use the system +stack. There is a discussion about PCRE2's stack usage in the +<a href="pcre2stack.html"><b>pcre2stack</b></a> +documentation. See the +<a href="pcre2build.html"><b>pcre2build</b></a> +documentation for details of how to build PCRE2. Using the heap for recursion +is a non-standard way of building PCRE2, for use in environments that have +limited stacks. Because of the greater use of memory management, +<b>pcre2_match()</b> runs more slowly. Functions that are different to the +general custom memory functions are provided so that special-purpose external +code can be used for this case, because the memory blocks are all the same +size. The blocks are retained by <b>pcre2_match()</b> until it is about to exit +so that they can be re-used when possible during the match. In the absence of +these functions, the normal custom memory management functions are used, if +supplied, otherwise the system functions. +</P> +<br><a name="SEC14" href="#TOC1">CHECKING BUILD-TIME OPTIONS</a><br> +<P> +<b>int pcre2_config(uint32_t <i>what</i>, void *<i>where</i>, PCRE2_SIZE <i>length</i>);</b> +</P> +<P> +The function <b>pcre2_config()</b> makes it possible for a PCRE2 client to +discover which optional features have been compiled into the PCRE2 library. The +<a href="pcre2build.html"><b>pcre2build</b></a> +documentation has more details about these optional features. +</P> +<P> +The first argument for <b>pcre2_config()</b> specifies which information is +required. The second argument is a pointer to memory into which the information +is placed, with the final argument giving the length of this memory in bytes. +For calls that return numerical values, <i>where</i> should point to +appropriately aligned memory, with <i>length</i> set to at least the "sizeof" +the data type. +</P> +<P> +The returned value from <b>pcre2_config()</b> is zero on success, or the +negative error code PCRE2_ERROR_BADOPTION if the value in the first argument is +not recognized. The following information is available: +<pre> + PCRE2_CONFIG_BSR +</pre> +The output is an integer whose value indicates what character sequences the \R +escape sequence matches by default. A value of 0 means that \R matches any +Unicode line ending sequence; a value of 1 means that \R matches only CR, LF, +or CRLF. The default can be overridden when a pattern is compiled or matched. +<pre> + PCRE2_CONFIG_JIT +</pre> +The output is an integer that is set to one if support for just-in-time +compiling is available; otherwise it is set to zero. +<pre> + PCRE2_CONFIG_JITTARGET +</pre> +FIXME: this needs sorting out once JIT is implemented. +If JIT support is available, the string contains the name of the architecture +for which the JIT compiler is configured, for example "x86 32bit (little endian ++ unaligned)". If JIT support is not available, FIXME. +<pre> + PCRE2_CONFIG_LINKSIZE +</pre> +The output is an integer that contains the number of bytes used for internal +linkage in compiled regular expressions. When PCRE2 is configured, the value +can be set to 2, 3, or 4, with the default being 2. This is the value that is +returned by <b>pcre2_config()</b>. However, when the 16-bit library is compiled, +a value of 3 is rounded up to 4, and when the 32-bit library is compiled, +internal linkages always use 4 bytes, so the configured value is not relevant. +</P> +<P> +The default value of 2 for the 8-bit and 16-bit libraries is sufficient for all +but the most massive patterns, since it allows the size of the compiled pattern +to be up to 64K code units. Larger values allow larger regular expressions to +be compiled by those two libraries, but at the expense of slower matching. +<pre> + PCRE2_CONFIG_MATCHLIMIT +</pre> +The output is an unsigned long integer that gives the default limit for the +number of internal matching function calls in a <b>pcre2_match()</b> execution. +Further details are given with <b>pcre2_match()</b> below. +<pre> + PCRE2_CONFIG_NEWLINE +</pre> +The output is an integer whose value specifies the default character sequence +that is recognized as meaning "newline". The values are: +<pre> + 1 Carriage return (CR) + 2 Linefeed (LF) + 3 Carriage return, linefeed (CRLF) + 4 Any Unicode line ending + 5 Any of CR, LF, or CRLF +</pre> +The default should normally correspond to the standard sequence for your +operating system. +<pre> + PCRE2_CONFIG_PARENSLIMIT +</pre> +The output is an unsigned long integer that gives the maximum depth of nesting +of parentheses (of any kind) in a pattern. This limit is imposed to cap the +amount of system stack used when a pattern is compiled. It is specified when +PCRE2 is built; the default is 250. This limit does not take into account the +stack that may already be used by the calling application. For finer control +over compilation stack usage, see <b>pcre2_set_compile_recursion_guard()</b>. +<pre> + PCRE2_CONFIG_RECURSIONLIMIT +</pre> +The output is an unsigned long integer that gives the default limit for the +depth of recursion when calling the internal matching function in a +<b>pcre2_match()</b> execution. Further details are given with +<b>pcre2_match()</b> below. +<pre> + PCRE2_CONFIG_STACKRECURSE +</pre> +The output is an integer that is set to one if internal recursion when running +<b>pcre2_match()</b> is implemented by recursive function calls that use the +system stack to remember their state. This is the usual way that PCRE2 is +compiled. The output is zero if PCRE2 was compiled to use blocks of data on the +heap instead of recursive function calls. +<pre> + PCRE2_CONFIG_UNICODE_VERSION +</pre> +The <i>where</i> argument should point to a buffer that is at least 24 code +units long. If PCRE2 has been compiled without Unicode support, this is filled +with the text "Unicode not supported". Otherwise, the Unicode version string +(for example, "7.0.0") is returnd. The string is zero-terminated. +<pre> + PCRE2_CONFIG_UNICODE +</pre> +The output is an integer that is set to one if Unicode support is available; +otherwise it is set to zero. Unicode support implies UTF support. +<pre> + PCRE2_CONFIG_VERSION +</pre> +The <i>where</i> argument should point to a buffer that is at least 12 code +units long. It is filled with the PCRE2 version string, zero-terminated. +</P> +<br><a name="SEC15" href="#TOC1">COMPILING A PATTERN</a><br> +<P> +<b>pcre2_code *pcre2_compile(PCRE2_SPTR <i>pattern</i>, PCRE2_SIZE <i>length</i>,</b> +<b> uint32_t <i>options</i>, int *<i>errorcode</i>, PCRE2_SIZE *<i>erroroffset,</i></b> +<b> pcre2_compile_context *<i>ccontext</i>);</b> +<br> +<br> +<b>pcre2_code_free(pcre2_code *<i>code</i>);</b> +</P> +<P> +This function compiles a pattern, defined by a pointer to a string of code +units and a length, into an internal form. If the pattern is zero-terminated, +the length should be specified as PCRE2_ZERO_TERMINATED. The function returns a +pointer to a block of memory that contains the compiled pattern and related +data. The caller must free the memory by calling <b>pcre2_code_free()</b> when +it is no longer needed. +</P> +<P> +If the compile context argument <i>ccontext</i> is NULL, the memory is obtained +by calling <b>malloc()</b>. Otherwise, it is obtained from the same memory +function that was used for the compile context. +</P> +<P> +The <i>options</i> argument contains various bit settings that affect the +compilation. It should be zero if no options are required. The available +options are described below. Some of them (in particular, those that are +compatible with Perl, but some others as well) can also be set and unset from +within the pattern (see the detailed description in the +<a href="pcre2pattern.html"><b>pcre2pattern</b></a> +documentation). +</P> +<P> +For those options that can be different in different parts of the pattern, the +contents of the <i>options</i> argument specifies their settings at the start of +compilation. The PCRE2_ANCHORED, PCRE2_NO_UTF_CHECK, and +PCRE2_NO_START_OPTIMIZE options can be set at the time of matching as well as +at compile time. +</P> +<P> +Other, less frequently required compile-time parameters (for example, the +newline setting) can be provided in a compile context (as described +<a href="#compilecontext">above).</a> +</P> +<P> +If <i>errorcode</i> or <i>erroroffset</i> is NULL, <b>pcre2_compile()</b> returns +NULL immediately. Otherwise, if compilation of a pattern fails, +<b>pcre2_compile()</b> returns NULL, having set these variables to an error code +and an offset (number of code units) within the pattern, respectively. The +<b>pcre2_get_error_message()</b> function provides a textual message for each +error code. Compilation errors are positive numbers, but UTF formatting errors +are negative numbers. For an invalid UTF-8 or UTF-16 string, the offset is that +of the first code unit of the failing character. +</P> +<P> +Some errors are not detected until the whole pattern has been scanned; in these +cases, the offset passed back is the length of the pattern. Note that the +offset is in code units, not characters, even in a UTF mode. It may sometimes +point into the middle of a UTF-8 or UTF-16 character. +</P> +<P> +This code fragment shows a typical straightforward call to +<b>pcre2_compile()</b>: +<pre> + pcre2_code *re; + PCRE2_SIZE erroffset; + int errorcode; + re = pcre2_compile( + "^A.*Z", /* the pattern */ + PCRE2_ZERO_TERMINATED, /* the pattern is zero-terminated */ + 0, /* default options */ + &errorcode, /* for error code */ + &erroffset, /* for error offset */ + NULL); /* no compile context */ +</pre> +The following names for option bits are defined in the <b>pcre2.h</b> header +file: +<pre> + PCRE2_ANCHORED +</pre> +If this bit is set, the pattern is forced to be "anchored", that is, it is +constrained to match only at the first matching point in the string that is +being searched (the "subject string"). This effect can also be achieved by +appropriate constructs in the pattern itself, which is the only way to do it in +Perl. +<pre> + PCRE2_ALLOW_EMPTY_CLASS +</pre> +By default, for compatibility with Perl, a closing square bracket that +immediately follows an opening one is treated as a data character for the +class. When PCRE2_ALLOW_EMPTY_CLASS is set, it terminates the class, which +therefore contains no characters and so can never match. +<pre> + PCRE2_ALT_BSUX +</pre> +This option request alternative handling of three escape sequences, which +makes PCRE2's behaviour more like ECMAscript (aka JavaScript). When it is set: +</P> +<P> +(1) \U matches an upper case "U" character; by default \U causes a compile +time error (Perl uses \U to upper case subsequent characters). +</P> +<P> +(2) \u matches a lower case "u" character unless it is followed by four +hexadecimal digits, in which case the hexadecimal number defines the code point +to match. By default, \u causes a compile time error (Perl uses it to upper +case the following character). +</P> +<P> +(3) \x matches a lower case "x" character unless it is followed by two +hexadecimal digits, in which case the hexadecimal number defines the code point +to match. By default, as in Perl, a hexadecimal number is always expected after +\x, but it may have zero, one, or two digits (so, for example, \xz matches a +binary zero character followed by z). +<pre> + PCRE2_AUTO_CALLOUT +</pre> +If this bit is set, <b>pcre2_compile()</b> automatically inserts callout items, +all with number 255, before each pattern item. For discussion of the callout +facility, see the +<a href="pcre2callout.html"><b>pcre2callout</b></a> +documentation. +<pre> + PCRE2_CASELESS +</pre> +If this bit is set, letters in the pattern match both upper and lower case +letters in the subject. It is equivalent to Perl's /i option, and it can be +changed within a pattern by a (?i) option setting. +<pre> + PCRE2_DOLLAR_ENDONLY +</pre> +If this bit is set, a dollar metacharacter in the pattern matches only at the +end of the subject string. Without this option, a dollar also matches +immediately before a newline at the end of the string (but not before any other +newlines). The PCRE2_DOLLAR_ENDONLY option is ignored if PCRE2_MULTILINE is +set. There is no equivalent to this option in Perl, and no way to set it within +a pattern. +<pre> + PCRE2_DOTALL +</pre> +If this bit is set, a dot metacharacter in the pattern matches any character, +including one that indicates a newline. However, it only ever matches one +character, even if newlines are coded as CRLF. Without this option, a dot does +not match when the current position in the subject is at a newline. This option +is equivalent to Perl's /s option, and it can be changed within a pattern by a +(?s) option setting. A negative class such as [^a] always matches newline +characters, independent of the setting of this option. +<pre> + PCRE2_DUPNAMES +</pre> +If this bit is set, names used to identify capturing subpatterns need not be +unique. This can be helpful for certain types of pattern when it is known that +only one instance of the named subpattern can ever be matched. There are more +details of named subpatterns below; see also the +<a href="pcre2pattern.html"><b>pcre2pattern</b></a> +documentation. +<pre> + PCRE2_EXTENDED +</pre> +If this bit is set, most white space characters in the pattern are totally +ignored except when escaped or inside a character class. However, white space +is not allowed within sequences such as (?> that introduce various +parenthesized subpatterns, nor within numerical quantifiers such as {1,3}. +Ignorable white space is permitted between an item and a following quantifier +and between a quantifier and a following + that indicates possessiveness. +</P> +<P> +PCRE2_EXTENDED also causes characters between an unescaped # outside a +character class and the next newline, inclusive, to be ignored, which makes it +possible to include comments inside complicated patterns. Note that the end of +this type of comment is a literal newline sequence in the pattern; escape +sequences that happen to represent a newline do not count. PCRE2_EXTENDED is +equivalent to Perl's /x option, and it can be changed within a pattern by a +(?x) option setting. +</P> +<P> +Which characters are interpreted as newlines can be specified by a setting in +the compile context that is passed to <b>pcre2_compile()</b> or by a special +sequence at the start of the pattern, as described in the section entitled +<a href="pcrepattern.html#newlines">"Newline conventions"</a> +in the <b>pcre2pattern</b> documentation. A default is defined when PCRE2 is +built. +<pre> + PCRE2_FIRSTLINE +</pre> +If this option is set, an unanchored pattern is required to match before or at +the first newline in the subject string, though the matched text may continue +over the newline. +<pre> + PCRE2_MATCH_UNSET_BACKREF +</pre> +If this option is set, a back reference to an unset subpattern group matches an +empty string (by default this causes the current matching alternative to fail). +A pattern such as (\1)(a) succeeds when this option is set (assuming it can +find an "a" in the subject), whereas it fails by default, for Perl +compatibility. Setting this option makes PCRE2 behave more like ECMAscript (aka +JavaScript). +<pre> + PCRE2_MULTILINE +</pre> +By default, for the purposes of matching "start of line" and "end of line", +PCRE2 treats the subject string as consisting of a single line of characters, +even if it actually contains newlines. The "start of line" metacharacter (^) +matches only at the start of the string, and the "end of line" metacharacter +($) matches only at the end of the string, or before a terminating newline +(except when PCRE2_DOLLAR_ENDONLY is set). Note, however, that unless +PCRE2_DOTALL is set, the "any character" metacharacter (.) does not match at a +newline. This behaviour (for ^, $, and dot) is the same as Perl. +</P> +<P> +When PCRE2_MULTILINE it is set, the "start of line" and "end of line" +constructs match immediately following or immediately before internal newlines +in the subject string, respectively, as well as at the very start and end. This +is equivalent to Perl's /m option, and it can be changed within a pattern by a +(?m) option setting. If there are no newlines in a subject string, or no +occurrences of ^ or $ in a pattern, setting PCRE2_MULTILINE has no effect. +<pre> + PCRE2_NEVER_UCP +</pre> +This option locks out the use of Unicode properties for handling \B, \b, \D, +\d, \S, \s, \W, \w, and some of the POSIX character classes, as described +for the PCRE2_UCP option below. In particular, it prevents the creator of the +pattern from enabling this facility by starting the pattern with (*UCP). This +may be useful in applications that process patterns from external sources. The +option combination PCRE_UCP and PCRE_NEVER_UCP causes an error. +<pre> + PCRE2_NEVER_UTF +</pre> +This option locks out interpretation of the pattern as UTF-8, UTF-16, or +UTF-32, depending on which library is in use. In particular, it prevents the +creator of the pattern from switching to UTF interpretation by starting the +pattern with (*UTF). This may be useful in applications that process patterns +from external sources. The combination of PCRE2_UTF and PCRE2_NEVER_UTF causes +an error. +<pre> + PCRE2_NO_AUTO_CAPTURE +</pre> +If this option is set, it disables the use of numbered capturing parentheses in +the pattern. Any opening parenthesis that is not followed by ? behaves as if it +were followed by ?: but named parentheses can still be used for capturing (and +they acquire numbers in the usual way). There is no equivalent of this option +in Perl. +<pre> + PCRE2_NO_AUTO_POSSESS +</pre> +If this option is set, it disables "auto-possessification", which is an +optimization that, for example, turns a+b into a++b in order to avoid +backtracks into a+ that can never be successful. However, if callouts are in +use, auto-possessification means that some callouts are never taken. You can +set this option if you want the matching functions to do a full unoptimized +search and run all the callouts, but it is mainly provided for testing +purposes. +<pre> + PCRE2_NO_START_OPTIMIZE +</pre> +This is an option that acts at matching time; that is, it is really an option +for <b>pcre2_match()</b> or <b>pcre_dfa_match()</b>. If it is set at compile +time, it is remembered with the compiled pattern and assumed at matching time. +This is necessary if you want to use JIT execution, because the JIT compiler +needs to know whether or not this option is set. For details, see the +discussion of PCRE2_NO_START_OPTIMIZE in the section on <b>pcre2_match()</b> +options +<a href="#matchoptions">below.</a> +<pre> + PCRE2_NO_UTF_CHECK +</pre> +When PCRE2_UTF is set, the validity of the pattern as a UTF string is +automatically checked. There are discussions about the validity of +<a href="pcre2unicode.html#utf8strings">UTF-8 strings,</a> +<a href="pcre2unicode.html#utf16strings">UTF-16 strings,</a> +and +<a href="pcre2unicode.html#utf32strings">UTF-32 strings</a> +in the +<a href="pcre2unicode.html"><b>pcre2unicode</b></a> +document. +If an invalid UTF sequence is found, <b>pcre2_compile()</b> returns a negative +error code. +</P> +<P> +If you know that your pattern is valid, and you want to skip this check for +performance reasons, you can set the PCRE2_NO_UTF_CHECK option. When it is set, +the effect of passing an invalid UTF string as a pattern is undefined. It may +cause your program to crash or loop. Note that this option can also be passed +to <b>pcre2_match()</b> and <b>pcre_dfa_match()</b>, to suppress validity +checking of the subject string. +<pre> + PCRE2_UCP +</pre> +This option changes the way PCRE2 processes \B, \b, \D, \d, \S, \s, \W, +\w, and some of the POSIX character classes. By default, only ASCII characters +are recognized, but if PCRE2_UCP is set, Unicode properties are used instead to +classify characters. More details are given in the section on +<a href="pcre2.html#genericchartypes">generic character types</a> +in the +<a href="pcre2pattern.html"><b>pcre2pattern</b></a> +page. If you set PCRE2_UCP, matching one of the items it affects takes much +longer. The option is available only if PCRE2 has been compiled with UTF +support. +<pre> + PCRE2_UNGREEDY +</pre> +This option inverts the "greediness" of the quantifiers so that they are not +greedy by default, but become greedy if followed by "?". It is not compatible +with Perl. It can also be set by a (?U) option setting within the pattern. +<pre> + PCRE2_UTF +</pre> +This option causes PCRE2 to regard both the pattern and the subject strings +that are subsequently processed as strings of UTF characters instead of +single-code-unit strings. However, it is available only when PCRE2 is built to +include UTF support. If not, the use of this option provokes an error. Details +of how this option changes the behaviour of PCRE2 are given in the +<a href="pcre2unicode.html"><b>pcre2unicode</b></a> +page. +</P> +<br><a name="SEC16" href="#TOC1">COMPILATION ERROR CODES</a><br> +<P> +There are over 80 positive error codes that <b>pcre2_compile()</b> may return if +it finds an error in the pattern. There are also some negative error codes that +are used for invalid UTF strings. These are the same as given by +<b>pcre2_match()</b> and <b>pcre2_dfa_match()</b>, and are described in the +<a href="pcre2unicode.html"><b>pcre2unicode</b></a> +page. The <b>pcre2_get_error_message()</b> function can be called to obtain a +textual error message from any error code. +</P> +<br><a name="SEC17" href="#TOC1">JUST-IN-TIME (JIT) COMPILATION</a><br> +<P> +<b>int pcre2_jit_compile(pcre2_code *<i>code</i>, uint32_t <i>options</i>);</b> +<br> +<br> +<b>int pcre2_jit_match(const pcre2_code *<i>code</i>, PCRE2_SPTR <i>subject</i>,</b> +<b> PCRE2_SIZE <i>length</i>, PCRE2_SIZE <i>startoffset</i>,</b> +<b> uint32_t <i>options</i>, pcre2_match_data *<i>match_data</i>,</b> +<b> pcre2_match_context *<i>mcontext</i>, pcre2_jit_stack *<i>jit_stack</i>);</b> +<br> +<br> +<b>void pcre2_jit_free_unused_memory(pcre2_general_context *<i>gcontext</i>);</b> +<br> +<br> +<b>pcre2_jit_stack *pcre2_jit_stack_alloc(pcre2_general_context *<i>gcontext</i>,</b> +<b> PCRE2_SIZE <i>startsize</i>, PCRE2_SIZE <i>maxsize</i>);</b> +<br> +<br> +<b>void pcre2_jit_stack_assign(const pcre2_code *<i>code</i>,</b> +<b> pcre2_jit_callback <i>callback_function</i>, void *<i>callback_data</i>);</b> +<br> +<br> +<b>void pcre2_jit_stack_free(pcre2_jit_stack *<i>jit_stack</i>);</b> +</P> +<P> +These functions provide support for JIT compilation, which, if the just-in-time +compiler is available, further processes a compiled pattern into machine code +that executes much faster than the <b>pcre2_match()</b> interpretive matching +function. Full details are given in the +<a href="pcre2jit.html"><b>pcre2jit</b></a> +documentation. +</P> +<P> +JIT compilation is a heavyweight optimization. It can take some time for +patterns to be analyzed, and for one-off matches and simple patterns the +benefit of faster execution might be offset by a much slower compilation time. +Most, but not all patterns can be optimized by the JIT compiler. +<a name="localesupport"></a></P> +<br><a name="SEC18" href="#TOC1">LOCALE SUPPORT</a><br> +<P> +PCRE2 handles caseless matching, and determines whether characters are letters, +digits, or whatever, by reference to a set of tables, indexed by character code +point. When running in UTF-8 mode, or using the 16-bit or 32-bit libraries, +this applies only to characters with code points less than 256. By default, +higher-valued code points never match escapes such as \w or \d. However, if +PCRE2 is built with UTF support, all characters can be tested with \p and \P, +or, alternatively, the PCRE2_UCP option can be set when a pattern is compiled; +this causes \w and friends to use Unicode property support instead of the +built-in tables. +</P> +<P> +The use of locales with Unicode is discouraged. If you are handling characters +with code points greater than 128, you should either use Unicode support, or +use locales, but not try to mix the two. +</P> +<P> +PCRE2 contains an internal set of character tables that are used by default. +These are sufficient for many applications. Normally, the internal tables +recognize only ASCII characters. However, when PCRE2 is built, it is possible +to cause the internal tables to be rebuilt in the default "C" locale of the +local system, which may cause them to be different. +</P> +<P> +The internal tables can be overridden by tables supplied by the application +that calls PCRE2. These may be created in a different locale from the default. +As more and more applications change to using Unicode, the need for this locale +support is expected to die away. +</P> +<P> +External tables are built by calling the <b>pcre2_maketables()</b> function, in +the relevant locale. The result can be passed to <b>pcre2_compile()</b> as often +as necessary, by creating a compile context and calling +<b>pcre2_set_character_tables()</b> to set the tables pointer therein. For +example, to build and use tables that are appropriate for the French locale +(where accented characters with values greater than 128 are treated as +letters), the following code could be used: +<pre> + setlocale(LC_CTYPE, "fr_FR"); + tables = pcre2_maketables(NULL); + ccontext = pcre2_compile_context_create(NULL); + pcre2_set_character_tables(ccontext, tables); + re = pcre2_compile(..., ccontext); +</pre> +The locale name "fr_FR" is used on Linux and other Unix-like systems; if you +are using Windows, the name for the French locale is "french". It is the +caller's responsibility to ensure that the memory containing the tables remains +available for as long as it is needed. +</P> +<P> +The pointer that is passed (via the compile context) to <b>pcre2_compile()</b> +is saved with the compiled pattern, and the same tables are used by +<b>pcre2_match()</b> and <b>pcre_dfa_match()</b>. Thus, for any single pattern, +compilation, and matching all happen in the same locale, but different patterns +can be processed in different locales. +<a name="infoaboutpattern"></a></P> +<br><a name="SEC19" href="#TOC1">INFORMATION ABOUT A COMPILED PATTERN</a><br> +<P> +<b>int pcre2_pattern_info(const pcre2 *<i>code</i>, uint32_t <i>what</i>, void *<i>where</i>);</b> +</P> +<P> +The <b>pcre2_pattern_info()</b> function returns information about a compiled +pattern. The first argument is a pointer to the compiled pattern. The second +argument specifies which piece of information is required, and the third +argument is a pointer to a variable to receive the data. The yield of the +function is zero for success, or one of the following negative numbers: +<pre> + PCRE2_ERROR_NULL the argument <i>code</i> was NULL + the argument <i>where</i> was NULL + PCRE2_ERROR_BADMAGIC the "magic number" was not found + PCRE2_ERROR_BADOPTION the value of <i>what</i> was invalid + PCRE2_ERROR_UNSET the requested field is not set +</pre> +The "magic number" is placed at the start of each compiled pattern as an simple +check against passing an arbitrary memory pointer. +Here is +a typical call of <b>pcre2_pattern_info()</b>, to obtain the length of the compiled +pattern: +<pre> + int rc; + size_t length; + rc = pcre2_pattern_info( + re, /* result of pcre2_compile() */ + PCRE2_INFO_SIZE, /* what is required */ + &length); /* where to put the data */ +</pre> +The possible values for the second argument are defined in <b>pcre2.h</b>, and +are as follows: +<pre> + PCRE2_INFO_ALLOPTIONS + PCRE2_INFO_ARGOPTIONS +</pre> +Return a copy of the pattern's options. The third argument should point to a +<b>uint32_t</b> variable. PCRE2_INFO_ARGOPTIONS returns exactly the options that +were passed to <b>pcre2_compile()</b>, whereas PCRE2_INFO_ALLOPTIONS returns +the compile options as modified by any top-level option settings at the start +of the pattern itself. In other words, they are the options that will be in +force when matching starts. For example, if the pattern /(?im)abc(?-i)d/ is +compiled with the PCRE2_EXTENDED option, the result is PCRE2_CASELESS, +PCRE2_MULTILINE, and PCRE2_EXTENDED. +</P> +<P> +A pattern is automatically anchored by PCRE2 if all of its top-level +alternatives begin with one of the following: +<pre> + ^ unless PCRE2_MULTILINE is set + \A always + \G always + .* if PCRE2_DOTALL is set and there are no back references to the subpattern in which .* appears +</pre> +For such patterns, the PCRE2_ANCHORED bit is set in the options returned for +PCRE2_INFO_ALLOPTIONS. +<pre> + PCRE2_INFO_BACKREFMAX +</pre> +Return the number of the highest back reference in the pattern. The third +argument should point to an <b>uint32_t</b> variable. Zero is returned if there +are no back references. +<pre> + PCRE2_INFO_BSR +</pre> +The output is a uint32_t whose value indicates what character sequences the \R +escape sequence matches by default. A value of 0 means that \R matches any +Unicode line ending sequence; a value of 1 means that \R matches only CR, LF, +or CRLF. The default can be overridden when a pattern is matched. +<pre> + PCRE2_INFO_CAPTURECOUNT +</pre> +Return the number of capturing subpatterns in the pattern. The third argument +should point to an <b>uint32_t</b> variable. +<pre> + PCRE2_INFO_FIRSTCODETYPE +</pre> +Return information about the first code unit of any matched string, for a +non-anchored pattern. The third argument should point to an <b>uint32_t</b> +variable. +</P> +<P> +If there is a fixed first value, for example, the letter "c" from a pattern +such as (cat|cow|coyote), 1 is returned, and the character value can be +retrieved using PCRE2_INFO_FIRSTCODEUNIT. If there is no fixed first value, and +if either +<br> +<br> +(a) the pattern was compiled with the PCRE2_MULTILINE option, and every branch +starts with "^", or +<br> +<br> +(b) every branch of the pattern starts with ".*" and PCRE2_DOTALL is not set +(if it were set, the pattern would be anchored), +<br> +<br> +2 is returned, indicating that the pattern matches only at the start of a +subject string or after any newline within the string. Otherwise 0 is +returned. For anchored patterns, 0 is returned. +<pre> + PCRE2_INFO_FIRSTCODEUNIT +</pre> +Return the value of the first code unit of any matched string in the situation +where PCRE2_INFO_FIRSTCODETYPE returns 1; otherwise return 0. The third +argument should point to an <b>uint32_t</b> variable. In the 8-bit library, the +value is always less than 256. In the 16-bit library the value can be up to +0xffff. In the 32-bit library in UTF-32 mode the value can be up to 0x10ffff, +and up to 0xffffffff when not using UTF-32 mode. +<pre> + PCRE2_INFO_FIRSTBITMAP +</pre> +In the absence of a single first code unit for a non-anchored pattern, +<b>pcre2_compile()</b> may construct a 256-bit table that defines a fixed set of +values for the first code unit in any match. For example, a pattern that starts +with [abc] results in a table with three bits set. When code unit values +greater than 255 are supported, the flag bit for 255 means "any code unit of +value 255 or above". If such a table was constructed, a pointer to it is +returned. Otherwise NULL is returned. The third argument should point to an +<b>const uint8_t *</b> variable. +<pre> + PCRE2_INFO_HASCRORLF +</pre> +Return 1 if the pattern contains any explicit matches for CR or LF characters, +otherwise 0. The third argument should point to an <b>uint32_t</b> variable. An +explicit match is either a literal CR or LF character, or \r or \n. +<pre> + PCRE2_INFO_JCHANGED +</pre> +Return 1 if the (?J) or (?-J) option setting is used in the pattern, otherwise +0. The third argument should point to an <b>uint32_t</b> variable. (?J) and +(?-J) set and unset the local PCRE2_DUPNAMES option, respectively. +<pre> + PCRE2_INFO_JITSIZE +</pre> +If the compiled pattern was successfully processed by +<b>pcre2_jit_compile()</b>, return the size of the JIT compiled code, otherwise +return zero. The third argument should point to a <b>size_t</b> variable. +<pre> + PCRE2_INFO_LASTCODETYPE +</pre> +Returns 1 if there is a rightmost literal code unit that must exist in any +matched string, other than at its start. The third argument should point to an +<b>uint32_t</b> variable. If there is no such value, 0 is returned. When 1 is +returned, the code unit value itself can be retrieved using +PCRE2_INFO_LASTCODEUNIT. +</P> +<P> +For anchored patterns, a last literal value is recorded only if it follows +something of variable length. For example, for the pattern /^a\d+z\d+/ the +returned value is 1 (with "z" returned from PCRE2_INFO_LASTCODEUNIT), but for +/^a\dz\d/ the returned value is 0. +<pre> + PCRE2_INFO_LASTCODEUNIT +</pre> +Return the value of the rightmost literal data unit that must exist in any +matched string, other than at its start, if such a value has been recorded. The +third argument should point to an <b>uint32_t</b> variable. If there is no such +value, 0 is returned. +<pre> + PCRE2_INFO_MATCHEMPTY +</pre> +Return 1 if the pattern can match an empty string, otherwise 0. The third +argument should point to an <b>uint32_t</b> variable. +<pre> + PCRE2_INFO_MATCHLIMIT +</pre> +If the pattern set a match limit by including an item of the form +(*LIMIT_MATCH=nnnn) at the start, the value is returned. The third argument +should point to an unsigned 32-bit integer. If no such value has been set, the +call to <b>pcre2_pattern_info()</b> returns the error PCRE2_ERROR_UNSET. +<pre> + PCRE2_INFO_MAXLOOKBEHIND +</pre> +Return the number of characters (not code units) in the longest lookbehind +assertion in the pattern. The third argument should point to an unsigned 32-bit +integer. This information is useful when doing multi-segment matching using the +partial matching facilities. Note that the simple assertions \b and \B +require a one-character lookbehind. \A also registers a one-character +lookbehind, though it does not actually inspect the previous character. This is +to ensure that at least one character from the old segment is retained when a +new segment is processed. Otherwise, if there are no lookbehinds in the +pattern, \A might match incorrectly at the start of a new segment. +<pre> + PCRE2_INFO_MINLENGTH +</pre> +If a minimum length for matching subject strings was computed, its value is +returned. Otherwise the returned value is 0. The value is a number of +characters, which in UTF mode may be different from the number of code units. +The third argument should point to an <b>uint32_t</b> variable. The value is a +lower bound to the length of any matching string. There may not be any strings +of that length that do actually match, but every string that does match is at +least that long. +<pre> + PCRE2_INFO_NAMECOUNT + PCRE2_INFO_NAMEENTRYSIZE + PCRE2_INFO_NAMETABLE +</pre> +PCRE2 supports the use of named as well as numbered capturing parentheses. The +names are just an additional way of identifying the parentheses, which still +acquire numbers. Several convenience functions such as +<b>pcre2_substring_get_byname()</b> are provided for extracting captured +substrings by name. It is also possible to extract the data directly, by first +converting the name to a number in order to access the correct pointers in the +output vector (described with <b>pcre2_match()</b> below). To do the conversion, +you need to use the name-to-number map, which is described by these three +values. +</P> +<P> +The map consists of a number of fixed-size entries. PCRE2_INFO_NAMECOUNT gives +the number of entries, and PCRE2_INFO_NAMEENTRYSIZE gives the size of each +entry; both of these return a <b>uint32_t</b> value. The entry size depends on +the length of the longest name. PCRE2_INFO_NAMETABLE returns a pointer to the +first entry of the table. This is a PCRE2_SPTR pointer to a block of code +units. In the 8-bit library, the first two bytes of each entry are the number +of the capturing parenthesis, most significant byte first. In the 16-bit +library, the pointer points to 16-bit data units, the first of which contains +the parenthesis number. In the 32-bit library, the pointer points to 32-bit +data units, the first of which contains the parenthesis number. The rest of the +entry is the corresponding name, zero terminated. +</P> +<P> +The names are in alphabetical order. If (?| is used to create multiple groups +with the same number, as described in the +<a href="pcre2pattern.html#dupsubpatternnumber">section on duplicate subpattern numbers</a> +in the +<a href="pcre2pattern.html"><b>pcre2pattern</b></a> +page, the groups may be given the same name, but there is only one entry in the +table. Different names for groups of the same number are not permitted. +</P> +<P> +Duplicate names for subpatterns with different numbers are permitted, but only +if PCRE2_DUPNAMES is set. They appear in the table in the order in which they +were found in the pattern. In the absence of (?| this is the order of +increasing number; when (?| is used this is not necessarily the case because +later subpatterns may have lower numbers. +</P> +<P> +As a simple example of the name/number table, consider the following pattern +after compilation by the 8-bit library (assume PCRE2_EXTENDED is set, so white +space - including newlines - is ignored): +<pre> + (?<date> (?<year>(\d\d)?\d\d) - (?<month>\d\d) - (?<day>\d\d) ) +</pre> +There are four named subpatterns, so the table has four entries, and each entry +in the table is eight bytes long. The table is as follows, with non-printing +bytes shows in hexadecimal, and undefined bytes shown as ??: +<pre> + 00 01 d a t e 00 ?? + 00 05 d a y 00 ?? ?? + 00 04 m o n t h 00 + 00 02 y e a r 00 ?? +</pre> +When writing code to extract data from named subpatterns using the +name-to-number map, remember that the length of the entries is likely to be +different for each compiled pattern. +<pre> + PCRE2_INFO_NEWLINE +</pre> +The output is a <b>uint32_t</b> whose value specifies the default character +sequence that will be recognized as meaning "newline" while matching. The +values are: +<pre> + 1 Carriage return (CR) + 2 Linefeed (LF) + 3 Carriage return, linefeed (CRLF) + 4 Any Unicode line ending + 5 Any of CR, LF, or CRLF +</pre> +The default can be overridden when a pattern is matched. +<pre> + PCRE2_INFO_RECURSIONLIMIT +</pre> +If the pattern set a recursion limit by including an item of the form +(*LIMIT_RECURSION=nnnn) at the start, the value is returned. The third +argument should point to an unsigned 32-bit integer. If no such value has been +set, the call to <b>pcre2_pattern_info()</b> returns the error PCRE2_ERROR_UNSET. +<pre> + PCRE2_INFO_SIZE +</pre> +Return the size of the compiled pattern in bytes (for all three libraries). The +third argument should point to a <b>size_t</b> variable. This value does not +include the size of the <b>pcre2_code</b> structure that is returned by +<b>pcre_compile()</b>. The value that is used when <b>pcre2_compile()</b> is +getting memory in which to place the compiled data is the value returned by +this option plus the size of the <b>pcre2_code</b> structure. Processing a +pattern with the JIT compiler does not alter the value returned by this option. +<a name="matchdatablock"></a></P> +<br><a name="SEC20" href="#TOC1">THE MATCH DATA BLOCK</a><br> +<P> +<b>pcre2_match_data_create(uint32_t <i>ovecsize</i>,</b> +<b> pcre2_general_context *<i>gcontext</i>);</b> +<br> +<br> +<b>pcre2_match_data_create_from_pattern(pcre2_code *<i>code</i>,</b> +<b> pcre2_general_context *<i>gcontext</i>);</b> +<br> +<br> +<b>void pcre2_match_data_free(pcre2_match_data *<i>match_data</i>);</b> +</P> +<P> +Information about successful and unsuccessful matches is placed in a match +data block, which is an opaque structure that is accessed by function calls. In +particular, the match data block contains a vector of offsets into the subject +string that define the matched part of the subject and any substrings that were +capured. This is know as the <i>ovector</i>. +</P> +<P> +Before calling <b>pcre2_match()</b> or <b>pcre2_dfa_match()</b> you must create a +match data block by calling one of the creation functions above. For +<b>pcre2_match_data_create()</b>, the first argument is the number of pairs of +offsets in the <i>ovector</i>. One pair of offsets is required to identify the +string that matched the whole pattern, with another pair for each captured +substring. For example, a value of 4 creates enough space to record the +matched portion of the subject plus three captured substrings. +</P> +<P> +For <b>pcre2_match_data_create_from_pattern()</b>, the first argument is a +pointer to a compiled pattern. In this case the ovector is created to be +exactly the right size to hold all the substrings a pattern might capture. +</P> +<P> +The second argument of both these functions ia a pointer to a general context, +which can specify custom memory management for obtaining the memory for the +match data block. If you are not using custom memory management, pass NULL. +</P> +<P> +A match data block can be used many times, with the same or different compiled +patterns. When it is no longer needed, it should be freed by calling +<b>pcre2_match_data_free()</b>. How to extract information from a match data +block after a match operation is described in the sections on +<a href="#matchedstrings">matched strings</a> +and +<a href="#matchotherdata">other match data</a> +below. +</P> +<br><a name="SEC21" href="#TOC1">MATCHING A PATTERN: THE TRADITIONAL FUNCTION</a><br> +<P> +<b>int pcre2_match(const pcre2_code *<i>code</i>, PCRE2_SPTR <i>subject</i>,</b> +<b> PCRE2_SIZE <i>length</i>, PCRE2_SIZE <i>startoffset</i>,</b> +<b> uint32_t <i>options</i>, pcre2_match_data *<i>match_data</i>,</b> +<b> pcre2_match_context *<i>mcontext</i>);</b> +</P> +<P> +The function <b>pcre2_match()</b> is called to match a subject string against a +compiled pattern, which is passed in the <i>code</i> argument. You can call +<b>pcre2_match()</b> with the same <i>code</i> argument as many times as you +like, in order to find multiple matches in the subject string or to match +different subject strings with the same pattern. +</P> +<P> +This function is the main matching facility of the library, and it operates in +a Perl-like manner. For specialist use there is also an alternative matching +function, which is described +<a href="#dfamatch">below</a> +in the section about the <b>pcre2_dfa_match()</b> function. +</P> +<P> +Here is an example of a simple call to <b>pcre2_match()</b>: +<pre> + pcre2_match_data *md = pcre2_match_data_create(4, NULL); + int rc = pcre2_match( + re, /* result of pcre2_compile() */ + "some string", /* the subject string */ + 11, /* the length of the subject string */ + 0, /* start at offset 0 in the subject */ + 0, /* default options */ + match_data, /* the match data block */ + NULL); /* a match context; NULL means use defaults */ +</pre> +If the subject string is zero-terminated, the length can be given as +PCRE2_ZERO_TERMINATED. A match context must be provided if certain less common +matching parameters are to be changed. For details, see the section on +<a href="#matchcontext">the match context</a> +above. +</P> +<br><b> +The string to be matched by <b>pcre2_match()</b> +</b><br> +<P> +The subject string is passed to <b>pcre2_match()</b> as a pointer in +<i>subject</i>, a length in <i>length</i>, and a starting offset in +<i>startoffset</i>. The length and offset are in code units, not characters. +That is, they are in bytes for the 8-bit library, 16-bit code units for the +16-bit library, and 32-bit code units for the 32-bit library, whether or not +UTF processing is enabled. +</P> +<P> +If <i>startoffset</i> is greater than the length of the subject, +<b>pcre2_match()</b> returns PCRE2_ERROR_BADOFFSET. When the starting offset is +zero, the search for a match starts at the beginning of the subject, and this +is by far the most common case. In UTF-8 or UTF-16 mode, the starting offset +must point to the start of a character, or to the end of the subject (in UTF-32 +mode, one code unit equals one character, so all offsets are valid). Like the +pattern string, the subject may contain binary zeroes. +</P> +<P> +A non-zero starting offset is useful when searching for another match in the +same subject by calling <b>pcre2_match()</b> again after a previous success. +Setting <i>startoffset</i> differs from passing over a shortened string and +setting PCRE2_NOTBOL in the case of a pattern that begins with any kind of +lookbehind. For example, consider the pattern +<pre> + \Biss\B +</pre> +which finds occurrences of "iss" in the middle of words. (\B matches only if +the current position in the subject is not a word boundary.) When applied to +the string "Mississipi" the first call to <b>pcre2_match()</b> finds the first +occurrence. If <b>pcre2_match()</b> is called again with just the remainder of +the subject, namely "issipi", it does not match, because \B is always false at +the start of the subject, which is deemed to be a word boundary. However, if +<b>pcre2_match()</b> is passed the entire string again, but with +<i>startoffset</i> set to 4, it finds the second occurrence of "iss" because it +is able to look behind the starting point to discover that it is preceded by a +letter. +</P> +<P> +Finding all the matches in a subject is tricky when the pattern can match an +empty string. It is possible to emulate Perl's /g behaviour by first trying the +match again at the same offset, with the PCRE2_NOTEMPTY_ATSTART and +PCRE2_ANCHORED options, and then if that fails, advancing the starting offset +and trying an ordinary match again. There is some code that demonstrates how to +do this in the +<a href="pcre2demo.html"><b>pcre2demo</b></a> +sample program. In the most general case, you have to check to see if the +newline convention recognizes CRLF as a newline, and if so, and the current +character is CR followed by LF, advance the starting offset by two characters +instead of one. +</P> +<P> +If a non-zero starting offset is passed when the pattern is anchored, one +attempt to match at the given offset is made. This can only succeed if the +pattern does not require the match to be at the start of the subject. +<a name="matchoptions"></a></P> +<br><b> +Option bits for <b>pcre2_match()</b> +</b><br> +<P> +The unused bits of the <i>options</i> argument for <b>pcre2_match()</b> must be +zero. The only bits that may be set are PCRE2_ANCHORED, +PCRE2_NOTBOL, PCRE2_NOTEOL, PCRE2_NOTEMPTY, PCRE2_NOTEMPTY_ATSTART, +PCRE2_NO_START_OPTIMIZE, PCRE2_NO_UTF_CHECK, PCRE2_PARTIAL_HARD, and +PCRE2_PARTIAL_SOFT. Their action is described below. +</P> +<P> +If the pattern was successfully processed by the just-in-time (JIT) compiler, +the only supported options for matching using the JIT code are PCRE2_NOTBOL, +PCRE2_NOTEOL, PCRE2_NOTEMPTY, PCRE2_NOTEMPTY_ATSTART, PCRE2_NO_UTF_CHECK, +PCRE2_PARTIAL_HARD, and PCRE2_PARTIAL_SOFT. If an unsupported option is used, +JIT matching is disabled and the normal interpretive code in +<b>pcre2_match()</b> is run. +<pre> + PCRE2_ANCHORED +</pre> +The PCRE2_ANCHORED option limits <b>pcre2_match()</b> to matching at the first +matching position. If a pattern was compiled with PCRE2_ANCHORED, or turned out +to be anchored by virtue of its contents, it cannot be made unachored at +matching time. Note that setting the option at match time disables JIT +matching. +<pre> + PCRE2_NOTBOL +</pre> +This option specifies that first character of the subject string is not the +beginning of a line, so the circumflex metacharacter should not match before +it. Setting this without PCRE2_MULTILINE (at compile time) causes circumflex +never to match. This option affects only the behaviour of the circumflex +metacharacter. It does not affect \A. +<pre> + PCRE2_NOTEOL +</pre> +This option specifies that the end of the subject string is not the end of a +line, so the dollar metacharacter should not match it nor (except in multiline +mode) a newline immediately before it. Setting this without PCRE2_MULTILINE (at +compile time) causes dollar never to match. This option affects only the +behaviour of the dollar metacharacter. It does not affect \Z or \z. +<pre> + PCRE2_NOTEMPTY +</pre> +An empty string is not considered to be a valid match if this option is set. If +there are alternatives in the pattern, they are tried. If all the alternatives +match the empty string, the entire match fails. For example, if the pattern +<pre> + a?b? +</pre> +is applied to a string not beginning with "a" or "b", it matches an empty +string at the start of the subject. With PCRE2_NOTEMPTY set, this match is not +valid, so PCRE2 searches further into the string for occurrences of "a" or "b". +<pre> + PCRE2_NOTEMPTY_ATSTART +</pre> +This is like PCRE2_NOTEMPTY, except that an empty string match that is not at +the start of the subject is permitted. If the pattern is anchored, such a match +can occur only if the pattern contains \K. +<pre> + PCRE2_NO_START_OPTIMIZE +</pre> +There are a number of optimizations that <b>pcre2_match()</b> uses at the start +of a match, in order to speed up the process. For example, if it is known that +an unanchored match must start with a specific character, it searches the +subject for that character, and fails immediately if it cannot find it, without +actually running the main matching function. This means that a special item +such as (*COMMIT) at the start of a pattern is not considered until after a +suitable starting point for the match has been found. Also, when callouts or +(*MARK) items are in use, these "start-up" optimizations can cause them to be +skipped if the pattern is never actually used. The start-up optimizations are +in effect a pre-scan of the subject that takes place before the pattern is run. +</P> +<P> +The PCRE2_NO_START_OPTIMIZE option disables the start-up optimizations, +possibly causing performance to suffer, but ensuring that in cases where the +result is "no match", the callouts do occur, and that items such as (*COMMIT) +and (*MARK) are considered at every possible starting position in the subject +string. If PCRE2_NO_START_OPTIMIZE is set at compile time, it cannot be unset +at matching time. The use of PCRE2_NO_START_OPTIMIZE at matching time (that is, +passing it to <b>pcre2_match()</b>) disables JIT execution; in this situation, +matching is always done using interpretively. +</P> +<P> +Setting PCRE2_NO_START_OPTIMIZE can change the outcome of a matching operation. +Consider the pattern +<pre> + (*COMMIT)ABC +</pre> +When this is compiled, PCRE2 records the fact that a match must start with the +character "A". Suppose the subject string is "DEFABC". The start-up +optimization scans along the subject, finds "A" and runs the first match +attempt from there. The (*COMMIT) item means that the pattern must match the +current starting position, which in this case, it does. However, if the same +match is run with PCRE2_NO_START_OPTIMIZE set, the initial scan along the +subject string does not happen. The first match attempt is run starting from +"D" and when this fails, (*COMMIT) prevents any further matches being tried, so +the overall result is "no match". There are also other start-up optimizations. +For example, a minimum length for the subject may be recorded. Consider the +pattern +<pre> + (*MARK:A)(X|Y) +</pre> +The minimum length for a match is one character. If the subject is "ABC", there +will be attempts to match "ABC", "BC", and "C". An attempt to match an empty +string at the end of the subject does not take place, because PCRE2 knows that +the subject is now too short, and so the (*MARK) is never encountered. In this +case, the optimization does not affect the overall match result, which is still +"no match", but it does affect the auxiliary information that is returned. +<pre> + PCRE2_NO_UTF_CHECK +</pre> +When PCRE2_UTF is set at compile time, the validity of the subject as a UTF +string is checked by default when <b>pcre2_match()</b> is subsequently called. +The entire string is checked before any other processing takes place, and a +negative error code is returned if the check fails. There are several UTF error +codes for each code unit width, corresponding to different problems with the +code unit sequence. The value of <i>startoffset</i> is also checked, to ensure +that it points to the start of a character or to the end of the subject. There +are discussions about the validity of +<a href="pcre2unicode.html#utf8strings">UTF-8 strings,</a> +<a href="pcre2unicode.html#utf16strings">UTF-16 strings,</a> +and +<a href="pcre2unicode.html#utf32strings">UTF-32 strings</a> +in the +<a href="pcre2unicode.html"><b>pcre2unicode</b></a> +page. +</P> +<P> +If you know that your subject is valid, and you want to skip these checks for +performance reasons, you can set the PCRE2_NO_UTF_CHECK option when calling +<b>pcre2_match()</b>. You might want to do this for the second and subsequent +calls to <b>pcre2_match()</b> if you are making repeated calls to find all the +matches in a single subject string. +</P> +<P> +NOTE: When PCRE2_NO_UTF_CHECK is set, the effect of passing an invalid string +as a subject, or an invalid value of <i>startoffset</i>, is undefined. Your +program may crash or loop indefinitely. +<pre> + PCRE2_PARTIAL_HARD + PCRE2_PARTIAL_SOFT +</pre> +These options turn on the partial matching feature. A partial match occurs if +the end of the subject string is reached successfully, but there are not enough +subject characters to complete the match. If this happens when +PCRE2_PARTIAL_SOFT (but not PCRE2_PARTIAL_HARD) is set, matching continues by +testing any remaining alternatives. Only if no complete match can be found is +PCRE2_ERROR_PARTIAL returned instead of PCRE2_ERROR_NOMATCH. In other words, +PCRE2_PARTIAL_SOFT says that the caller is prepared to handle a partial match, +but only if no complete match can be found. +</P> +<P> +If PCRE2_PARTIAL_HARD is set, it overrides PCRE2_PARTIAL_SOFT. In this case, if +a partial match is found, <b>pcre2_match()</b> immediately returns +PCRE2_ERROR_PARTIAL, without considering any other alternatives. In other +words, when PCRE2_PARTIAL_HARD is set, a partial match is considered to be more +important that an alternative complete match. +</P> +<P> +There is a more detailed discussion of partial and multi-segment matching, with +examples, in the +<a href="pcre2partial.html"><b>pcre2partial</b></a> +documentation. +</P> +<br><a name="SEC22" href="#TOC1">NEWLINE HANDLING WHEN MATCHING</a><br> +<P> +When PCRE2 is built, a default newline convention is set; this is usually the +standard convention for the operating system. The default can be overridden in +either a +<a href="#compilecontext">compile context</a> +or a +<a href="#matchcontext">match context.</a> +However, changing the newline convention at match time disables JIT matching. +During matching, the newline choice affects the behaviour of the dot, +circumflex, and dollar metacharacters. It may also alter the way the match +position is advanced after a match failure for an unanchored pattern. +</P> +<P> +When PCRE2_NEWLINE_CRLF, PCRE2_NEWLINE_ANYCRLF, or PCRE2_NEWLINE_ANY is set, +and a match attempt for an unanchored pattern fails when the current position +is at a CRLF sequence, and the pattern contains no explicit matches for CR or +LF characters, the match position is advanced by two characters instead of one, +in other words, to after the CRLF. +</P> +<P> +The above rule is a compromise that makes the most common cases work as +expected. For example, if the pattern is .+A (and the PCRE2_DOTALL option is +not set), it does not match the string "\r\nA" because, after failing at the +start, it skips both the CR and the LF before retrying. However, the pattern +[\r\n]A does match that string, because it contains an explicit CR or LF +reference, and so advances only by one character after the first failure. +</P> +<P> +An explicit match for CR of LF is either a literal appearance of one of those +characters in the pattern, or one of the \r or \n escape sequences. Implicit +matches such as [^X] do not count, nor does \s (which includes CR and LF in +the characters that it matches). +</P> +<P> +Notwithstanding the above, anomalous effects may still occur when CRLF is a +valid newline sequence and explicit \r or \n escapes appear in the pattern. +<a name="matchedstrings"></a></P> +<br><a name="SEC23" href="#TOC1">HOW PCRE2_MATCH() RETURNS A STRING AND CAPTURED SUBSTRINGS</a><br> +<P> +<b>uint32_t pcre2_get_ovector_count(pcre2_match_data *<i>match_data</i>);</b> +<br> +<br> +<b>PCRE2_SIZE *pcre2_get_ovector_pointer(pcre2_match_data *<i>match_data</i>);</b> +</P> +<P> +In general, a pattern matches a certain portion of the subject, and in +addition, further substrings from the subject may be picked out by +parenthesized parts of the pattern. Following the usage in Jeffrey Friedl's +book, this is called "capturing" in what follows, and the phrase "capturing +subpattern" is used for a fragment of a pattern that picks out a substring. +PCRE2 supports several other kinds of parenthesized subpattern that do not +cause substrings to be captured. The <b>pcre2_pattern_info()</b> function can be +used to find out how many capturing subpatterns there are in a compiled +pattern. +</P> +<P> +The overall matched string and any captured substrings are returned to the +caller via a vector of PCRE2_SIZE values, called the <b>ovector</b>. This is +contained within the +<a href="#matchdatablock">match data block.</a> +You can obtain direct access to the ovector by calling +<b>pcre2_get_ovector_pointer()</b> to find its address, and +<b>pcre2_get_ovector_count()</b> to find the number of pairs of values it +contains. Alternatively, you can use the auxiliary functions for accessing +captured substrings +<a href="#extractbynumber">by number</a> +or +<a href="#extractbyname">by name</a> +(see below). +</P> +<P> +Within the ovector, the first in each pair of values is set to the offset of +the first code unit of a substring, and the second is set to the offset of the +first code unit after the end of a substring. These values are always code unit +offsets, not character offsets. That is, they are byte offsets in the 8-bit +library, 16-bit offsets in the 16-bit library, and 32-bit offsets in the 32-bit +library. +</P> +<P> +The first pair of offsets (that is, <i>ovector[0]</i> and <i>ovector[1]</i>) +identifies the portion of the subject string that was matched by the entire +pattern. The next pair is used for the first capturing subpattern, and so on. +The value returned by <b>pcre2_match()</b> is one more than the highest numbered +pair that has been set. For example, if two substrings have been captured, the +returned value is 3. If there are no capturing subpatterns, the return value +from a successful match is 1, indicating that just the first pair of offsets +has been set. +</P> +<P> +If a capturing subpattern is matched repeatedly within a single match +operation, it is the last portion of the string that it matched that is +returned. +</P> +<P> +If the ovector is too small to hold all the captured substring offsets, as much +as possible is filled in, and the function returns a value of zero. If neither +the actual string matched nor any captured substrings are of interest, +<b>pcre2_match()</b> may be called with a match data block whose ovector is of +zero length. However, if the pattern contains back references and the +<i>ovector</i> is not big enough to remember the related substrings, PCRE2 has +to get additional memory for use during matching. Thus it is usually advisable +to set up a match data block containing an ovector of reasonable size. +</P> +<P> +It is possible for capturing subpattern number <i>n+1</i> to match some part of +the subject when subpattern <i>n</i> has not been used at all. For example, if +the string "abc" is matched against the pattern (a|(z))(bc) the return from the +function is 4, and subpatterns 1 and 3 are matched, but 2 is not. When this +happens, both values in the offset pairs corresponding to unused subpatterns +are set to PCRE2_UNSET. +</P> +<P> +Offset values that correspond to unused subpatterns at the end of the +expression are also set to PCRE2_UNSET. For example, if the string "abc" is +matched against the pattern (abc)(x(yz)?)? subpatterns 2 and 3 are not matched. +The return from the function is 2, because the highest used capturing +subpattern number is 1. The offsets for for the second and third capturing +subpatterns (assuming the vector is large enough, of course) are set to +PCRE2_UNSET. +</P> +<P> +Elements in the ovector that do not correspond to capturing parentheses in the +pattern are never changed. That is, if a pattern contains <i>n</i> capturing +parentheses, no more than <i>ovector[0]</i> to <i>ovector[2n+1]</i> are set by +<b>pcre2_match()</b>. The other elements retain whatever values they previously +had. +<a name="matchotherdata"></a></P> +<br><b> +Other information about the match +</b><br> +<P> +<b>PCRE2_SPTR pcre2_get_mark(pcre2_match_data *<i>match_data</i>);</b> +<br> +<br> +<b>PCRE2_SIZE pcre2_get_leftchar(pcre2_match_data *<i>match_data</i>);</b> +<br> +<br> +<b>PCRE2_SIZE pcre2_get_rightchar(pcre2_match_data *<i>match_data</i>);</b> +<br> +<br> +<b>PCRE2_SIZE pcre2_get_startchar(pcre2_match_data *<i>match_data</i>);</b> +</P> +<P> +In addition to the offsets in the ovector, other information about a match is +retained in the match data block and can be retrieved by the above functions. +</P> +<P> +When a (*MARK) name is to be passed back, <b>pcre2_get_mark()</b> returns a +pointer to the zero-terminated name, which is within the compiled pattern. +Otherwise NULL is returned. A (*MARK) name may be available after a failed +match or a partial match, as well as after a successful one. +</P> +<P> +The other three functions yield values that give information about the part of +the subject string that was inspected during a successful match or a partial +match. Their results are undefined after a failed match. They return the +following values, respectively: +<br> +<br> +(1) The offset of the leftmost character that was inspected during the match. +This can be earlier than the point at which the match started if the pattern +contains lookbehind assertions or \b or \B at the start. +<br> +<br> +(2) The offset of the character that follows the rightmost character that was +inspected during the match. This can be after the end of the match if the +pattern contains lookahead assertions. +<br> +<br> +(3) The offset of the character at which the successful or partial match +started. This can be different to the value of <i>ovector[0]</i> if the pattern +contains the \K escape sequence. +</P> +<P> +For example, if the pattern (?<=abc)xx\Kyy(?=def) is matched against the +string "123abcxxyydef123", the resulting offsets are: +<pre> + ovector[0] 8 + ovector[1] 10 + leftchar 3 + rightchar 13 + startchar 6 +</pre> +The <b>allusedtext</b> modifier in <b>pcre2test</b> can be used to display a +longer string that shows the leftmost and rightmost characters in a match +instead of just the matched string. +<a name="errorlist"></a></P> +<br><b> +Error return values from <b>pcre2_match()</b> +</b><br> +<P> +If <b>pcre2_match()</b> fails, it returns a negative number. This can be +converted to a text string by calling <b>pcre2_get_error_message()</b>. Negative +error codes are also returned by other functions, and are documented with them. +The codes are given names in the header file. If UTF checking is in force and +an invalid UTF subject string is detected, one of a number of UTF-specific +negative error codes is returned. Details are given in the +<a href="pcre2unicode.html"><b>pcre2unicode</b></a> +page. The following are the other errors that may be returned by +<b>pcre2_match()</b>: +<pre> + PCRE2_ERROR_NOMATCH +</pre> +The subject string did not match the pattern. +<pre> + PCRE2_ERROR_PARTIAL +</pre> +The subject string did not match, but it did match partially. See the +<a href="pcre2partial.html"><b>pcre2partial</b></a> +documentation for details of partial matching. +<pre> + PCRE2_ERROR_BADMAGIC +</pre> +PCRE2 stores a 4-byte "magic number" at the start of the compiled code, to +catch the case when it is passed a junk pointer. This is the error that is +returned when the magic number is not present. +<pre> + PCRE2_ERROR_BADMODE +</pre> +This error is given when a pattern that was compiled by the 8-bit library is +passed to a 16-bit or 32-bit library function, or vice versa. +<pre> + PCRE2_ERROR_BADOFFSET +</pre> +The value of <i>startoffset</i> greater than the length of the subject. +<pre> + PCRE2_ERROR_BADOPTION +</pre> +An unrecognized bit was set in the <i>options</i> argument. +<pre> + PCRE2_ERROR_BADUTFOFFSET +</pre> +The UTF code unit sequence that was passed as a subject was checked and found +to be valid (the PCRE2_NO_UTF_CHECK option was not set), but the value of +<i>startoffset</i> did not point to the beginning of a UTF character or the end +of the subject. +<pre> + PCRE2_ERROR_CALLOUT +</pre> +This error is never generated by <b>pcre2_match()</b> itself. It is provided for +use by callout functions that want to cause <b>pcre2_match()</b> to return a +distinctive error code. See the +<a href="pcre2callout.html"><b>pcre2callout</b></a> +documentation for details. +<pre> + PCRE2_ERROR_INTERNAL +</pre> +An unexpected internal error has occurred. This error could be caused by a bug +in PCRE2 or by overwriting of the compiled pattern. +<pre> + PCRE2_ERROR_JIT_BADOPTION +</pre> +This error is returned when a pattern that was successfully studied using JIT +is being matched, but the matching mode (partial or complete match) does not +correspond to any JIT compilation mode. When the JIT fast path function is +used, this error may be also given for invalid options. See the +<a href="pcre2jit.html"><b>pcre2jit</b></a> +documentation for more details. +<pre> + PCRE2_ERROR_JIT_STACKLIMIT +</pre> +This error is returned when a pattern that was successfully studied using JIT +is being matched, but the memory available for the just-in-time processing +stack is not large enough. See the +<a href="pcre2jit.html"><b>pcre2jit</b></a> +documentation for more details. +<pre> + PCRE2_ERROR_MATCHLIMIT +</pre> +The backtracking limit was reached. +<pre> + PCRE2_ERROR_NOMEMORY +</pre> +If a pattern contains back references, but the ovector is not big enough to +remember the referenced substrings, PCRE2 gets a block of memory at the start +of matching to use for this purpose. There are some other special cases where +extra memory is needed during matching. This error is given when memory cannot +be obtained. +<pre> + PCRE2_ERROR_NULL +</pre> +Either the <i>code</i>, <i>subject</i>, or <i>match_data</i> argument was passed +as NULL. +<pre> + PCRE2_ERROR_RECURSELOOP +</pre> +This error is returned when <b>pcre2_match()</b> detects a recursion loop within +the pattern. Specifically, it means that either the whole pattern or a +subpattern has been called recursively for the second time at the same position +in the subject string. Some simple patterns that might do this are detected and +faulted at compile time, but more complicated cases, in particular mutual +recursions between two different subpatterns, cannot be detected until run +time. +<pre> + PCRE2_ERROR_RECURSIONLIMIT +</pre> +The internal recursion limit was reached. +<a name="extractbynumber"></a></P> +<br><a name="SEC24" href="#TOC1">EXTRACTING CAPTURED SUBSTRINGS BY NUMBER</a><br> +<P> +<b>int pcre2_substring_length_bynumber(pcre2_match_data *<i>match_data</i>,</b> +<b> unsigned int <i>number</i>, PCRE2_SIZE *<i>length</i>);</b> +<br> +<br> +<b>int pcre2_substring_copy_bynumber(pcre2_match_data *<i>match_data</i>,</b> +<b> unsigned int <i>number</i>, PCRE2_UCHAR *<i>buffer</i>,</b> +<b> PCRE2_SIZE *<i>bufflen</i>);</b> +<br> +<br> +<b>int pcre2_substring_get_bynumber(pcre2_match_data *<i>match_data</i>,</b> +<b> unsigned int <i>number</i>, PCRE2_UCHAR **<i>bufferptr</i>,</b> +<b> PCRE2_SIZE *<i>bufflen</i>);</b> +<br> +<br> +<b>void pcre2_substring_free(PCRE2_UCHAR *<i>buffer</i>);</b> +</P> +<P> +Captured substrings can be accessed directly by using the ovector as described +<a href="#matchedstrings">above.</a> +For convenience, auxiliary functions are provided for extracting captured +substrings as new, separate, zero-terminated strings. The functions in this +section identify substrings by number. The next section describes similar +functions for extracting substrings by name. A substring that contains a binary +zero is correctly extracted and has a further zero added on the end, but the +result is not, of course, a C string. +</P> +<P> +You can find the length in code units of a captured substring without +extracting it by calling <b>pcre2_substring_length_bynumber()</b>. The first +argument is a pointer to the match data block, the second is the group number, +and the third is a pointer to a variable into which the length is placed. +</P> +<P> +The <b>pcre2_substring_copy_bynumber()</b> function copies one string into a +supplied buffer, whereas <b>pcre2_substring_get_bynumber()</b> copies it into +new memory, obtained using the same memory allocation function that was used +for the match data block. The first two arguments of these functions are a +pointer to the match data block and a capturing group number. A group number of +zero extracts the substring that matched the entire pattern, and higher values +extract the captured substrings. +</P> +<P> +The final arguments of <b>pcre2_substring_copy_bynumber()</b> are a pointer to +the buffer and a pointer to a variable that contains its length in code units. +This is updated to contain the actual number of code units used, excluding the +terminating zero. +</P> +<P> +For <b>pcre2_substring_get_bynumber()</b> the third and fourth arguments point +to variables that are updated with a pointer to the new memory and the number +of code units that comprise the substring, again excluding the terminating +zero. When the substring is no longer needed, the memory should be freed by +calling <b>pcre2_substring_free()</b>. +</P> +<P> +The return value from these functions is zero for success, or one of these +error codes: +<pre> + PCRE2_ERROR_NOMEMORY +</pre> +The buffer was too small for <b>pcre2_substring_copy_bynumber()</b>, or the +attempt to get memory failed for <b>pcre2_substring_get_bynumber()</b>. +<pre> + PCRE2_ERROR_NOSUBSTRING +</pre> +No substring with the given number was captured. This could be because there is +no capturing group of that number in the pattern, or because the group with +that number did not participate in the match, or because the ovector was too +small to capture that group. +</P> +<br><a name="SEC25" href="#TOC1">EXTRACTING A LIST OF ALL CAPTURED SUBSTRINGS</a><br> +<P> +<b>int pcre2_substring_list_get(pcre2_match_data *<i>match_data</i>,</b> +<b>" PCRE2_UCHAR ***<i>listptr</i>, PCRE2_SIZE **<i>lengthsptr</i>);</b> +<br> +<br> +<b>void pcre2_substring_list_free(PCRE2_SPTR *<i>list</i>);</b> +</P> +<P> +The <b>pcre2_substring_list_get()</b> function extracts all available substrings +and builds a list of pointers to them, and a second list that contains their +lengths (in code units), excluding a terminating zero that is added to each of +them. All this is done in a single block of memory that is obtained using the +same memory allocation function that was used to get the match data block. +</P> +<P> +The address of the memory block is returned via <i>listptr</i>, which is also +the start of the list of string pointers. The end of the list is marked by a +NULL pointer. The address of the list of lengths is returned via +<i>lengthsptr</i>. If your strings do not contain binary zeros and you do not +therefore need the lengths, you may supply NULL as the <b>lengthsptr</b> +argument to disable the creation of a list of lengths. The yield of the +function is zero if all went well, or PCRE2_ERROR_NOMEMORY if the memory block +could not be obtained. When the list is no longer needed, it should be freed by +calling <b>pcre2_substring_list_free()</b>. +</P> +<P> +If this function encounters a substring that is unset, which can happen when +capturing subpattern number <i>n+1</i> matches some part of the subject, but +subpattern <i>n</i> has not been used at all, it returns an empty string. This +can be distinguished from a genuine zero-length substring by inspecting the +appropriate offset in the ovector, which contains PCRE2_UNSET for unset +substrings. +<a name="extractbynname"></a></P> +<br><a name="SEC26" href="#TOC1">EXTRACTING CAPTURED SUBSTRINGS BY NAME</a><br> +<P> +<b>int pcre2_substring_number_from_name(const pcre2_code *<i>code</i>,</b> +<b> PCRE2_SPTR <i>name</i>);</b> +<br> +<br> +<b>int pcre2_substring_length_byname(pcre2_match_data *<i>match_data</i>,</b> +<b> PCRE2_SPTR <i>name</i>, PCRE2_SIZE *<i>length</i>);</b> +<br> +<br> +<b>int pcre2_substring_copy_byname(pcre2_match_data *<i>match_data</i>,</b> +<b> PCRE2_SPTR <i>name</i>, PCRE2_UCHAR *<i>buffer</i>, PCRE2_SIZE *<i>bufflen</i>);</b> +<br> +<br> +<b>int pcre2_substring_get_byname(pcre2_match_data *<i>match_data</i>,</b> +<b> PCRE2_SPTR <i>name</i>, PCRE2_UCHAR **<i>bufferptr</i>, PCRE2_SIZE *<i>bufflen</i>);</b> +<br> +<br> +<b>void pcre2_substring_free(PCRE2_UCHAR *<i>buffer</i>);</b> +</P> +<P> +To extract a substring by name, you first have to find associated number. +For example, for this pattern: +<pre> + (a+)b(?<xxx>\d+)... +</pre> +the number of the subpattern called "xxx" is 2. If the name is known to be +unique (PCRE2_DUPNAMES was not set), you can find the number from the name by +calling <b>pcre2_substring_number_from_name()</b>. The first argument is the +compiled pattern, and the second is the name. The yield of the function is the +subpattern number, or PCRE2_ERROR_NOSUBSTRING if there is no subpattern of that +name. +</P> +<P> +Given the number, you can extract the substring directly, or use one of the +functions described in the previous section. For convenience, there are also +"byname" functions that correspond to the "bynumber" functions, the only +difference being that the second argument is a name instead of a number. +However, if PCRE2_DUPNAMES is set and there are duplicate names, +the behaviour may not be what you want (see the next section). +</P> +<P> +<b>Warning:</b> If the pattern uses the (?| feature to set up multiple +subpatterns with the same number, as described in the +<a href="pcre2pattern.html#dupsubpatternnumber">section on duplicate subpattern numbers</a> +in the +<a href="pcre2pattern.html"><b>pcre2pattern</b></a> +page, you cannot use names to distinguish the different subpatterns, because +names are not included in the compiled code. The matching process uses only +numbers. For this reason, the use of different names for subpatterns of the +same number causes an error at compile time. +</P> +<br><a name="SEC27" href="#TOC1">DUPLICATE SUBPATTERN NAMES</a><br> +<P> +<b>int pcre2_substring_nametable_scan(const pcre2_code *<i>code</i>,</b> +<b> PCRE2_SPTR <i>name</i>, PCRE2_SPTR *<i>first</i>, PCRE2_SPTR *<i>last</i>);</b> +</P> +<P> +When a pattern is compiled with the PCRE2_DUPNAMES option, names for +subpatterns are not required to be unique. Duplicate names are always allowed +for subpatterns with the same number, created by using the (?| feature. Indeed, +if such subpatterns are named, they are required to use the same names. +</P> +<P> +Normally, patterns with duplicate names are such that in any one match, only +one of the named subpatterns participates. An example is shown in the +<a href="pcre2pattern.html"><b>pcre2pattern</b></a> +documentation. +</P> +<P> +When duplicates are present, <b>pcre2_substring_copy_byname()</b> and +<b>pcre2_substring_get_byname()</b> return the first substring corresponding to +the given name that is set. If none are set, PCRE2_ERROR_NOSUBSTRING is +returned. The <b>pcre2_substring_number_from_name()</b> function returns one of +the numbers that are associated with the name, but it is not defined which it +is. +</P> +<P> +If you want to get full details of all captured substrings for a given name, +you must use the <b>pcre2_substring_nametable_scan()</b> function. The first +argument is the compiled pattern, and the second is the name. If the third and +fourth arguments are NULL, the function returns a group number (it is not +defined which). Otherwise, the third and fourth arguments must be pointers to +variables that are updated by the function. After it has run, they point to the +first and last entries in the name-to-number table for the given name, and the +function returns the length of each entry. In both cases, +PCRE2_ERROR_NOSUBSTRING is returned if there are no entries for the given name. +</P> +<P> +The format of the name table is described above in the section entitled +<i>Information about a pattern</i> +<a href="#infoaboutpattern">above.</a> +Given all the relevant entries for the name, you can extract each of their +numbers, and hence the captured data. +</P> +<br><a name="SEC28" href="#TOC1">FINDING ALL POSSIBLE MATCHES</a><br> +<P> +The traditional matching function uses a similar algorithm to Perl, which stops +when it finds the first match, starting at a given point in the subject. If you +want to find all possible matches, or the longest possible match at a given +position, consider using the alternative matching function (see below) instead. +If you cannot use the alternative function, you can kludge it up by making use +of the callout facility, which is described in the +<a href="pcre2callout.html"><b>pcre2callout</b></a> +documentation. +</P> +<P> +What you have to do is to insert a callout right at the end of the pattern. +When your callout function is called, extract and save the current matched +substring. Then return 1, which forces <b>pcre2_match()</b> to backtrack and try +other alternatives. Ultimately, when it runs out of matches, +<b>pcre2_match()</b> will yield PCRE2_ERROR_NOMATCH. +<a name="dfamatch"></a></P> +<br><a name="SEC29" href="#TOC1">MATCHING A PATTERN: THE ALTERNATIVE FUNCTION</a><br> +<P> +<b>int pcre2_dfa_match(const pcre2_code *<i>code</i>, PCRE2_SPTR <i>subject</i>,</b> +<b> PCRE2_SIZE <i>length</i>, PCRE2_SIZE <i>startoffset</i>,</b> +<b> uint32_t <i>options</i>, pcre2_match_data *<i>match_data</i>,</b> +<b> pcre2_match_context *<i>mcontext</i>,</b> +<b> int *<i>workspace</i>, PCRE2_SIZE <i>wscount</i>);</b> +</P> +<P> +The function <b>pcre2_dfa_match()</b> is called to match a subject string +against a compiled pattern, using a matching algorithm that scans the subject +string just once, and does not backtrack. This has different characteristics to +the normal algorithm, and is not compatible with Perl. Some of the features of +PCRE2 patterns are not supported. Nevertheless, there are times when this kind +of matching can be useful. For a discussion of the two matching algorithms, and +a list of features that <b>pcre2_dfa_match()</b> does not support, see the +<a href="pcre2matching.html"><b>pcre2matching</b></a> +documentation. +</P> +<P> +The arguments for the <b>pcre2_dfa_match()</b> function are the same as for +<b>pcre2_match()</b>, plus two extras. The ovector within the match data block +is used in a different way, and this is described below. The other common +arguments are used in the same way as for <b>pcre2_match()</b>, so their +description is not repeated here. +</P> +<P> +The two additional arguments provide workspace for the function. The workspace +vector should contain at least 20 elements. It is used for keeping track of +multiple paths through the pattern tree. More workspace is needed for patterns +and subjects where there are a lot of potential matches. +</P> +<P> +Here is an example of a simple call to <b>pcre2_dfa_match()</b>: +<pre> + int wspace[20]; + pcre2_match_data *md = pcre2_match_data_create(4, NULL); + int rc = pcre2_dfa_match( + re, /* result of pcre2_compile() */ + "some string", /* the subject string */ + 11, /* the length of the subject string */ + 0, /* start at offset 0 in the subject */ + 0, /* default options */ + match_data, /* the match data block */ + NULL, /* a match context; NULL means use defaults */ + wspace, /* working space vector */ + 20); /* number of elements (NOT size in bytes) */ +</PRE> +</P> +<br><b> +Option bits for <b>pcre_dfa_match()</b> +</b><br> +<P> +The unused bits of the <i>options</i> argument for <b>pcre2_dfa_match()</b> must +be zero. The only bits that may be set are PCRE2_ANCHORED, PCRE2_NOTBOL, +PCRE2_NOTEOL, PCRE2_NOTEMPTY, PCRE2_NOTEMPTY_ATSTART, PCRE2_NO_UTF_CHECK, +PCRE2_NO_START_OPTIMIZE, PCRE2_PARTIAL_HARD, PCRE2_PARTIAL_SOFT, +PCRE2_DFA_SHORTEST, and PCRE2_DFA_RESTART. All but the last four of these are +exactly the same as for <b>pcre2_match()</b>, so their description is not +repeated here. +<pre> + PCRE2_PARTIAL_HARD + PCRE2_PARTIAL_SOFT +</pre> +These have the same general effect as they do for <b>pcre2_match()</b>, but the +details are slightly different. When PCRE2_PARTIAL_HARD is set for +<b>pcre2_dfa_match()</b>, it returns PCRE2_ERROR_PARTIAL if the end of the +subject is reached and there is still at least one matching possibility that +requires additional characters. This happens even if some complete matches have +already been found. When PCRE2_PARTIAL_SOFT is set, the return code +PCRE2_ERROR_NOMATCH is converted into PCRE2_ERROR_PARTIAL if the end of the +subject is reached, there have been no complete matches, but there is still at +least one matching possibility. The portion of the string that was inspected +when the longest partial match was found is set as the first matching string in +both cases. There is a more detailed discussion of partial and multi-segment +matching, with examples, in the +<a href="pcre2partial.html"><b>pcre2partial</b></a> +documentation. +<pre> + PCRE2_DFA_SHORTEST +</pre> +Setting the PCRE2_DFA_SHORTEST option causes the matching algorithm to stop as +soon as it has found one match. Because of the way the alternative algorithm +works, this is necessarily the shortest possible match at the first possible +matching point in the subject string. +<pre> + PCRE2_DFA_RESTART +</pre> +When <b>pcre2_dfa_match()</b> returns a partial match, it is possible to call it +again, with additional subject characters, and have it continue with the same +match. The PCRE2_DFA_RESTART option requests this action; when it is set, the +<i>workspace</i> and <i>wscount</i> options must reference the same vector as +before because data about the match so far is left in them after a partial +match. There is more discussion of this facility in the +<a href="pcre2partial.html"><b>pcre2partial</b></a> +documentation. +</P> +<br><b> +Successful returns from <b>pcre2_dfa_match()</b> +</b><br> +<P> +When <b>pcre2_dfa_match()</b> succeeds, it may have matched more than one +substring in the subject. Note, however, that all the matches from one run of +the function start at the same point in the subject. The shorter matches are +all initial substrings of the longer matches. For example, if the pattern +<pre> + <.*> +</pre> +is matched against the string +<pre> + This is <something> <something else> <something further> no more +</pre> +the three matched strings are +<pre> + <something> + <something> <something else> + <something> <something else> <something further> +</pre> +On success, the yield of the function is a number greater than zero, which is +the number of matched substrings. The offsets of the substrings are returned in +the ovector, and can be extracted in the same way as for <b>pcre2_match()</b>. +They are returned in reverse order of length; that is, the longest +matching string is given first. If there were too many matches to fit into +the ovector, the yield of the function is zero, and the vector is filled with +the longest matches. +</P> +<P> +NOTE: PCRE2's "auto-possessification" optimization usually applies to character +repeats at the end of a pattern (as well as internally). For example, the +pattern "a\d+" is compiled as if it were "a\d++" because there is no point in +backtracking into the repeated digits. For DFA matching, this means that only +one possible match is found. If you really do want multiple matches in such +cases, either use an ungreedy repeat ("a\d+?") or set the +PCRE2_NO_AUTO_POSSESS option when compiling. +</P> +<br><b> +Error returns from <b>pcre2_dfa_match()</b> +</b><br> +<P> +The <b>pcre2_dfa_match()</b> function returns a negative number when it fails. +Many of the errors are the same as for <b>pcre2_match()</b>, as described +<a href="#errorlist">above.</a> +There are in addition the following errors that are specific to +<b>pcre2_dfa_match()</b>: +<pre> + PCRE2_ERROR_DFA_UITEM +</pre> +This return is given if <b>pcre2_dfa_match()</b> encounters an item in the +pattern that it does not support, for instance, the use of \C or a back +reference. +<pre> + PCRE2_ERROR_DFA_UCOND +</pre> +This return is given if <b>pcre2_dfa_match()</b> encounters a condition item +that uses a back reference for the condition, or a test for recursion in a +specific group. These are not supported. +<pre> + PCRE2_ERROR_DFA_WSSIZE +</pre> +This return is given if <b>pcre2_dfa_match()</b> runs out of space in the +<i>workspace</i> vector. +<pre> + PCRE2_ERROR_DFA_RECURSE +</pre> +When a recursive subpattern is processed, the matching function calls itself +recursively, using private memory for the ovector and <i>workspace</i>. This +error is given if the internal ovector is not large enough. This should be +extremely rare, as a vector of size 1000 is used. +<pre> + PCRE2_ERROR_DFA_BADRESTART +</pre> +When <b>pcre2_dfa_match()</b> is called with the <b>pcre2_dfa_RESTART</b> option, +some plausibility checks are made on the contents of the workspace, which +should contain data about the previous partial match. If any of these checks +fail, this error is given. +</P> +<br><a name="SEC30" href="#TOC1">SEE ALSO</a><br> +<P> +<b>pcre2build</b>(3), <b>pcre2libs</b>(3), <b>pcre2callout</b>(3), +<b>pcre2matching</b>(3), <b>pcre2partial</b>(3), <b>pcre2posix</b>(3), +<b>pcre2demo(3)</b>, <b>pcre2sample</b>(3), <b>pcre2stack</b>(3). +</P> +<br><a name="SEC31" href="#TOC1">AUTHOR</a><br> +<P> +Philip Hazel +<br> +University Computing Service +<br> +Cambridge CB2 3QH, England. +<br> +</P> +<br><a name="SEC32" href="#TOC1">REVISION</a><br> +<P> +Last updated: 16 September 2014 +<br> +Copyright © 1997-2014 University of Cambridge. +<br> +<p> +Return to the <a href="index.html">PCRE2 index page</a>. +</p> diff --git a/doc/html/pcre2callout.html b/doc/html/pcre2callout.html new file mode 100644 index 0000000..c742f90 --- /dev/null +++ b/doc/html/pcre2callout.html @@ -0,0 +1,270 @@ +<html> +<head> +<title>pcre2callout specification</title> +</head> +<body bgcolor="#FFFFFF" text="#00005A" link="#0066FF" alink="#3399FF" vlink="#2222BB"> +<h1>pcre2callout man page</h1> +<p> +Return to the <a href="index.html">PCRE2 index page</a>. +</p> +<p> +This page is part of the PCRE2 HTML documentation. It was generated +automatically from the original man page. If there is any nonsense in it, +please consult the man page, in case the conversion went wrong. +<br> +<ul> +<li><a name="TOC1" href="#SEC1">SYNOPSIS</a> +<li><a name="TOC2" href="#SEC2">DESCRIPTION</a> +<li><a name="TOC3" href="#SEC3">MISSING CALLOUTS</a> +<li><a name="TOC4" href="#SEC4">THE CALLOUT INTERFACE</a> +<li><a name="TOC5" href="#SEC5">RETURN VALUES</a> +<li><a name="TOC6" href="#SEC6">AUTHOR</a> +<li><a name="TOC7" href="#SEC7">REVISION</a> +</ul> +<br><a name="SEC1" href="#TOC1">SYNOPSIS</a><br> +<P> +<b>#include <pcre2.h></b> +</P> +<P> +<b>int (*pcre2_callout)(pcre2_callout_block *);</b> +</P> +<br><a name="SEC2" href="#TOC1">DESCRIPTION</a><br> +<P> +PCRE2 provides a feature called "callout", which is a means of temporarily +passing control to the caller of PCRE2 in the middle of pattern matching. The +caller of PCRE2 provides an external function by putting its entry point in +a match context (see <b>pcre2_set_callout()</b>) in the +<a href="pcre2api.html"><b>pcre2api</b></a> +documentation). +</P> +<P> +Within a regular expression, (?C) indicates the points at which the external +function is to be called. Different callout points can be identified by putting +a number less than 256 after the letter C. The default value is zero. +For example, this pattern has two callout points: +<pre> + (?C1)abc(?C2)def +</pre> +If the PCRE2_AUTO_CALLOUT option bit is set when a pattern is compiled, PCRE2 +automatically inserts callouts, all with number 255, before each item in the +pattern. For example, if PCRE2_AUTO_CALLOUT is used with the pattern +<pre> + A(\d{2}|--) +</pre> +it is processed as if it were +<br> +<br> +(?C255)A(?C255)((?C255)\d{2}(?C255)|(?C255)-(?C255)-(?C255))(?C255) +<br> +<br> +Notice that there is a callout before and after each parenthesis and +alternation bar. If the pattern contains a conditional group whose condition is +an assertion, an automatic callout is inserted immediately before the +condition. Such a callout may also be inserted explicitly, for example: +<pre> + (?(?C9)(?=a)ab|de) +</pre> +This applies only to assertion conditions (because they are themselves +independent groups). +</P> +<P> +Automatic callouts can be used for tracking the progress of pattern matching. +The +<a href="pcre2test.html"><b>pcre2test</b></a> +program has a pattern qualifier (/auto_callout) that sets automatic callouts; +when it is used, the output indicates how the pattern is being matched. This is +useful information when you are trying to optimize the performance of a +particular pattern. +</P> +<br><a name="SEC3" href="#TOC1">MISSING CALLOUTS</a><br> +<P> +You should be aware that, because of optimizations in the way PCRE2 compiles +and matches patterns, callouts sometimes do not happen exactly as you might +expect. +</P> +<P> +At compile time, PCRE2 "auto-possessifies" repeated items when it knows that +what follows cannot be part of the repeat. For example, a+[bc] is compiled as +if it were a++[bc]. The <b>pcre2test</b> output when this pattern is anchored +and then applied with automatic callouts to the string "aaaa" is: +<pre> + --->aaaa + +0 ^ ^ + +1 ^ a+ + +3 ^ ^ [bc] + No match +</pre> +This indicates that when matching [bc] fails, there is no backtracking into a+ +and therefore the callouts that would be taken for the backtracks do not occur. +You can disable the auto-possessify feature by passing PCRE2_NO_AUTO_POSSESS +to <b>pcre2_compile()</b>, or starting the pattern with (*NO_AUTO_POSSESS). If +this is done in <b>pcre2test</b> (using the /no_auto_possess qualifier), the +output changes to this: +<pre> + --->aaaa + +0 ^ ^ + +1 ^ a+ + +3 ^ ^ [bc] + +3 ^ ^ [bc] + +3 ^ ^ [bc] + +3 ^^ [bc] + No match +</pre> +This time, when matching [bc] fails, the matcher backtracks into a+ and tries +again, repeatedly, until a+ itself fails. +</P> +<P> +Other optimizations that provide fast "no match" results also affect callouts. +For example, if the pattern is +<pre> + ab(?C4)cd +</pre> +PCRE2 knows that any matching string must contain the letter "d". If the +subject string is "abyz", the lack of "d" means that matching doesn't ever +start, and the callout is never reached. However, with "abyd", though the +result is still no match, the callout is obeyed. +</P> +<P> +PCRE2 also knows the minimum length of a matching string, and will immediately +give a "no match" return without actually running a match if the subject is not +long enough, or, for unanchored patterns, if it has been scanned far enough. +</P> +<P> +You can disable these optimizations by passing the PCRE2_NO_START_OPTIMIZE +option to the matching function, or by starting the pattern with +(*NO_START_OPT). This slows down the matching process, but does ensure that +callouts such as the example above are obeyed. +</P> +<br><a name="SEC4" href="#TOC1">THE CALLOUT INTERFACE</a><br> +<P> +During matching, when PCRE2 reaches a callout point, the external function that +is set in the match context is called (if it is set). This applies to both +normal and DFA matching. The only argument to the callout function is a pointer +to a <b>pcre2_callout</b> block. This structure contains the following fields: +<pre> + uint32_t <i>version</i>; + uint32_t <i>callout_number</i>; + uint32_t <i>capture_top</i>; + uint32_t <i>capture_last</i>; + void *<i>callout_data</i>; + PCRE2_SIZE *<i>offset_vector</i>; + PCRE2_SPTR <i>mark</i>; + PCRE2_SPTR <i>subject</i>; + PCRE2_SIZE <i>subject_length</i>; + PCRE2_SIZE <i>start_match</i>; + PCRE2_SIZE <i>current_position</i>; + PCRE2_SIZE <i>pattern_position</i>; + PCRE2_SIZE <i>next_item_length</i>; +</pre> +The <i>version</i> field contains the version number of the block format. The +current version is 0. The version number will change in future if additional +fields are added, but the intention is never to remove any of the existing +fields. +</P> +<P> +The <i>callout_number</i> field contains the number of the callout, as compiled +into the pattern (that is, the number after ?C for manual callouts, and 255 for +automatically generated callouts). +</P> +<P> +The <i>offset_vector</i> field is a pointer to the vector of capturing offsets +(the "ovector") that was passed to the matching function in the match data +block. When <b>pcre2_match()</b> is used, the contents can be inspected, in +order to extract substrings that have been matched so far, in the same way as +for extracting substrings after a match has completed. For the DFA matching +function, this field is not useful. +</P> +<P> +The <i>subject</i> and <i>subject_length</i> fields contain copies of the values +that were passed to the matching function. +</P> +<P> +The <i>start_match</i> field normally contains the offset within the subject at +which the current match attempt started. However, if the escape sequence \K +has been encountered, this value is changed to reflect the modified starting +point. If the pattern is not anchored, the callout function may be called +several times from the same point in the pattern for different starting points +in the subject. +</P> +<P> +The <i>current_position</i> field contains the offset within the subject of the +current match pointer. +</P> +<P> +When the <b>pcre2_match()</b> is used, the <i>capture_top</i> field contains one +more than the number of the highest numbered captured substring so far. If no +substrings have been captured, the value of <i>capture_top</i> is one. This is +always the case when the DFA functions are used, because they do not support +captured substrings. +</P> +<P> +The <i>capture_last</i> field contains the number of the most recently captured +substring. However, when a recursion exits, the value reverts to what it was +outside the recursion, as do the values of all captured substrings. If no +substrings have been captured, the value of <i>capture_last</i> is 0. This is +always the case for the DFA matching functions. +</P> +<P> +The <i>callout_data</i> field contains a value that is passed to a matching +function specifically so that it can be passed back in callouts. It is set in +the match context when the callout is set up by calling +<b>pcre2_set_callout()</b> (see the +<a href="pcre2api.html"><b>pcre2api</b></a> +documentation). +</P> +<P> +The <i>pattern_position</i> field contains the offset to the next item to be +matched in the pattern string. +</P> +<P> +The <i>next_item_length</i> field contains the length of the next item to be +matched in the pattern string. When the callout immediately precedes an +alternation bar, a closing parenthesis, or the end of the pattern, the length +is zero. When the callout precedes an opening parenthesis, the length is that +of the entire subpattern. +</P> +<P> +The <i>pattern_position</i> and <i>next_item_length</i> fields are intended to +help in distinguishing between different automatic callouts, which all have the +same callout number. However, they are set for all callouts. +</P> +<P> +In callouts from <b>pcre2_match()</b> the <i>mark</i> field contains a pointer to +the zero-terminated name of the most recently passed (*MARK), (*PRUNE), or +(*THEN) item in the match, or NULL if no such items have been passed. Instances +of (*PRUNE) or (*THEN) without a name do not obliterate a previous (*MARK). In +callouts from the DFA matching function this field always contains NULL. +</P> +<br><a name="SEC5" href="#TOC1">RETURN VALUES</a><br> +<P> +The external callout function returns an integer to PCRE2. If the value is +zero, matching proceeds as normal. If the value is greater than zero, matching +fails at the current point, but the testing of other matching possibilities +goes ahead, just as if a lookahead assertion had failed. If the value is less +than zero, the match is abandoned, and the matching function returns the +negative value. +</P> +<P> +Negative values should normally be chosen from the set of PCRE2_ERROR_xxx +values. In particular, PCRE2_ERROR_NOMATCH forces a standard "no match" +failure. The error number PCRE2_ERROR_CALLOUT is reserved for use by callout +functions; it will never be used by PCRE2 itself. +</P> +<br><a name="SEC6" href="#TOC1">AUTHOR</a><br> +<P> +Philip Hazel +<br> +University Computing Service +<br> +Cambridge CB2 3QH, England. +<br> +</P> +<br><a name="SEC7" href="#TOC1">REVISION</a><br> +<P> +Last updated: 19 October 2014 +<br> +Copyright © 1997-2014 University of Cambridge. +<br> +<p> +Return to the <a href="index.html">PCRE2 index page</a>. +</p> diff --git a/doc/html/pcre2demo.html b/doc/html/pcre2demo.html new file mode 100644 index 0000000..2d1d92b --- /dev/null +++ b/doc/html/pcre2demo.html @@ -0,0 +1,443 @@ +<html> +<head> +<title>pcre2demo specification</title> +</head> +<body bgcolor="#FFFFFF" text="#00005A" link="#0066FF" alink="#3399FF" vlink="#2222BB"> +<h1>pcre2demo man page</h1> +<p> +Return to the <a href="index.html">PCRE2 index page</a>. +</p> +<p> +This page is part of the PCRE2 HTML documentation. It was generated +automatically from the original man page. If there is any nonsense in it, +please consult the man page, in case the conversion went wrong. +<br> +<ul> +</ul> +<PRE> +/************************************************* +* PCRE2 DEMONSTRATION PROGRAM * +*************************************************/ + +/* This is a demonstration program to illustrate a straightforward way of +calling the PCRE2 regular expression library from a C program. See the +pcre2sample documentation for a short discussion ("man pcre2sample" if you have +the PCRE2 man pages installed). PCRE2 is a revised API for the library, and is +incompatible with the original PCRE API. + +There are actually three libraries, each supporting a different code unit +width. This demonstration program uses the 8-bit library. + +In Unix-like environments, if PCRE2 is installed in your standard system +libraries, you should be able to compile this program using this command: + +gcc -Wall pcre2demo.c -lpcre2-8 -o pcre2demo + +If PCRE2 is not installed in a standard place, it is likely to be installed +with support for the pkg-config mechanism. If you have pkg-config, you can +compile this program using this command: + +gcc -Wall pcre2demo.c `pkg-config --cflags --libs libpcre2-8` -o pcre2demo + +If you do not have pkg-config, you may have to use this: + +gcc -Wall pcre2demo.c -I/usr/local/include -L/usr/local/lib \ + -R/usr/local/lib -lpcre2-8 -o pcre2demo + +Replace "/usr/local/include" and "/usr/local/lib" with wherever the include and +library files for PCRE2 are installed on your system. Only some operating +systems (Solaris is one) use the -R option. + +Building under Windows: + +If you want to statically link this program against a non-dll .a file, you must +define PCRE2_STATIC before including pcre2.h, so in this environment, uncomment +the following line. */ + +/* #define PCRE2_STATIC */ + +/* This macro must be defined before including pcre2.h. For a program that uses +only one code unit width, it makes it possible to use generic function names +such as pcre2_compile(). */ + +#define PCRE2_CODE_UNIT_WIDTH 8 + +#include <stdio.h> +#include <string.h> +#include <pcre2.h> + + +/************************************************************************** +* Here is the program. The API includes the concept of "contexts" for * +* setting up unusual interface requirements for compiling and matching, * +* such as custom memory managers and non-standard newline definitions. * +* This program does not do any of this, so it makes no use of contexts, * +* always passing NULL where a context could be given. * +**************************************************************************/ + +int main(int argc, char **argv) +{ +pcre2_code *re; +PCRE2_SPTR pattern; /* PCRE2_SPTR is a pointer to unsigned code units of */ +PCRE2_SPTR subject; /* the appropriate width (8, 16, or 32 bits). */ +PCRE2_SPTR name_table; + +int crlf_is_newline; +int errornumber; +int find_all; +int i; +int namecount; +int name_entry_size; +int rc; +int utf8; + +uint32_t option_bits; +uint32_t newline; + +PCRE2_SIZE erroroffset; +PCRE2_SIZE *ovector; + +size_t subject_length; +pcre2_match_data *match_data; + + + +/************************************************************************** +* First, sort out the command line. There is only one possible option at * +* the moment, "-g" to request repeated matching to find all occurrences, * +* like Perl's /g option. We set the variable find_all to a non-zero value * +* if the -g option is present. Apart from that, there must be exactly two * +* arguments. * +**************************************************************************/ + +find_all = 0; +for (i = 1; i < argc; i++) + { + if (strcmp(argv[i], "-g") == 0) find_all = 1; + else break; + } + +/* After the options, we require exactly two arguments, which are the pattern, +and the subject string. */ + +if (argc - i != 2) + { + printf("Two arguments required: a regex and a subject string\n"); + return 1; + } + +/* As pattern and subject are char arguments, they can be straightforwardly +cast to PCRE2_SPTR as we are working in 8-bit code units. */ + +pattern = (PCRE2_SPTR)argv[i]; +subject = (PCRE2_SPTR)argv[i+1]; +subject_length = strlen((char *)subject); + + +/************************************************************************* +* Now we are going to compile the regular expression pattern, and handle * +* any errors that are detected. * +*************************************************************************/ + +re = pcre2_compile( + pattern, /* the pattern */ + -1, /* indicates pattern is zero-terminated */ + 0, /* default options */ + &errornumber, /* for error number */ + &erroroffset, /* for error offset */ + NULL); /* use default compile context */ + +/* Compilation failed: print the error message and exit. */ + +if (re == NULL) + { + PCRE2_UCHAR buffer[256]; + pcre2_get_error_message(errornumber, buffer, sizeof(buffer)); + printf("PCRE2 compilation failed at offset %d: %s\n", (int)erroroffset, + buffer); + return 1; + } + + +/************************************************************************* +* If the compilation succeeded, we call PCRE again, in order to do a * +* pattern match against the subject string. This does just ONE match. If * +* further matching is needed, it will be done below. Before running the * +* match we must set up a match_data block for holding the result. * +*************************************************************************/ + +/* Using this function ensures that the block is exactly the right size for +the number of capturing parentheses in the pattern. */ + +match_data = pcre2_match_data_create_from_pattern(re, NULL); + +rc = pcre2_match( + re, /* the compiled pattern */ + subject, /* the subject string */ + subject_length, /* the length of the subject */ + 0, /* start at offset 0 in the subject */ + 0, /* default options */ + match_data, /* block for storing the result */ + NULL); /* use default match context */ + +/* Matching failed: handle error cases */ + +if (rc < 0) + { + switch(rc) + { + case PCRE2_ERROR_NOMATCH: printf("No match\n"); break; + /* + Handle other special cases if you like + */ + default: printf("Matching error %d\n", rc); break; + } + pcre2_match_data_free(match_data); /* Release memory used for the match */ + pcre2_code_free(re); /* data and the compiled pattern. */ + return 1; + } + +/* Match succeded. Get a pointer to the output vector, where string offsets are +stored. */ + +ovector = pcre2_get_ovector_pointer(match_data); +printf("\nMatch succeeded at offset %d\n", (int)ovector[0]); + + +/************************************************************************* +* We have found the first match within the subject string. If the output * +* vector wasn't big enough, say so. Then output any substrings that were * +* captured. * +*************************************************************************/ + +/* The output vector wasn't big enough. This should not happen, because we used +pcre2_match_data_create_from_pattern() above. */ + +if (rc == 0) + printf("ovector was not big enough for all the captured substrings\n"); + +/* Show substrings stored in the output vector by number. Obviously, in a real +application you might want to do things other than print them. */ + +for (i = 0; i < rc; i++) + { + PCRE2_SPTR substring_start = subject + ovector[2*i]; + size_t substring_length = ovector[2*i+1] - ovector[2*i]; + printf("%2d: %.*s\n", i, (int)substring_length, (char *)substring_start); + } + + +/************************************************************************** +* That concludes the basic part of this demonstration program. We have * +* compiled a pattern, and performed a single match. The code that follows * +* shows first how to access named substrings, and then how to code for * +* repeated matches on the same subject. * +**************************************************************************/ + +/* See if there are any named substrings, and if so, show them by name. First +we have to extract the count of named parentheses from the pattern. */ + +(void)pcre2_pattern_info( + re, /* the compiled pattern */ + PCRE2_INFO_NAMECOUNT, /* get the number of named substrings */ + &namecount); /* where to put the answer */ + +if (namecount <= 0) printf("No named substrings\n"); else + { + PCRE2_SPTR tabptr; + printf("Named substrings\n"); + + /* Before we can access the substrings, we must extract the table for + translating names to numbers, and the size of each entry in the table. */ + + (void)pcre2_pattern_info( + re, /* the compiled pattern */ + PCRE2_INFO_NAMETABLE, /* address of the table */ + &name_table); /* where to put the answer */ + + (void)pcre2_pattern_info( + re, /* the compiled pattern */ + PCRE2_INFO_NAMEENTRYSIZE, /* size of each entry in the table */ + &name_entry_size); /* where to put the answer */ + + /* Now we can scan the table and, for each entry, print the number, the name, + and the substring itself. In the 8-bit library the number is held in two + bytes, most significant first. */ + + tabptr = name_table; + for (i = 0; i < namecount; i++) + { + int n = (tabptr[0] << 8) | tabptr[1]; + printf("(%d) %*s: %.*s\n", n, name_entry_size - 3, tabptr + 2, + (int)(ovector[2*n+1] - ovector[2*n]), subject + ovector[2*n]); + tabptr += name_entry_size; + } + } + + +/************************************************************************* +* If the "-g" option was given on the command line, we want to continue * +* to search for additional matches in the subject string, in a similar * +* way to the /g option in Perl. This turns out to be trickier than you * +* might think because of the possibility of matching an empty string. * +* What happens is as follows: * +* * +* If the previous match was NOT for an empty string, we can just start * +* the next match at the end of the previous one. * +* * +* If the previous match WAS for an empty string, we can't do that, as it * +* would lead to an infinite loop. Instead, a call of pcre2_match() is * +* made with the PCRE2_NOTEMPTY_ATSTART and PCRE2_ANCHORED flags set. The * +* first of these tells PCRE2 that an empty string at the start of the * +* subject is not a valid match; other possibilities must be tried. The * +* second flag restricts PCRE2 to one match attempt at the initial string * +* position. If this match succeeds, an alternative to the empty string * +* match has been found, and we can print it and proceed round the loop, * +* advancing by the length of whatever was found. If this match does not * +* succeed, we still stay in the loop, advancing by just one character. * +* In UTF-8 mode, which can be set by (*UTF) in the pattern, this may be * +* more than one byte. * +* * +* However, there is a complication concerned with newlines. When the * +* newline convention is such that CRLF is a valid newline, we must * +* advance by two characters rather than one. The newline convention can * +* be set in the regex by (*CR), etc.; if not, we must find the default. * +*************************************************************************/ + +if (!find_all) /* Check for -g */ + { + pcre2_match_data_free(match_data); /* Release the memory that was used */ + pcre2_code_free(re); /* for the match data and the pattern. */ + return 0; /* Exit the program. */ + } + +/* Before running the loop, check for UTF-8 and whether CRLF is a valid newline +sequence. First, find the options with which the regex was compiled and extract +the UTF state. */ + +(void)pcre2_pattern_info(re, PCRE2_INFO_ALLOPTIONS, &option_bits); +utf8 = (option_bits & PCRE2_UTF) != 0; + +/* Now find the newline convention and see whether CRLF is a valid newline +sequence. */ + +(void)pcre2_pattern_info(re, PCRE2_INFO_NEWLINE, &newline); +crlf_is_newline = newline == PCRE2_NEWLINE_ANY || + newline == PCRE2_NEWLINE_CRLF || + newline == PCRE2_NEWLINE_ANYCRLF; + +/* Loop for second and subsequent matches */ + +for (;;) + { + uint32_t options = 0; /* Normally no options */ + PCRE2_SIZE start_offset = ovector[1]; /* Start at end of previous match */ + + /* If the previous match was for an empty string, we are finished if we are + at the end of the subject. Otherwise, arrange to run another match at the + same point to see if a non-empty match can be found. */ + + if (ovector[0] == ovector[1]) + { + if (ovector[0] == subject_length) break; + options = PCRE2_NOTEMPTY_ATSTART | PCRE2_ANCHORED; + } + + /* Run the next matching operation */ + + rc = pcre2_match( + re, /* the compiled pattern */ + subject, /* the subject string */ + subject_length, /* the length of the subject */ + start_offset, /* starting offset in the subject */ + options, /* options */ + match_data, /* block for storing the result */ + NULL); /* use default match context */ + + /* This time, a result of NOMATCH isn't an error. If the value in "options" + is zero, it just means we have found all possible matches, so the loop ends. + Otherwise, it means we have failed to find a non-empty-string match at a + point where there was a previous empty-string match. In this case, we do what + Perl does: advance the matching position by one character, and continue. We + do this by setting the "end of previous match" offset, because that is picked + up at the top of the loop as the point at which to start again. + + There are two complications: (a) When CRLF is a valid newline sequence, and + the current position is just before it, advance by an extra byte. (b) + Otherwise we must ensure that we skip an entire UTF character if we are in + UTF mode. */ + + if (rc == PCRE2_ERROR_NOMATCH) + { + if (options == 0) break; /* All matches found */ + ovector[1] = start_offset + 1; /* Advance one code unit */ + if (crlf_is_newline && /* If CRLF is newline & */ + start_offset < subject_length - 1 && /* we are at CRLF, */ + subject[start_offset] == '\r' && + subject[start_offset + 1] == '\n') + ovector[1] += 1; /* Advance by one more. */ + else if (utf8) /* Otherwise, ensure we */ + { /* advance a whole UTF-8 */ + while (ovector[1] < subject_length) /* character. */ + { + if ((subject[ovector[1]] & 0xc0) != 0x80) break; + ovector[1] += 1; + } + } + continue; /* Go round the loop again */ + } + + /* Other matching errors are not recoverable. */ + + if (rc < 0) + { + printf("Matching error %d\n", rc); + pcre2_match_data_free(match_data); + pcre2_code_free(re); + return 1; + } + + /* Match succeded */ + + printf("\nMatch succeeded again at offset %d\n", (int)ovector[0]); + + /* The match succeeded, but the output vector wasn't big enough. This + should not happen. */ + + if (rc == 0) + printf("ovector was not big enough for all the captured substrings\n"); + + /* As before, show substrings stored in the output vector by number, and then + also any named substrings. */ + + for (i = 0; i < rc; i++) + { + PCRE2_SPTR substring_start = subject + ovector[2*i]; + size_t substring_length = ovector[2*i+1] - ovector[2*i]; + printf("%2d: %.*s\n", i, (int)substring_length, (char *)substring_start); + } + + if (namecount <= 0) printf("No named substrings\n"); else + { + PCRE2_SPTR tabptr = name_table; + printf("Named substrings\n"); + for (i = 0; i < namecount; i++) + { + int n = (tabptr[0] << 8) | tabptr[1]; + printf("(%d) %*s: %.*s\n", n, name_entry_size - 3, tabptr + 2, + (int)(ovector[2*n+1] - ovector[2*n]), subject + ovector[2*n]); + tabptr += name_entry_size; + } + } + } /* End of loop to find second and subsequent matches */ + +printf("\n"); +pcre2_match_data_free(match_data); +pcre2_code_free(re); +return 0; +} + +/* End of pcre2demo.c */ +<p> +Return to the <a href="index.html">PCRE2 index page</a>. +</p> diff --git a/doc/html/pcre2test.html b/doc/html/pcre2test.html new file mode 100644 index 0000000..30b527d --- /dev/null +++ b/doc/html/pcre2test.html @@ -0,0 +1,1199 @@ +<html> +<head> +<title>pcre2test specification</title> +</head> +<body bgcolor="#FFFFFF" text="#00005A" link="#0066FF" alink="#3399FF" vlink="#2222BB"> +<h1>pcre2test man page</h1> +<p> +Return to the <a href="index.html">PCRE2 index page</a>. +</p> +<p> +This page is part of the PCRE2 HTML documentation. It was generated +automatically from the original man page. If there is any nonsense in it, +please consult the man page, in case the conversion went wrong. +<br> +<ul> +<li><a name="TOC1" href="#SEC1">SYNOPSIS</a> +<li><a name="TOC2" href="#SEC2">PCRE2's 8-BIT, 16-BIT AND 32-BIT LIBRARIES</a> +<li><a name="TOC3" href="#SEC3">INPUT ENCODING</a> +<li><a name="TOC4" href="#SEC4">COMMAND LINE OPTIONS</a> +<li><a name="TOC5" href="#SEC5">DESCRIPTION</a> +<li><a name="TOC6" href="#SEC6">COMMAND LINES</a> +<li><a name="TOC7" href="#SEC7">MODIFIER SYNTAX</a> +<li><a name="TOC8" href="#SEC8">PATTERN SYNTAX</a> +<li><a name="TOC9" href="#SEC9">SUBJECT LINE SYNTAX</a> +<li><a name="TOC10" href="#SEC10">PATTERN MODIFIERS</a> +<li><a name="TOC11" href="#SEC11">SUBJECT MODIFIERS</a> +<li><a name="TOC12" href="#SEC12">THE ALTERNATIVE MATCHING FUNCTION</a> +<li><a name="TOC13" href="#SEC13">DEFAULT OUTPUT FROM pcre2test</a> +<li><a name="TOC14" href="#SEC14">OUTPUT FROM THE ALTERNATIVE MATCHING FUNCTION</a> +<li><a name="TOC15" href="#SEC15">RESTARTING AFTER A PARTIAL MATCH</a> +<li><a name="TOC16" href="#SEC16">CALLOUTS</a> +<li><a name="TOC17" href="#SEC17">NON-PRINTING CHARACTERS</a> +<li><a name="TOC18" href="#SEC18">SEE ALSO</a> +<li><a name="TOC19" href="#SEC19">AUTHOR</a> +<li><a name="TOC20" href="#SEC20">REVISION</a> +</ul> +<br><a name="SEC1" href="#TOC1">SYNOPSIS</a><br> +<P> +<b>pcre2test [options] [input file [output file]]</b> +<br> +<br> +<b>pcre2test</b> is a test program for the PCRE2 regular expression libraries, +but it can also be used for experimenting with regular expressions. This +document describes the features of the test program; for details of the regular +expressions themselves, see the +<a href="pcre2pattern.html"><b>pcre2pattern</b></a> +documentation. For details of the PCRE2 library function calls and their +options, see the +<a href="pcre2api.html"><b>pcre2api</b></a> +documentation. +</P> +<P> +The input for <b>pcre2test</b> is a sequence of regular expression patterns and +subject strings to be matched. The output shows the result of each match +attempt. Modifiers on the command line, the patterns, and the subject lines +specify PCRE2 function options, control how the subject is processed, and what +output is produced. +</P> +<P> +As the original fairly simple PCRE library evolved, it acquired many different +features, and as a result, the original <b>pcretest</b> program ended up with a +lot of options in a messy, arcane syntax, for testing all the features. The +move to the new PCRE2 API provided an opportunity to re-implement the test +program as <b>pcre2test</b>, with a cleaner modifier syntax. Nevertheless, there +are still many obscure modifiers, some of which are specifically designed for +use in conjunction with the test script and data files that are distributed as +part of PCRE2. All the modifiers are documented here, some without much +justification, but many of them are unlikely to be of use except when testing +the libraries. +</P> +<br><a name="SEC2" href="#TOC1">PCRE2's 8-BIT, 16-BIT AND 32-BIT LIBRARIES</a><br> +<P> +Different versions of the PCRE2 library can be built to support character +strings that are encoded in 8-bit, 16-bit, or 32-bit code units. One, two, or +all three of these libraries may be simultaneously installed. The +<b>pcre2test</b> program can be used to test all the libraries. However, its own +input and output are always in 8-bit format. When testing the 16-bit or 32-bit +libraries, patterns and subject strings are converted to 16- or 32-bit format +before being passed to the library functions. Results are converted back to +8-bit code units for output. +</P> +<P> +In the rest of this document, the names of library functions and structures +are given in generic form, for example, <b>pcre_compile()</b>. The actual +names used in the libraries have a suffix _8, _16, or _32, as appropriate. +</P> +<br><a name="SEC3" href="#TOC1">INPUT ENCODING</a><br> +<P> +Input to <b>pcre2test</b> is processed line by line, either by calling the C +library's <b>fgets()</b> function, or via the <b>libreadline</b> library (see +below). In Unix-like environments, <b>fgets()</b> treats any bytes other than +newline as data characters. However, in some Windows environments character 26 +(hex 1A) causes an immediate end of file, and no further data is read. For +maximum portability, therefore, it is safest to avoid non-printing characters +in <b>pcre2test</b> input files. +</P> +<br><a name="SEC4" href="#TOC1">COMMAND LINE OPTIONS</a><br> +<P> +<b>-8</b> +If the 8-bit library has been built, this option causes it to be used (this is +the default). If the 8-bit library has not been built, this option causes an +error. +</P> +<P> +<b>-16</b> +If the 16-bit library has been built, this option causes it to be used. If only +the 16-bit library has been built, this is the default. If the 16-bit library +has not been built, this option causes an error. +</P> +<P> +<b>-32</b> +If the 32-bit library has been built, this option causes it to be used. If only +the 32-bit library has been built, this is the default. If the 32-bit library +has not been built, this option causes an error. +</P> +<P> +<b>-b</b> +Behave as if each pattern has the <b>/fullbincode</b> modifier; the full +internal binary form of the pattern is output after compilation. +</P> +<P> +<b>-C</b> +Output the version number of the PCRE2 library, and all available information +about the optional features that are included, and then exit with zero exit +code. All other options are ignored. +</P> +<P> +<b>-C</b> <i>option</i> +Output information about a specific build-time option, then exit. This +functionality is intended for use in scripts such as <b>RunTest</b>. The +following options output the value and set the exit code as indicated: +<pre> + ebcdic-nl the code for LF (= NL) in an EBCDIC environment: + 0x15 or 0x25 + 0 if used in an ASCII environment + exit code is always 0 + linksize the configured internal link size (2, 3, or 4) + exit code is set to the link size + newline the default newline setting: + CR, LF, CRLF, ANYCRLF, or ANY + exit code is always 0 + bsr the default setting for what \R matches: + ANYCRLF or ANY + exit code is always 0 +</pre> +The following options output 1 for true or 0 for false, and set the exit code +to the same value: +<pre> + ebcdic compiled for an EBCDIC environment + jit just-in-time support is available + pcre16 the 16-bit library was built + pcre32 the 32-bit library was built + pcre8 the 8-bit library was built + unicode Unicode support is available +</pre> +If an unknown option is given, an error message is output; the exit code is 0. +</P> +<P> +<b>-d</b> +Behave as if each pattern has the <b>debug</b> modifier; the internal +form and information about the compiled pattern is output after compilation; +<b>-d</b> is equivalent to <b>-b -i</b>. +</P> +<P> +<b>-dfa</b> +Behave as if each subject line has the <b>dfa</b> modifier; matching is done +using the <b>pcre2_dfa_match()</b> function instead of the default +<b>pcre2_match()</b>. +</P> +<P> +<b>-help</b> +Output a brief summary these options and then exit. +</P> +<P> +<b>-i</b> +Behave as if each pattern has the <b>/info</b> modifier; information about the +compiled pattern is given after compilation. +</P> +<P> +<b>-jit</b> +Behave as if each pattern line has the <b>jit</b> modifier; after successful +compilation, each pattern is passed to the just-in-time compiler, if available. +</P> +<P> +\fB-pattern\fB <i>modifier-list</i> +Behave as if each pattern line contains the given modifiers. +</P> +<P> +<b>-q</b> +Do not output the version number of <b>pcre2test</b> at the start of execution. +</P> +<P> +<b>-S</b> <i>size</i> +On Unix-like systems, set the size of the run-time stack to <i>size</i> +megabytes. +</P> +<P> +<b>-subject</b> <i>modifier-list</i> +Behave as if each subject line contains the given modifiers. +</P> +<P> +<b>-t</b> +Run each compile and match many times with a timer, and output the resulting +times per compile or match. You can control the number of iterations that are +used for timing by following <b>-t</b> with a number (as a separate item on the +command line). For example, "-t 1000" iterates 1000 times. The default is to +iterate 500,000 times. +</P> +<P> +<b>-tm</b> +This is like <b>-t</b> except that it times only the matching phase, not the +compile phase. +</P> +<P> +<b>-T</b> <b>-TM</b> +These behave like <b>-t</b> and <b>-tm</b>, but in addition, at the end of a run, +the total times for all compiles and matches are output. +</P> +<P> +<b>-version</b> +Output the PCRE2 version number and then exit. +</P> +<br><a name="SEC5" href="#TOC1">DESCRIPTION</a><br> +<P> +If <b>pcre2test</b> is given two filename arguments, it reads from the first and +writes to the second. If it is given only one filename argument, it reads from +that file and writes to stdout. Otherwise, it reads from stdin and writes to +stdout, and prompts for each line of input, using "re>" to prompt for regular +expression patterns, and "data>" to prompt for subject lines. +</P> +<P> +When <b>pcre2test</b> is built, a configuration option can specify that it +should be linked with the <b>libreadline</b> or <b>libedit</b> library. When this +is done, if the input is from a terminal, it is read using the <b>readline()</b> +function. This provides line-editing and history facilities. The output from +the <b>-help</b> option states whether or not <b>readline()</b> will be used. +</P> +<P> +The program handles any number of tests, each of which consists of a set of +input lines. Each set starts with a regular expression pattern, followed by any +number of subject lines to be matched against that pattern. In between sets of +test data, command lines that begin with a hash (#) character may appear. This +file format, with some restrictions, can also be processed by the +<b>perltest.pl</b> script that is distributed with PCRE2 as a means of checking +that the behaviour of PCRE2 and Perl is the same. +</P> +<P> +Each subject line is matched separately and independently. If you want to do +multi-line matches, you have to use the \n escape sequence (or \r or \r\n, +etc., depending on the newline setting) in a single line of input to encode the +newline sequences. There is no limit on the length of subject lines; the input +buffer is automatically extended if it is too small. There is a replication +feature that makes it possible to generate long subject lines without having to +supply them explicitly. +</P> +<P> +An empty line or the end of the file signals the end of the subject lines for a +test, at which point a new pattern or command line is expected if there is +still input to be read. +</P> +<br><a name="SEC6" href="#TOC1">COMMAND LINES</a><br> +<P> +In between sets of test data, a line that begins with a hash (#) character is +interpreted as a command line. If the first character is followed by white +space or an exclamation mark, the line is treated as a comment, and ignored. +Otherwise, the following commands are recognized: +<pre> + #forbid_utf +</pre> +Subsequent patterns automatically have the PCRE2_NEVER_UTF and PCRE2_NEVER_UCP +options set, which locks out the use of UTF and Unicode property features. This +is a trigger guard that is used in test files to ensure that UTF/Unicode tests +are not accidentally added to files that are used when UTF support is not +included in the library. This effect can also be obtained by the use of +<b>#pattern</b>; the difference is that <b>#forbid_utf</b> cannot be unset, and +the automatic options are not displayed in pattern information, to avoid +cluttering up test output. +<pre> + #pattern <modifier-list> +</pre> +This command sets a default modifier list that applies to all subsequent +patterns. Modifiers on a pattern can change these settings. +<pre> + #perltest +</pre> +The appearance of this line causes all subsequent modifier settings to be +checked for compatibility with the <b>perltest.pl</b> script, which is used to +confirm that Perl gives the same results as PCRE2. Also, apart from comment +lines, none of the other command lines are permitted, because they and many +of the modifiers are specific to <b>pcre2test</b>, and should not be used in +test files that are also processed by <b>perltest.pl</b>. The \fP#perltest\fB +command helps detect tests that are accidentally put in the wrong file. +<pre> + #subject <modifier-list> +</pre> +This command sets a default modifier list that applies to all subsequent +subject lines. Modifiers on a subject line can change these settings. +</P> +<br><a name="SEC7" href="#TOC1">MODIFIER SYNTAX</a><br> +<P> +Modifier lists are used with both pattern and subject lines. Items in a list +are separated by commas and optional white space. Some modifiers may be given +for both patterns and subject lines, whereas others are valid for one or the +other only. Each modifier has a long name, for example "anchored", and some of +them must be followed by an equals sign and a value, for example, "offset=12". +Modifiers that do not take values may be preceded by a minus sign to turn off a +previous default setting. +</P> +<P> +A few of the more common modifiers can also be specified as single letters, for +example "i" for "caseless". In documentation, following the Perl convention, +these are written with a slash ("the /i modifier") for clarity. Abbreviated +modifiers must all be concatenated in the first item of a modifier list. If the +first item is not recognized as a long modifier name, it is interpreted as a +sequence of these abbreviations. For example: +<pre> + /abc/ig,newline=cr,jit=3 +</pre> +This is a pattern line whose modifier list starts with two one-letter modifiers +(/i and /g). The lower-case abbreviated modifiers are the same as used in Perl. +</P> +<br><a name="SEC8" href="#TOC1">PATTERN SYNTAX</a><br> +<P> +A pattern line must start with one of the following characters (common symbols, +excluding pattern meta-characters): +<pre> + / ! " ' ` - = _ : ; , % & @ ~ +</pre> +This is interpreted as the pattern's delimiter. A regular expression may be +continued over several input lines, in which case the newline characters are +included within it. It is possible to include the delimiter within the pattern +by escaping it with a backslash, for example +<pre> + /abc\/def/ +</pre> +If you do this, the escape and the delimiter form part of the pattern, but +since the delimiters are all non-alphanumeric, this does not affect its +interpretation. If the terminating delimiter is immediately followed by a +backslash, for example, +<pre> + /abc/\ +</pre> +then a backslash is added to the end of the pattern. This is done to provide a +way of testing the error condition that arises if a pattern finishes with a +backslash, because +<pre> + /abc\/ +</pre> +is interpreted as the first line of a pattern that starts with "abc/", causing +pcre2test to read the next line as a continuation of the regular expression. +</P> +<P> +A pattern can be followed by a modifier list (details below). +</P> +<br><a name="SEC9" href="#TOC1">SUBJECT LINE SYNTAX</a><br> +<P> +Before each subject line is passed to <b>pcre2_match()</b> or +<b>pcre2_dfa_match()</b>, leading and trailing white space is removed, and the +line is scanned for backslash escapes. The following provide a means of +encoding non-printing characters in a visible way: +<pre> + \a alarm (BEL, \x07) + \b backspace (\x08) + \e escape (\x27) + \f form feed (\x0c) + \n newline (\x0a) + \r carriage return (\x0d) + \t tab (\x09) + \v vertical tab (\x0b) + \nnn octal character (up to 3 octal digits); always + a byte unless > 255 in UTF-8 or 16-bit or 32-bit mode + \o{dd...} octal character (any number of octal digits} + \xhh hexadecimal byte (up to 2 hex digits) + \x{hh...} hexadecimal character (any number of hex digits) +</pre> +The use of \x{hh...} is not dependent on the use of the utf modifier on +the pattern. It is recognized always. There may be any number of hexadecimal +digits inside the braces; invalid values provoke error messages. +</P> +<P> +Note that \xhh specifies one byte rather than one character in UTF-8 mode; +this makes it possible to construct invalid UTF-8 sequences for testing +purposes. On the other hand, \x{hh} is interpreted as a UTF-8 character in +UTF-8 mode, generating more than one byte if the value is greater than 127. +When testing the 8-bit library not in UTF-8 mode, \x{hh} generates one byte +for values less than 256, and causes an error for greater values. +</P> +<P> +In UTF-16 mode, all 4-digit \x{hhhh} values are accepted. This makes it +possible to construct invalid UTF-16 sequences for testing purposes. +</P> +<P> +In UTF-32 mode, all 4- to 8-digit \x{...} values are accepted. This makes it +possible to construct invalid UTF-32 sequences for testing purposes. +</P> +<P> +There is a special backslash sequence that specifies replication of one or more +characters: +<pre> + \[<characters>]{<count>} +</pre> +This makes it possible to test long strings without having to provide them as +part of the file. For example: +<pre> + \[abc]{4} +</pre> +is converted to "abcabcabcabc". This feature does not support nesting. To +include a closing square bracket in the characters, code it as \x5D. +</P> +<P> +A backslash followed by an equals sign marke the end of the subject string and +the start of a modifier list. For example: +<pre> + abc\=notbol,notempty +</pre> +A backslash followed by any other non-alphanumeric character just escapes that +character. A backslash followed by anything else causes an error. However, if +the very last character in the line is a backslash (and there is no modifier +list), it is ignored. This gives a way of passing an empty line as data, since +a real empty line terminates the data input. +</P> +<br><a name="SEC10" href="#TOC1">PATTERN MODIFIERS</a><br> +<P> +There are three types of modifier that can appear in pattern lines, two of +which may also be used in a <b>#pattern</b> command. A pattern's modifier list +can add to or override default modifiers that were set by a previous +<b>#pattern</b> command. +</P> +<br><b> +Setting compilation options +</b><br> +<P> +The following modifiers set options for <b>pcre2_compile()</b>. The most common +ones have single-letter abbreviations. See +<a href="pcreapi.html"><b>pcreapi</b></a> +for a description of their effects. +<pre> + allow_empty_class set PCRE2_ALLOW_EMPTY_CLASS + alt_bsux set PCRE2_ALT_BSUX + anchored set PCRE2_ANCHORED + auto_callout set PCRE2_AUTO_CALLOUT + /i caseless set PCRE2_CASELESS + dollar_endonly set PCRE2_DOLLAR_ENDONLY + /s dotall set PCRE2_DOTALL + dupnames set PCRE2_DUPNAMES + /x extended set PCRE2_EXTENDED + firstline set PCRE2_FIRSTLINE + match_unset_backref set PCRE2_MATCH_UNSET_BACKREF + /m multiline set PCRE2_MULTILINE + never_ucp set PCRE2_NEVER_UCP + never_utf set PCRE2_NEVER_UTF + no_auto_capture set PCRE2_NO_AUTO_CAPTURE + no_auto_possess set PCRE2_NO_AUTO_POSSESS + no_start_optimize set PCRE2_NO_START_OPTIMIZE + no_utf_check set PCRE2_NO_UTF_CHECK + ucp set PCRE2_UCP + ungreedy set PCRE2_UNGREEDY + utf set PCRE2_UTF +</pre> +As well as turning on the PCRE2_UTF option, the <b>utf</b> modifier causes all +non-printing characters in output strings to be printed using the \x{hh...} +notation. Otherwise, those less than 0x100 are output in hex without the curly +brackets. +</P> +<br><b> +Setting compilation controls +</b><br> +<P> +The following modifiers affect the compilation process or request information +about the pattern: +<pre> + bsr=[anycrlf|unicode] specify \R handling + /B bincode show binary code without lengths + debug same as info,fullbincode + fullbincode show binary code with lengths + /I info show info about compiled pattern + hex pattern is coded in hexadecimal + jit[=<number>] use JIT + locale=<name> use this locale + memory show memory used + newline=<type> set newline type + parens_nest_limit=<n> set maximum parentheses depth + perlcompat lock out non-Perl modifiers + posix use the POSIX API + stackguard=<number> test the stackguard feature + tables=[0|1|2] select internal tables + use_length use the pattern's length +</pre> +The effects of these modifiers are described in the following sections. +FIXME: Give more examples. +</P> +<br><b> +Newline and \R handling +</b><br> +<P> +The <b>bsr</b> modifier specifies what \R in a pattern should match. If it is +set to "anycrlf", \R matches CR, LF, or CRLF only. If it is set to "unicode", +\R matches any Unicode newline sequence. The default is specified when PCRE2 +is built, with the default default being Unicode. +</P> +<P> +The <b>newline</b> modifier specifies which characters are to be interpreted as +newlines, both in the pattern and (by default) in subject lines. The type must +be one of CR, LF, CRLF, ANYCRLF, or ANY. +</P> +<P> +Both the \R and newline settings can be changed at match time, but if this is +done, JIT matching is disabled. +</P> +<br><b> +Information about a pattern +</b><br> +<P> +The <b>debug</b> modifier is a shorthand for <b>info,fullbincode</b>, requesting +all available information. +</P> +<P> +The <b>bincode</b> modifier causes a representation of the compiled code to be +output after compilation. This information does not contain length and offset +values, which ensures that the same output is generated for different internal +link sizes and different code unit widths. By using <b>bincode</b>, the same +regression tests can be used in different environments. +</P> +<P> +The <b>fullbincode</b> modifier, by contrast, <i>does</i> include length and +offset values. This is used in a few special tests and is also useful for +one-off tests. +</P> +<P> +The <b>info</b> modifier requests information about the compiled pattern +(whether it is anchored, has a fixed first character, and so on). The +information is obtained from the <b>pcre2_pattern_info()</b> function. +</P> +<br><b> +Specifying a pattern in hex +</b><br> +<P> +The <b>hex</b> modifier specifies that the characters of the pattern are to be +interpreted as pairs of hexadecimal digits. White space is permitted between +pairs. For example: +<pre> + /ab 32 59/hex +</pre> +This feature is provided as a way of creating patterns that contain binary zero +characters. When <b>hex</b> is set, it implies <b>use_length</b>. +</P> +<br><b> +Using the pattern's length +</b><br> +<P> +By default, <b>pcre2test</b> passes patterns as zero-terminated strings to +<b>pcre2_compile()</b>, giving the length as -1. If <b>use_length</b> is set, the +length of the pattern is passed. This is implied if <b>hex</b> is set. +</P> +<br><b> +JIT compilation +</b><br> +<P> +The <b>/jit</b> modifier may optionally be followed by a number in the range 0 +to 7: +<pre> + 0 disable JIT + 1 normal match only + 2 soft partial match only + 3 normal match and soft partial match + 4 hard partial match only + 6 soft and hard partial match + 7 all three modes +</pre> +If no number is given, 7 is assumed. If JIT compilation is successful, the +compiled JIT code will automatically be used when <b>pcre2_match()</b> is run, +except when incompatible run-time options are specified. For more details, see +the +<a href="pcre2jit.html"><b>pcre2jit</b></a> +documentation. See also the <b>jitstack</b> modifier below for a way of +setting the size of the JIT stack. +</P> +<P> +If the <b>jitverify</b> modifier is specified, the text "(JIT)" is added to the +first output line after a match or non match when JIT-compiled code was +actually used. This modifier can also be set on a subject line. +</P> +<br><b> +Setting a locale +</b><br> +<P> +The <b>/locale</b> modifier must specify the name of a locale, for example: +<pre> + /pattern/locale=fr_FR +</pre> +The given locale is set, <b>pcre2_maketables()</b> is called to build a set of +character tables for the locale, and this is then passed to +<b>pcre2_compile()</b> when compiling the regular expression. The same tables +are used when matching the following subject lines. The <b>/locale</b> modifier +applies only to the pattern on which it appears, but can be given in a +<b>#pattern</b> command if a default is needed. Setting a locale and alternate +character tables are mutually exclusive. +</P> +<br><b> +Showing pattern memory +</b><br> +<P> +The <b>/memory</b> modifier causes the size in bytes of the memory block used to +hold the compiled pattern to be output. This does not include the size of the +<b>pcre2_code</b> block; it is just the actual compiled data. If the pattern is +subsequently passed to the JIT compiler, the size of the JIT compiled code is +also output. +</P> +<br><b> +Limiting nested parentheses +</b><br> +<P> +The <b>parens_nest_limit</b> modifier sets a limit on the depth of nested +parentheses in a pattern. Breaching the limit causes a compilation error. +</P> +<br><b> +Using the POSIX wrapper API +</b><br> +<P> +The <b>/posix</b> modifier causes <b>pcre2test</b> to call PCRE2 via the POSIX +wrapper API rather than its native API. This supports only the 8-bit library. +When the POSIX API is being used, the following pattern modifiers set options +for the <b>regcomp()</b> function: +<pre> + caseless REG_ICASE + multiline REG_NEWLINE + no_auto_capture REG_NOSUB + dotall REG_DOTALL ) + ungreedy REG_UNGREEDY ) These options are not part of + ucp REG_UCP ) the POSIX standard + utf REG_UTF8 ) +</pre> +The <b>aftertext</b> and <b>allaftertext</b> subject modifiers work as described +below. All other modifiers cause an error. +</P> +<br><b> +Testing the stack guard feature +</b><br> +<P> +The <b>/stackguard</b> modifier is used to test the use of +<b>pcre2_set_compile_recursion_guard()</b>, a function that is provided to +enable stack availability to be checked during compilation (see the +<a href="pcre2api.html"><b>pcre2api</b></a> +documentation for details). If the number specified by the modifier is greater +than zero, <b>pcre2_set_compile_recursion_guard()</b> is called to set up +callback from <b>pcre2_compile()</b> to a local function. The argument it is +passed is the current nesting parenthesis depth; if this is greater than the +value given by the modifier, non-zero is returned, causing the compilation to +be aborted. +</P> +<br><b> +Using alternative character tables +</b><br> +<P> +The <b>/tables</b> modifier must be followed by a single digit. It causes a +specific set of built-in character tables to be passed to +<b>pcre2_compile()</b>. This is used in the PCRE2 tests to check behaviour with +different character tables. The digit specifies the tables as follows: +<pre> + 0 do not pass any special character tables + 1 the default ASCII tables, as distributed in + pcre2_chartables.c.dist + 2 a set of tables defining ISO 8859 characters +</pre> +In table 2, some characters whose codes are greater than 128 are identified as +letters, digits, spaces, etc. Setting alternate character tables and a locale +are mutually exclusive. +</P> +<br><b> +Setting certain match controls +</b><br> +<P> +The following modifiers are really subject modifiers, and are described below. +However, they may be included in a pattern's modifier list, in which case they +are applied to every subject line that is processed with that pattern. They do +not affect the compilation process. +<pre> + aftertext show text after match + allaftertext show text after captures + allcaptures show all captures + allusedtext show all consulted text + /g global global matching + jitverify verify JIT usage + mark show mark values +</pre> +These modifiers may not appear in a <b>#pattern</b> command. If you want them as +defaults, set them in a <b>#subject</b> command. +</P> +<br><a name="SEC11" href="#TOC1">SUBJECT MODIFIERS</a><br> +<P> +The modifiers that can appear in subject lines and the <b>#subject</b> +command are of two types. +</P> +<br><b> +Setting match options +</b><br> +<P> +The following modifiers set options for <b>pcre2_match()</b> or +<b>pcre2_dfa_match()</b>. See +<a href="pcreapi.html"><b>pcreapi</b></a> +for a description of their effects. +<pre> + anchored set PCRE2_ANCHORED + dfa_restart set PCRE2_DFA_RESTART + dfa_shortest set PCRE2_DFA_SHORTEST + no_start_optimize set PCRE2_NO_START_OPTIMIZE + no_utf_check set PCRE2_NO_UTF_CHECK + notbol set PCRE2_NOTBOL + notempty set PCRE2_NOTEMPTY + notempty_atstart set PCRE2_NOTEMPTY_ATSTART + noteol set PCRE2_NOTEOL + partial_hard (or ph) set PCRE2_PARTIAL_HARD + partial_soft (or ps) set PCRE2_PARTIAL_SOFT +</pre> +The partial matching modifiers are provided with abbreviations because they +appear frequently in tests. +</P> +<P> +If the <b>/posix</b> modifier was present on the pattern, causing the POSIX +wrapper API to be used, the only option-setting modifiers that have any effect +are <b>notbol</b>, <b>notempty</b>, and <b>noteol</b>, causing REG_NOTBOL, +REG_NOTEMPTY, and REG_NOTEOL, respectively, to be passed to <b>regexec()</b>. +Any other modifiers cause an error. +</P> +<br><b> +Setting match controls +</b><br> +<P> +The following modifiers affect the matching process or request additional +information. Some of them may also be specified on a pattern line (see above), +in which case they apply to every subject line that is matched against that +pattern. +<pre> + aftertext show text after match + allaftertext show text after captures + allcaptures show all captures + allusedtext show all consulted text + altglobal alternative global matching + bsr=[anycrlf|unicode] specify \R handling + callout_capture show captures at callout time + callout_data=<n> set a value to pass via callouts + callout_fail=<n>[:<m>] control callout failure + callout_none do not supply a callout function + copy=<number or name> copy captured substring + dfa use <b>pcre2_dfa_match()</b> + find_limits find match and recursion limits + get=<number or name> extract captured substring + getall extract all captured substrings + /g global global matching + jitstack=<n> set size of JIT stack + jitverify verify JIT usage + mark show mark values + match_limit=>n> set a match limit + memory show memory usage + newline=<type> set newline type + offset=<n> set starting offset + ovector=<n> set size of output vector + recursion_limit=<n> set a recursion limit +</pre> +The effects of these modifiers are described in the following sections. +FIXME: Give more examples. +</P> +<br><b> +Newline and \R handling +</b><br> +<P> +These modifiers set the newline and \R processing conventions for the subject +line, overriding any values that were set at compile time (as described above). +JIT matching is disabled if these settings are changed at match time. +</P> +<br><b> +Showing more text +</b><br> +<P> +The <b>aftertext</b> modifier requests that as well as outputting the substring +that matched the entire pattern, <b>pcre2test</b> should in addition output the +remainder of the subject string. This is useful for tests where the subject +contains multiple copies of the same substring. The <b>allaftertext</b> modifier +requests the same action for captured substrings as well as the main matched +substring. In each case the remainder is output on the following line with a +plus character following the capture number. +</P> +<P> +The <b>allusedtext</b> modifier requests that all the text that was consulted +during a successful pattern match be shown. This affects the output if there +is a lookbehind at the start of a match, or a lookahead at the end, or if \K +is used in the pattern. Characters that precede or follow the start and end of +the actual match are indicated in the output by '<' or '>' characters +underneath them. Here is an example: +<pre> + /(?<=pqr)abc(?=xyz)/ + 123pqrabcxyz456\=allusedtext + 0: pqrabcxyz + <<< >>> +</pre> +This shows that the matched string is "abc", with the preceding and following +strings "pqr" and "xyz" also consulted during the match. +</P> +<br><b> +Showing the value of all capture groups +</b><br> +<P> +The <b>allcaptures</b> modifier requests that the values of all potential +captured parentheses be output after a match. By default, only those up to the +highest one actually used in the match are output (corresponding to the return +code from <b>pcre2_match()</b>). Groups that did not take part in the match +are output as "<unset>". +</P> +<br><b> +Testing callouts +</b><br> +<P> +A callout function is supplied when <b>pcre2test</b> calls the library matching +functions, unless <b>callout_none</b> is specified. If <b>callout_capture</b> is +set, the current captured groups are output when a callout occurs. +</P> +<P> +The <b>callout_fail</b> modifier can be given one or two numbers. If there is +only one number, 1 is returned instead of 0 when a callout of that number is +reached. If two numbers are given, 1 is returned when callout <n> is reached +for the <m>th time. +</P> +<P> +The <b>callout_data</b> modifier can be given an unsigned or a negative number. +Any value other than zero is used as a return from <b>pcre2test</b>'s callout +function. +</P> +<br><b> +Testing substring extraction functions +</b><br> +<P> +The <b>copy</b> and <b>get</b> modifiers can be used to test the +<b>pcre2_substring_copy_xxx()</b> and <b>pcre2_substring_get_xxx()</b> functions. +They can be given more than once, and each can specify a group name or number, +for example: +<pre> + abcd\=copy=1,copy=3,get=G1 +</pre> +If the <b>#subject</b> command is used to set default copy and get lists, these +can be unset by specifying a negative number for numbered groups and an empty +name for named groups. +</P> +<P> +The <b>getall</b> modifier tests <b>pcre2_substring_list_get()</b>, which +extracts all captured substrings. +</P> +<P> +If the subject line is successfully matched, the substrings extracted by the +convenience functions are output with C, G, or L after the string number +instead of a colon. This is in addition to the normal full list. The string +length (that is, the return from the extraction function) is given in +parentheses after each substring. +</P> +<br><b> +Finding all matches in a string +</b><br> +<P> +Searching for all possible matches within a subject can be requested by the +<b>global</b> or <b>/altglobal</b> modifier. After finding a match, the matching +function is called again to search the remainder of the subject. The difference +between <b>global</b> and <b>altglobal</b> is that the former uses the +<i>start_offset</i> argument to <b>pcre2_match()</b> or <b>pcre2_dfa_match()</b> +to start searching at a new point within the entire string (which is what Perl +does), whereas the latter passes over a shortened substring. This makes a +difference to the matching process if the pattern begins with a lookbehind +assertion (including \b or \B). +</P> +<P> +If an empty string is matched, the next match is done with the +PCRE2_NOTEMPTY_ATSTART and PCRE2_ANCHORED flags set, in order to search for +another, non-empty, match at the same point in the subject. If this match +fails, the start offset is advanced, and the normal match is retried. This +imitates the way Perl handles such cases when using the <b>/g</b> modifier or +the <b>split()</b> function. Normally, the start offset is advanced by one +character, but if the newline convention recognizes CRLF as a newline, and the +current character is CR followed by LF, an advance of two is used. +</P> +<br><b> +Setting the JIT stack size +</b><br> +<P> +The <b>jitstack</b> modifier provides a way of setting the maximum stack size +that is used by the just-in-time optimization code. It is ignored if JIT +optimization is not being used. Providing a stack that is larger than the +default 32K is necessary only for very complicated patterns. +</P> +<br><b> +Setting match and recursion limits +</b><br> +<P> +The <b>match_limit</b> and <b>recursion_limit</b> modifiers set the appropriate +limits in the match context. These values are ignored when the +<b>find_limits</b> modifier is specified. +</P> +<br><b> +Finding minimum limits +</b><br> +<P> +If the <b>find_limits</b> modifier is present, <b>pcre2test</b> calls +<b>pcre2_match()</b> several times, setting different values in the match +context via <b>pcre2_set_match_limit()</b> and <b>pcre2_set_recursion_limit()</b> +until it finds the minimum values for each parameter that allow +<b>pcre2_match()</b> to complete without error. +</P> +<P> +The <i>match_limit</i> number is a measure of the amount of backtracking +that takes place, and learning the minimum value can be instructive. For most +simple matches, the number is quite small, but for patterns with very large +numbers of matching possibilities, it can become large very quickly with +increasing length of subject string. The <i>match_limit_recursion</i> number is +a measure of how much stack (or, if PCRE2 is compiled with NO_RECURSE, how much +heap) memory is needed to complete the match attempt. +</P> +<br><b> +Showing MARK names +</b><br> +<P> +The <b>mark</b> modifier causes the names from backtracking control verbs that +are returned from calls to <b>pcre2_match()</b> to be displayed. If a mark is +returned for a match, non-match, or partial match, <b>pcre2test</b> shows it. +For a match, it is on a line by itself, tagged with "MK:". Otherwise, it +is added to the non-match message. +</P> +<br><b> +Showing memory usage +</b><br> +<P> +The <b>memory</b> modifier causes <b>pcre2test</b> to log all memory allocation +and freeing calls that occur during a match operation. +</P> +<br><b> +Setting a starting offset +</b><br> +<P> +The <b>offset</b> modifier sets an offset in the subject string at which +matching starts. Its value is a number of code units, not characters. +</P> +<br><b> +Setting the size of the output vector +</b><br> +<P> +The <b>ovector</b> modifier applies only to the subject line in which it +appears, though of course it can also be used to set a default in a +<b>#subject</b> command. It specifies the number of pairs of offsets that are +available for storing matching information. The default is 15. +</P> +<br><a name="SEC12" href="#TOC1">THE ALTERNATIVE MATCHING FUNCTION</a><br> +<P> +By default, <b>pcre2test</b> uses the standard PCRE2 matching function, +<b>pcre2_match()</b> to match each subject line. PCRE2 also supports an +alternative matching function, <b>pcre2_dfa_match()</b>, which operates in a +different way, and has some restrictions. The differences between the two +functions are described in the +<a href="pcre2matching.html"><b>pcre2matching</b></a> +documentation. +</P> +<P> +If the <b>dfa</b> modifier is set, the alternative matching function is used. +This function finds all possible matches at a given point in the subject. If, +however, the <b>dfa_shortest</b> modifier is set, processing stops after the +first match is found. This is always the shortest possible match. +</P> +<br><a name="SEC13" href="#TOC1">DEFAULT OUTPUT FROM pcre2test</a><br> +<P> +This section describes the output when the normal matching function, +<b>pcre2_match()</b>, is being used. +</P> +<P> +When a match succeeds, <b>pcre2test</b> outputs the list of captured substrings, +starting with number 0 for the string that matched the whole pattern. +Otherwise, it outputs "No match" when the return is PCRE2_ERROR_NOMATCH, or +"Partial match:" followed by the partially matching substring when the +return is PCRE2_ERROR_PARTIAL. (Note that this is the +entire substring that was inspected during the partial match; it may include +characters before the actual match start if a lookbehind assertion, \K, \b, +or \B was involved.) +</P> +<P> +For any other return, <b>pcre2test</b> outputs the PCRE2 +negative error number and a short descriptive phrase. If the error is a failed +UTF string check, the offset of the start of the failing character and the +reason code are also output. Here is an example of an interactive +<b>pcre2test</b> run. +<pre> + $ pcre2test + PCRE2 version 9.00 2014-05-10 + + re> /^abc(\d+)/ + data> abc123 + 0: abc123 + 1: 123 + data> xyz + No match +</pre> +Unset capturing substrings that are not followed by one that is set are not +returned by <b>pcre2_match()</b>, and are not shown by <b>pcre2test</b>. In the +following example, there are two capturing substrings, but when the first data +line is matched, the second, unset substring is not shown. An "internal" unset +substring is shown as "<unset>", as for the second data line. +<pre> + re> /(a)|(b)/ + data> a + 0: a + 1: a + data> b + 0: b + 1: <unset> + 2: b +</pre> +If the strings contain any non-printing characters, they are output as \xhh +escapes if the value is less than 256 and UTF mode is not set. Otherwise they +are output as \x{hh...} escapes. See below for the definition of non-printing +characters. If the <b>/aftertext</b> modifier is set, the output for substring +0 is followed by the the rest of the subject string, identified by "0+" like +this: +<pre> + re> /cat/aftertext + data> cataract + 0: cat + 0+ aract +</pre> +If global matching is requested, the results of successive matching attempts +are output in sequence, like this: +<pre> + re> /\Bi(\w\w)/g + data> Mississippi + 0: iss + 1: ss + 0: iss + 1: ss + 0: ipp + 1: pp +</pre> +"No match" is output only if the first match attempt fails. Here is an example +of a failure message (the offset 4 that is specified by \>4 is past the end of +the subject string): +<pre> + re> /xyz/ + data> xyz\=offset=4 + Error -24 (bad offset value) +</PRE> +</P> +<P> +Note that whereas patterns can be continued over several lines (a plain ">" +prompt is used for continuations), subject lines may not. However newlines can +be included in a subject by means of the \n escape (or \r, \r\n, etc., +depending on the newline sequence setting). +</P> +<br><a name="SEC14" href="#TOC1">OUTPUT FROM THE ALTERNATIVE MATCHING FUNCTION</a><br> +<P> +When the alternative matching function, <b>pcre2_dfa_match()</b>, is used, the +output consists of a list of all the matches that start at the first point in +the subject where there is at least one match. For example: +<pre> + re> /(tang|tangerine|tan)/ + data> yellow tangerine\=dfa + 0: tangerine + 1: tang + 2: tan +</pre> +(Using the normal matching function on this data finds only "tang".) The +longest matching string is always given first (and numbered zero). After a +PCRE2_ERROR_PARTIAL return, the output is "Partial match:", followed by the +partially matching substring. (Note that this is the entire substring that was +inspected during the partial match; it may include characters before the actual +match start if a lookbehind assertion, \K, \b, or \B was involved.) +</P> +<P> +If global matching is requested, the search for further matches resumes +at the end of the longest match. For example: +<pre> + re> /(tang|tangerine|tan)/g + data> yellow tangerine and tangy sultana\=dfa + 0: tangerine + 1: tang + 2: tan + 0: tang + 1: tan + 0: tan +</pre> +The alternative matching function does not support substring capture, so the +modifiers that are concerned with captured substrings are not relevant. +</P> +<br><a name="SEC15" href="#TOC1">RESTARTING AFTER A PARTIAL MATCH</a><br> +<P> +When the alternative matching function has given the PCRE2_ERROR_PARTIAL +return, indicating that the subject partially matched the pattern, you can +restart the match with additional subject data by means of the +<b>dfa_restart</b> modifier. For example: +<pre> + re> /^\d?\d(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\d\d$/ + data> 23ja\=P,dfa + Partial match: 23ja + data> n05\=dfa,dfa_restart + 0: n05 +</pre> +For further information about partial matching, see the +<a href="pcre2partial.html"><b>pcre2partial</b></a> +documentation. +</P> +<br><a name="SEC16" href="#TOC1">CALLOUTS</a><br> +<P> +If the pattern contains any callout requests, <b>pcre2test</b>'s callout function +is called during matching. This works with both matching functions. By default, +the called function displays the callout number, the start and current +positions in the text at the callout time, and the next pattern item to be +tested. For example: +<pre> + --->pqrabcdef + 0 ^ ^ \d +</pre> +This output indicates that callout number 0 occurred for a match attempt +starting at the fourth character of the subject string, when the pointer was at +the seventh character, and when the next pattern item was \d. Just +one circumflex is output if the start and current positions are the same. +</P> +<P> +Callouts numbered 255 are assumed to be automatic callouts, inserted as a +result of the <b>/auto_callout</b> pattern modifier. In this case, instead of +showing the callout number, the offset in the pattern, preceded by a plus, is +output. For example: +<pre> + re> /\d?[A-E]\*/auto_callout + data> E* + --->E* + +0 ^ \d? + +3 ^ [A-E] + +8 ^^ \* + +10 ^ ^ + 0: E* +</pre> +If a pattern contains (*MARK) items, an additional line is output whenever +a change of latest mark is passed to the callout function. For example: +<pre> + re> /a(*MARK:X)bc/auto_callout + data> abc + --->abc + +0 ^ a + +1 ^^ (*MARK:X) + +10 ^^ b + Latest Mark: X + +11 ^ ^ c + +12 ^ ^ + 0: abc +</pre> +The mark changes between matching "a" and "b", but stays the same for the rest +of the match, so nothing more is output. If, as a result of backtracking, the +mark reverts to being unset, the text "<unset>" is output. +</P> +<P> +The callout function in <b>pcre2test</b> returns zero (carry on matching) by +default, but you can use a <b>callout_fail</b> modifier in a subject line (as +described above) to change this and other parameters of the callout. +</P> +<P> +Inserting callouts can be helpful when using <b>pcre2test</b> to check +complicated regular expressions. For further information about callouts, see +the +<a href="pcre2callout.html"><b>pcre2callout</b></a> +documentation. +</P> +<br><a name="SEC17" href="#TOC1">NON-PRINTING CHARACTERS</a><br> +<P> +When <b>pcre2test</b> is outputting text in the compiled version of a pattern, +bytes other than 32-126 are always treated as non-printing characters and are +therefore shown as hex escapes. +</P> +<P> +When <b>pcre2test</b> is outputting text that is a matched part of a subject +string, it behaves in the same way, unless a different locale has been set for +the pattern (using the <b>/locale</b> modifier). In this case, the +<b>isprint()</b> function is used to distinguish printing and non-printing +characters. +</P> +<br><a name="SEC18" href="#TOC1">SEE ALSO</a><br> +<P> +<b>pcre2</b>(3), <b>pcre16</b>(3), <b>pcre32</b>(3), <b>pcre2api</b>(3), +<b>pcre2callout</b>(3), +<b>pcre2jit</b>, <b>pcre2matching</b>(3), <b>pcre2partial</b>(d), +<b>pcre2pattern</b>(3), <b>pcre2precompile</b>(3). +</P> +<br><a name="SEC19" href="#TOC1">AUTHOR</a><br> +<P> +Philip Hazel +<br> +University Computing Service +<br> +Cambridge CB2 3QH, England. +<br> +</P> +<br><a name="SEC20" href="#TOC1">REVISION</a><br> +<P> +Last updated: 19 August 2014 +<br> +Copyright © 1997-2014 University of Cambridge. +<br> +<p> +Return to the <a href="index.html">PCRE2 index page</a>. +</p> diff --git a/doc/html/pcre2unicode.html b/doc/html/pcre2unicode.html new file mode 100644 index 0000000..bbefd02 --- /dev/null +++ b/doc/html/pcre2unicode.html @@ -0,0 +1,270 @@ +<html> +<head> +<title>pcre2unicode specification</title> +</head> +<body bgcolor="#FFFFFF" text="#00005A" link="#0066FF" alink="#3399FF" vlink="#2222BB"> +<h1>pcre2unicode man page</h1> +<p> +Return to the <a href="index.html">PCRE2 index page</a>. +</p> +<p> +This page is part of the PCRE2 HTML documentation. It was generated +automatically from the original man page. If there is any nonsense in it, +please consult the man page, in case the conversion went wrong. +<br> +<br><b> +UNICODE AND UTF SUPPORT +</b><br> +<P> +When PCRE2 is built with Unicode support, it acquires knowledge of Unicode +character properties and can process text strings in UTF-8, UTF-16, or UTF-32 +format (depending on the code unit width). By default, PCRE2 assumes that one +code unit is one character. To process a pattern as a UTF string, where a +character may require more than one code unit, you must call +<a href="pcre2_compile.html"><b>pcre2_compile()</b></a> +with the PCRE2_UTF option flag, or the pattern must start with the sequence +(*UTF). When either of these is the case, both the pattern and any subject +strings that are matched against it are treated as UTF strings instead of +strings of individual one-code-unit characters. +</P> +<P> +If you build PCRE2 with Unicode support, the library will be bigger, but the +additional run time overhead is limited to testing the PCRE2_UTF flag +occasionally, so should not be very much. +</P> +<br><b> +UNICODE PROPERTY SUPPORT +</b><br> +<P> +When PCRE2 is built with Unicode support, the escape sequences \p{..}, +\P{..}, and \X can be used. The Unicode properties that can be tested are +limited to the general category properties such as Lu for an upper case letter +or Nd for a decimal number, the Unicode script names such as Arabic or Han, and +the derived properties Any and L&. Full lists are given in the +<a href="pcre2pattern.html"><b>pcre2pattern</b></a> +and +<a href="pcre2syntax.html"><b>pcre2syntax</b></a> +documentation. Only the short names for properties are supported. For example, +\p{L} matches a letter. Its Perl synonym, \p{Letter}, is not supported. +Furthermore, in Perl, many properties may optionally be prefixed by "Is", for +compatibility with Perl 5.6. PCRE does not support this. +</P> +<br><b> +WIDE CHARACTERS AND UTF MODES +</b><br> +<P> +Codepoints less than 256 can be specified in patterns by either braced or +unbraced hexadecimal escape sequences (for example, \x{b3} or \xb3). Larger +values have to use braced sequences. Unbraced octal code points up to \777 are +also recognized; larger ones can be coded using \o{...}. +</P> +<P> +In UTF modes, repeat quantifiers apply to complete UTF characters, not to +individual code units. +</P> +<P> +In UTF modes, the dot metacharacter matches one UTF character instead of a +single code unit. +</P> +<P> +The escape sequence \C can be used to match a single code unit, in a UTF mode, +but its use can lead to some strange effects because it breaks up multi-unit +characters (see the description of \C in the +<a href="pcre2pattern.html"><b>pcre2pattern</b></a> +documentation). The use of \C is not supported in the alternative matching +function <b>pcre2_dfa_exec()</b>, nor is it supported in UTF mode by the JIT +optimization. If JIT optimization is requested for a UTF pattern that contains +\C, it will not succeed, and so the matching will be carried out by the normal +interpretive function. +</P> +<P> +The character escapes \b, \B, \d, \D, \s, \S, \w, and \W correctly test +characters of any code value, but, by default, the characters that PCRE2 +recognizes as digits, spaces, or word characters remain the same set as in +non-UTF mode, all with code points less than 256. This remains true even when +PCRE2 is built to include Unicode support, because to do otherwise would slow +down matching in many common cases. Note that this also applies to \b +and \B, because they are defined in terms of \w and \W. If you want +to test for a wider sense of, say, "digit", you can use explicit Unicode +property tests such as \p{Nd}. Alternatively, if you set the PCRE2_UCP option, +the way that the character escapes work is changed so that Unicode properties +are used to determine which characters match. There are more details in the +section on +<a href="pcre2pattern.html#genericchartypes">generic character types</a> +in the +<a href="pcre2pattern.html"><b>pcre2pattern</b></a> +documentation. +</P> +<P> +Similarly, characters that match the POSIX named character classes are all +low-valued characters, unless the PCRE2_UCP option is set. +</P> +<P> +However, the special horizontal and vertical white space matching escapes (\h, +\H, \v, and \V) do match all the appropriate Unicode characters, whether or +not PCRE2_UCP is set. +</P> +<P> +Case-insensitive matching in UTF mode makes use of Unicode properties. A few +Unicode characters such as Greek sigma have more than two codepoints that are +case-equivalent, and these are treated as such. +</P> +<br><b> +VALIDITY OF UTF STRINGS +</b><br> +<P> +When the PCRE2_UTF option is set, the strings passed as patterns and subjects +are (by default) checked for validity on entry to the relevant functions. +If an invalid UTF string is passed, an error return is given. +</P> +<P> +UTF-16 and UTF-32 strings can indicate their endianness by special code knows +as a byte-order mark (BOM). The PCRE2 functions do not handle this, expecting +strings to be in host byte order. +</P> +<P> +The entire string is checked before any other processing takes place. In +addition to checking the format of the string, there is a check to ensure that +all code points lie in the range U+0 to U+10FFFF, excluding the surrogate area. +The so-called "non-character" code points are not excluded because Unicode +corrigendum #9 makes it clear that they should not be. +</P> +<P> +Characters in the "Surrogate Area" of Unicode are reserved for use by UTF-16, +where they are used in pairs to encode code points with values greater than +0xFFFF. The code points that are encoded by UTF-16 pairs are available +independently in the UTF-8 and UTF-32 encodings. (In other words, the whole +surrogate thing is a fudge for UTF-16 which unfortunately messes up UTF-8 and +UTF-32.) +</P> +<P> +In some situations, you may already know that your strings are valid, and +therefore want to skip these checks in order to improve performance, for +example in the case of a long subject string that is being scanned repeatedly. +If you set the PCRE2_NO_UTF_CHECK flag at compile time or at run time, PCRE2 +assumes that the pattern or subject it is given (respectively) contains only +valid UTF code unit sequences. +</P> +<P> +Passing PCRE2_NO_UTF_CHECK to <b>pcre2_compile()</b> just disables the check for +the pattern; it does not also apply to subject strings. If you want to disable +the check for a subject string you must pass this option to <b>pcre2_exec()</b> +or <b>pcre2_dfa_exec()</b>. +</P> +<P> +If you pass an invalid UTF string when PCRE2_NO_UTF_CHECK is set, the result +is undefined and your program may crash or loop indefinitely. +<a name="utf8strings"></a></P> +<br><b> +Errors in UTF-8 strings +</b><br> +<P> +The following negative error codes are given for invalid UTF-8 strings: +<pre> + PCRE2_ERROR_UTF8_ERR1 + PCRE2_ERROR_UTF8_ERR2 + PCRE2_ERROR_UTF8_ERR3 + PCRE2_ERROR_UTF8_ERR4 + PCRE2_ERROR_UTF8_ERR5 +</pre> +The string ends with a truncated UTF-8 character; the code specifies how many +bytes are missing (1 to 5). Although RFC 3629 restricts UTF-8 characters to be +no longer than 4 bytes, the encoding scheme (originally defined by RFC 2279) +allows for up to 6 bytes, and this is checked first; hence the possibility of +4 or 5 missing bytes. +<pre> + PCRE2_ERROR_UTF8_ERR6 + PCRE2_ERROR_UTF8_ERR7 + PCRE2_ERROR_UTF8_ERR8 + PCRE2_ERROR_UTF8_ERR9 + PCRE2_ERROR_UTF8_ERR10 +</pre> +The two most significant bits of the 2nd, 3rd, 4th, 5th, or 6th byte of the +character do not have the binary value 0b10 (that is, either the most +significant bit is 0, or the next bit is 1). +<pre> + PCRE2_ERROR_UTF8_ERR11 + PCRE2_ERROR_UTF8_ERR12 +</pre> +A character that is valid by the RFC 2279 rules is either 5 or 6 bytes long; +these code points are excluded by RFC 3629. +<pre> + PCRE2_ERROR_UTF8_ERR13 +</pre> +A 4-byte character has a value greater than 0x10fff; these code points are +excluded by RFC 3629. +<pre> + PCRE2_ERROR_UTF8_ERR14 +</pre> +A 3-byte character has a value in the range 0xd800 to 0xdfff; this range of +code points are reserved by RFC 3629 for use with UTF-16, and so are excluded +from UTF-8. +<pre> + PCRE2_ERROR_UTF8_ERR15 + PCRE2_ERROR_UTF8_ERR16 + PCRE2_ERROR_UTF8_ERR17 + PCRE2_ERROR_UTF8_ERR18 + PCRE2_ERROR_UTF8_ERR19 +</pre> +A 2-, 3-, 4-, 5-, or 6-byte character is "overlong", that is, it codes for a +value that can be represented by fewer bytes, which is invalid. For example, +the two bytes 0xc0, 0xae give the value 0x2e, whose correct coding uses just +one byte. +<pre> + PCRE2_ERROR_UTF8_ERR20 +</pre> +The two most significant bits of the first byte of a character have the binary +value 0b10 (that is, the most significant bit is 1 and the second is 0). Such a +byte can only validly occur as the second or subsequent byte of a multi-byte +character. +<pre> + PCRE2_ERROR_UTF8_ERR21 +</pre> +The first byte of a character has the value 0xfe or 0xff. These values can +never occur in a valid UTF-8 string. +<a name="utf16strings"></a></P> +<br><b> +Errors in UTF-16 strings +</b><br> +<P> +The following negative error codes are given for invalid UTF-16 strings: +<pre> + PCRE_UTF16_ERR1 Missing low surrogate at end of string + PCRE_UTF16_ERR2 Invalid low surrogate follows high surrogate + PCRE_UTF16_ERR3 Isolated low surrogate + +<a name="utf32strings"></a></PRE> +</P> +<br><b> +Errors in UTF-32 strings +</b><br> +<P> +The following negative error codes are given for invalid UTF-32 strings: +<pre> + PCRE_UTF32_ERR1 Surrogate character (range from 0xd800 to 0xdfff) + PCRE_UTF32_ERR2 Code point is greater than 0x10ffff + +</PRE> +</P> +<br><b> +AUTHOR +</b><br> +<P> +Philip Hazel +<br> +University Computing Service +<br> +Cambridge CB2 3QH, England. +<br> +</P> +<br><b> +REVISION +</b><br> +<P> +Last updated: 16 September 2014 +<br> +Copyright © 1997-2014 University of Cambridge. +<br> +<p> +Return to the <a href="index.html">PCRE2 index page</a>. +</p> diff --git a/doc/index.html.src b/doc/index.html.src new file mode 100644 index 0000000..4e264ec --- /dev/null +++ b/doc/index.html.src @@ -0,0 +1,177 @@ +<html> +<!-- This is a manually maintained file that is the root of the HTML version of + the PCRE2 documentation. When the HTML documents are built from the man + page versions, the entire doc/html directory is emptied, this file is then + copied into doc/html/index.html, and the remaining files therein are + created by the 132html script. +--> +<head> +<title>PCRE2 specification</title> +</head> +<body bgcolor="#FFFFFF" text="#00005A" link="#0066FF" alink="#3399FF" vlink="#2222BB"> +<h1>Perl-compatible Regular Expressions (revised API: PCRE2)</h1> +<p> +The HTML documentation for PCRE2 consists of a number of pages that are listed +below in alphabetical order. If you are new to PCRE2, please read the first one +first. +</p> + +<table> +<tr><td><a href="pcre2.html">pcre</a></td> + <td> Introductory page</td></tr> + +<tr><td><a href="pcre2-config.html">pcre-config</a></td> + <td> Information about the installation configuration</td></tr> + +<tr><td><a href="pcre2api.html">pcreapi</a></td> + <td> PCRE2's native API</td></tr> + +<tr><td><a href="pcre2build.html">pcrebuild</a></td> + <td> Building PCRE2</td></tr> + +<tr><td><a href="pcre2callout.html">pcre2callout</a></td> + <td> The <i>callout</i> facility</td></tr> + +<tr><td><a href="pcre2compat.html">pcre2compat</a></td> + <td> Compability with Perl</td></tr> + +<tr><td><a href="pcre2demo.html">pcre2demo</a></td> + <td> A demonstration C program that uses the PCRE2 library</td></tr> + +<tr><td><a href="pcre2grep.html">pcre2grep</a></td> + <td> The <b>pcre2grep</b> command</td></tr> + +<tr><td><a href="pcre2jit.html">pcre2jit</a></td> + <td> Discussion of the just-in-time optimization support</td></tr> + +<tr><td><a href="pcre2limits.html">pcre2limits</a></td> + <td> Details of size and other limits</td></tr> + +<tr><td><a href="pcre2matching.html">pcre2matching</a></td> + <td> Discussion of the two matching algorithms</td></tr> + +<tr><td><a href="pcre2partial.html">pcre2partial</a></td> + <td> Using PCRE2 for partial matching</td></tr> + +<tr><td><a href="pcre2pattern.html">pcre2pattern</a></td> + <td> Specification of the regular expressions supported by PCRE2</td></tr> + +<tr><td><a href="pcre2perform.html">pcre2perform</a></td> + <td> Some comments on performance</td></tr> + +<tr><td><a href="pcre2posix.html">pcre2posix</a></td> + <td> The POSIX API to the PCRE2 8-bit library</td></tr> + +<tr><td><a href="pcre2precompile.html">pcre2precompile</a></td> + <td> How to save and re-use compiled patterns</td></tr> + +<tr><td><a href="pcre2sample.html">pcre2sample</a></td> + <td> Discussion of the pcre2demo program</td></tr> + +<tr><td><a href="pcre2stack.html">pcre2stack</a></td> + <td> Discussion of PCRE2's stack usage</td></tr> + +<tr><td><a href="pcre2syntax.html">pcre2syntax</a></td> + <td> Syntax quick-reference summary</td></tr> + +<tr><td><a href="pcre2test.html">pcre2test</a></td> + <td> The <b>pcre2test</b> command for testing PCRE2</td></tr> + +<tr><td><a href="pcre2unicode.html">pcre2unicode</a></td> + <td> Discussion of Unicode and UTF-8/UTF-16/UTF-32 support</td></tr> +</table> + +<p> +There are also individual pages that summarize the interface for each function +in the library. There is a single page for each triple of 8-bit/16-bit/32-bit +functions. +</p> + +<table> + +<tr><td><a href="pcre2_assign_jit_stack.html">pcre2_assign_jit_stack</a></td> + <td> Assign stack for JIT matching</td></tr> + +<tr><td><a href="pcre2_compile.html">pcre2_compile</a></td> + <td> Compile a regular expression</td></tr> + +<tr><td><a href="pcre2_compile2.html">pcre2_compile2</a></td> + <td> Compile a regular expression (alternate interface)</td></tr> + +<tr><td><a href="pcre2_config.html">pcre2_config</a></td> + <td> Show build-time configuration options</td></tr> + +<tr><td><a href="pcre2_copy_named_substring.html">pcre2_copy_named_substring</a></td> + <td> Extract named substring into given buffer</td></tr> + +<tr><td><a href="pcre2_copy_substring.html">pcre2_copy_substring</a></td> + <td> Extract numbered substring into given buffer</td></tr> + +<tr><td><a href="pcre2_dfa_exec.html">pcre2_dfa_exec</a></td> + <td> Match a compiled pattern to a subject string + (DFA algorithm; <i>not</i> Perl compatible)</td></tr> + +<tr><td><a href="pcre2_exec.html">pcre2_exec</a></td> + <td> Match a compiled pattern to a subject string + (Perl compatible)</td></tr> + +<tr><td><a href="pcre2_free_study.html">pcre2_free_study</a></td> + <td> Free study data</td></tr> + +<tr><td><a href="pcre2_free_substring.html">pcre2_free_substring</a></td> + <td> Free extracted substring</td></tr> + +<tr><td><a href="pcre2_free_substring_list.html">pcre2_free_substring_list</a></td> + <td> Free list of extracted substrings</td></tr> + +<tr><td><a href="pcre2_fullinfo.html">pcre2_fullinfo</a></td> + <td> Extract information about a pattern</td></tr> + +<tr><td><a href="pcre2_get_named_substring.html">pcre2_get_named_substring</a></td> + <td> Extract named substring into new memory</td></tr> + +<tr><td><a href="pcre2_get_stringnumber.html">pcre2_get_stringnumber</a></td> + <td> Convert captured string name to number</td></tr> + +<tr><td><a href="pcre2_get_stringtable_entries.html">pcre2_get_stringtable_entries</a></td> + <td> Find table entries for given string name</td></tr> + +<tr><td><a href="pcre2_get_substring.html">pcre2_get_substring</a></td> + <td> Extract numbered substring into new memory</td></tr> + +<tr><td><a href="pcre2_get_substring_list.html">pcre2_get_substring_list</a></td> + <td> Extract all substrings into new memory</td></tr> + +<tr><td><a href="pcre2_jit_exec.html">pcre2_jit_exec</a></td> + <td> Fast path interface to JIT matching</td></tr> + +<tr><td><a href="pcre2_jit_stack_alloc.html">pcre2_jit_stack_alloc</a></td> + <td> Create a stack for JIT matching</td></tr> + +<tr><td><a href="pcre2_jit_stack_free.html">pcre2_jit_stack_free</a></td> + <td> Free a JIT matching stack</td></tr> + +<tr><td><a href="pcre2_maketables.html">pcre2_maketables</a></td> + <td> Build character tables in current locale</td></tr> + +<tr><td><a href="pcre2_pattern_to_host_byte_order.html">pcre2_pattern_to_host_byte_order</a></td> + <td> Convert compiled pattern to host byte order if necessary</td></tr> + +<tr><td><a href="pcre2_refcount.html">pcre2_refcount</a></td> + <td> Maintain reference count in compiled pattern</td></tr> + +<tr><td><a href="pcre2_study.html">pcre2_study</a></td> + <td> Study a compiled pattern</td></tr> + +<tr><td><a href="pcre2_utf16_to_host_byte_order.html">pcre2_utf16_to_host_byte_order</a></td> + <td> Convert UTF-16 string to host byte order if necessary</td></tr> + +<tr><td><a href="pcre2_utf32_to_host_byte_order.html">pcre2_utf32_to_host_byte_order</a></td> + <td> Convert UTF-32 string to host byte order if necessary</td></tr> + +<tr><td><a href="pcre2_version.html">pcre2_version</a></td> + <td> Return PCRE2 version and release date</td></tr> +</table> + +</html> + diff --git a/doc/pcre2.txt b/doc/pcre2.txt new file mode 100644 index 0000000..52b7406 --- /dev/null +++ b/doc/pcre2.txt @@ -0,0 +1,2903 @@ +----------------------------------------------------------------------------- +This file contains a concatenation of the PCRE2 man pages, converted to plain +text format for ease of searching with a text editor, or for use on systems +that do not have a man page processor. The small individual files that give +synopses of each function in the library have not been included. Neither has +the pcre2demo program. There are separate text files for the pcre2grep and +pcre2test commands. +----------------------------------------------------------------------------- + + +PCRE2API(3) Library Functions Manual PCRE2API(3) + + + +NAME + PCRE2 - Perl-compatible regular expressions (revised API) + + #include <pcre2.h> + + PCRE2 is a new API for PCRE. This document contains a description of + all its functions. See the pcre2 document for an overview of all the + PCRE2 documentation. + + +PCRE2 NATIVE API BASIC FUNCTIONS + + pcre2_code *pcre2_compile(PCRE2_SPTR pattern, PCRE2_SIZE length, + uint32_t options, int *errorcode, PCRE2_SIZE *erroroffset, + pcre2_compile_context *ccontext); + + pcre2_code_free(pcre2_code *code); + + pcre2_match_data_create(uint32_t ovecsize, + pcre2_general_context *gcontext); + + pcre2_match_data_create_from_pattern(pcre2_code *code, + pcre2_general_context *gcontext); + + int pcre2_match(const pcre2_code *code, PCRE2_SPTR subject, + PCRE2_SIZE length, PCRE2_SIZE startoffset, + uint32_t options, pcre2_match_data *match_data, + pcre2_match_context *mcontext); + + int pcre2_dfa_match(const pcre2_code *code, PCRE2_SPTR subject, + PCRE2_SIZE length, PCRE2_SIZE startoffset, + uint32_t options, pcre2_match_data *match_data, + pcre2_match_context *mcontext, + int *workspace, PCRE2_SIZE wscount); + + void pcre2_match_data_free(pcre2_match_data *match_data); + + +PCRE2 NATIVE API AUXILIARY MATCH FUNCTIONS + + PCRE2_SIZE pcre2_get_leftchar(pcre2_match_data *match_data); + + PCRE2_SPTR pcre2_get_mark(pcre2_match_data *match_data); + + uint32_t pcre2_get_ovector_count(pcre2_match_data *match_data); + + PCRE2_SIZE *pcre2_get_ovector_pointer(pcre2_match_data *match_data); + + PCRE2_SIZE pcre2_get_rightchar(pcre2_match_data *match_data); + + PCRE2_SIZE pcre2_get_startchar(pcre2_match_data *match_data); + + +PCRE2 NATIVE API GENERAL CONTEXT FUNCTIONS + + pcre2_general_context *pcre2_general_context_create( + void *(*private_malloc)(PCRE2_SIZE, void *), + void (*private_free)(void *, void *), void *memory_data); + + pcre2_general_context *pcre2_general_context_copy( + pcre2_general_context *gcontext); + + void pcre2_general_context_free(pcre2_general_context *gcontext); + + +PCRE2 NATIVE API COMPILE CONTEXT FUNCTIONS + + pcre2_compile_context *pcre2_compile_context_create( + pcre2_general_context *gcontext); + + pcre2_compile_context *pcre2_compile_context_copy( + pcre2_compile_context *ccontext); + + void pcre2_compile_context_free(pcre2_compile_context *ccontext); + + int pcre2_set_bsr_compile(pcre2_compile_context *ccontext, + uint32_t value); + + int pcre2_set_character_tables(pcre2_compile_context *ccontext, + const unsigned char *tables); + + int pcre2_set_newline_compile(pcre2_compile_context *ccontext, + uint32_t value); + + int pcre2_set_parens_nest_limit(pcre2_compile_context *ccontext, + uint32_t value); + + int pcre2_set_compile_recursion_guard(pcre2_compile_context *ccontext, + int (*guard_function)(uint32_t)); + + +PCRE2 NATIVE API MATCH CONTEXT FUNCTIONS + + pcre2_match_context *pcre2_match_context_create( + pcre2_general_context *gcontext); + + pcre2_match_context *pcre2_match_context_copy( + pcre2_match_context *mcontext); + + void pcre2_match_context_free(pcre2_match_context *mcontext); + + int pcre2_set_bsr_match(pcre2_match_context *mcontext, + uint32_t value); + + int pcre2_set_callout(pcre2_match_context *mcontext, + int (*callout_function)(pcre2_callout_block *), + void *callout_data); + + int pcre2_set_match_limit(pcre2_match_context *mcontext, + uint32_t value); + + int pcre2_set_newline_match(pcre2_match_context *mcontext, + uint32_t value); + + int pcre2_set_recursion_limit(pcre2_match_context *mcontext, + uint32_t value); + + int pcre2_set_recursion_memory_management( + pcre2_match_context *mcontext, + void *(*private_malloc)(PCRE2_SIZE, void *), + void (*private_free)(void *, void *), void *memory_data); + + +PCRE2 NATIVE API STRING EXTRACTION FUNCTIONS + + int pcre2_substring_copy_byname(pcre2_match_data *match_data, + PCRE2_SPTR name, PCRE2_UCHAR *buffer, PCRE2_SIZE *bufflen); + + int pcre2_substring_copy_bynumber(pcre2_match_data *match_data, + unsigned int number, PCRE2_UCHAR *buffer, + PCRE2_SIZE *bufflen); + + void pcre2_substring_free(PCRE2_UCHAR *buffer); + + int pcre2_substring_get_byname(pcre2_match_data *match_data, + PCRE2_SPTR name, PCRE2_UCHAR **bufferptr, PCRE2_SIZE *bufflen); + + int pcre2_substring_get_bynumber(pcre2_match_data *match_data, + unsigned int number, PCRE2_UCHAR **bufferptr, + PCRE2_SIZE *bufflen); + + int pcre2_substring_length_byname(pcre2_match_data *match_data, + PCRE2_SPTR name, PCRE2_SIZE *length); + + int pcre2_substring_length_bynumber(pcre2_match_data *match_data, + unsigned int number, PCRE2_SIZE *length); + + int pcre2_substring_nametable_scan(const pcre2_code *code, + PCRE2_SPTR name, PCRE2_SPTR *first, PCRE2_SPTR *last); + + int pcre2_substring_number_from_name(const pcre2_code *code, + PCRE2_SPTR name); + + void pcre2_substring_list_free(PCRE2_SPTR *list); + + int pcre2_substring_list_get(pcre2_match_data *match_data, + PCRE2_UCHAR ***listptr, PCRE2_SIZE **lengthsptr); + + +PCRE2 NATIVE API JIT FUNCTIONS + + int pcre2_jit_compile(pcre2_code *code, uint32_t options); + + int pcre2_jit_match(const pcre2_code *code, PCRE2_SPTR subject, + PCRE2_SIZE length, PCRE2_SIZE startoffset, + uint32_t options, pcre2_match_data *match_data, + pcre2_match_context *mcontext, pcre2_jit_stack *jit_stack); + + void pcre2_jit_free_unused_memory(pcre2_general_context *gcontext); + + pcre2_jit_stack *pcre2_jit_stack_alloc(pcre2_general_context *gcontext, + PCRE2_SIZE startsize, PCRE2_SIZE maxsize); + + void pcre2_jit_stack_assign(const pcre2_code *code, + pcre2_jit_callback callback_function, void *callback_data); + + void pcre2_jit_stack_free(pcre2_jit_stack *jit_stack); + + +PCRE2 NATIVE API AUXILIARY FUNCTIONS + + int pcre2_get_error_message(int errorcode, PCRE2_UCHAR *buffer, + PCRE2_SIZE bufflen); + + const unsigned char *pcre2_maketables(pcre2_general_context *gcontext); + + int pcre2_pattern_info(const pcre2 *code, uint32_t what, void *where); + + int pcre2_config(uint32_t what, void *where, PCRE2_SIZE length); + + +PCRE2 8-BIT, 16-BIT, AND 32-BIT LIBRARIES + + There are three PCRE2 libraries, supporting 8-bit, 16-bit, and 32-bit + code units, respectively. However, there is just one header file, + pcre2.h. This contains the function prototypes and other definitions + for all three libraries. One, two, or all three can be installed simul- + taneously. On Unix-like systems the libraries are called libpcre2-8, + libpcre2-16, and libpcre2-32, and they can also co-exist with the orig- + inal PCRE libraries. + + Character strings are passed to and from a PCRE2 library as a sequence + of unsigned integers in code units of the appropriate width. Every + PCRE2 function comes in three different forms, one for each library, + for example: + + pcre2_compile_8() + pcre2_compile_16() + pcre2_compile_32() + + There are also three different sets of data types: + + PCRE2_UCHAR8, PCRE2_UCHAR16, PCRE2_UCHAR32 + PCRE2_SPTR8, PCRE2_SPTR16, PCRE2_SPTR32 + + The UCHAR types define unsigned code units of the appropriate widths. + For example, PCRE2_UCHAR16 is usually defined as `uint16_t'. The SPTR + types are constant pointers to the equivalent UCHAR types, that is, + they are pointers to vectors of unsigned code units. + + Many applications use only one code unit width. For their convenience, + macros are defined whose names are the generic forms such as pcre2_com- + pile() and PCRE2_SPTR. These macros use the value of the macro + PCRE2_CODE_UNIT_WIDTH to generate the appropriate width-specific func- + tion and macro names. PCRE2_CODE_UNIT_WIDTH is not defined by default. + An application must define it to be 8, 16, or 32 before including + pcre2.h in order to make use of the generic names. + + Applications that use more than one code unit width can be linked with + more than one PCRE2 library, but must define PCRE2_CODE_UNIT_WIDTH to + be 0 before including pcre2.h, and then use the real function names. + Any code that is to be included in an environment where the value of + PCRE2_CODE_UNIT_WIDTH is unknown should also use the real function + names. (Unfortunately, it is not possible in C code to save and restore + the value of a macro.) + + If PCRE2_CODE_UNIT_WIDTH is not defined before including pcre2.h, a + compiler error occurs. + + When using multiple libraries in an application, you must take care + when processing any particular pattern to use only functions from a + single library. For example, if you want to run a match using a pat- + tern that was compiled with pcre2_compile_16(), you must do so with + pcre2_match_16(), not pcre2_match_8(). + + In the function summaries above, and in the rest of this document and + other PCRE2 documents, functions and data types are described using + their generic names, without the 8, 16, or 32 suffix. + + +PCRE2 API OVERVIEW + + PCRE2 has its own native API, which is described in this document. + There are also some wrapper functions for the 8-bit library that corre- + spond to the POSIX regular expression API, but they do not give access + to all the functionality. They are described in the pcre2posix documen- + tation. Both these APIs define a set of C function calls. + + The native API C data types, function prototypes, option values, and + error codes are defined in the header file pcre2.h, which contains def- + initions of PCRE2_MAJOR and PCRE2_MINOR, the major and minor release + numbers for the library. Applications can use these to include support + for different releases of PCRE2. + + In a Windows environment, if you want to statically link an application + program against a non-dll PCRE2 library, you must define PCRE2_STATIC + before including pcre2.h. + + The functions pcre2_compile(), and pcre2_match() are used for compiling + and matching regular expressions in a Perl-compatible manner. A sample + program that demonstrates the simplest way of using them is provided in + the file called pcre2demo.c in the PCRE2 source distribution. A listing + of this program is given in the pcre2demo documentation, and the + pcre2sample documentation describes how to compile and run it. + + Just-in-time compiler support is an optional feature of PCRE2 that can + be built in appropriate hardware environments. It greatly speeds up the + matching performance of many patterns. Programs can request that it be + used if available, by calling pcre2_jit_compile() after a pattern has + been successfully compiled by pcre2_compile(). This does nothing if JIT + support is not available. + + More complicated programs might need to make use of the specialist + functions pcre2_jit_stack_alloc(), pcre2_jit_stack_free(), and + pcre2_jit_stack_assign() in order to control the JIT code's memory + usage. + + JIT matching is automatically used by pcre2_match() if it is available. + There is also a direct interface for JIT matching, which gives improved + performance. The JIT-specific functions are discussed in the pcre2jit + documentation. + + A second matching function, pcre2_dfa_exec(), which is not Perl-compat- + ible, is also provided. This uses a different algorithm for the match- + ing. The alternative algorithm finds all possible matches (at a given + point in the subject), and scans the subject just once (unless there + are lookbehind assertions). However, this algorithm does not return + captured substrings. A description of the two matching algorithms and + their advantages and disadvantages is given in the pcre2matching docu- + mentation. There is no JIT support for pcre2_dfa_match(). + + In addition to the main compiling and matching functions, there are + convenience functions for extracting captured substrings from a subject + string that is matched by pcre2_match(). They are: + + pcre2_substring_copy_byname() + pcre2_substring_copy_bynumber() + pcre2_substring_get_byname() + pcre2_substring_get_bynumber() + pcre2_substring_list_get() + pcre2_substring_length_byname() + pcre2_substring_length_bynumber() + pcre2_substring_nametable_scan() + pcre2_substring_number_from_name() + + pcre2_substring_free() and pcre2_substring_list_free() are also pro- + vided, to free the memory used for extracted strings. + + There are functions for finding out information about a compiled pat- + tern (pcre2_pattern_info()) and about the configuration with which + PCRE2 was built (pcre2_config()). + + +NEWLINES + + PCRE2 supports five different conventions for indicating line breaks in + strings: a single CR (carriage return) character, a single LF (line- + feed) character, the two-character sequence CRLF, any of the three pre- + ceding, or any Unicode newline sequence. The Unicode newline sequences + are the three just mentioned, plus the single characters VT (vertical + tab, U+000B), FF (form feed, U+000C), NEL (next line, U+0085), LS (line + separator, U+2028), and PS (paragraph separator, U+2029). + + Each of the first three conventions is used by at least one operating + system as its standard newline sequence. When PCRE2 is built, a default + can be specified. The default default is LF, which is the Unix stan- + dard. When PCRE2 is run, the default can be overridden, either when a + pattern is compiled, or when it is matched. + + The newline convention can be changed when calling pcre2_compile(), or + it can be specified by special text at the start of the pattern itself; + this overrides any other settings. See the pcre2pattern page for + details of the special character sequences. + + In the PCRE2 documentation the word "newline" is used to mean "the + character or pair of characters that indicate a line break". The choice + of newline convention affects the handling of the dot, circumflex, and + dollar metacharacters, the handling of #-comments in /x mode, and, when + CRLF is a recognized line ending sequence, the match position advance- + ment for a non-anchored pattern. There is more detail about this in the + section on pcre2_match() options below. + + The choice of newline convention does not affect the interpretation of + the \n or \r escape sequences, nor does it affect what \R matches, + which has its own separate control. + + +MULTITHREADING + + In a multithreaded application it is important to keep thread-specific + data separate from data that can be shared between threads. The PCRE2 + library code itself is thread-safe: it contains no static or global + variables. The API is designed to be fairly simple for non-threaded + applications while at the same time ensuring that multithreaded appli- + cations can use it. + + There are several different blocks of data that are used to pass infor- + mation between the application and the PCRE libraries. + + (1) A pointer to the compiled form of a pattern is returned to the user + when pcre2_compile() is successful. The data in the compiled pattern is + fixed, and does not change when the pattern is matched. Therefore, it + is thread-safe, that is, the same compiled pattern can be used by more + than one thread simultaneously. An application can compile all its pat- + terns at the start, before forking off multiple threads that use them. + However, if the just-in-time optimization feature is being used, it + needs separate memory stack areas for each thread. See the pcre2jit + documentation for more details. + + (2) The next section below introduces the idea of "contexts" in which + PCRE2 functions are called. A context is nothing more than a collection + of parameters that control the way PCRE2 operates. Grouping a number of + parameters together in a context is a convenient way of passing them to + a PCRE2 function without using lots of arguments. The parameters that + are stored in contexts are in some sense "advanced features" of the + API. Many straightforward applications will not need to use contexts. + + In a multithreaded application, if the parameters in a context are val- + ues that are never changed, the same context can be used by all the + threads. However, if any thread needs to change any value in a context, + it must make its own thread-specific copy. + + (3) The matching functions need a block of memory for working space and + for storing the results of a match. This includes details of what was + matched, as well as additional information such as the name of a + (*MARK) setting. Each thread must provide its own version of this mem- + ory. + + +PCRE2 CONTEXTS + + Some PCRE2 functions have a lot of parameters, many of which are used + only by specialist applications, for example, those that use custom + memory management or non-standard character tables. To keep function + argument lists at a reasonable size, and at the same time to keep the + API extensible, "uncommon" parameters are passed to certain functions + in a context instead of directly. A context is just a block of memory + that holds the parameter values. Applications that do not need to + adjust any of the context parameters can pass NULL when a context + pointer is required. + + There are three different types of context: a general context that is + relevant for several PCRE2 operations, a compile-time context, and a + match-time context. + + The general context + + At present, this context just contains pointers to (and data for) + external memory management functions that are called from several + places in the PCRE2 library. The context is named `general' rather than + specifically `memory' because in future other fields may be added. If + you do not want to supply your own custom memory management functions, + you do not need to bother with a general context. A general context is + created by: + + pcre2_general_context *pcre2_general_context_create( + void *(*private_malloc)(PCRE2_SIZE, void *), + void (*private_free)(void *, void *), void *memory_data); + + The two function pointers specify custom memory management functions, + whose prototypes are: + + void *private_malloc(PCRE2_SIZE, void *); + void private_free(void *, void *); + + Whenever code in PCRE2 calls these functions, the final argument is the + value of memory_data. Either of the first two arguments of the creation + function may be NULL, in which case the system memory management func- + tions malloc() and free() are used. (This is not currently useful, as + there are no other fields in a general context, but in future there + might be.) The private_malloc() function is used (if supplied) to + obtain memory for storing the context, and all three values are saved + as part of the context. + + Whenever PCRE2 creates a data block of any kind, the block contains a + pointer to the free() function that matches the malloc() function that + was used. When the time comes to free the block, this function is + called. + + A general context can be copied by calling: + + pcre2_general_context *pcre2_general_context_copy( + pcre2_general_context *gcontext); + + The memory used for a general context should be freed by calling: + + void pcre2_general_context_free(pcre2_general_context *gcontext); + + + The compile context + + A compile context is required if you want to change the default values + of any of the following compile-time parameters: + + What \R matches (Unicode newlines or CR, LF, CRLF only); + PCRE2's character tables; + The newline character sequence; + The compile time nested parentheses limit; + An external function for stack checking. + + A compile context is also required if you are using custom memory man- + agement. If none of these apply, just pass NULL as the context argu- + ment of pcre2_compile(). + + A compile context is created, copied, and freed by the following func- + tions: + + pcre2_compile_context *pcre2_compile_context_create( + pcre2_general_context *gcontext); + + pcre2_compile_context *pcre2_compile_context_copy( + pcre2_compile_context *ccontext); + + void pcre2_compile_context_free(pcre2_compile_context *ccontext); + + A compile context is created with default values for its parameters. + These can be changed by calling the following functions, which return 0 + on success, or PCRE2_ERROR_BADDATA if invalid data is detected. + + int pcre2_set_bsr_compile(pcre2_compile_context *ccontext, + uint32_t value); + + The value must be PCRE2_BSR_ANYCRLF, to specify that \R matches only + CR, LF, or CRLF, or PCRE2_BSR_UNICODE, to specify that \R matches any + Unicode line ending sequence. The value of this parameter does not + affect what is compiled; it is just saved with the compiled pattern. + The value is used by the JIT compiler and by the two interpreted match- + ing functions, pcre2_match() and pcre2_dfa_match(). You can change the + value when calling these functions, but doing so disables the use of + JIT. + + int pcre2_set_character_tables(pcre2_compile_context *ccontext, + const unsigned char *tables); + + The value must be the result of a call to pcre2_maketables(), whose + only argument is a general context. This function builds a set of char- + acter tables in the current locale. + + int pcre2_set_newline_compile(pcre2_compile_context *ccontext, + uint32_t value); + + This specifies which characters or character sequences are to be recog- + nized as newlines. The value must be one of PCRE2_NEWLINE_CR (carriage + return only), PCRE2_NEWLINE_LF (linefeed only), PCRE2_NEWLINE_CRLF (the + two-character sequence CR followed by LF), PCRE2_NEWLINE_ANYCRLF (any + of the above), or PCRE2_NEWLINE_ANY (any Unicode newline sequence). + + When a pattern is compiled with the PCRE2_EXTENDED option, the value of + this parameter affects the recognition of white space and the end of + internal comments starting with #. The value is saved with the compiled + pattern for subsequent use by the JIT compiler and by the two inter- + preted matching functions, pcre2_match() and pcre2_dfa_match(). You can + change the value when calling these functions, but doing so disables + the use of JIT. + + int pcre2_set_parens_nest_limit(pcre2_compile_context *ccontext, + uint32_t value); + + This parameter ajusts the limit, set when PCRE2 is built (default 250), + on the depth of parenthesis nesting in a pattern. This limit stops + rogue patterns using up too much system stack when being compiled. + + int pcre2_set_compile_recursion_guard(pcre2_compile_context *ccontext, + int (*guard_function)(uint32_t)); + + There is at least one application that runs PCRE2 in threads with very + limited system stack, where running out of stack is to be avoided at + all costs. The parenthesis limit above cannot take account of how much + stack is actually available. For a finer control, you can supply a + function that is called whenever pcre2_compile() starts to compile a + parenthesized part of a pattern. The argument to the function gives the + current depth of nesting. The function should return zero if all is + well, or non-zero to force an error. + + The match context + + A match context is required if you want to change the default values of + any of the following match-time parameters: + + What \R matches (Unicode newlines or CR, LF, CRLF only); + A callout function; + The limit for calling match(); + The limit for calling match() recursively; + The newline character sequence; + + A match context is also required if you are using custom memory manage- + ment. If none of these apply, just pass NULL as the context argument + of pcre2_match(), pcre2_dfa_match(), or pcre2_jit_match(). Changing + the newline value or what \R matches at match time disables the use of + JIT via pcre2_match(). + + A match context is created, copied, and freed by the following func- + tions: + + pcre2_match_context *pcre2_match_context_create( + pcre2_general_context *gcontext); + + pcre2_match_context *pcre2_match_context_copy( + pcre2_match_context *mcontext); + + void pcre2_match_context_free(pcre2_match_context *mcontext); + + A match context is created with default values for its parameters. + These can be changed by calling the following functions, which return 0 + on success, or PCRE2_ERROR_BADDATA if invalid data is detected. + + int pcre2_set_bsr_match(pcre2_match_context *mcontext, + uint32_t value); + + The value must be PCRE2_BSR_ANYCRLF, to specify that \R matches only + CR, LF, or CRLF, or PCRE2_BSR_UNICODE, to specify that \R matches any + Unicode line ending sequence. If you want to make use of JIT matching, + you should not use this function, but instead set the value in a com- + pile context. + + int pcre2_set_callout(pcre2_match_context *mcontext, + int (*callout_function)(pcre2_callout_block *), + void *callout_data); + + This sets up a "callout" function, which PCRE2 will call at specified + points during a matching operation. Details are given in the pcre2call- + out documentation. + + int pcre2_set_match_limit(pcre2_match_context *mcontext, + uint32_t value); + + The match_limit parameter provides a means of preventing PCRE2 from + using up too many resources when processing patterns that are not going + to match, but which have a very large number of possibilities in their + search trees. The classic example is a pattern that uses nested unlim- + ited repeats. + + Internally, pcre2_match() uses a function called match(), which it + calls repeatedly (sometimes recursively). The limit set by match_limit + is imposed on the number of times this function is called during a + match, which has the effect of limiting the amount of backtracking that + can take place. For patterns that are not anchored, the count restarts + from zero for each position in the subject string. This limit is not + relevant to pcre2_dfa_match(), which ignores it. + + When pcre2_match() is called with a pattern that was successfully stud- + ied with pcre2_jit_compile(), the way that the matching is executed is + entirely different. However, there is still the possibility of runaway + matching that goes on for a very long time, and so the match_limit + value is also used in this case (but in a different way) to limit how + long the matching can continue. + + The default value for the limit can be set when PCRE2 is built; the + default default is 10 million, which handles all but the most extreme + cases. If the limit is exceeded, pcre2_match() returns + PCRE2_ERROR_MATCHLIMIT. A value for the match limit may also be sup- + plied by an item at the start of a pattern of the form + + (*LIMIT_MATCH=ddd) + + where ddd is a decimal number. However, such a setting is ignored + unless ddd is less than the limit set by the caller of pcre2_match() + or, if no such limit is set, less than the default. + + int pcre2_set_recursion_limit(pcre2_match_context *mcontext, + uint32_t value); + + The recursion_limit parameter is similar to match_limit, but instead of + limiting the total number of times that match() is called, it limits + the depth of recursion. The recursion depth is a smaller number than + the total number of calls, because not all calls to match() are recur- + sive. This limit is of use only if it is set smaller than match_limit. + + Limiting the recursion depth limits the amount of system stack that can + be used, or, when PCRE2 has been compiled to use memory on the heap + instead of the stack, the amount of heap memory that can be used. This + limit is not relevant, and is ignored, when matching is done using JIT + compiled code or by the pcre2_dfa_match() function. + + The default value for recursion_limit can be set when PCRE2 is built; + the default default is the same value as the default for match_limit. + If the limit is exceeded, pcre2_match() returns PCRE2_ERROR_RECURSION- + LIMIT. A value for the recursion limit may also be supplied by an item + at the start of a pattern of the form + + (*LIMIT_RECURSION=ddd) + + where ddd is a decimal number. However, such a setting is ignored + unless ddd is less than the limit set by the caller of pcre2_match() + or, if no such limit is set, less than the default. + + int pcre2_set_newline_match(pcre2_match_context *mcontext, + uint32_t value); + + This specifies which characters or character sequences are to be recog- + nized as newlines. The value must be one of PCRE2_NEWLINE_CR (carriage + return only), PCRE2_NEWLINE_LF (linefeed only), PCRE2_NEWLINE_CRLF (the + two-character sequence CR followed by LF), PCRE2_NEWLINE_ANYCRLF (any + of the above), or PCRE2_NEWLINE_ANY (any Unicode newline sequence). If + you want to make use of JIT matching, you should not use this function, + but instead set the value in a compile context. + + int pcre2_set_recursion_memory_management( + pcre2_match_context *mcontext, + void *(*private_malloc)(PCRE2_SIZE, void *), + void (*private_free)(void *, void *), void *memory_data); + + This function sets up two additional custom memory management functions + for use by pcre2_match() when PCRE2 is compiled to use the heap for + remembering backtracking data, instead of recursive function calls that + use the system stack. There is a discussion about PCRE2's stack usage + in the pcre2stack documentation. See the pcre2build documentation for + details of how to build PCRE2. Using the heap for recursion is a non- + standard way of building PCRE2, for use in environments that have lim- + ited stacks. Because of the greater use of memory management, + pcre2_match() runs more slowly. Functions that are different to the + general custom memory functions are provided so that special-purpose + external code can be used for this case, because the memory blocks are + all the same size. The blocks are retained by pcre2_match() until it is + about to exit so that they can be re-used when possible during the + match. In the absence of these functions, the normal custom memory man- + agement functions are used, if supplied, otherwise the system func- + tions. + + +CHECKING BUILD-TIME OPTIONS + + int pcre2_config(uint32_t what, void *where, PCRE2_SIZE length); + + The function pcre2_config() makes it possible for a PCRE2 client to + discover which optional features have been compiled into the PCRE2 + library. The pcre2build documentation has more details about these + optional features. + + The first argument for pcre2_config() specifies which information is + required. The second argument is a pointer to memory into which the + information is placed, with the final argument giving the length of + this memory in bytes. For calls that return numerical values, where + should point to appropriately aligned memory, with length set to at + least the "sizeof" the data type. + + The returned value from pcre2_config() is zero on success, or the nega- + tive error code PCRE2_ERROR_BADOPTION if the value in the first argu- + ment is not recognized. The following information is available: + + PCRE2_CONFIG_BSR + + The output is an integer whose value indicates what character sequences + the \R escape sequence matches by default. A value of 0 means that \R + matches any Unicode line ending sequence; a value of 1 means that \R + matches only CR, LF, or CRLF. The default can be overridden when a pat- + tern is compiled or matched. + + PCRE2_CONFIG_JIT + + The output is an integer that is set to one if support for just-in-time + compiling is available; otherwise it is set to zero. + + PCRE2_CONFIG_JITTARGET + + FIXME: this needs sorting out once JIT is implemented. If JIT support + is available, the string contains the name of the architecture for + which the JIT compiler is configured, for example "x86 32bit (little + endian + unaligned)". If JIT support is not available, FIXME. + + PCRE2_CONFIG_LINKSIZE + + The output is an integer that contains the number of bytes used for + internal linkage in compiled regular expressions. When PCRE2 is config- + ured, the value can be set to 2, 3, or 4, with the default being 2. + This is the value that is returned by pcre2_config(). However, when the + 16-bit library is compiled, a value of 3 is rounded up to 4, and when + the 32-bit library is compiled, internal linkages always use 4 bytes, + so the configured value is not relevant. + + The default value of 2 for the 8-bit and 16-bit libraries is sufficient + for all but the most massive patterns, since it allows the size of the + compiled pattern to be up to 64K code units. Larger values allow larger + regular expressions to be compiled by those two libraries, but at the + expense of slower matching. + + PCRE2_CONFIG_MATCHLIMIT + + The output is an unsigned long integer that gives the default limit for + the number of internal matching function calls in a pcre2_match() exe- + cution. Further details are given with pcre2_match() below. + + PCRE2_CONFIG_NEWLINE + + The output is an integer whose value specifies the default character + sequence that is recognized as meaning "newline". The values are: + + 1 Carriage return (CR) + 2 Linefeed (LF) + 3 Carriage return, linefeed (CRLF) + 4 Any Unicode line ending + 5 Any of CR, LF, or CRLF + + The default should normally correspond to the standard sequence for + your operating system. + + PCRE2_CONFIG_PARENSLIMIT + + The output is an unsigned long integer that gives the maximum depth of + nesting of parentheses (of any kind) in a pattern. This limit is + imposed to cap the amount of system stack used when a pattern is com- + piled. It is specified when PCRE2 is built; the default is 250. This + limit does not take into account the stack that may already be used by + the calling application. For finer control over compilation stack + usage, see pcre2_set_compile_recursion_guard(). + + PCRE2_CONFIG_RECURSIONLIMIT + + The output is an unsigned long integer that gives the default limit for + the depth of recursion when calling the internal matching function in a + pcre2_match() execution. Further details are given with pcre2_match() + below. + + PCRE2_CONFIG_STACKRECURSE + + The output is an integer that is set to one if internal recursion when + running pcre2_match() is implemented by recursive function calls that + use the system stack to remember their state. This is the usual way + that PCRE2 is compiled. The output is zero if PCRE2 was compiled to use + blocks of data on the heap instead of recursive function calls. + + PCRE2_CONFIG_UNICODE_VERSION + + The where argument should point to a buffer that is at least 24 code + units long. If PCRE2 has been compiled without Unicode support, this is + filled with the text "Unicode not supported". Otherwise, the Unicode + version string (for example, "7.0.0") is returnd. The string is zero- + terminated. + + PCRE2_CONFIG_UNICODE + + The output is an integer that is set to one if Unicode support is + available; otherwise it is set to zero. Unicode support implies UTF + support. + + PCRE2_CONFIG_VERSION + + The where argument should point to a buffer that is at least 12 code + units long. It is filled with the PCRE2 version string, zero-termi- + nated. + + +COMPILING A PATTERN + + pcre2_code *pcre2_compile(PCRE2_SPTR pattern, PCRE2_SIZE length, + uint32_t options, int *errorcode, PCRE2_SIZE *erroroffset, + pcre2_compile_context *ccontext); + + pcre2_code_free(pcre2_code *code); + + This function compiles a pattern, defined by a pointer to a string of + code units and a length, into an internal form. If the pattern is zero- + terminated, the length should be specified as PCRE2_ZERO_TERMINATED. + The function returns a pointer to a block of memory that contains the + compiled pattern and related data. The caller must free the memory by + calling pcre2_code_free() when it is no longer needed. + + If the compile context argument ccontext is NULL, the memory is + obtained by calling malloc(). Otherwise, it is obtained from the same + memory function that was used for the compile context. + + The options argument contains various bit settings that affect the com- + pilation. It should be zero if no options are required. The available + options are described below. Some of them (in particular, those that + are compatible with Perl, but some others as well) can also be set and + unset from within the pattern (see the detailed description in the + pcre2pattern documentation). + + For those options that can be different in different parts of the pat- + tern, the contents of the options argument specifies their settings at + the start of compilation. The PCRE2_ANCHORED, PCRE2_NO_UTF_CHECK, and + PCRE2_NO_START_OPTIMIZE options can be set at the time of matching as + well as at compile time. + + Other, less frequently required compile-time parameters (for example, + the newline setting) can be provided in a compile context (as described + above). + + If errorcode or erroroffset is NULL, pcre2_compile() returns NULL imme- + diately. Otherwise, if compilation of a pattern fails, pcre2_compile() + returns NULL, having set these variables to an error code and an offset + (number of code units) within the pattern, respectively. The + pcre2_get_error_message() function provides a textual message for each + error code. Compilation errors are positive numbers, but UTF formatting + errors are negative numbers. For an invalid UTF-8 or UTF-16 string, the + offset is that of the first code unit of the failing character. + + Some errors are not detected until the whole pattern has been scanned; + in these cases, the offset passed back is the length of the pattern. + Note that the offset is in code units, not characters, even in a UTF + mode. It may sometimes point into the middle of a UTF-8 or UTF-16 char- + acter. + + This code fragment shows a typical straightforward call to pcre2_com- + pile(): + + pcre2_code *re; + PCRE2_SIZE erroffset; + int errorcode; + re = pcre2_compile( + "^A.*Z", /* the pattern */ + PCRE2_ZERO_TERMINATED, /* the pattern is zero-terminated */ + 0, /* default options */ + &errorcode, /* for error code */ + &erroffset, /* for error offset */ + NULL); /* no compile context */ + + The following names for option bits are defined in the pcre2.h header + file: + + PCRE2_ANCHORED + + If this bit is set, the pattern is forced to be "anchored", that is, it + is constrained to match only at the first matching point in the string + that is being searched (the "subject string"). This effect can also be + achieved by appropriate constructs in the pattern itself, which is the + only way to do it in Perl. + + PCRE2_ALLOW_EMPTY_CLASS + + By default, for compatibility with Perl, a closing square bracket that + immediately follows an opening one is treated as a data character for + the class. When PCRE2_ALLOW_EMPTY_CLASS is set, it terminates the + class, which therefore contains no characters and so can never match. + + PCRE2_ALT_BSUX + + This option request alternative handling of three escape sequences, + which makes PCRE2's behaviour more like ECMAscript (aka JavaScript). + When it is set: + + (1) \U matches an upper case "U" character; by default \U causes a com- + pile time error (Perl uses \U to upper case subsequent characters). + + (2) \u matches a lower case "u" character unless it is followed by four + hexadecimal digits, in which case the hexadecimal number defines the + code point to match. By default, \u causes a compile time error (Perl + uses it to upper case the following character). + + (3) \x matches a lower case "x" character unless it is followed by two + hexadecimal digits, in which case the hexadecimal number defines the + code point to match. By default, as in Perl, a hexadecimal number is + always expected after \x, but it may have zero, one, or two digits (so, + for example, \xz matches a binary zero character followed by z). + + PCRE2_AUTO_CALLOUT + + If this bit is set, pcre2_compile() automatically inserts callout + items, all with number 255, before each pattern item. For discussion of + the callout facility, see the pcre2callout documentation. + + PCRE2_CASELESS + + If this bit is set, letters in the pattern match both upper and lower + case letters in the subject. It is equivalent to Perl's /i option, and + it can be changed within a pattern by a (?i) option setting. + + PCRE2_DOLLAR_ENDONLY + + If this bit is set, a dollar metacharacter in the pattern matches only + at the end of the subject string. Without this option, a dollar also + matches immediately before a newline at the end of the string (but not + before any other newlines). The PCRE2_DOLLAR_ENDONLY option is ignored + if PCRE2_MULTILINE is set. There is no equivalent to this option in + Perl, and no way to set it within a pattern. + + PCRE2_DOTALL + + If this bit is set, a dot metacharacter in the pattern matches any + character, including one that indicates a newline. However, it only + ever matches one character, even if newlines are coded as CRLF. Without + this option, a dot does not match when the current position in the sub- + ject is at a newline. This option is equivalent to Perl's /s option, + and it can be changed within a pattern by a (?s) option setting. A neg- + ative class such as [^a] always matches newline characters, independent + of the setting of this option. + + PCRE2_DUPNAMES + + If this bit is set, names used to identify capturing subpatterns need + not be unique. This can be helpful for certain types of pattern when it + is known that only one instance of the named subpattern can ever be + matched. There are more details of named subpatterns below; see also + the pcre2pattern documentation. + + PCRE2_EXTENDED + + If this bit is set, most white space characters in the pattern are + totally ignored except when escaped or inside a character class. How- + ever, white space is not allowed within sequences such as (?> that + introduce various parenthesized subpatterns, nor within numerical quan- + tifiers such as {1,3}. Ignorable white space is permitted between an + item and a following quantifier and between a quantifier and a follow- + ing + that indicates possessiveness. + + PCRE2_EXTENDED also causes characters between an unescaped # outside a + character class and the next newline, inclusive, to be ignored, which + makes it possible to include comments inside complicated patterns. Note + that the end of this type of comment is a literal newline sequence in + the pattern; escape sequences that happen to represent a newline do not + count. PCRE2_EXTENDED is equivalent to Perl's /x option, and it can be + changed within a pattern by a (?x) option setting. + + Which characters are interpreted as newlines can be specified by a set- + ting in the compile context that is passed to pcre2_compile() or by a + special sequence at the start of the pattern, as described in the sec- + tion entitled "Newline conventions" in the pcre2pattern documentation. + A default is defined when PCRE2 is built. + + PCRE2_FIRSTLINE + + If this option is set, an unanchored pattern is required to match + before or at the first newline in the subject string, though the + matched text may continue over the newline. + + PCRE2_MATCH_UNSET_BACKREF + + If this option is set, a back reference to an unset subpattern group + matches an empty string (by default this causes the current matching + alternative to fail). A pattern such as (\1)(a) succeeds when this + option is set (assuming it can find an "a" in the subject), whereas it + fails by default, for Perl compatibility. Setting this option makes + PCRE2 behave more like ECMAscript (aka JavaScript). + + PCRE2_MULTILINE + + By default, for the purposes of matching "start of line" and "end of + line", PCRE2 treats the subject string as consisting of a single line + of characters, even if it actually contains newlines. The "start of + line" metacharacter (^) matches only at the start of the string, and + the "end of line" metacharacter ($) matches only at the end of the + string, or before a terminating newline (except when PCRE2_DOL- + LAR_ENDONLY is set). Note, however, that unless PCRE2_DOTALL is set, + the "any character" metacharacter (.) does not match at a newline. This + behaviour (for ^, $, and dot) is the same as Perl. + + When PCRE2_MULTILINE it is set, the "start of line" and "end of line" + constructs match immediately following or immediately before internal + newlines in the subject string, respectively, as well as at the very + start and end. This is equivalent to Perl's /m option, and it can be + changed within a pattern by a (?m) option setting. If there are no new- + lines in a subject string, or no occurrences of ^ or $ in a pattern, + setting PCRE2_MULTILINE has no effect. + + PCRE2_NEVER_UCP + + This option locks out the use of Unicode properties for handling \B, + \b, \D, \d, \S, \s, \W, \w, and some of the POSIX character classes, as + described for the PCRE2_UCP option below. In particular, it prevents + the creator of the pattern from enabling this facility by starting the + pattern with (*UCP). This may be useful in applications that process + patterns from external sources. The option combination PCRE_UCP and + PCRE_NEVER_UCP causes an error. + + PCRE2_NEVER_UTF + + This option locks out interpretation of the pattern as UTF-8, UTF-16, + or UTF-32, depending on which library is in use. In particular, it pre- + vents the creator of the pattern from switching to UTF interpretation + by starting the pattern with (*UTF). This may be useful in applications + that process patterns from external sources. The combination of + PCRE2_UTF and PCRE2_NEVER_UTF causes an error. + + PCRE2_NO_AUTO_CAPTURE + + If this option is set, it disables the use of numbered capturing paren- + theses in the pattern. Any opening parenthesis that is not followed by + ? behaves as if it were followed by ?: but named parentheses can still + be used for capturing (and they acquire numbers in the usual way). + There is no equivalent of this option in Perl. + + PCRE2_NO_AUTO_POSSESS + + If this option is set, it disables "auto-possessification", which is an + optimization that, for example, turns a+b into a++b in order to avoid + backtracks into a+ that can never be successful. However, if callouts + are in use, auto-possessification means that some callouts are never + taken. You can set this option if you want the matching functions to do + a full unoptimized search and run all the callouts, but it is mainly + provided for testing purposes. + + PCRE2_NO_START_OPTIMIZE + + This is an option that acts at matching time; that is, it is really an + option for pcre2_match() or pcre_dfa_match(). If it is set at compile + time, it is remembered with the compiled pattern and assumed at match- + ing time. This is necessary if you want to use JIT execution, because + the JIT compiler needs to know whether or not this option is set. For + details, see the discussion of PCRE2_NO_START_OPTIMIZE in the section + on pcre2_match() options below. + + PCRE2_NO_UTF_CHECK + + When PCRE2_UTF is set, the validity of the pattern as a UTF string is + automatically checked. There are discussions about the validity of + UTF-8 strings, UTF-16 strings, and UTF-32 strings in the pcre2unicode + document. If an invalid UTF sequence is found, pcre2_compile() returns + a negative error code. + + If you know that your pattern is valid, and you want to skip this check + for performance reasons, you can set the PCRE2_NO_UTF_CHECK option. + When it is set, the effect of passing an invalid UTF string as a pat- + tern is undefined. It may cause your program to crash or loop. Note + that this option can also be passed to pcre2_match() and + pcre_dfa_match(), to suppress validity checking of the subject string. + + PCRE2_UCP + + This option changes the way PCRE2 processes \B, \b, \D, \d, \S, \s, \W, + \w, and some of the POSIX character classes. By default, only ASCII + characters are recognized, but if PCRE2_UCP is set, Unicode properties + are used instead to classify characters. More details are given in the + section on generic character types in the pcre2pattern page. If you set + PCRE2_UCP, matching one of the items it affects takes much longer. The + option is available only if PCRE2 has been compiled with UTF support. + + PCRE2_UNGREEDY + + This option inverts the "greediness" of the quantifiers so that they + are not greedy by default, but become greedy if followed by "?". It is + not compatible with Perl. It can also be set by a (?U) option setting + within the pattern. + + PCRE2_UTF + + This option causes PCRE2 to regard both the pattern and the subject + strings that are subsequently processed as strings of UTF characters + instead of single-code-unit strings. However, it is available only when + PCRE2 is built to include UTF support. If not, the use of this option + provokes an error. Details of how this option changes the behaviour of + PCRE2 are given in the pcre2unicode page. + + +COMPILATION ERROR CODES + + There are over 80 positive error codes that pcre2_compile() may return + if it finds an error in the pattern. There are also some negative error + codes that are used for invalid UTF strings. These are the same as + given by pcre2_match() and pcre2_dfa_match(), and are described in the + pcre2unicode page. The pcre2_get_error_message() function can be called + to obtain a textual error message from any error code. + + +JUST-IN-TIME (JIT) COMPILATION + + int pcre2_jit_compile(pcre2_code *code, uint32_t options); + + int pcre2_jit_match(const pcre2_code *code, PCRE2_SPTR subject, + PCRE2_SIZE length, PCRE2_SIZE startoffset, + uint32_t options, pcre2_match_data *match_data, + pcre2_match_context *mcontext, pcre2_jit_stack *jit_stack); + + void pcre2_jit_free_unused_memory(pcre2_general_context *gcontext); + + pcre2_jit_stack *pcre2_jit_stack_alloc(pcre2_general_context *gcontext, + PCRE2_SIZE startsize, PCRE2_SIZE maxsize); + + void pcre2_jit_stack_assign(const pcre2_code *code, + pcre2_jit_callback callback_function, void *callback_data); + + void pcre2_jit_stack_free(pcre2_jit_stack *jit_stack); + + These functions provide support for JIT compilation, which, if the + just-in-time compiler is available, further processes a compiled pat- + tern into machine code that executes much faster than the pcre2_match() + interpretive matching function. Full details are given in the pcre2jit + documentation. + + JIT compilation is a heavyweight optimization. It can take some time + for patterns to be analyzed, and for one-off matches and simple pat- + terns the benefit of faster execution might be offset by a much slower + compilation time. Most, but not all patterns can be optimized by the + JIT compiler. + + +LOCALE SUPPORT + + PCRE2 handles caseless matching, and determines whether characters are + letters, digits, or whatever, by reference to a set of tables, indexed + by character code point. When running in UTF-8 mode, or using the + 16-bit or 32-bit libraries, this applies only to characters with code + points less than 256. By default, higher-valued code points never match + escapes such as \w or \d. However, if PCRE2 is built with UTF support, + all characters can be tested with \p and \P, or, alternatively, the + PCRE2_UCP option can be set when a pattern is compiled; this causes \w + and friends to use Unicode property support instead of the built-in + tables. + + The use of locales with Unicode is discouraged. If you are handling + characters with code points greater than 128, you should either use + Unicode support, or use locales, but not try to mix the two. + + PCRE2 contains an internal set of character tables that are used by + default. These are sufficient for many applications. Normally, the + internal tables recognize only ASCII characters. However, when PCRE2 is + built, it is possible to cause the internal tables to be rebuilt in the + default "C" locale of the local system, which may cause them to be dif- + ferent. + + The internal tables can be overridden by tables supplied by the appli- + cation that calls PCRE2. These may be created in a different locale + from the default. As more and more applications change to using Uni- + code, the need for this locale support is expected to die away. + + External tables are built by calling the pcre2_maketables() function, + in the relevant locale. The result can be passed to pcre2_compile() as + often as necessary, by creating a compile context and calling + pcre2_set_character_tables() to set the tables pointer therein. For + example, to build and use tables that are appropriate for the French + locale (where accented characters with values greater than 128 are + treated as letters), the following code could be used: + + setlocale(LC_CTYPE, "fr_FR"); + tables = pcre2_maketables(NULL); + ccontext = pcre2_compile_context_create(NULL); + pcre2_set_character_tables(ccontext, tables); + re = pcre2_compile(..., ccontext); + + The locale name "fr_FR" is used on Linux and other Unix-like systems; + if you are using Windows, the name for the French locale is "french". + It is the caller's responsibility to ensure that the memory containing + the tables remains available for as long as it is needed. + + The pointer that is passed (via the compile context) to pcre2_compile() + is saved with the compiled pattern, and the same tables are used by + pcre2_match() and pcre_dfa_match(). Thus, for any single pattern, com- + pilation, and matching all happen in the same locale, but different + patterns can be processed in different locales. + + +INFORMATION ABOUT A COMPILED PATTERN + + int pcre2_pattern_info(const pcre2 *code, uint32_t what, void *where); + + The pcre2_pattern_info() function returns information about a compiled + pattern. The first argument is a pointer to the compiled pattern. The + second argument specifies which piece of information is required, and + the third argument is a pointer to a variable to receive the data. The + yield of the function is zero for success, or one of the following neg- + ative numbers: + + PCRE2_ERROR_NULL the argument code was NULL + the argument where was NULL + PCRE2_ERROR_BADMAGIC the "magic number" was not found + PCRE2_ERROR_BADOPTION the value of what was invalid + PCRE2_ERROR_UNSET the requested field is not set + + The "magic number" is placed at the start of each compiled pattern as + an simple check against passing an arbitrary memory pointer. Here is a + typical call of pcre2_pattern_info(), to obtain the length of the com- + piled pattern: + + int rc; + size_t length; + rc = pcre2_pattern_info( + re, /* result of pcre2_compile() */ + PCRE2_INFO_SIZE, /* what is required */ + &length); /* where to put the data */ + + The possible values for the second argument are defined in pcre2.h, and + are as follows: + + PCRE2_INFO_ALLOPTIONS + PCRE2_INFO_ARGOPTIONS + + Return a copy of the pattern's options. The third argument should point + to a uint32_t variable. PCRE2_INFO_ARGOPTIONS returns exactly the + options that were passed to pcre2_compile(), whereas PCRE2_INFO_ALLOP- + TIONS returns the compile options as modified by any top-level option + settings at the start of the pattern itself. In other words, they are + the options that will be in force when matching starts. For example, if + the pattern /(?im)abc(?-i)d/ is compiled with the PCRE2_EXTENDED + option, the result is PCRE2_CASELESS, PCRE2_MULTILINE, and + PCRE2_EXTENDED. + + A pattern is automatically anchored by PCRE2 if all of its top-level + alternatives begin with one of the following: + + ^ unless PCRE2_MULTILINE is set + \A always + \G always + .* if PCRE2_DOTALL is set and there are no back + references to the subpattern in which .* appears + + For such patterns, the PCRE2_ANCHORED bit is set in the options + returned for PCRE2_INFO_ALLOPTIONS. + + PCRE2_INFO_BACKREFMAX + + Return the number of the highest back reference in the pattern. The + third argument should point to an uint32_t variable. Zero is returned + if there are no back references. + + PCRE2_INFO_BSR + + The output is a uint32_t whose value indicates what character sequences + the \R escape sequence matches by default. A value of 0 means that \R + matches any Unicode line ending sequence; a value of 1 means that \R + matches only CR, LF, or CRLF. The default can be overridden when a pat- + tern is matched. + + PCRE2_INFO_CAPTURECOUNT + + Return the number of capturing subpatterns in the pattern. The third + argument should point to an uint32_t variable. + + PCRE2_INFO_FIRSTCODETYPE + + Return information about the first code unit of any matched string, for + a non-anchored pattern. The third argument should point to an uint32_t + variable. + + If there is a fixed first value, for example, the letter "c" from a + pattern such as (cat|cow|coyote), 1 is returned, and the character + value can be retrieved using PCRE2_INFO_FIRSTCODEUNIT. If there is no + fixed first value, and if either + + (a) the pattern was compiled with the PCRE2_MULTILINE option, and every + branch starts with "^", or + + (b) every branch of the pattern starts with ".*" and PCRE2_DOTALL is + not set (if it were set, the pattern would be anchored), + + 2 is returned, indicating that the pattern matches only at the start of + a subject string or after any newline within the string. Otherwise 0 is + returned. For anchored patterns, 0 is returned. + + PCRE2_INFO_FIRSTCODEUNIT + + Return the value of the first code unit of any matched string in the + situation where PCRE2_INFO_FIRSTCODETYPE returns 1; otherwise return 0. + The third argument should point to an uint32_t variable. In the 8-bit + library, the value is always less than 256. In the 16-bit library the + value can be up to 0xffff. In the 32-bit library in UTF-32 mode the + value can be up to 0x10ffff, and up to 0xffffffff when not using UTF-32 + mode. + + PCRE2_INFO_FIRSTBITMAP + + In the absence of a single first code unit for a non-anchored pattern, + pcre2_compile() may construct a 256-bit table that defines a fixed set + of values for the first code unit in any match. For example, a pattern + that starts with [abc] results in a table with three bits set. When + code unit values greater than 255 are supported, the flag bit for 255 + means "any code unit of value 255 or above". If such a table was con- + structed, a pointer to it is returned. Otherwise NULL is returned. The + third argument should point to an const uint8_t * variable. + + PCRE2_INFO_HASCRORLF + + Return 1 if the pattern contains any explicit matches for CR or LF + characters, otherwise 0. The third argument should point to an uint32_t + variable. An explicit match is either a literal CR or LF character, or + \r or \n. + + PCRE2_INFO_JCHANGED + + Return 1 if the (?J) or (?-J) option setting is used in the pattern, + otherwise 0. The third argument should point to an uint32_t variable. + (?J) and (?-J) set and unset the local PCRE2_DUPNAMES option, respec- + tively. + + PCRE2_INFO_JITSIZE + + If the compiled pattern was successfully processed by pcre2_jit_com- + pile(), return the size of the JIT compiled code, otherwise return + zero. The third argument should point to a size_t variable. + + PCRE2_INFO_LASTCODETYPE + + Returns 1 if there is a rightmost literal code unit that must exist in + any matched string, other than at its start. The third argument should + point to an uint32_t variable. If there is no such value, 0 is + returned. When 1 is returned, the code unit value itself can be + retrieved using PCRE2_INFO_LASTCODEUNIT. + + For anchored patterns, a last literal value is recorded only if it fol- + lows something of variable length. For example, for the pattern + /^a\d+z\d+/ the returned value is 1 (with "z" returned from + PCRE2_INFO_LASTCODEUNIT), but for /^a\dz\d/ the returned value is 0. + + PCRE2_INFO_LASTCODEUNIT + + Return the value of the rightmost literal data unit that must exist in + any matched string, other than at its start, if such a value has been + recorded. The third argument should point to an uint32_t variable. If + there is no such value, 0 is returned. + + PCRE2_INFO_MATCHEMPTY + + Return 1 if the pattern can match an empty string, otherwise 0. The + third argument should point to an uint32_t variable. + + PCRE2_INFO_MATCHLIMIT + + If the pattern set a match limit by including an item of the form + (*LIMIT_MATCH=nnnn) at the start, the value is returned. The third + argument should point to an unsigned 32-bit integer. If no such value + has been set, the call to pcre2_pattern_info() returns the error + PCRE2_ERROR_UNSET. + + PCRE2_INFO_MAXLOOKBEHIND + + Return the number of characters (not code units) in the longest lookbe- + hind assertion in the pattern. The third argument should point to an + unsigned 32-bit integer. This information is useful when doing multi- + segment matching using the partial matching facilities. Note that the + simple assertions \b and \B require a one-character lookbehind. \A also + registers a one-character lookbehind, though it does not actually + inspect the previous character. This is to ensure that at least one + character from the old segment is retained when a new segment is pro- + cessed. Otherwise, if there are no lookbehinds in the pattern, \A might + match incorrectly at the start of a new segment. + + PCRE2_INFO_MINLENGTH + + If a minimum length for matching subject strings was computed, its + value is returned. Otherwise the returned value is 0. The value is a + number of characters, which in UTF mode may be different from the num- + ber of code units. The third argument should point to an uint32_t + variable. The value is a lower bound to the length of any matching + string. There may not be any strings of that length that do actually + match, but every string that does match is at least that long. + + PCRE2_INFO_NAMECOUNT + PCRE2_INFO_NAMEENTRYSIZE + PCRE2_INFO_NAMETABLE + + PCRE2 supports the use of named as well as numbered capturing parenthe- + ses. The names are just an additional way of identifying the parenthe- + ses, which still acquire numbers. Several convenience functions such as + pcre2_substring_get_byname() are provided for extracting captured sub- + strings by name. It is also possible to extract the data directly, by + first converting the name to a number in order to access the correct + pointers in the output vector (described with pcre2_match() below). To + do the conversion, you need to use the name-to-number map, which is + described by these three values. + + The map consists of a number of fixed-size entries. PCRE2_INFO_NAME- + COUNT gives the number of entries, and PCRE2_INFO_NAMEENTRYSIZE gives + the size of each entry; both of these return a uint32_t value. The + entry size depends on the length of the longest name. + PCRE2_INFO_NAMETABLE returns a pointer to the first entry of the table. + This is a PCRE2_SPTR pointer to a block of code units. In the 8-bit + library, the first two bytes of each entry are the number of the cap- + turing parenthesis, most significant byte first. In the 16-bit library, + the pointer points to 16-bit data units, the first of which contains + the parenthesis number. In the 32-bit library, the pointer points to + 32-bit data units, the first of which contains the parenthesis number. + The rest of the entry is the corresponding name, zero terminated. + + The names are in alphabetical order. If (?| is used to create multiple + groups with the same number, as described in the section on duplicate + subpattern numbers in the pcre2pattern page, the groups may be given + the same name, but there is only one entry in the table. Different + names for groups of the same number are not permitted. + + Duplicate names for subpatterns with different numbers are permitted, + but only if PCRE2_DUPNAMES is set. They appear in the table in the + order in which they were found in the pattern. In the absence of (?| + this is the order of increasing number; when (?| is used this is not + necessarily the case because later subpatterns may have lower numbers. + + As a simple example of the name/number table, consider the following + pattern after compilation by the 8-bit library (assume PCRE2_EXTENDED + is set, so white space - including newlines - is ignored): + + (?<date> (?<year>(\d\d)?\d\d) - + (?<month>\d\d) - (?<day>\d\d) ) + + There are four named subpatterns, so the table has four entries, and + each entry in the table is eight bytes long. The table is as follows, + with non-printing bytes shows in hexadecimal, and undefined bytes shown + as ??: + + 00 01 d a t e 00 ?? + 00 05 d a y 00 ?? ?? + 00 04 m o n t h 00 + 00 02 y e a r 00 ?? + + When writing code to extract data from named subpatterns using the + name-to-number map, remember that the length of the entries is likely + to be different for each compiled pattern. + + PCRE2_INFO_NEWLINE + + The output is a uint32_t whose value specifies the default character + sequence that will be recognized as meaning "newline" while matching. + The values are: + + 1 Carriage return (CR) + 2 Linefeed (LF) + 3 Carriage return, linefeed (CRLF) + 4 Any Unicode line ending + 5 Any of CR, LF, or CRLF + + The default can be overridden when a pattern is matched. + + PCRE2_INFO_RECURSIONLIMIT + + If the pattern set a recursion limit by including an item of the form + (*LIMIT_RECURSION=nnnn) at the start, the value is returned. The third + argument should point to an unsigned 32-bit integer. If no such value + has been set, the call to pcre2_pattern_info() returns the error + PCRE2_ERROR_UNSET. + + PCRE2_INFO_SIZE + + Return the size of the compiled pattern in bytes (for all three + libraries). The third argument should point to a size_t variable. This + value does not include the size of the pcre2_code structure that is + returned by pcre_compile(). The value that is used when pcre2_compile() + is getting memory in which to place the compiled data is the value + returned by this option plus the size of the pcre2_code structure. Pro- + cessing a pattern with the JIT compiler does not alter the value + returned by this option. + + +THE MATCH DATA BLOCK + + pcre2_match_data_create(uint32_t ovecsize, + pcre2_general_context *gcontext); + + pcre2_match_data_create_from_pattern(pcre2_code *code, + pcre2_general_context *gcontext); + + void pcre2_match_data_free(pcre2_match_data *match_data); + + Information about successful and unsuccessful matches is placed in a + match data block, which is an opaque structure that is accessed by + function calls. In particular, the match data block contains a vector + of offsets into the subject string that define the matched part of the + subject and any substrings that were capured. This is know as the ovec- + tor. + + Before calling pcre2_match() or pcre2_dfa_match() you must create a + match data block by calling one of the creation functions above. For + pcre2_match_data_create(), the first argument is the number of pairs of + offsets in the ovector. One pair of offsets is required to identify the + string that matched the whole pattern, with another pair for each cap- + tured substring. For example, a value of 4 creates enough space to + record the matched portion of the subject plus three captured sub- + strings. + + For pcre2_match_data_create_from_pattern(), the first argument is a + pointer to a compiled pattern. In this case the ovector is created to + be exactly the right size to hold all the substrings a pattern might + capture. + + The second argument of both these functions ia a pointer to a general + context, which can specify custom memory management for obtaining the + memory for the match data block. If you are not using custom memory + management, pass NULL. + + A match data block can be used many times, with the same or different + compiled patterns. When it is no longer needed, it should be freed by + calling pcre2_match_data_free(). How to extract information from a + match data block after a match operation is described in the sections + on matched strings and other match data below. + + +MATCHING A PATTERN: THE TRADITIONAL FUNCTION + + int pcre2_match(const pcre2_code *code, PCRE2_SPTR subject, + PCRE2_SIZE length, PCRE2_SIZE startoffset, + uint32_t options, pcre2_match_data *match_data, + pcre2_match_context *mcontext); + + The function pcre2_match() is called to match a subject string against + a compiled pattern, which is passed in the code argument. You can call + pcre2_match() with the same code argument as many times as you like, in + order to find multiple matches in the subject string or to match dif- + ferent subject strings with the same pattern. + + This function is the main matching facility of the library, and it + operates in a Perl-like manner. For specialist use there is also an + alternative matching function, which is described below in the section + about the pcre2_dfa_match() function. + + Here is an example of a simple call to pcre2_match(): + + pcre2_match_data *md = pcre2_match_data_create(4, NULL); + int rc = pcre2_match( + re, /* result of pcre2_compile() */ + "some string", /* the subject string */ + 11, /* the length of the subject string */ + 0, /* start at offset 0 in the subject */ + 0, /* default options */ + match_data, /* the match data block */ + NULL); /* a match context; NULL means use defaults */ + + If the subject string is zero-terminated, the length can be given as + PCRE2_ZERO_TERMINATED. A match context must be provided if certain less + common matching parameters are to be changed. For details, see the sec- + tion on the match context above. + + The string to be matched by pcre2_match() + + The subject string is passed to pcre2_match() as a pointer in subject, + a length in length, and a starting offset in startoffset. The length + and offset are in code units, not characters. That is, they are in + bytes for the 8-bit library, 16-bit code units for the 16-bit library, + and 32-bit code units for the 32-bit library, whether or not UTF pro- + cessing is enabled. + + If startoffset is greater than the length of the subject, pcre2_match() + returns PCRE2_ERROR_BADOFFSET. When the starting offset is zero, the + search for a match starts at the beginning of the subject, and this is + by far the most common case. In UTF-8 or UTF-16 mode, the starting off- + set must point to the start of a character, or to the end of the sub- + ject (in UTF-32 mode, one code unit equals one character, so all off- + sets are valid). Like the pattern string, the subject may contain + binary zeroes. + + A non-zero starting offset is useful when searching for another match + in the same subject by calling pcre2_match() again after a previous + success. Setting startoffset differs from passing over a shortened + string and setting PCRE2_NOTBOL in the case of a pattern that begins + with any kind of lookbehind. For example, consider the pattern + + \Biss\B + + which finds occurrences of "iss" in the middle of words. (\B matches + only if the current position in the subject is not a word boundary.) + When applied to the string "Mississipi" the first call to pcre2_match() + finds the first occurrence. If pcre2_match() is called again with just + the remainder of the subject, namely "issipi", it does not match, + because \B is always false at the start of the subject, which is deemed + to be a word boundary. However, if pcre2_match() is passed the entire + string again, but with startoffset set to 4, it finds the second occur- + rence of "iss" because it is able to look behind the starting point to + discover that it is preceded by a letter. + + Finding all the matches in a subject is tricky when the pattern can + match an empty string. It is possible to emulate Perl's /g behaviour by + first trying the match again at the same offset, with the + PCRE2_NOTEMPTY_ATSTART and PCRE2_ANCHORED options, and then if that + fails, advancing the starting offset and trying an ordinary match + again. There is some code that demonstrates how to do this in the + pcre2demo sample program. In the most general case, you have to check + to see if the newline convention recognizes CRLF as a newline, and if + so, and the current character is CR followed by LF, advance the start- + ing offset by two characters instead of one. + + If a non-zero starting offset is passed when the pattern is anchored, + one attempt to match at the given offset is made. This can only succeed + if the pattern does not require the match to be at the start of the + subject. + + Option bits for pcre2_match() + + The unused bits of the options argument for pcre2_match() must be zero. + The only bits that may be set are PCRE2_ANCHORED, PCRE2_NOTBOL, + PCRE2_NOTEOL, PCRE2_NOTEMPTY, PCRE2_NOTEMPTY_ATSTART, + PCRE2_NO_START_OPTIMIZE, PCRE2_NO_UTF_CHECK, PCRE2_PARTIAL_HARD, and + PCRE2_PARTIAL_SOFT. Their action is described below. + + If the pattern was successfully processed by the just-in-time (JIT) + compiler, the only supported options for matching using the JIT code + are PCRE2_NOTBOL, PCRE2_NOTEOL, PCRE2_NOTEMPTY, PCRE2_NOTEMPTY_ATSTART, + PCRE2_NO_UTF_CHECK, PCRE2_PARTIAL_HARD, and PCRE2_PARTIAL_SOFT. If an + unsupported option is used, JIT matching is disabled and the normal + interpretive code in pcre2_match() is run. + + PCRE2_ANCHORED + + The PCRE2_ANCHORED option limits pcre2_match() to matching at the first + matching position. If a pattern was compiled with PCRE2_ANCHORED, or + turned out to be anchored by virtue of its contents, it cannot be made + unachored at matching time. Note that setting the option at match time + disables JIT matching. + + PCRE2_NOTBOL + + This option specifies that first character of the subject string is not + the beginning of a line, so the circumflex metacharacter should not + match before it. Setting this without PCRE2_MULTILINE (at compile time) + causes circumflex never to match. This option affects only the behav- + iour of the circumflex metacharacter. It does not affect \A. + + PCRE2_NOTEOL + + This option specifies that the end of the subject string is not the end + of a line, so the dollar metacharacter should not match it nor (except + in multiline mode) a newline immediately before it. Setting this with- + out PCRE2_MULTILINE (at compile time) causes dollar never to match. + This option affects only the behaviour of the dollar metacharacter. It + does not affect \Z or \z. + + PCRE2_NOTEMPTY + + An empty string is not considered to be a valid match if this option is + set. If there are alternatives in the pattern, they are tried. If all + the alternatives match the empty string, the entire match fails. For + example, if the pattern + + a?b? + + is applied to a string not beginning with "a" or "b", it matches an + empty string at the start of the subject. With PCRE2_NOTEMPTY set, this + match is not valid, so PCRE2 searches further into the string for + occurrences of "a" or "b". + + PCRE2_NOTEMPTY_ATSTART + + This is like PCRE2_NOTEMPTY, except that an empty string match that is + not at the start of the subject is permitted. If the pattern is + anchored, such a match can occur only if the pattern contains \K. + + PCRE2_NO_START_OPTIMIZE + + There are a number of optimizations that pcre2_match() uses at the + start of a match, in order to speed up the process. For example, if it + is known that an unanchored match must start with a specific character, + it searches the subject for that character, and fails immediately if it + cannot find it, without actually running the main matching function. + This means that a special item such as (*COMMIT) at the start of a pat- + tern is not considered until after a suitable starting point for the + match has been found. Also, when callouts or (*MARK) items are in use, + these "start-up" optimizations can cause them to be skipped if the pat- + tern is never actually used. The start-up optimizations are in effect a + pre-scan of the subject that takes place before the pattern is run. + + The PCRE2_NO_START_OPTIMIZE option disables the start-up optimizations, + possibly causing performance to suffer, but ensuring that in cases + where the result is "no match", the callouts do occur, and that items + such as (*COMMIT) and (*MARK) are considered at every possible starting + position in the subject string. If PCRE2_NO_START_OPTIMIZE is set at + compile time, it cannot be unset at matching time. The use of + PCRE2_NO_START_OPTIMIZE at matching time (that is, passing it to + pcre2_match()) disables JIT execution; in this situation, matching is + always done using interpretively. + + Setting PCRE2_NO_START_OPTIMIZE can change the outcome of a matching + operation. Consider the pattern + + (*COMMIT)ABC + + When this is compiled, PCRE2 records the fact that a match must start + with the character "A". Suppose the subject string is "DEFABC". The + start-up optimization scans along the subject, finds "A" and runs the + first match attempt from there. The (*COMMIT) item means that the pat- + tern must match the current starting position, which in this case, it + does. However, if the same match is run with PCRE2_NO_START_OPTIMIZE + set, the initial scan along the subject string does not happen. The + first match attempt is run starting from "D" and when this fails, + (*COMMIT) prevents any further matches being tried, so the overall + result is "no match". There are also other start-up optimizations. For + example, a minimum length for the subject may be recorded. Consider the + pattern + + (*MARK:A)(X|Y) + + The minimum length for a match is one character. If the subject is + "ABC", there will be attempts to match "ABC", "BC", and "C". An attempt + to match an empty string at the end of the subject does not take place, + because PCRE2 knows that the subject is now too short, and so the + (*MARK) is never encountered. In this case, the optimization does not + affect the overall match result, which is still "no match", but it does + affect the auxiliary information that is returned. + + PCRE2_NO_UTF_CHECK + + When PCRE2_UTF is set at compile time, the validity of the subject as a + UTF string is checked by default when pcre2_match() is subsequently + called. The entire string is checked before any other processing takes + place, and a negative error code is returned if the check fails. There + are several UTF error codes for each code unit width, corresponding to + different problems with the code unit sequence. The value of startoff- + set is also checked, to ensure that it points to the start of a charac- + ter or to the end of the subject. There are discussions about the + validity of UTF-8 strings, UTF-16 strings, and UTF-32 strings in the + pcre2unicode page. + + If you know that your subject is valid, and you want to skip these + checks for performance reasons, you can set the PCRE2_NO_UTF_CHECK + option when calling pcre2_match(). You might want to do this for the + second and subsequent calls to pcre2_match() if you are making repeated + calls to find all the matches in a single subject string. + + NOTE: When PCRE2_NO_UTF_CHECK is set, the effect of passing an invalid + string as a subject, or an invalid value of startoffset, is undefined. + Your program may crash or loop indefinitely. + + PCRE2_PARTIAL_HARD + PCRE2_PARTIAL_SOFT + + These options turn on the partial matching feature. A partial match + occurs if the end of the subject string is reached successfully, but + there are not enough subject characters to complete the match. If this + happens when PCRE2_PARTIAL_SOFT (but not PCRE2_PARTIAL_HARD) is set, + matching continues by testing any remaining alternatives. Only if no + complete match can be found is PCRE2_ERROR_PARTIAL returned instead of + PCRE2_ERROR_NOMATCH. In other words, PCRE2_PARTIAL_SOFT says that the + caller is prepared to handle a partial match, but only if no complete + match can be found. + + If PCRE2_PARTIAL_HARD is set, it overrides PCRE2_PARTIAL_SOFT. In this + case, if a partial match is found, pcre2_match() immediately returns + PCRE2_ERROR_PARTIAL, without considering any other alternatives. In + other words, when PCRE2_PARTIAL_HARD is set, a partial match is consid- + ered to be more important that an alternative complete match. + + There is a more detailed discussion of partial and multi-segment match- + ing, with examples, in the pcre2partial documentation. + + +NEWLINE HANDLING WHEN MATCHING + + When PCRE2 is built, a default newline convention is set; this is usu- + ally the standard convention for the operating system. The default can + be overridden in either a compile context or a match context. However, + changing the newline convention at match time disables JIT matching. + During matching, the newline choice affects the behaviour of the dot, + circumflex, and dollar metacharacters. It may also alter the way the + match position is advanced after a match failure for an unanchored pat- + tern. + + When PCRE2_NEWLINE_CRLF, PCRE2_NEWLINE_ANYCRLF, or PCRE2_NEWLINE_ANY is + set, and a match attempt for an unanchored pattern fails when the cur- + rent position is at a CRLF sequence, and the pattern contains no + explicit matches for CR or LF characters, the match position is + advanced by two characters instead of one, in other words, to after the + CRLF. + + The above rule is a compromise that makes the most common cases work as + expected. For example, if the pattern is .+A (and the PCRE2_DOTALL + option is not set), it does not match the string "\r\nA" because, after + failing at the start, it skips both the CR and the LF before retrying. + However, the pattern [\r\n]A does match that string, because it con- + tains an explicit CR or LF reference, and so advances only by one char- + acter after the first failure. + + An explicit match for CR of LF is either a literal appearance of one of + those characters in the pattern, or one of the \r or \n escape + sequences. Implicit matches such as [^X] do not count, nor does \s + (which includes CR and LF in the characters that it matches). + + Notwithstanding the above, anomalous effects may still occur when CRLF + is a valid newline sequence and explicit \r or \n escapes appear in the + pattern. + + +HOW PCRE2_MATCH() RETURNS A STRING AND CAPTURED SUBSTRINGS + + uint32_t pcre2_get_ovector_count(pcre2_match_data *match_data); + + PCRE2_SIZE *pcre2_get_ovector_pointer(pcre2_match_data *match_data); + + In general, a pattern matches a certain portion of the subject, and in + addition, further substrings from the subject may be picked out by + parenthesized parts of the pattern. Following the usage in Jeffrey + Friedl's book, this is called "capturing" in what follows, and the + phrase "capturing subpattern" is used for a fragment of a pattern that + picks out a substring. PCRE2 supports several other kinds of parenthe- + sized subpattern that do not cause substrings to be captured. The + pcre2_pattern_info() function can be used to find out how many captur- + ing subpatterns there are in a compiled pattern. + + The overall matched string and any captured substrings are returned to + the caller via a vector of PCRE2_SIZE values, called the ovector. This + is contained within the match data block. You can obtain direct access + to the ovector by calling pcre2_get_ovector_pointer() to find its + address, and pcre2_get_ovector_count() to find the number of pairs of + values it contains. Alternatively, you can use the auxiliary functions + for accessing captured substrings by number or by name (see below). + + Within the ovector, the first in each pair of values is set to the off- + set of the first code unit of a substring, and the second is set to the + offset of the first code unit after the end of a substring. These val- + ues are always code unit offsets, not character offsets. That is, they + are byte offsets in the 8-bit library, 16-bit offsets in the 16-bit + library, and 32-bit offsets in the 32-bit library. + + The first pair of offsets (that is, ovector[0] and ovector[1]) identi- + fies the portion of the subject string that was matched by the entire + pattern. The next pair is used for the first capturing subpattern, and + so on. The value returned by pcre2_match() is one more than the high- + est numbered pair that has been set. For example, if two substrings + have been captured, the returned value is 3. If there are no capturing + subpatterns, the return value from a successful match is 1, indicating + that just the first pair of offsets has been set. + + If a capturing subpattern is matched repeatedly within a single match + operation, it is the last portion of the string that it matched that is + returned. + + If the ovector is too small to hold all the captured substring offsets, + as much as possible is filled in, and the function returns a value of + zero. If neither the actual string matched nor any captured substrings + are of interest, pcre2_match() may be called with a match data block + whose ovector is of zero length. However, if the pattern contains back + references and the ovector is not big enough to remember the related + substrings, PCRE2 has to get additional memory for use during matching. + Thus it is usually advisable to set up a match data block containing an + ovector of reasonable size. + + It is possible for capturing subpattern number n+1 to match some part + of the subject when subpattern n has not been used at all. For example, + if the string "abc" is matched against the pattern (a|(z))(bc) the + return from the function is 4, and subpatterns 1 and 3 are matched, but + 2 is not. When this happens, both values in the offset pairs corre- + sponding to unused subpatterns are set to PCRE2_UNSET. + + Offset values that correspond to unused subpatterns at the end of the + expression are also set to PCRE2_UNSET. For example, if the string + "abc" is matched against the pattern (abc)(x(yz)?)? subpatterns 2 and 3 + are not matched. The return from the function is 2, because the high- + est used capturing subpattern number is 1. The offsets for for the sec- + ond and third capturing subpatterns (assuming the vector is large + enough, of course) are set to PCRE2_UNSET. + + Elements in the ovector that do not correspond to capturing parentheses + in the pattern are never changed. That is, if a pattern contains n cap- + turing parentheses, no more than ovector[0] to ovector[2n+1] are set by + pcre2_match(). The other elements retain whatever values they previ- + ously had. + + Other information about the match + + PCRE2_SPTR pcre2_get_mark(pcre2_match_data *match_data); + + PCRE2_SIZE pcre2_get_leftchar(pcre2_match_data *match_data); + + PCRE2_SIZE pcre2_get_rightchar(pcre2_match_data *match_data); + + PCRE2_SIZE pcre2_get_startchar(pcre2_match_data *match_data); + + In addition to the offsets in the ovector, other information about a + match is retained in the match data block and can be retrieved by the + above functions. + + When a (*MARK) name is to be passed back, pcre2_get_mark() returns a + pointer to the zero-terminated name, which is within the compiled pat- + tern. Otherwise NULL is returned. A (*MARK) name may be available + after a failed match or a partial match, as well as after a successful + one. + + The other three functions yield values that give information about the + part of the subject string that was inspected during a successful match + or a partial match. Their results are undefined after a failed match. + They return the following values, respectively: + + (1) The offset of the leftmost character that was inspected during the + match. This can be earlier than the point at which the match started + if the pattern contains lookbehind assertions or \b or \B at the start. + + (2) The offset of the character that follows the rightmost character + that was inspected during the match. This can be after the end of the + match if the pattern contains lookahead assertions. + + (3) The offset of the character at which the successful or partial + match started. This can be different to the value of ovector[0] if the + pattern contains the \K escape sequence. + + For example, if the pattern (?<=abc)xx\Kyy(?=def) is matched against + the string "123abcxxyydef123", the resulting offsets are: + + ovector[0] 8 + ovector[1] 10 + leftchar 3 + rightchar 13 + startchar 6 + + The allusedtext modifier in pcre2test can be used to display a longer + string that shows the leftmost and rightmost characters in a match + instead of just the matched string. + + Error return values from pcre2_match() + + If pcre2_match() fails, it returns a negative number. This can be con- + verted to a text string by calling pcre2_get_error_message(). Negative + error codes are also returned by other functions, and are documented + with them. The codes are given names in the header file. If UTF check- + ing is in force and an invalid UTF subject string is detected, one of a + number of UTF-specific negative error codes is returned. Details are + given in the pcre2unicode page. The following are the other errors that + may be returned by pcre2_match(): + + PCRE2_ERROR_NOMATCH + + The subject string did not match the pattern. + + PCRE2_ERROR_PARTIAL + + The subject string did not match, but it did match partially. See the + pcre2partial documentation for details of partial matching. + + PCRE2_ERROR_BADMAGIC + + PCRE2 stores a 4-byte "magic number" at the start of the compiled code, + to catch the case when it is passed a junk pointer. This is the error + that is returned when the magic number is not present. + + PCRE2_ERROR_BADMODE + + This error is given when a pattern that was compiled by the 8-bit + library is passed to a 16-bit or 32-bit library function, or vice + versa. + + PCRE2_ERROR_BADOFFSET + + The value of startoffset greater than the length of the subject. + + PCRE2_ERROR_BADOPTION + + An unrecognized bit was set in the options argument. + + PCRE2_ERROR_BADUTFOFFSET + + The UTF code unit sequence that was passed as a subject was checked and + found to be valid (the PCRE2_NO_UTF_CHECK option was not set), but the + value of startoffset did not point to the beginning of a UTF character + or the end of the subject. + + PCRE2_ERROR_CALLOUT + + This error is never generated by pcre2_match() itself. It is provided + for use by callout functions that want to cause pcre2_match() to return + a distinctive error code. See the pcre2callout documentation for + details. + + PCRE2_ERROR_INTERNAL + + An unexpected internal error has occurred. This error could be caused + by a bug in PCRE2 or by overwriting of the compiled pattern. + + PCRE2_ERROR_JIT_BADOPTION + + This error is returned when a pattern that was successfully studied + using JIT is being matched, but the matching mode (partial or complete + match) does not correspond to any JIT compilation mode. When the JIT + fast path function is used, this error may be also given for invalid + options. See the pcre2jit documentation for more details. + + PCRE2_ERROR_JIT_STACKLIMIT + + This error is returned when a pattern that was successfully studied + using JIT is being matched, but the memory available for the just-in- + time processing stack is not large enough. See the pcre2jit documenta- + tion for more details. + + PCRE2_ERROR_MATCHLIMIT + + The backtracking limit was reached. + + PCRE2_ERROR_NOMEMORY + + If a pattern contains back references, but the ovector is not big + enough to remember the referenced substrings, PCRE2 gets a block of + memory at the start of matching to use for this purpose. There are some + other special cases where extra memory is needed during matching. This + error is given when memory cannot be obtained. + + PCRE2_ERROR_NULL + + Either the code, subject, or match_data argument was passed as NULL. + + PCRE2_ERROR_RECURSELOOP + + This error is returned when pcre2_match() detects a recursion loop + within the pattern. Specifically, it means that either the whole pat- + tern or a subpattern has been called recursively for the second time at + the same position in the subject string. Some simple patterns that + might do this are detected and faulted at compile time, but more com- + plicated cases, in particular mutual recursions between two different + subpatterns, cannot be detected until run time. + + PCRE2_ERROR_RECURSIONLIMIT + + The internal recursion limit was reached. + + +EXTRACTING CAPTURED SUBSTRINGS BY NUMBER + + int pcre2_substring_length_bynumber(pcre2_match_data *match_data, + unsigned int number, PCRE2_SIZE *length); + + int pcre2_substring_copy_bynumber(pcre2_match_data *match_data, + unsigned int number, PCRE2_UCHAR *buffer, + PCRE2_SIZE *bufflen); + + int pcre2_substring_get_bynumber(pcre2_match_data *match_data, + unsigned int number, PCRE2_UCHAR **bufferptr, + PCRE2_SIZE *bufflen); + + void pcre2_substring_free(PCRE2_UCHAR *buffer); + + Captured substrings can be accessed directly by using the ovector as + described above. For convenience, auxiliary functions are provided for + extracting captured substrings as new, separate, zero-terminated + strings. The functions in this section identify substrings by number. + The next section describes similar functions for extracting substrings + by name. A substring that contains a binary zero is correctly extracted + and has a further zero added on the end, but the result is not, of + course, a C string. + + You can find the length in code units of a captured substring without + extracting it by calling pcre2_substring_length_bynumber(). The first + argument is a pointer to the match data block, the second is the group + number, and the third is a pointer to a variable into which the length + is placed. + + The pcre2_substring_copy_bynumber() function copies one string into a + supplied buffer, whereas pcre2_substring_get_bynumber() copies it into + new memory, obtained using the same memory allocation function that was + used for the match data block. The first two arguments of these func- + tions are a pointer to the match data block and a capturing group num- + ber. A group number of zero extracts the substring that matched the + entire pattern, and higher values extract the captured substrings. + + The final arguments of pcre2_substring_copy_bynumber() are a pointer to + the buffer and a pointer to a variable that contains its length in code + units. This is updated to contain the actual number of code units + used, excluding the terminating zero. + + For pcre2_substring_get_bynumber() the third and fourth arguments point + to variables that are updated with a pointer to the new memory and the + number of code units that comprise the substring, again excluding the + terminating zero. When the substring is no longer needed, the memory + should be freed by calling pcre2_substring_free(). + + The return value from these functions is zero for success, or one of + these error codes: + + PCRE2_ERROR_NOMEMORY + + The buffer was too small for pcre2_substring_copy_bynumber(), or the + attempt to get memory failed for pcre2_substring_get_bynumber(). + + PCRE2_ERROR_NOSUBSTRING + + No substring with the given number was captured. This could be because + there is no capturing group of that number in the pattern, or because + the group with that number did not participate in the match, or because + the ovector was too small to capture that group. + + +EXTRACTING A LIST OF ALL CAPTURED SUBSTRINGS + + int pcre2_substring_list_get(pcre2_match_data *match_data, + PCRE2_UCHAR ***listptr, PCRE2_SIZE **lengthsptr); + + void pcre2_substring_list_free(PCRE2_SPTR *list); + + The pcre2_substring_list_get() function extracts all available sub- + strings and builds a list of pointers to them, and a second list that + contains their lengths (in code units), excluding a terminating zero + that is added to each of them. All this is done in a single block of + memory that is obtained using the same memory allocation function that + was used to get the match data block. + + The address of the memory block is returned via listptr, which is also + the start of the list of string pointers. The end of the list is marked + by a NULL pointer. The address of the list of lengths is returned via + lengthsptr. If your strings do not contain binary zeros and you do not + therefore need the lengths, you may supply NULL as the lengthsptr argu- + ment to disable the creation of a list of lengths. The yield of the + function is zero if all went well, or PCRE2_ERROR_NOMEMORY if the mem- + ory block could not be obtained. When the list is no longer needed, it + should be freed by calling pcre2_substring_list_free(). + + If this function encounters a substring that is unset, which can happen + when capturing subpattern number n+1 matches some part of the subject, + but subpattern n has not been used at all, it returns an empty string. + This can be distinguished from a genuine zero-length substring by + inspecting the appropriate offset in the ovector, which contains + PCRE2_UNSET for unset substrings. + + +EXTRACTING CAPTURED SUBSTRINGS BY NAME + + int pcre2_substring_number_from_name(const pcre2_code *code, + PCRE2_SPTR name); + + int pcre2_substring_length_byname(pcre2_match_data *match_data, + PCRE2_SPTR name, PCRE2_SIZE *length); + + int pcre2_substring_copy_byname(pcre2_match_data *match_data, + PCRE2_SPTR name, PCRE2_UCHAR *buffer, PCRE2_SIZE *bufflen); + + int pcre2_substring_get_byname(pcre2_match_data *match_data, + PCRE2_SPTR name, PCRE2_UCHAR **bufferptr, PCRE2_SIZE *bufflen); + + void pcre2_substring_free(PCRE2_UCHAR *buffer); + + To extract a substring by name, you first have to find associated num- + ber. For example, for this pattern: + + (a+)b(?<xxx>\d+)... + + the number of the subpattern called "xxx" is 2. If the name is known to + be unique (PCRE2_DUPNAMES was not set), you can find the number from + the name by calling pcre2_substring_number_from_name(). The first argu- + ment is the compiled pattern, and the second is the name. The yield of + the function is the subpattern number, or PCRE2_ERROR_NOSUBSTRING if + there is no subpattern of that name. + + Given the number, you can extract the substring directly, or use one of + the functions described in the previous section. For convenience, there + are also "byname" functions that correspond to the "bynumber" func- + tions, the only difference being that the second argument is a name + instead of a number. However, if PCRE2_DUPNAMES is set and there are + duplicate names, the behaviour may not be what you want (see the next + section). + + Warning: If the pattern uses the (?| feature to set up multiple subpat- + terns with the same number, as described in the section on duplicate + subpattern numbers in the pcre2pattern page, you cannot use names to + distinguish the different subpatterns, because names are not included + in the compiled code. The matching process uses only numbers. For this + reason, the use of different names for subpatterns of the same number + causes an error at compile time. + + +DUPLICATE SUBPATTERN NAMES + + int pcre2_substring_nametable_scan(const pcre2_code *code, + PCRE2_SPTR name, PCRE2_SPTR *first, PCRE2_SPTR *last); + + When a pattern is compiled with the PCRE2_DUPNAMES option, names for + subpatterns are not required to be unique. Duplicate names are always + allowed for subpatterns with the same number, created by using the (?| + feature. Indeed, if such subpatterns are named, they are required to + use the same names. + + Normally, patterns with duplicate names are such that in any one match, + only one of the named subpatterns participates. An example is shown in + the pcre2pattern documentation. + + When duplicates are present, pcre2_substring_copy_byname() and + pcre2_substring_get_byname() return the first substring corresponding + to the given name that is set. If none are set, PCRE2_ERROR_NOSUBSTRING + is returned. The pcre2_substring_number_from_name() function returns + one of the numbers that are associated with the name, but it is not + defined which it is. + + If you want to get full details of all captured substrings for a given + name, you must use the pcre2_substring_nametable_scan() function. The + first argument is the compiled pattern, and the second is the name. If + the third and fourth arguments are NULL, the function returns a group + number (it is not defined which). Otherwise, the third and fourth argu- + ments must be pointers to variables that are updated by the function. + After it has run, they point to the first and last entries in the name- + to-number table for the given name, and the function returns the length + of each entry. In both cases, PCRE2_ERROR_NOSUBSTRING is returned if + there are no entries for the given name. + + The format of the name table is described above in the section entitled + Information about a pattern above. Given all the relevant entries for + the name, you can extract each of their numbers, and hence the captured + data. + + +FINDING ALL POSSIBLE MATCHES + + The traditional matching function uses a similar algorithm to Perl, + which stops when it finds the first match, starting at a given point in + the subject. If you want to find all possible matches, or the longest + possible match at a given position, consider using the alternative + matching function (see below) instead. If you cannot use the alterna- + tive function, you can kludge it up by making use of the callout facil- + ity, which is described in the pcre2callout documentation. + + What you have to do is to insert a callout right at the end of the pat- + tern. When your callout function is called, extract and save the cur- + rent matched substring. Then return 1, which forces pcre2_match() to + backtrack and try other alternatives. Ultimately, when it runs out of + matches, pcre2_match() will yield PCRE2_ERROR_NOMATCH. + + +MATCHING A PATTERN: THE ALTERNATIVE FUNCTION + + int pcre2_dfa_match(const pcre2_code *code, PCRE2_SPTR subject, + PCRE2_SIZE length, PCRE2_SIZE startoffset, + uint32_t options, pcre2_match_data *match_data, + pcre2_match_context *mcontext, + int *workspace, PCRE2_SIZE wscount); + + The function pcre2_dfa_match() is called to match a subject string + against a compiled pattern, using a matching algorithm that scans the + subject string just once, and does not backtrack. This has different + characteristics to the normal algorithm, and is not compatible with + Perl. Some of the features of PCRE2 patterns are not supported. Never- + theless, there are times when this kind of matching can be useful. For + a discussion of the two matching algorithms, and a list of features + that pcre2_dfa_match() does not support, see the pcre2matching documen- + tation. + + The arguments for the pcre2_dfa_match() function are the same as for + pcre2_match(), plus two extras. The ovector within the match data block + is used in a different way, and this is described below. The other com- + mon arguments are used in the same way as for pcre2_match(), so their + description is not repeated here. + + The two additional arguments provide workspace for the function. The + workspace vector should contain at least 20 elements. It is used for + keeping track of multiple paths through the pattern tree. More + workspace is needed for patterns and subjects where there are a lot of + potential matches. + + Here is an example of a simple call to pcre2_dfa_match(): + + int wspace[20]; + pcre2_match_data *md = pcre2_match_data_create(4, NULL); + int rc = pcre2_dfa_match( + re, /* result of pcre2_compile() */ + "some string", /* the subject string */ + 11, /* the length of the subject string */ + 0, /* start at offset 0 in the subject */ + 0, /* default options */ + match_data, /* the match data block */ + NULL, /* a match context; NULL means use defaults */ + wspace, /* working space vector */ + 20); /* number of elements (NOT size in bytes) */ + + Option bits for pcre_dfa_match() + + The unused bits of the options argument for pcre2_dfa_match() must be + zero. The only bits that may be set are PCRE2_ANCHORED, PCRE2_NOTBOL, + PCRE2_NOTEOL, PCRE2_NOTEMPTY, PCRE2_NOTEMPTY_ATSTART, + PCRE2_NO_UTF_CHECK, PCRE2_NO_START_OPTIMIZE, PCRE2_PARTIAL_HARD, + PCRE2_PARTIAL_SOFT, PCRE2_DFA_SHORTEST, and PCRE2_DFA_RESTART. All but + the last four of these are exactly the same as for pcre2_match(), so + their description is not repeated here. + + PCRE2_PARTIAL_HARD + PCRE2_PARTIAL_SOFT + + These have the same general effect as they do for pcre2_match(), but + the details are slightly different. When PCRE2_PARTIAL_HARD is set for + pcre2_dfa_match(), it returns PCRE2_ERROR_PARTIAL if the end of the + subject is reached and there is still at least one matching possibility + that requires additional characters. This happens even if some complete + matches have already been found. When PCRE2_PARTIAL_SOFT is set, the + return code PCRE2_ERROR_NOMATCH is converted into PCRE2_ERROR_PARTIAL + if the end of the subject is reached, there have been no complete + matches, but there is still at least one matching possibility. The por- + tion of the string that was inspected when the longest partial match + was found is set as the first matching string in both cases. There is a + more detailed discussion of partial and multi-segment matching, with + examples, in the pcre2partial documentation. + + PCRE2_DFA_SHORTEST + + Setting the PCRE2_DFA_SHORTEST option causes the matching algorithm to + stop as soon as it has found one match. Because of the way the alterna- + tive algorithm works, this is necessarily the shortest possible match + at the first possible matching point in the subject string. + + PCRE2_DFA_RESTART + + When pcre2_dfa_match() returns a partial match, it is possible to call + it again, with additional subject characters, and have it continue with + the same match. The PCRE2_DFA_RESTART option requests this action; when + it is set, the workspace and wscount options must reference the same + vector as before because data about the match so far is left in them + after a partial match. There is more discussion of this facility in the + pcre2partial documentation. + + Successful returns from pcre2_dfa_match() + + When pcre2_dfa_match() succeeds, it may have matched more than one sub- + string in the subject. Note, however, that all the matches from one run + of the function start at the same point in the subject. The shorter + matches are all initial substrings of the longer matches. For example, + if the pattern + + <.*> + + is matched against the string + + This is <something> <something else> <something further> no more + + the three matched strings are + + <something> + <something> <something else> + <something> <something else> <something further> + + On success, the yield of the function is a number greater than zero, + which is the number of matched substrings. The offsets of the sub- + strings are returned in the ovector, and can be extracted in the same + way as for pcre2_match(). They are returned in reverse order of + length; that is, the longest matching string is given first. If there + were too many matches to fit into the ovector, the yield of the func- + tion is zero, and the vector is filled with the longest matches. + + NOTE: PCRE2's "auto-possessification" optimization usually applies to + character repeats at the end of a pattern (as well as internally). For + example, the pattern "a\d+" is compiled as if it were "a\d++" because + there is no point in backtracking into the repeated digits. For DFA + matching, this means that only one possible match is found. If you + really do want multiple matches in such cases, either use an ungreedy + repeat ("a\d+?") or set the PCRE2_NO_AUTO_POSSESS option when compil- + ing. + + Error returns from pcre2_dfa_match() + + The pcre2_dfa_match() function returns a negative number when it fails. + Many of the errors are the same as for pcre2_match(), as described + above. There are in addition the following errors that are specific to + pcre2_dfa_match(): + + PCRE2_ERROR_DFA_UITEM + + This return is given if pcre2_dfa_match() encounters an item in the + pattern that it does not support, for instance, the use of \C or a back + reference. + + PCRE2_ERROR_DFA_UCOND + + This return is given if pcre2_dfa_match() encounters a condition item + that uses a back reference for the condition, or a test for recursion + in a specific group. These are not supported. + + PCRE2_ERROR_DFA_WSSIZE + + This return is given if pcre2_dfa_match() runs out of space in the + workspace vector. + + PCRE2_ERROR_DFA_RECURSE + + When a recursive subpattern is processed, the matching function calls + itself recursively, using private memory for the ovector and workspace. + This error is given if the internal ovector is not large enough. This + should be extremely rare, as a vector of size 1000 is used. + + PCRE2_ERROR_DFA_BADRESTART + + When pcre2_dfa_match() is called with the pcre2_dfa_RESTART option, + some plausibility checks are made on the contents of the workspace, + which should contain data about the previous partial match. If any of + these checks fail, this error is given. + + +SEE ALSO + + pcre2build(3), pcre2libs(3), pcre2callout(3), pcre2matching(3), + pcre2partial(3), pcre2posix(3), pcre2demo(3), pcre2sample(3), + pcre2stack(3). + + +AUTHOR + + Philip Hazel + University Computing Service + Cambridge CB2 3QH, England. + + +REVISION + + Last updated: 16 September 2014 + Copyright (c) 1997-2014 University of Cambridge. +------------------------------------------------------------------------------ + + +PCRE2CALLOUT(3) Library Functions Manual PCRE2CALLOUT(3) + + + +NAME + PCRE2 - Perl-compatible regular expressions (revised API) + +SYNOPSIS + + #include <pcre2.h> + + int (*pcre2_callout)(pcre2_callout_block *); + + +DESCRIPTION + + PCRE2 provides a feature called "callout", which is a means of tempo- + rarily passing control to the caller of PCRE2 in the middle of pattern + matching. The caller of PCRE2 provides an external function by putting + its entry point in a match context (see pcre2_set_callout()) in the + pcre2api documentation). + + Within a regular expression, (?C) indicates the points at which the + external function is to be called. Different callout points can be + identified by putting a number less than 256 after the letter C. The + default value is zero. For example, this pattern has two callout + points: + + (?C1)abc(?C2)def + + If the PCRE2_AUTO_CALLOUT option bit is set when a pattern is compiled, + PCRE2 automatically inserts callouts, all with number 255, before each + item in the pattern. For example, if PCRE2_AUTO_CALLOUT is used with + the pattern + + A(\d{2}|--) + + it is processed as if it were + + (?C255)A(?C255)((?C255)\d{2}(?C255)|(?C255)-(?C255)-(?C255))(?C255) + + Notice that there is a callout before and after each parenthesis and + alternation bar. If the pattern contains a conditional group whose con- + dition is an assertion, an automatic callout is inserted immediately + before the condition. Such a callout may also be inserted explicitly, + for example: + + (?(?C9)(?=a)ab|de) + + This applies only to assertion conditions (because they are themselves + independent groups). + + Automatic callouts can be used for tracking the progress of pattern + matching. The pcre2test program has a pattern qualifier (/auto_call- + out) that sets automatic callouts; when it is used, the output indi- + cates how the pattern is being matched. This is useful information when + you are trying to optimize the performance of a particular pattern. + + +MISSING CALLOUTS + + You should be aware that, because of optimizations in the way PCRE2 + compiles and matches patterns, callouts sometimes do not happen exactly + as you might expect. + + At compile time, PCRE2 "auto-possessifies" repeated items when it knows + that what follows cannot be part of the repeat. For example, a+[bc] is + compiled as if it were a++[bc]. The pcre2test output when this pattern + is anchored and then applied with automatic callouts to the string + "aaaa" is: + + --->aaaa + +0 ^ ^ + +1 ^ a+ + +3 ^ ^ [bc] + No match + + This indicates that when matching [bc] fails, there is no backtracking + into a+ and therefore the callouts that would be taken for the back- + tracks do not occur. You can disable the auto-possessify feature by + passing PCRE2_NO_AUTO_POSSESS to pcre2_compile(), or starting the pat- + tern with (*NO_AUTO_POSSESS). If this is done in pcre2test (using the + /no_auto_possess qualifier), the output changes to this: + + --->aaaa + +0 ^ ^ + +1 ^ a+ + +3 ^ ^ [bc] + +3 ^ ^ [bc] + +3 ^ ^ [bc] + +3 ^^ [bc] + No match + + This time, when matching [bc] fails, the matcher backtracks into a+ and + tries again, repeatedly, until a+ itself fails. + + Other optimizations that provide fast "no match" results also affect + callouts. For example, if the pattern is + + ab(?C4)cd + + PCRE2 knows that any matching string must contain the letter "d". If + the subject string is "abyz", the lack of "d" means that matching + doesn't ever start, and the callout is never reached. However, with + "abyd", though the result is still no match, the callout is obeyed. + + PCRE2 also knows the minimum length of a matching string, and will + immediately give a "no match" return without actually running a match + if the subject is not long enough, or, for unanchored patterns, if it + has been scanned far enough. + + You can disable these optimizations by passing the PCRE2_NO_START_OPTI- + MIZE option to the matching function, or by starting the pattern with + (*NO_START_OPT). This slows down the matching process, but does ensure + that callouts such as the example above are obeyed. + + +THE CALLOUT INTERFACE + + During matching, when PCRE2 reaches a callout point, the external func- + tion that is set in the match context is called (if it is set). This + applies to both normal and DFA matching. The only argument to the call- + out function is a pointer to a pcre2_callout block. This structure con- + tains the following fields: + + uint32_t version; + uint32_t callout_number; + uint32_t capture_top; + uint32_t capture_last; + void *callout_data; + PCRE2_SIZE *offset_vector; + PCRE2_SPTR mark; + PCRE2_SPTR subject; + PCRE2_SIZE subject_length; + PCRE2_SIZE start_match; + PCRE2_SIZE current_position; + PCRE2_SIZE pattern_position; + PCRE2_SIZE next_item_length; + + The version field contains the version number of the block format. The + current version is 0. The version number will change in future if addi- + tional fields are added, but the intention is never to remove any of + the existing fields. + + The callout_number field contains the number of the callout, as com- + piled into the pattern (that is, the number after ?C for manual call- + outs, and 255 for automatically generated callouts). + + The offset_vector field is a pointer to the vector of capturing offsets + (the "ovector") that was passed to the matching function in the match + data block. When pcre2_match() is used, the contents can be inspected, + in order to extract substrings that have been matched so far, in the + same way as for extracting substrings after a match has completed. For + the DFA matching function, this field is not useful. + + The subject and subject_length fields contain copies of the values that + were passed to the matching function. + + The start_match field normally contains the offset within the subject + at which the current match attempt started. However, if the escape + sequence \K has been encountered, this value is changed to reflect the + modified starting point. If the pattern is not anchored, the callout + function may be called several times from the same point in the pattern + for different starting points in the subject. + + The current_position field contains the offset within the subject of + the current match pointer. + + When the pcre2_match() is used, the capture_top field contains one more + than the number of the highest numbered captured substring so far. If + no substrings have been captured, the value of capture_top is one. This + is always the case when the DFA functions are used, because they do not + support captured substrings. + + The capture_last field contains the number of the most recently cap- + tured substring. However, when a recursion exits, the value reverts to + what it was outside the recursion, as do the values of all captured + substrings. If no substrings have been captured, the value of cap- + ture_last is 0. This is always the case for the DFA matching functions. + + The callout_data field contains a value that is passed to a matching + function specifically so that it can be passed back in callouts. It is + set in the match context when the callout is set up by calling + pcre2_set_callout() (see the pcre2api documentation). + + The pattern_position field contains the offset to the next item to be + matched in the pattern string. + + The next_item_length field contains the length of the next item to be + matched in the pattern string. When the callout immediately precedes an + alternation bar, a closing parenthesis, or the end of the pattern, the + length is zero. When the callout precedes an opening parenthesis, the + length is that of the entire subpattern. + + The pattern_position and next_item_length fields are intended to help + in distinguishing between different automatic callouts, which all have + the same callout number. However, they are set for all callouts. + + In callouts from pcre2_match() the mark field contains a pointer to the + zero-terminated name of the most recently passed (*MARK), (*PRUNE), or + (*THEN) item in the match, or NULL if no such items have been passed. + Instances of (*PRUNE) or (*THEN) without a name do not obliterate a + previous (*MARK). In callouts from the DFA matching function this field + always contains NULL. + + +RETURN VALUES + + The external callout function returns an integer to PCRE2. If the value + is zero, matching proceeds as normal. If the value is greater than + zero, matching fails at the current point, but the testing of other + matching possibilities goes ahead, just as if a lookahead assertion had + failed. If the value is less than zero, the match is abandoned, and the + matching function returns the negative value. + + Negative values should normally be chosen from the set of + PCRE2_ERROR_xxx values. In particular, PCRE2_ERROR_NOMATCH forces a + standard "no match" failure. The error number PCRE2_ERROR_CALLOUT is + reserved for use by callout functions; it will never be used by PCRE2 + itself. + + +AUTHOR + + Philip Hazel + University Computing Service + Cambridge CB2 3QH, England. + + +REVISION + + Last updated: 19 October 2014 + Copyright (c) 1997-2014 University of Cambridge. +------------------------------------------------------------------------------ + + +PCRE2UNICODE(3) Library Functions Manual PCRE2UNICODE(3) + + + +NAME + PCRE - Perl-compatible regular expressions (revised API) + +UNICODE AND UTF SUPPORT + + When PCRE2 is built with Unicode support, it acquires knowledge of Uni- + code character properties and can process text strings in UTF-8, + UTF-16, or UTF-32 format (depending on the code unit width). By + default, PCRE2 assumes that one code unit is one character. To process + a pattern as a UTF string, where a character may require more than one + code unit, you must call pcre2_compile() with the PCRE2_UTF option + flag, or the pattern must start with the sequence (*UTF). When either + of these is the case, both the pattern and any subject strings that are + matched against it are treated as UTF strings instead of strings of + individual one-code-unit characters. + + If you build PCRE2 with Unicode support, the library will be bigger, + but the additional run time overhead is limited to testing the + PCRE2_UTF flag occasionally, so should not be very much. + + +UNICODE PROPERTY SUPPORT + + When PCRE2 is built with Unicode support, the escape sequences \p{..}, + \P{..}, and \X can be used. The Unicode properties that can be tested + are limited to the general category properties such as Lu for an upper + case letter or Nd for a decimal number, the Unicode script names such + as Arabic or Han, and the derived properties Any and L&. Full lists are + given in the pcre2pattern and pcre2syntax documentation. Only the short + names for properties are supported. For example, \p{L} matches a let- + ter. Its Perl synonym, \p{Letter}, is not supported. Furthermore, in + Perl, many properties may optionally be prefixed by "Is", for compati- + bility with Perl 5.6. PCRE does not support this. + + +WIDE CHARACTERS AND UTF MODES + + Codepoints less than 256 can be specified in patterns by either braced + or unbraced hexadecimal escape sequences (for example, \x{b3} or \xb3). + Larger values have to use braced sequences. Unbraced octal code points + up to \777 are also recognized; larger ones can be coded using \o{...}. + + In UTF modes, repeat quantifiers apply to complete UTF characters, not + to individual code units. + + In UTF modes, the dot metacharacter matches one UTF character instead + of a single code unit. + + The escape sequence \C can be used to match a single code unit, in a + UTF mode, but its use can lead to some strange effects because it + breaks up multi-unit characters (see the description of \C in the + pcre2pattern documentation). The use of \C is not supported in the + alternative matching function pcre2_dfa_exec(), nor is it supported in + UTF mode by the JIT optimization. If JIT optimization is requested for + a UTF pattern that contains \C, it will not succeed, and so the match- + ing will be carried out by the normal interpretive function. + + The character escapes \b, \B, \d, \D, \s, \S, \w, and \W correctly test + characters of any code value, but, by default, the characters that + PCRE2 recognizes as digits, spaces, or word characters remain the same + set as in non-UTF mode, all with code points less than 256. This + remains true even when PCRE2 is built to include Unicode support, + because to do otherwise would slow down matching in many common cases. + Note that this also applies to \b and \B, because they are defined in + terms of \w and \W. If you want to test for a wider sense of, say, + "digit", you can use explicit Unicode property tests such as \p{Nd}. + Alternatively, if you set the PCRE2_UCP option, the way that the char- + acter escapes work is changed so that Unicode properties are used to + determine which characters match. There are more details in the section + on generic character types in the pcre2pattern documentation. + + Similarly, characters that match the POSIX named character classes are + all low-valued characters, unless the PCRE2_UCP option is set. + + However, the special horizontal and vertical white space matching + escapes (\h, \H, \v, and \V) do match all the appropriate Unicode char- + acters, whether or not PCRE2_UCP is set. + + Case-insensitive matching in UTF mode makes use of Unicode properties. + A few Unicode characters such as Greek sigma have more than two code- + points that are case-equivalent, and these are treated as such. + + +VALIDITY OF UTF STRINGS + + When the PCRE2_UTF option is set, the strings passed as patterns and + subjects are (by default) checked for validity on entry to the relevant + functions. If an invalid UTF string is passed, an error return is + given. + + UTF-16 and UTF-32 strings can indicate their endianness by special code + knows as a byte-order mark (BOM). The PCRE2 functions do not handle + this, expecting strings to be in host byte order. + + The entire string is checked before any other processing takes place. + In addition to checking the format of the string, there is a check to + ensure that all code points lie in the range U+0 to U+10FFFF, excluding + the surrogate area. The so-called "non-character" code points are not + excluded because Unicode corrigendum #9 makes it clear that they should + not be. + + Characters in the "Surrogate Area" of Unicode are reserved for use by + UTF-16, where they are used in pairs to encode code points with values + greater than 0xFFFF. The code points that are encoded by UTF-16 pairs + are available independently in the UTF-8 and UTF-32 encodings. (In + other words, the whole surrogate thing is a fudge for UTF-16 which + unfortunately messes up UTF-8 and UTF-32.) + + In some situations, you may already know that your strings are valid, + and therefore want to skip these checks in order to improve perfor- + mance, for example in the case of a long subject string that is being + scanned repeatedly. If you set the PCRE2_NO_UTF_CHECK flag at compile + time or at run time, PCRE2 assumes that the pattern or subject it is + given (respectively) contains only valid UTF code unit sequences. + + Passing PCRE2_NO_UTF_CHECK to pcre2_compile() just disables the check + for the pattern; it does not also apply to subject strings. If you want + to disable the check for a subject string you must pass this option to + pcre2_exec() or pcre2_dfa_exec(). + + If you pass an invalid UTF string when PCRE2_NO_UTF_CHECK is set, the + result is undefined and your program may crash or loop indefinitely. + + Errors in UTF-8 strings + + The following negative error codes are given for invalid UTF-8 strings: + + PCRE2_ERROR_UTF8_ERR1 + PCRE2_ERROR_UTF8_ERR2 + PCRE2_ERROR_UTF8_ERR3 + PCRE2_ERROR_UTF8_ERR4 + PCRE2_ERROR_UTF8_ERR5 + + The string ends with a truncated UTF-8 character; the code specifies + how many bytes are missing (1 to 5). Although RFC 3629 restricts UTF-8 + characters to be no longer than 4 bytes, the encoding scheme (origi- + nally defined by RFC 2279) allows for up to 6 bytes, and this is + checked first; hence the possibility of 4 or 5 missing bytes. + + PCRE2_ERROR_UTF8_ERR6 + PCRE2_ERROR_UTF8_ERR7 + PCRE2_ERROR_UTF8_ERR8 + PCRE2_ERROR_UTF8_ERR9 + PCRE2_ERROR_UTF8_ERR10 + + The two most significant bits of the 2nd, 3rd, 4th, 5th, or 6th byte of + the character do not have the binary value 0b10 (that is, either the + most significant bit is 0, or the next bit is 1). + + PCRE2_ERROR_UTF8_ERR11 + PCRE2_ERROR_UTF8_ERR12 + + A character that is valid by the RFC 2279 rules is either 5 or 6 bytes + long; these code points are excluded by RFC 3629. + + PCRE2_ERROR_UTF8_ERR13 + + A 4-byte character has a value greater than 0x10fff; these code points + are excluded by RFC 3629. + + PCRE2_ERROR_UTF8_ERR14 + + A 3-byte character has a value in the range 0xd800 to 0xdfff; this + range of code points are reserved by RFC 3629 for use with UTF-16, and + so are excluded from UTF-8. + + PCRE2_ERROR_UTF8_ERR15 + PCRE2_ERROR_UTF8_ERR16 + PCRE2_ERROR_UTF8_ERR17 + PCRE2_ERROR_UTF8_ERR18 + PCRE2_ERROR_UTF8_ERR19 + + A 2-, 3-, 4-, 5-, or 6-byte character is "overlong", that is, it codes + for a value that can be represented by fewer bytes, which is invalid. + For example, the two bytes 0xc0, 0xae give the value 0x2e, whose cor- + rect coding uses just one byte. + + PCRE2_ERROR_UTF8_ERR20 + + The two most significant bits of the first byte of a character have the + binary value 0b10 (that is, the most significant bit is 1 and the sec- + ond is 0). Such a byte can only validly occur as the second or subse- + quent byte of a multi-byte character. + + PCRE2_ERROR_UTF8_ERR21 + + The first byte of a character has the value 0xfe or 0xff. These values + can never occur in a valid UTF-8 string. + + Errors in UTF-16 strings + + The following negative error codes are given for invalid UTF-16 + strings: + + PCRE_UTF16_ERR1 Missing low surrogate at end of string + PCRE_UTF16_ERR2 Invalid low surrogate follows high surrogate + PCRE_UTF16_ERR3 Isolated low surrogate + + + Errors in UTF-32 strings + + The following negative error codes are given for invalid UTF-32 + strings: + + PCRE_UTF32_ERR1 Surrogate character (range from 0xd800 to 0xdfff) + PCRE_UTF32_ERR2 Code point is greater than 0x10ffff + + +AUTHOR + + Philip Hazel + University Computing Service + Cambridge CB2 3QH, England. + + +REVISION + + Last updated: 16 September 2014 + Copyright (c) 1997-2014 University of Cambridge. +------------------------------------------------------------------------------ + + diff --git a/doc/pcre2api.3 b/doc/pcre2api.3 index b7b350e..f5528eb 100644 --- a/doc/pcre2api.3 +++ b/doc/pcre2api.3 @@ -214,7 +214,7 @@ document for an overview of all the PCRE2 documentation. .B int pcre2_pattern_info(const pcre2 *\fIcode\fP, uint32_t \fIwhat\fP, void *\fIwhere\fP); .sp .B int pcre2_config(uint32_t \fIwhat\fP, void *\fIwhere\fP, PCRE2_SIZE \fIlength\fP); -.sp +.fi . . .SH "PCRE2 8-BIT, 16-BIT, AND 32-BIT LIBRARIES" diff --git a/doc/pcre2demo.3 b/doc/pcre2demo.3 new file mode 100644 index 0000000..13535b2 --- /dev/null +++ b/doc/pcre2demo.3 @@ -0,0 +1,441 @@ +.\" Start example. +.de EX +. nr mE \\n(.f +. nf +. nh +. ft CW +.. +. +. +.\" End example. +.de EE +. ft \\n(mE +. fi +. hy \\n(HY +.. +. +.EX +/************************************************* +* PCRE2 DEMONSTRATION PROGRAM * +*************************************************/ + +/* This is a demonstration program to illustrate a straightforward way of +calling the PCRE2 regular expression library from a C program. See the +pcre2sample documentation for a short discussion ("man pcre2sample" if you have +the PCRE2 man pages installed). PCRE2 is a revised API for the library, and is +incompatible with the original PCRE API. + +There are actually three libraries, each supporting a different code unit +width. This demonstration program uses the 8-bit library. + +In Unix-like environments, if PCRE2 is installed in your standard system +libraries, you should be able to compile this program using this command: + +gcc -Wall pcre2demo.c -lpcre2-8 -o pcre2demo + +If PCRE2 is not installed in a standard place, it is likely to be installed +with support for the pkg-config mechanism. If you have pkg-config, you can +compile this program using this command: + +gcc -Wall pcre2demo.c `pkg-config --cflags --libs libpcre2-8` -o pcre2demo + +If you do not have pkg-config, you may have to use this: + +gcc -Wall pcre2demo.c -I/usr/local/include -L/usr/local/lib \e + -R/usr/local/lib -lpcre2-8 -o pcre2demo + +Replace "/usr/local/include" and "/usr/local/lib" with wherever the include and +library files for PCRE2 are installed on your system. Only some operating +systems (Solaris is one) use the -R option. + +Building under Windows: + +If you want to statically link this program against a non-dll .a file, you must +define PCRE2_STATIC before including pcre2.h, so in this environment, uncomment +the following line. */ + +/* #define PCRE2_STATIC */ + +/* This macro must be defined before including pcre2.h. For a program that uses +only one code unit width, it makes it possible to use generic function names +such as pcre2_compile(). */ + +#define PCRE2_CODE_UNIT_WIDTH 8 + +#include <stdio.h> +#include <string.h> +#include <pcre2.h> + + +/************************************************************************** +* Here is the program. The API includes the concept of "contexts" for * +* setting up unusual interface requirements for compiling and matching, * +* such as custom memory managers and non-standard newline definitions. * +* This program does not do any of this, so it makes no use of contexts, * +* always passing NULL where a context could be given. * +**************************************************************************/ + +int main(int argc, char **argv) +{ +pcre2_code *re; +PCRE2_SPTR pattern; /* PCRE2_SPTR is a pointer to unsigned code units of */ +PCRE2_SPTR subject; /* the appropriate width (8, 16, or 32 bits). */ +PCRE2_SPTR name_table; + +int crlf_is_newline; +int errornumber; +int find_all; +int i; +int namecount; +int name_entry_size; +int rc; +int utf8; + +uint32_t option_bits; +uint32_t newline; + +PCRE2_SIZE erroroffset; +PCRE2_SIZE *ovector; + +size_t subject_length; +pcre2_match_data *match_data; + + + +/************************************************************************** +* First, sort out the command line. There is only one possible option at * +* the moment, "-g" to request repeated matching to find all occurrences, * +* like Perl's /g option. We set the variable find_all to a non-zero value * +* if the -g option is present. Apart from that, there must be exactly two * +* arguments. * +**************************************************************************/ + +find_all = 0; +for (i = 1; i < argc; i++) + { + if (strcmp(argv[i], "-g") == 0) find_all = 1; + else break; + } + +/* After the options, we require exactly two arguments, which are the pattern, +and the subject string. */ + +if (argc - i != 2) + { + printf("Two arguments required: a regex and a subject string\en"); + return 1; + } + +/* As pattern and subject are char arguments, they can be straightforwardly +cast to PCRE2_SPTR as we are working in 8-bit code units. */ + +pattern = (PCRE2_SPTR)argv[i]; +subject = (PCRE2_SPTR)argv[i+1]; +subject_length = strlen((char *)subject); + + +/************************************************************************* +* Now we are going to compile the regular expression pattern, and handle * +* any errors that are detected. * +*************************************************************************/ + +re = pcre2_compile( + pattern, /* the pattern */ + -1, /* indicates pattern is zero-terminated */ + 0, /* default options */ + &errornumber, /* for error number */ + &erroroffset, /* for error offset */ + NULL); /* use default compile context */ + +/* Compilation failed: print the error message and exit. */ + +if (re == NULL) + { + PCRE2_UCHAR buffer[256]; + pcre2_get_error_message(errornumber, buffer, sizeof(buffer)); + printf("PCRE2 compilation failed at offset %d: %s\en", (int)erroroffset, + buffer); + return 1; + } + + +/************************************************************************* +* If the compilation succeeded, we call PCRE again, in order to do a * +* pattern match against the subject string. This does just ONE match. If * +* further matching is needed, it will be done below. Before running the * +* match we must set up a match_data block for holding the result. * +*************************************************************************/ + +/* Using this function ensures that the block is exactly the right size for +the number of capturing parentheses in the pattern. */ + +match_data = pcre2_match_data_create_from_pattern(re, NULL); + +rc = pcre2_match( + re, /* the compiled pattern */ + subject, /* the subject string */ + subject_length, /* the length of the subject */ + 0, /* start at offset 0 in the subject */ + 0, /* default options */ + match_data, /* block for storing the result */ + NULL); /* use default match context */ + +/* Matching failed: handle error cases */ + +if (rc < 0) + { + switch(rc) + { + case PCRE2_ERROR_NOMATCH: printf("No match\en"); break; + /* + Handle other special cases if you like + */ + default: printf("Matching error %d\en", rc); break; + } + pcre2_match_data_free(match_data); /* Release memory used for the match */ + pcre2_code_free(re); /* data and the compiled pattern. */ + return 1; + } + +/* Match succeded. Get a pointer to the output vector, where string offsets are +stored. */ + +ovector = pcre2_get_ovector_pointer(match_data); +printf("\enMatch succeeded at offset %d\en", (int)ovector[0]); + + +/************************************************************************* +* We have found the first match within the subject string. If the output * +* vector wasn't big enough, say so. Then output any substrings that were * +* captured. * +*************************************************************************/ + +/* The output vector wasn't big enough. This should not happen, because we used +pcre2_match_data_create_from_pattern() above. */ + +if (rc == 0) + printf("ovector was not big enough for all the captured substrings\en"); + +/* Show substrings stored in the output vector by number. Obviously, in a real +application you might want to do things other than print them. */ + +for (i = 0; i < rc; i++) + { + PCRE2_SPTR substring_start = subject + ovector[2*i]; + size_t substring_length = ovector[2*i+1] - ovector[2*i]; + printf("%2d: %.*s\en", i, (int)substring_length, (char *)substring_start); + } + + +/************************************************************************** +* That concludes the basic part of this demonstration program. We have * +* compiled a pattern, and performed a single match. The code that follows * +* shows first how to access named substrings, and then how to code for * +* repeated matches on the same subject. * +**************************************************************************/ + +/* See if there are any named substrings, and if so, show them by name. First +we have to extract the count of named parentheses from the pattern. */ + +(void)pcre2_pattern_info( + re, /* the compiled pattern */ + PCRE2_INFO_NAMECOUNT, /* get the number of named substrings */ + &namecount); /* where to put the answer */ + +if (namecount <= 0) printf("No named substrings\en"); else + { + PCRE2_SPTR tabptr; + printf("Named substrings\en"); + + /* Before we can access the substrings, we must extract the table for + translating names to numbers, and the size of each entry in the table. */ + + (void)pcre2_pattern_info( + re, /* the compiled pattern */ + PCRE2_INFO_NAMETABLE, /* address of the table */ + &name_table); /* where to put the answer */ + + (void)pcre2_pattern_info( + re, /* the compiled pattern */ + PCRE2_INFO_NAMEENTRYSIZE, /* size of each entry in the table */ + &name_entry_size); /* where to put the answer */ + + /* Now we can scan the table and, for each entry, print the number, the name, + and the substring itself. In the 8-bit library the number is held in two + bytes, most significant first. */ + + tabptr = name_table; + for (i = 0; i < namecount; i++) + { + int n = (tabptr[0] << 8) | tabptr[1]; + printf("(%d) %*s: %.*s\en", n, name_entry_size - 3, tabptr + 2, + (int)(ovector[2*n+1] - ovector[2*n]), subject + ovector[2*n]); + tabptr += name_entry_size; + } + } + + +/************************************************************************* +* If the "-g" option was given on the command line, we want to continue * +* to search for additional matches in the subject string, in a similar * +* way to the /g option in Perl. This turns out to be trickier than you * +* might think because of the possibility of matching an empty string. * +* What happens is as follows: * +* * +* If the previous match was NOT for an empty string, we can just start * +* the next match at the end of the previous one. * +* * +* If the previous match WAS for an empty string, we can't do that, as it * +* would lead to an infinite loop. Instead, a call of pcre2_match() is * +* made with the PCRE2_NOTEMPTY_ATSTART and PCRE2_ANCHORED flags set. The * +* first of these tells PCRE2 that an empty string at the start of the * +* subject is not a valid match; other possibilities must be tried. The * +* second flag restricts PCRE2 to one match attempt at the initial string * +* position. If this match succeeds, an alternative to the empty string * +* match has been found, and we can print it and proceed round the loop, * +* advancing by the length of whatever was found. If this match does not * +* succeed, we still stay in the loop, advancing by just one character. * +* In UTF-8 mode, which can be set by (*UTF) in the pattern, this may be * +* more than one byte. * +* * +* However, there is a complication concerned with newlines. When the * +* newline convention is such that CRLF is a valid newline, we must * +* advance by two characters rather than one. The newline convention can * +* be set in the regex by (*CR), etc.; if not, we must find the default. * +*************************************************************************/ + +if (!find_all) /* Check for -g */ + { + pcre2_match_data_free(match_data); /* Release the memory that was used */ + pcre2_code_free(re); /* for the match data and the pattern. */ + return 0; /* Exit the program. */ + } + +/* Before running the loop, check for UTF-8 and whether CRLF is a valid newline +sequence. First, find the options with which the regex was compiled and extract +the UTF state. */ + +(void)pcre2_pattern_info(re, PCRE2_INFO_ALLOPTIONS, &option_bits); +utf8 = (option_bits & PCRE2_UTF) != 0; + +/* Now find the newline convention and see whether CRLF is a valid newline +sequence. */ + +(void)pcre2_pattern_info(re, PCRE2_INFO_NEWLINE, &newline); +crlf_is_newline = newline == PCRE2_NEWLINE_ANY || + newline == PCRE2_NEWLINE_CRLF || + newline == PCRE2_NEWLINE_ANYCRLF; + +/* Loop for second and subsequent matches */ + +for (;;) + { + uint32_t options = 0; /* Normally no options */ + PCRE2_SIZE start_offset = ovector[1]; /* Start at end of previous match */ + + /* If the previous match was for an empty string, we are finished if we are + at the end of the subject. Otherwise, arrange to run another match at the + same point to see if a non-empty match can be found. */ + + if (ovector[0] == ovector[1]) + { + if (ovector[0] == subject_length) break; + options = PCRE2_NOTEMPTY_ATSTART | PCRE2_ANCHORED; + } + + /* Run the next matching operation */ + + rc = pcre2_match( + re, /* the compiled pattern */ + subject, /* the subject string */ + subject_length, /* the length of the subject */ + start_offset, /* starting offset in the subject */ + options, /* options */ + match_data, /* block for storing the result */ + NULL); /* use default match context */ + + /* This time, a result of NOMATCH isn't an error. If the value in "options" + is zero, it just means we have found all possible matches, so the loop ends. + Otherwise, it means we have failed to find a non-empty-string match at a + point where there was a previous empty-string match. In this case, we do what + Perl does: advance the matching position by one character, and continue. We + do this by setting the "end of previous match" offset, because that is picked + up at the top of the loop as the point at which to start again. + + There are two complications: (a) When CRLF is a valid newline sequence, and + the current position is just before it, advance by an extra byte. (b) + Otherwise we must ensure that we skip an entire UTF character if we are in + UTF mode. */ + + if (rc == PCRE2_ERROR_NOMATCH) + { + if (options == 0) break; /* All matches found */ + ovector[1] = start_offset + 1; /* Advance one code unit */ + if (crlf_is_newline && /* If CRLF is newline & */ + start_offset < subject_length - 1 && /* we are at CRLF, */ + subject[start_offset] == '\er' && + subject[start_offset + 1] == '\en') + ovector[1] += 1; /* Advance by one more. */ + else if (utf8) /* Otherwise, ensure we */ + { /* advance a whole UTF-8 */ + while (ovector[1] < subject_length) /* character. */ + { + if ((subject[ovector[1]] & 0xc0) != 0x80) break; + ovector[1] += 1; + } + } + continue; /* Go round the loop again */ + } + + /* Other matching errors are not recoverable. */ + + if (rc < 0) + { + printf("Matching error %d\en", rc); + pcre2_match_data_free(match_data); + pcre2_code_free(re); + return 1; + } + + /* Match succeded */ + + printf("\enMatch succeeded again at offset %d\en", (int)ovector[0]); + + /* The match succeeded, but the output vector wasn't big enough. This + should not happen. */ + + if (rc == 0) + printf("ovector was not big enough for all the captured substrings\en"); + + /* As before, show substrings stored in the output vector by number, and then + also any named substrings. */ + + for (i = 0; i < rc; i++) + { + PCRE2_SPTR substring_start = subject + ovector[2*i]; + size_t substring_length = ovector[2*i+1] - ovector[2*i]; + printf("%2d: %.*s\en", i, (int)substring_length, (char *)substring_start); + } + + if (namecount <= 0) printf("No named substrings\en"); else + { + PCRE2_SPTR tabptr = name_table; + printf("Named substrings\en"); + for (i = 0; i < namecount; i++) + { + int n = (tabptr[0] << 8) | tabptr[1]; + printf("(%d) %*s: %.*s\en", n, name_entry_size - 3, tabptr + 2, + (int)(ovector[2*n+1] - ovector[2*n]), subject + ovector[2*n]); + tabptr += name_entry_size; + } + } + } /* End of loop to find second and subsequent matches */ + +printf("\en"); +pcre2_match_data_free(match_data); +pcre2_code_free(re); +return 0; +} + +/* End of pcre2demo.c */ +.EE diff --git a/doc/pcre2test.1 b/doc/pcre2test.1 index 1da6dfa..71be47d 100644 --- a/doc/pcre2test.1 +++ b/doc/pcre2test.1 @@ -154,7 +154,7 @@ Do not output the version number of \fBpcre2test\fP at the start of execution. \fB-S\fP \fIsize\fP On Unix-like systems, set the size of the run-time stack to \fIsize\fP megabytes. -.TP10 +.TP 10 \fB-subject\fP \fImodifier-list\fP Behave as if each subject line contains the given modifiers. .TP 10 @@ -366,7 +366,7 @@ include a closing square bracket in the characters, code it as \ex5D. A backslash followed by an equals sign marke the end of the subject string and the start of a modifier list. For example: .sp - abc\=notbol,notempty + abc\e=notbol,notempty .sp A backslash followed by any other non-alphanumeric character just escapes that character. A backslash followed by anything else causes an error. However, if @@ -746,7 +746,7 @@ the actual match are indicated in the output by '<' or '>' characters underneath them. Here is an example: .sp /(?<=pqr)abc(?=xyz)/ - 123pqrabcxyz456\=allusedtext + 123pqrabcxyz456\e=allusedtext 0: pqrabcxyz <<< >>> .sp @@ -789,7 +789,7 @@ The \fBcopy\fP and \fBget\fP modifiers can be used to test the They can be given more than once, and each can specify a group name or number, for example: .sp - abcd\=copy=1,copy=3,get=G1 + abcd\e=copy=1,copy=3,get=G1 .sp If the \fB#subject\fP command is used to set default copy and get lists, these can be unset by specifying a negative number for numbered groups and an empty diff --git a/doc/pcre2test.txt b/doc/pcre2test.txt new file mode 100644 index 0000000..34c1a14 --- /dev/null +++ b/doc/pcre2test.txt @@ -0,0 +1,1073 @@ +PCRE2TEST(1) General Commands Manual PCRE2TEST(1) + + + +NAME + pcre2test - a program for testing Perl-compatible regular expressions. + +SYNOPSIS + + pcre2test [options] [input file [output file]] + + pcre2test is a test program for the PCRE2 regular expression libraries, + but it can also be used for experimenting with regular expressions. + This document describes the features of the test program; for details + of the regular expressions themselves, see the pcre2pattern documenta- + tion. For details of the PCRE2 library function calls and their + options, see the pcre2api documentation. + + The input for pcre2test is a sequence of regular expression patterns + and subject strings to be matched. The output shows the result of each + match attempt. Modifiers on the command line, the patterns, and the + subject lines specify PCRE2 function options, control how the subject + is processed, and what output is produced. + + As the original fairly simple PCRE library evolved, it acquired many + different features, and as a result, the original pcretest program + ended up with a lot of options in a messy, arcane syntax, for testing + all the features. The move to the new PCRE2 API provided an opportunity + to re-implement the test program as pcre2test, with a cleaner modifier + syntax. Nevertheless, there are still many obscure modifiers, some of + which are specifically designed for use in conjunction with the test + script and data files that are distributed as part of PCRE2. All the + modifiers are documented here, some without much justification, but + many of them are unlikely to be of use except when testing the + libraries. + + +PCRE2's 8-BIT, 16-BIT AND 32-BIT LIBRARIES + + Different versions of the PCRE2 library can be built to support charac- + ter strings that are encoded in 8-bit, 16-bit, or 32-bit code units. + One, two, or all three of these libraries may be simultaneously + installed. The pcre2test program can be used to test all the libraries. + However, its own input and output are always in 8-bit format. When + testing the 16-bit or 32-bit libraries, patterns and subject strings + are converted to 16- or 32-bit format before being passed to the + library functions. Results are converted back to 8-bit code units for + output. + + In the rest of this document, the names of library functions and struc- + tures are given in generic form, for example, pcre_compile(). The + actual names used in the libraries have a suffix _8, _16, or _32, as + appropriate. + + +INPUT ENCODING + + Input to pcre2test is processed line by line, either by calling the C + library's fgets() function, or via the libreadline library (see below). + In Unix-like environments, fgets() treats any bytes other than newline + as data characters. However, in some Windows environments character 26 + (hex 1A) causes an immediate end of file, and no further data is read. + For maximum portability, therefore, it is safest to avoid non-printing + characters in pcre2test input files. + + +COMMAND LINE OPTIONS + + -8 If the 8-bit library has been built, this option causes it to + be used (this is the default). If the 8-bit library has not + been built, this option causes an error. + + -16 If the 16-bit library has been built, this option causes it + to be used. If only the 16-bit library has been built, this + is the default. If the 16-bit library has not been built, + this option causes an error. + + -32 If the 32-bit library has been built, this option causes it + to be used. If only the 32-bit library has been built, this + is the default. If the 32-bit library has not been built, + this option causes an error. + + -b Behave as if each pattern has the /fullbincode modifier; the + full internal binary form of the pattern is output after com- + pilation. + + -C Output the version number of the PCRE2 library, and all + available information about the optional features that are + included, and then exit with zero exit code. All other + options are ignored. + + -C option Output information about a specific build-time option, then + exit. This functionality is intended for use in scripts such + as RunTest. The following options output the value and set + the exit code as indicated: + + ebcdic-nl the code for LF (= NL) in an EBCDIC environment: + 0x15 or 0x25 + 0 if used in an ASCII environment + exit code is always 0 + linksize the configured internal link size (2, 3, or 4) + exit code is set to the link size + newline the default newline setting: + CR, LF, CRLF, ANYCRLF, or ANY + exit code is always 0 + bsr the default setting for what \R matches: + ANYCRLF or ANY + exit code is always 0 + + The following options output 1 for true or 0 for false, and + set the exit code to the same value: + + ebcdic compiled for an EBCDIC environment + jit just-in-time support is available + pcre16 the 16-bit library was built + pcre32 the 32-bit library was built + pcre8 the 8-bit library was built + unicode Unicode support is available + + If an unknown option is given, an error message is output; + the exit code is 0. + + -d Behave as if each pattern has the debug modifier; the inter- + nal form and information about the compiled pattern is output + after compilation; -d is equivalent to -b -i. + + -dfa Behave as if each subject line has the dfa modifier; matching + is done using the pcre2_dfa_match() function instead of the + default pcre2_match(). + + -help Output a brief summary these options and then exit. + + -i Behave as if each pattern has the /info modifier; information + about the compiled pattern is given after compilation. + + -jit Behave as if each pattern line has the jit modifier; after + successful compilation, each pattern is passed to the just- + in-time compiler, if available. + + -pattern modifier-list + Behave as if each pattern line contains the given modifiers. + + -q Do not output the version number of pcre2test at the start of + execution. + + -S size On Unix-like systems, set the size of the run-time stack to + size megabytes. + + -subject modifier-list + Behave as if each subject line contains the given modifiers. + + -t Run each compile and match many times with a timer, and out- + put the resulting times per compile or match. You can control + the number of iterations that are used for timing by follow- + ing -t with a number (as a separate item on the command + line). For example, "-t 1000" iterates 1000 times. The + default is to iterate 500,000 times. + + -tm This is like -t except that it times only the matching phase, + not the compile phase. + + -T -TM These behave like -t and -tm, but in addition, at the end of + a run, the total times for all compiles and matches are out- + put. + + -version Output the PCRE2 version number and then exit. + + +DESCRIPTION + + If pcre2test is given two filename arguments, it reads from the first + and writes to the second. If it is given only one filename argument, it + reads from that file and writes to stdout. Otherwise, it reads from + stdin and writes to stdout, and prompts for each line of input, using + "re>" to prompt for regular expression patterns, and "data>" to prompt + for subject lines. + + When pcre2test is built, a configuration option can specify that it + should be linked with the libreadline or libedit library. When this is + done, if the input is from a terminal, it is read using the readline() + function. This provides line-editing and history facilities. The output + from the -help option states whether or not readline() will be used. + + The program handles any number of tests, each of which consists of a + set of input lines. Each set starts with a regular expression pattern, + followed by any number of subject lines to be matched against that pat- + tern. In between sets of test data, command lines that begin with a + hash (#) character may appear. This file format, with some restric- + tions, can also be processed by the perltest.pl script that is distrib- + uted with PCRE2 as a means of checking that the behaviour of PCRE2 and + Perl is the same. + + Each subject line is matched separately and independently. If you want + to do multi-line matches, you have to use the \n escape sequence (or \r + or \r\n, etc., depending on the newline setting) in a single line of + input to encode the newline sequences. There is no limit on the length + of subject lines; the input buffer is automatically extended if it is + too small. There is a replication feature that makes it possible to + generate long subject lines without having to supply them explicitly. + + An empty line or the end of the file signals the end of the subject + lines for a test, at which point a new pattern or command line is + expected if there is still input to be read. + + +COMMAND LINES + + In between sets of test data, a line that begins with a hash (#) char- + acter is interpreted as a command line. If the first character is fol- + lowed by white space or an exclamation mark, the line is treated as a + comment, and ignored. Otherwise, the following commands are recog- + nized: + + #forbid_utf + + Subsequent patterns automatically have the PCRE2_NEVER_UTF and + PCRE2_NEVER_UCP options set, which locks out the use of UTF and Unicode + property features. This is a trigger guard that is used in test files + to ensure that UTF/Unicode tests are not accidentally added to files + that are used when UTF support is not included in the library. This + effect can also be obtained by the use of #pattern; the difference is + that #forbid_utf cannot be unset, and the automatic options are not + displayed in pattern information, to avoid cluttering up test output. + + #pattern <modifier-list> + + This command sets a default modifier list that applies to all subse- + quent patterns. Modifiers on a pattern can change these settings. + + #perltest + + The appearance of this line causes all subsequent modifier settings to + be checked for compatibility with the perltest.pl script, which is used + to confirm that Perl gives the same results as PCRE2. Also, apart from + comment lines, none of the other command lines are permitted, because + they and many of the modifiers are specific to pcre2test, and should + not be used in test files that are also processed by perltest.pl. The + #perltest command helps detect tests that are accidentally put in the + wrong file. + + #subject <modifier-list> + + This command sets a default modifier list that applies to all subse- + quent subject lines. Modifiers on a subject line can change these set- + tings. + + +MODIFIER SYNTAX + + Modifier lists are used with both pattern and subject lines. Items in a + list are separated by commas and optional white space. Some modifiers + may be given for both patterns and subject lines, whereas others are + valid for one or the other only. Each modifier has a long name, for + example "anchored", and some of them must be followed by an equals sign + and a value, for example, "offset=12". Modifiers that do not take val- + ues may be preceded by a minus sign to turn off a previous default set- + ting. + + A few of the more common modifiers can also be specified as single let- + ters, for example "i" for "caseless". In documentation, following the + Perl convention, these are written with a slash ("the /i modifier") for + clarity. Abbreviated modifiers must all be concatenated in the first + item of a modifier list. If the first item is not recognized as a long + modifier name, it is interpreted as a sequence of these abbreviations. + For example: + + /abc/ig,newline=cr,jit=3 + + This is a pattern line whose modifier list starts with two one-letter + modifiers (/i and /g). The lower-case abbreviated modifiers are the + same as used in Perl. + + +PATTERN SYNTAX + + A pattern line must start with one of the following characters (common + symbols, excluding pattern meta-characters): + + / ! " ' ` - = _ : ; , % & @ ~ + + This is interpreted as the pattern's delimiter. A regular expression + may be continued over several input lines, in which case the newline + characters are included within it. It is possible to include the delim- + iter within the pattern by escaping it with a backslash, for example + + /abc\/def/ + + If you do this, the escape and the delimiter form part of the pattern, + but since the delimiters are all non-alphanumeric, this does not affect + its interpretation. If the terminating delimiter is immediately fol- + lowed by a backslash, for example, + + /abc/\ + + then a backslash is added to the end of the pattern. This is done to + provide a way of testing the error condition that arises if a pattern + finishes with a backslash, because + + /abc\/ + + is interpreted as the first line of a pattern that starts with "abc/", + causing pcre2test to read the next line as a continuation of the regu- + lar expression. + + A pattern can be followed by a modifier list (details below). + + +SUBJECT LINE SYNTAX + + Before each subject line is passed to pcre2_match() or + pcre2_dfa_match(), leading and trailing white space is removed, and the + line is scanned for backslash escapes. The following provide a means of + encoding non-printing characters in a visible way: + + \a alarm (BEL, \x07) + \b backspace (\x08) + \e escape (\x27) + \f form feed (\x0c) + \n newline (\x0a) + \r carriage return (\x0d) + \t tab (\x09) + \v vertical tab (\x0b) + \nnn octal character (up to 3 octal digits); always + a byte unless > 255 in UTF-8 or 16-bit or 32-bit mode + \o{dd...} octal character (any number of octal digits} + \xhh hexadecimal byte (up to 2 hex digits) + \x{hh...} hexadecimal character (any number of hex digits) + + The use of \x{hh...} is not dependent on the use of the utf modifier on + the pattern. It is recognized always. There may be any number of hexa- + decimal digits inside the braces; invalid values provoke error mes- + sages. + + Note that \xhh specifies one byte rather than one character in UTF-8 + mode; this makes it possible to construct invalid UTF-8 sequences for + testing purposes. On the other hand, \x{hh} is interpreted as a UTF-8 + character in UTF-8 mode, generating more than one byte if the value is + greater than 127. When testing the 8-bit library not in UTF-8 mode, + \x{hh} generates one byte for values less than 256, and causes an error + for greater values. + + In UTF-16 mode, all 4-digit \x{hhhh} values are accepted. This makes it + possible to construct invalid UTF-16 sequences for testing purposes. + + In UTF-32 mode, all 4- to 8-digit \x{...} values are accepted. This + makes it possible to construct invalid UTF-32 sequences for testing + purposes. + + There is a special backslash sequence that specifies replication of one + or more characters: + + \[<characters>]{<count>} + + This makes it possible to test long strings without having to provide + them as part of the file. For example: + + \[abc]{4} + + is converted to "abcabcabcabc". This feature does not support nesting. + To include a closing square bracket in the characters, code it as \x5D. + + A backslash followed by an equals sign marke the end of the subject + string and the start of a modifier list. For example: + + abc\=notbol,notempty + + A backslash followed by any other non-alphanumeric character just + escapes that character. A backslash followed by anything else causes an + error. However, if the very last character in the line is a backslash + (and there is no modifier list), it is ignored. This gives a way of + passing an empty line as data, since a real empty line terminates the + data input. + + +PATTERN MODIFIERS + + There are three types of modifier that can appear in pattern lines, two + of which may also be used in a #pattern command. A pattern's modifier + list can add to or override default modifiers that were set by a previ- + ous #pattern command. + + Setting compilation options + + The following modifiers set options for pcre2_compile(). The most com- + mon ones have single-letter abbreviations. See pcreapi for a descrip- + tion of their effects. + + allow_empty_class set PCRE2_ALLOW_EMPTY_CLASS + alt_bsux set PCRE2_ALT_BSUX + anchored set PCRE2_ANCHORED + auto_callout set PCRE2_AUTO_CALLOUT + /i caseless set PCRE2_CASELESS + dollar_endonly set PCRE2_DOLLAR_ENDONLY + /s dotall set PCRE2_DOTALL + dupnames set PCRE2_DUPNAMES + /x extended set PCRE2_EXTENDED + firstline set PCRE2_FIRSTLINE + match_unset_backref set PCRE2_MATCH_UNSET_BACKREF + /m multiline set PCRE2_MULTILINE + never_ucp set PCRE2_NEVER_UCP + never_utf set PCRE2_NEVER_UTF + no_auto_capture set PCRE2_NO_AUTO_CAPTURE + no_auto_possess set PCRE2_NO_AUTO_POSSESS + no_start_optimize set PCRE2_NO_START_OPTIMIZE + no_utf_check set PCRE2_NO_UTF_CHECK + ucp set PCRE2_UCP + ungreedy set PCRE2_UNGREEDY + utf set PCRE2_UTF + + As well as turning on the PCRE2_UTF option, the utf modifier causes all + non-printing characters in output strings to be printed using the + \x{hh...} notation. Otherwise, those less than 0x100 are output in hex + without the curly brackets. + + Setting compilation controls + + The following modifiers affect the compilation process or request + information about the pattern: + + bsr=[anycrlf|unicode] specify \R handling + /B bincode show binary code without lengths + debug same as info,fullbincode + fullbincode show binary code with lengths + /I info show info about compiled pattern + hex pattern is coded in hexadecimal + jit[=<number>] use JIT + locale=<name> use this locale + memory show memory used + newline=<type> set newline type + parens_nest_limit=<n> set maximum parentheses depth + perlcompat lock out non-Perl modifiers + posix use the POSIX API + stackguard=<number> test the stackguard feature + tables=[0|1|2] select internal tables + use_length use the pattern's length + + The effects of these modifiers are described in the following sections. + FIXME: Give more examples. + + Newline and \R handling + + The bsr modifier specifies what \R in a pattern should match. If it is + set to "anycrlf", \R matches CR, LF, or CRLF only. If it is set to + "unicode", \R matches any Unicode newline sequence. The default is + specified when PCRE2 is built, with the default default being Unicode. + + The newline modifier specifies which characters are to be interpreted + as newlines, both in the pattern and (by default) in subject lines. The + type must be one of CR, LF, CRLF, ANYCRLF, or ANY. + + Both the \R and newline settings can be changed at match time, but if + this is done, JIT matching is disabled. + + Information about a pattern + + The debug modifier is a shorthand for info,fullbincode, requesting all + available information. + + The bincode modifier causes a representation of the compiled code to be + output after compilation. This information does not contain length and + offset values, which ensures that the same output is generated for dif- + ferent internal link sizes and different code unit widths. By using + bincode, the same regression tests can be used in different environ- + ments. + + The fullbincode modifier, by contrast, does include length and offset + values. This is used in a few special tests and is also useful for one- + off tests. + + The info modifier requests information about the compiled pattern + (whether it is anchored, has a fixed first character, and so on). The + information is obtained from the pcre2_pattern_info() function. + + Specifying a pattern in hex + + The hex modifier specifies that the characters of the pattern are to be + interpreted as pairs of hexadecimal digits. White space is permitted + between pairs. For example: + + /ab 32 59/hex + + This feature is provided as a way of creating patterns that contain + binary zero characters. When hex is set, it implies use_length. + + Using the pattern's length + + By default, pcre2test passes patterns as zero-terminated strings to + pcre2_compile(), giving the length as -1. If use_length is set, the + length of the pattern is passed. This is implied if hex is set. + + JIT compilation + + The /jit modifier may optionally be followed by a number in the range 0 + to 7: + + 0 disable JIT + 1 normal match only + 2 soft partial match only + 3 normal match and soft partial match + 4 hard partial match only + 6 soft and hard partial match + 7 all three modes + + If no number is given, 7 is assumed. If JIT compilation is successful, + the compiled JIT code will automatically be used when pcre2_match() is + run, except when incompatible run-time options are specified. For more + details, see the pcre2jit documentation. See also the jitstack modifier + below for a way of setting the size of the JIT stack. + + If the jitverify modifier is specified, the text "(JIT)" is added to + the first output line after a match or non match when JIT-compiled code + was actually used. This modifier can also be set on a subject line. + + Setting a locale + + The /locale modifier must specify the name of a locale, for example: + + /pattern/locale=fr_FR + + The given locale is set, pcre2_maketables() is called to build a set of + character tables for the locale, and this is then passed to pcre2_com- + pile() when compiling the regular expression. The same tables are used + when matching the following subject lines. The /locale modifier applies + only to the pattern on which it appears, but can be given in a #pattern + command if a default is needed. Setting a locale and alternate charac- + ter tables are mutually exclusive. + + Showing pattern memory + + The /memory modifier causes the size in bytes of the memory block used + to hold the compiled pattern to be output. This does not include the + size of the pcre2_code block; it is just the actual compiled data. If + the pattern is subsequently passed to the JIT compiler, the size of the + JIT compiled code is also output. + + Limiting nested parentheses + + The parens_nest_limit modifier sets a limit on the depth of nested + parentheses in a pattern. Breaching the limit causes a compilation + error. + + Using the POSIX wrapper API + + The /posix modifier causes pcre2test to call PCRE2 via the POSIX wrap- + per API rather than its native API. This supports only the 8-bit + library. When the POSIX API is being used, the following pattern modi- + fiers set options for the regcomp() function: + + caseless REG_ICASE + multiline REG_NEWLINE + no_auto_capture REG_NOSUB + dotall REG_DOTALL ) + ungreedy REG_UNGREEDY ) These options are not part of + ucp REG_UCP ) the POSIX standard + utf REG_UTF8 ) + + The aftertext and allaftertext subject modifiers work as described + below. All other modifiers cause an error. + + Testing the stack guard feature + + The /stackguard modifier is used to test the use of pcre2_set_com- + pile_recursion_guard(), a function that is provided to enable stack + availability to be checked during compilation (see the pcre2api docu- + mentation for details). If the number specified by the modifier is + greater than zero, pcre2_set_compile_recursion_guard() is called to set + up callback from pcre2_compile() to a local function. The argument it + is passed is the current nesting parenthesis depth; if this is greater + than the value given by the modifier, non-zero is returned, causing the + compilation to be aborted. + + Using alternative character tables + + The /tables modifier must be followed by a single digit. It causes a + specific set of built-in character tables to be passed to pcre2_com- + pile(). This is used in the PCRE2 tests to check behaviour with differ- + ent character tables. The digit specifies the tables as follows: + + 0 do not pass any special character tables + 1 the default ASCII tables, as distributed in + pcre2_chartables.c.dist + 2 a set of tables defining ISO 8859 characters + + In table 2, some characters whose codes are greater than 128 are iden- + tified as letters, digits, spaces, etc. Setting alternate character + tables and a locale are mutually exclusive. + + Setting certain match controls + + The following modifiers are really subject modifiers, and are described + below. However, they may be included in a pattern's modifier list, in + which case they are applied to every subject line that is processed + with that pattern. They do not affect the compilation process. + + aftertext show text after match + allaftertext show text after captures + allcaptures show all captures + allusedtext show all consulted text + /g global global matching + jitverify verify JIT usage + mark show mark values + + These modifiers may not appear in a #pattern command. If you want them + as defaults, set them in a #subject command. + + +SUBJECT MODIFIERS + + The modifiers that can appear in subject lines and the #subject command + are of two types. + + Setting match options + + The following modifiers set options for pcre2_match() or + pcre2_dfa_match(). See pcreapi for a description of their effects. + + anchored set PCRE2_ANCHORED + dfa_restart set PCRE2_DFA_RESTART + dfa_shortest set PCRE2_DFA_SHORTEST + no_start_optimize set PCRE2_NO_START_OPTIMIZE + no_utf_check set PCRE2_NO_UTF_CHECK + notbol set PCRE2_NOTBOL + notempty set PCRE2_NOTEMPTY + notempty_atstart set PCRE2_NOTEMPTY_ATSTART + noteol set PCRE2_NOTEOL + partial_hard (or ph) set PCRE2_PARTIAL_HARD + partial_soft (or ps) set PCRE2_PARTIAL_SOFT + + The partial matching modifiers are provided with abbreviations because + they appear frequently in tests. + + If the /posix modifier was present on the pattern, causing the POSIX + wrapper API to be used, the only option-setting modifiers that have any + effect are notbol, notempty, and noteol, causing REG_NOTBOL, + REG_NOTEMPTY, and REG_NOTEOL, respectively, to be passed to regexec(). + Any other modifiers cause an error. + + Setting match controls + + The following modifiers affect the matching process or request addi- + tional information. Some of them may also be specified on a pattern + line (see above), in which case they apply to every subject line that + is matched against that pattern. + + aftertext show text after match + allaftertext show text after captures + allcaptures show all captures + allusedtext show all consulted text + altglobal alternative global matching + bsr=[anycrlf|unicode] specify \R handling + callout_capture show captures at callout time + callout_data=<n> set a value to pass via callouts + callout_fail=<n>[:<m>] control callout failure + callout_none do not supply a callout function + copy=<number or name> copy captured substring + dfa use pcre2_dfa_match() + find_limits find match and recursion limits + get=<number or name> extract captured substring + getall extract all captured substrings + /g global global matching + jitstack=<n> set size of JIT stack + jitverify verify JIT usage + mark show mark values + match_limit=>n> set a match limit + memory show memory usage + newline=<type> set newline type + offset=<n> set starting offset + ovector=<n> set size of output vector + recursion_limit=<n> set a recursion limit + + The effects of these modifiers are described in the following sections. + FIXME: Give more examples. + + Newline and \R handling + + These modifiers set the newline and \R processing conventions for the + subject line, overriding any values that were set at compile time (as + described above). JIT matching is disabled if these settings are + changed at match time. + + Showing more text + + The aftertext modifier requests that as well as outputting the sub- + string that matched the entire pattern, pcre2test should in addition + output the remainder of the subject string. This is useful for tests + where the subject contains multiple copies of the same substring. The + allaftertext modifier requests the same action for captured substrings + as well as the main matched substring. In each case the remainder is + output on the following line with a plus character following the cap- + ture number. + + The allusedtext modifier requests that all the text that was consulted + during a successful pattern match be shown. This affects the output if + there is a lookbehind at the start of a match, or a lookahead at the + end, or if \K is used in the pattern. Characters that precede or follow + the start and end of the actual match are indicated in the output by + '<' or '>' characters underneath them. Here is an example: + + /(?<=pqr)abc(?=xyz)/ + 123pqrabcxyz456\=allusedtext + 0: pqrabcxyz + <<< >>> + + This shows that the matched string is "abc", with the preceding and + following strings "pqr" and "xyz" also consulted during the match. + + Showing the value of all capture groups + + The allcaptures modifier requests that the values of all potential cap- + tured parentheses be output after a match. By default, only those up to + the highest one actually used in the match are output (corresponding to + the return code from pcre2_match()). Groups that did not take part in + the match are output as "<unset>". + + Testing callouts + + A callout function is supplied when pcre2test calls the library match- + ing functions, unless callout_none is specified. If callout_capture is + set, the current captured groups are output when a callout occurs. + + The callout_fail modifier can be given one or two numbers. If there is + only one number, 1 is returned instead of 0 when a callout of that num- + ber is reached. If two numbers are given, 1 is returned when callout + <n> is reached for the <m>th time. + + The callout_data modifier can be given an unsigned or a negative num- + ber. Any value other than zero is used as a return from pcre2test's + callout function. + + Testing substring extraction functions + + The copy and get modifiers can be used to test the pcre2_sub- + string_copy_xxx() and pcre2_substring_get_xxx() functions. They can be + given more than once, and each can specify a group name or number, for + example: + + abcd\=copy=1,copy=3,get=G1 + + If the #subject command is used to set default copy and get lists, + these can be unset by specifying a negative number for numbered groups + and an empty name for named groups. + + The getall modifier tests pcre2_substring_list_get(), which extracts + all captured substrings. + + If the subject line is successfully matched, the substrings extracted + by the convenience functions are output with C, G, or L after the + string number instead of a colon. This is in addition to the normal + full list. The string length (that is, the return from the extraction + function) is given in parentheses after each substring. + + Finding all matches in a string + + Searching for all possible matches within a subject can be requested by + the global or /altglobal modifier. After finding a match, the matching + function is called again to search the remainder of the subject. The + difference between global and altglobal is that the former uses the + start_offset argument to pcre2_match() or pcre2_dfa_match() to start + searching at a new point within the entire string (which is what Perl + does), whereas the latter passes over a shortened substring. This makes + a difference to the matching process if the pattern begins with a look- + behind assertion (including \b or \B). + + If an empty string is matched, the next match is done with the + PCRE2_NOTEMPTY_ATSTART and PCRE2_ANCHORED flags set, in order to search + for another, non-empty, match at the same point in the subject. If this + match fails, the start offset is advanced, and the normal match is + retried. This imitates the way Perl handles such cases when using the + /g modifier or the split() function. Normally, the start offset is + advanced by one character, but if the newline convention recognizes + CRLF as a newline, and the current character is CR followed by LF, an + advance of two is used. + + Setting the JIT stack size + + The jitstack modifier provides a way of setting the maximum stack size + that is used by the just-in-time optimization code. It is ignored if + JIT optimization is not being used. Providing a stack that is larger + than the default 32K is necessary only for very complicated patterns. + + Setting match and recursion limits + + The match_limit and recursion_limit modifiers set the appropriate lim- + its in the match context. These values are ignored when the find_limits + modifier is specified. + + Finding minimum limits + + If the find_limits modifier is present, pcre2test calls pcre2_match() + several times, setting different values in the match context via + pcre2_set_match_limit() and pcre2_set_recursion_limit() until it finds + the minimum values for each parameter that allow pcre2_match() to com- + plete without error. + + The match_limit number is a measure of the amount of backtracking that + takes place, and learning the minimum value can be instructive. For + most simple matches, the number is quite small, but for patterns with + very large numbers of matching possibilities, it can become large very + quickly with increasing length of subject string. The + match_limit_recursion number is a measure of how much stack (or, if + PCRE2 is compiled with NO_RECURSE, how much heap) memory is needed to + complete the match attempt. + + Showing MARK names + + + The mark modifier causes the names from backtracking control verbs that + are returned from calls to pcre2_match() to be displayed. If a mark is + returned for a match, non-match, or partial match, pcre2test shows it. + For a match, it is on a line by itself, tagged with "MK:". Otherwise, + it is added to the non-match message. + + Showing memory usage + + The memory modifier causes pcre2test to log all memory allocation and + freeing calls that occur during a match operation. + + Setting a starting offset + + The offset modifier sets an offset in the subject string at which + matching starts. Its value is a number of code units, not characters. + + Setting the size of the output vector + + The ovector modifier applies only to the subject line in which it + appears, though of course it can also be used to set a default in a + #subject command. It specifies the number of pairs of offsets that are + available for storing matching information. The default is 15. + + +THE ALTERNATIVE MATCHING FUNCTION + + By default, pcre2test uses the standard PCRE2 matching function, + pcre2_match() to match each subject line. PCRE2 also supports an alter- + native matching function, pcre2_dfa_match(), which operates in a dif- + ferent way, and has some restrictions. The differences between the two + functions are described in the pcre2matching documentation. + + If the dfa modifier is set, the alternative matching function is used. + This function finds all possible matches at a given point in the sub- + ject. If, however, the dfa_shortest modifier is set, processing stops + after the first match is found. This is always the shortest possible + match. + + +DEFAULT OUTPUT FROM pcre2test + + This section describes the output when the normal matching function, + pcre2_match(), is being used. + + When a match succeeds, pcre2test outputs the list of captured sub- + strings, starting with number 0 for the string that matched the whole + pattern. Otherwise, it outputs "No match" when the return is + PCRE2_ERROR_NOMATCH, or "Partial match:" followed by the partially + matching substring when the return is PCRE2_ERROR_PARTIAL. (Note that + this is the entire substring that was inspected during the partial + match; it may include characters before the actual match start if a + lookbehind assertion, \K, \b, or \B was involved.) + + For any other return, pcre2test outputs the PCRE2 negative error number + and a short descriptive phrase. If the error is a failed UTF string + check, the offset of the start of the failing character and the reason + code are also output. Here is an example of an interactive pcre2test + run. + + $ pcre2test + PCRE2 version 9.00 2014-05-10 + + re> /^abc(\d+)/ + data> abc123 + 0: abc123 + 1: 123 + data> xyz + No match + + Unset capturing substrings that are not followed by one that is set are + not returned by pcre2_match(), and are not shown by pcre2test. In the + following example, there are two capturing substrings, but when the + first data line is matched, the second, unset substring is not shown. + An "internal" unset substring is shown as "<unset>", as for the second + data line. + + re> /(a)|(b)/ + data> a + 0: a + 1: a + data> b + 0: b + 1: <unset> + 2: b + + If the strings contain any non-printing characters, they are output as + \xhh escapes if the value is less than 256 and UTF mode is not set. + Otherwise they are output as \x{hh...} escapes. See below for the defi- + nition of non-printing characters. If the /aftertext modifier is set, + the output for substring 0 is followed by the the rest of the subject + string, identified by "0+" like this: + + re> /cat/aftertext + data> cataract + 0: cat + 0+ aract + + If global matching is requested, the results of successive matching + attempts are output in sequence, like this: + + re> /\Bi(\w\w)/g + data> Mississippi + 0: iss + 1: ss + 0: iss + 1: ss + 0: ipp + 1: pp + + "No match" is output only if the first match attempt fails. Here is an + example of a failure message (the offset 4 that is specified by \>4 is + past the end of the subject string): + + re> /xyz/ + data> xyz\=offset=4 + Error -24 (bad offset value) + + Note that whereas patterns can be continued over several lines (a plain + ">" prompt is used for continuations), subject lines may not. However + newlines can be included in a subject by means of the \n escape (or \r, + \r\n, etc., depending on the newline sequence setting). + + +OUTPUT FROM THE ALTERNATIVE MATCHING FUNCTION + + When the alternative matching function, pcre2_dfa_match(), is used, the + output consists of a list of all the matches that start at the first + point in the subject where there is at least one match. For example: + + re> /(tang|tangerine|tan)/ + data> yellow tangerine\=dfa + 0: tangerine + 1: tang + 2: tan + + (Using the normal matching function on this data finds only "tang".) + The longest matching string is always given first (and numbered zero). + After a PCRE2_ERROR_PARTIAL return, the output is "Partial match:", + followed by the partially matching substring. (Note that this is the + entire substring that was inspected during the partial match; it may + include characters before the actual match start if a lookbehind asser- + tion, \K, \b, or \B was involved.) + + If global matching is requested, the search for further matches resumes + at the end of the longest match. For example: + + re> /(tang|tangerine|tan)/g + data> yellow tangerine and tangy sultana\=dfa + 0: tangerine + 1: tang + 2: tan + 0: tang + 1: tan + 0: tan + + The alternative matching function does not support substring capture, + so the modifiers that are concerned with captured substrings are not + relevant. + + +RESTARTING AFTER A PARTIAL MATCH + + When the alternative matching function has given the PCRE2_ERROR_PAR- + TIAL return, indicating that the subject partially matched the pattern, + you can restart the match with additional subject data by means of the + dfa_restart modifier. For example: + + re> /^\d?\d(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\d\d$/ + data> 23ja\=P,dfa + Partial match: 23ja + data> n05\=dfa,dfa_restart + 0: n05 + + For further information about partial matching, see the pcre2partial + documentation. + + +CALLOUTS + + If the pattern contains any callout requests, pcre2test's callout func- + tion is called during matching. This works with both matching func- + tions. By default, the called function displays the callout number, the + start and current positions in the text at the callout time, and the + next pattern item to be tested. For example: + + --->pqrabcdef + 0 ^ ^ \d + + This output indicates that callout number 0 occurred for a match + attempt starting at the fourth character of the subject string, when + the pointer was at the seventh character, and when the next pattern + item was \d. Just one circumflex is output if the start and current + positions are the same. + + Callouts numbered 255 are assumed to be automatic callouts, inserted as + a result of the /auto_callout pattern modifier. In this case, instead + of showing the callout number, the offset in the pattern, preceded by a + plus, is output. For example: + + re> /\d?[A-E]\*/auto_callout + data> E* + --->E* + +0 ^ \d? + +3 ^ [A-E] + +8 ^^ \* + +10 ^ ^ + 0: E* + + If a pattern contains (*MARK) items, an additional line is output when- + ever a change of latest mark is passed to the callout function. For + example: + + re> /a(*MARK:X)bc/auto_callout + data> abc + --->abc + +0 ^ a + +1 ^^ (*MARK:X) + +10 ^^ b + Latest Mark: X + +11 ^ ^ c + +12 ^ ^ + 0: abc + + The mark changes between matching "a" and "b", but stays the same for + the rest of the match, so nothing more is output. If, as a result of + backtracking, the mark reverts to being unset, the text "<unset>" is + output. + + The callout function in pcre2test returns zero (carry on matching) by + default, but you can use a callout_fail modifier in a subject line (as + described above) to change this and other parameters of the callout. + + Inserting callouts can be helpful when using pcre2test to check compli- + cated regular expressions. For further information about callouts, see + the pcre2callout documentation. + + +NON-PRINTING CHARACTERS + + When pcre2test is outputting text in the compiled version of a pattern, + bytes other than 32-126 are always treated as non-printing characters + and are therefore shown as hex escapes. + + When pcre2test is outputting text that is a matched part of a subject + string, it behaves in the same way, unless a different locale has been + set for the pattern (using the /locale modifier). In this case, the + isprint() function is used to distinguish printing and non-printing + characters. + + +SEE ALSO + + pcre2(3), pcre16(3), pcre32(3), pcre2api(3), pcre2callout(3), pcre2jit, + pcre2matching(3), pcre2partial(d), pcre2pattern(3), pcre2precompile(3). + + +AUTHOR + + Philip Hazel + University Computing Service + Cambridge CB2 3QH, England. + + +REVISION + + Last updated: 19 August 2014 + Copyright (c) 1997-2014 University of Cambridge. diff --git a/src/pcre2demo.c b/src/pcre2demo.c index 6153ffa..8e37832 100644 --- a/src/pcre2demo.c +++ b/src/pcre2demo.c @@ -420,4 +420,4 @@ pcre2_code_free(re); return 0; } -/* End of pcredemo.c */ +/* End of pcre2demo.c */ |