summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorph10 <ph10@6239d852-aaf2-0410-a92c-79f79f948069>2014-09-23 11:35:51 +0000
committerph10 <ph10@6239d852-aaf2-0410-a92c-79f79f948069>2014-09-23 11:35:51 +0000
commitfd8438eb9b6bec69a456b69a7dece77aadc06a36 (patch)
treeb0f09f3d92934ea3ad0570599c861891cf360362
parentcf3d2f48e3a1281a47cd544cfd2457b8342037f9 (diff)
downloadpcre2-fd8438eb9b6bec69a456b69a7dece77aadc06a36.tar.gz
Documentation scripts
git-svn-id: svn://vcs.exim.org/pcre2/code/trunk@79 6239d852-aaf2-0410-a92c-79f79f948069
-rwxr-xr-x132html313
-rwxr-xr-xCheckMan67
-rwxr-xr-xCleanTxt113
-rwxr-xr-xDetrail35
-rwxr-xr-xPrepareRelease265
-rw-r--r--doc/html/README.txt1
-rw-r--r--doc/html/index.html177
-rw-r--r--doc/html/pcre2api.html2659
-rw-r--r--doc/html/pcre2callout.html270
-rw-r--r--doc/html/pcre2demo.html443
-rw-r--r--doc/html/pcre2test.html1199
-rw-r--r--doc/html/pcre2unicode.html270
-rw-r--r--doc/index.html.src177
-rw-r--r--doc/pcre2.txt2903
-rw-r--r--doc/pcre2api.32
-rw-r--r--doc/pcre2demo.3441
-rw-r--r--doc/pcre2test.18
-rw-r--r--doc/pcre2test.txt1073
-rw-r--r--src/pcre2demo.c2
19 files changed, 10412 insertions, 6 deletions
diff --git a/132html b/132html
new file mode 100755
index 0000000..85baab9
--- /dev/null
+++ b/132html
@@ -0,0 +1,313 @@
+#! /usr/bin/perl -w
+
+# Script to turn PCRE2 man pages into HTML
+
+
+# Subroutine to handle font changes and other escapes
+
+sub do_line {
+my($s) = $_[0];
+
+$s =~ s/</&#60;/g; # Deal with < and >
+$s =~ s/>/&#62;/g;
+$s =~ s"\\fI(.*?)\\f[RP]"<i>$1</i>"g;
+$s =~ s"\\fB(.*?)\\f[RP]"<b>$1</b>"g;
+$s =~ s"\\e"\\"g;
+$s =~ s/(?<=Copyright )\(c\)/&copy;/g;
+$s;
+}
+
+# Subroutine to ensure not in a paragraph
+
+sub end_para {
+if ($inpara)
+ {
+ print TEMP "</PRE>\n" if ($inpre);
+ print TEMP "</P>\n";
+ }
+$inpara = $inpre = 0;
+$wrotetext = 0;
+}
+
+# Subroutine to start a new paragraph
+
+sub new_para {
+&end_para();
+print TEMP "<P>\n";
+$inpara = 1;
+}
+
+
+# Main program
+
+$innf = 0;
+$inpara = 0;
+$inpre = 0;
+$wrotetext = 0;
+$toc = 0;
+$ref = 1;
+
+while ($#ARGV >= 0 && $ARGV[0] =~ /^-/)
+ {
+ $toc = 1 if $ARGV[0] eq "-toc";
+ shift;
+ }
+
+# Initial output to STDOUT
+
+print <<End ;
+<html>
+<head>
+<title>$ARGV[0] specification</title>
+</head>
+<body bgcolor="#FFFFFF" text="#00005A" link="#0066FF" alink="#3399FF" vlink="#2222BB">
+<h1>$ARGV[0] man page</h1>
+<p>
+Return to the <a href="index.html">PCRE2 index page</a>.
+</p>
+<p>
+This page is part of the PCRE2 HTML documentation. It was generated
+automatically from the original man page. If there is any nonsense in it,
+please consult the man page, in case the conversion went wrong.
+<br>
+End
+
+print "<ul>\n" if ($toc);
+
+open(TEMP, ">/tmp/$$") || die "Can't open /tmp/$$ for output\n";
+
+while (<STDIN>)
+ {
+ # Handle lines beginning with a dot
+
+ if (/^\./)
+ {
+ # Some of the PCRE2 man pages used to contain instances of .br. However,
+ # they should have all been removed because they cause trouble in some
+ # (other) automated systems that translate man pages to HTML. Complain if
+ # we find .br or .in (another macro that is deprecated).
+
+ if (/^\.br/ || /^\.in/)
+ {
+ print STDERR "\n*** Deprecated macro encountered - rewrite needed\n";
+ print STDERR "*** $_\n";
+ die "*** Processing abandoned\n";
+ }
+
+ # Instead of .br, relevent "literal" sections are enclosed in .nf/.fi.
+
+ elsif (/^\.nf/)
+ {
+ $innf = 1;
+ }
+
+ elsif (/^\.fi/)
+ {
+ $innf = 0;
+ }
+
+ # Handling .sp is subtle. If it is inside a literal section, do nothing if
+ # the next line is a non literal text line; similarly, if not inside a
+ # literal section, do nothing if a literal follows, unless we are inside
+ # a .nf/.ne section. The point being that the <pre> and </pre> that delimit
+ # literal sections will do the spacing. Always skip if no previous output.
+
+ elsif (/^\.sp/)
+ {
+ if ($wrotetext)
+ {
+ $_ = <STDIN>;
+ if ($inpre)
+ {
+ print TEMP "\n" if (/^[\s.]/);
+ }
+ else
+ {
+ print TEMP "<br>\n<br>\n" if ($innf || !/^[\s.]/);
+ }
+ redo; # Now process the lookahead line we just read
+ }
+ }
+ elsif (/^\.TP/ || /^\.PP/ || /^\.P/)
+ {
+ &new_para();
+ }
+ elsif (/^\.SH\s*("?)(.*)\1/)
+ {
+ # Ignore the NAME section
+ if ($2 =~ /^NAME\b/)
+ {
+ <STDIN>;
+ next;
+ }
+
+ &end_para();
+ my($title) = &do_line($2);
+ if ($toc)
+ {
+ printf("<li><a name=\"TOC%d\" href=\"#SEC%d\">$title</a>\n",
+ $ref, $ref);
+ printf TEMP ("<br><a name=\"SEC%d\" href=\"#TOC1\">$title</a><br>\n",
+ $ref, $ref);
+ $ref++;
+ }
+ else
+ {
+ print TEMP "<br><b>\n$title\n</b><br>\n";
+ }
+ }
+ elsif (/^\.SS\s*("?)(.*)\1/)
+ {
+ &end_para();
+ my($title) = &do_line($2);
+ print TEMP "<br><b>\n$title\n</b><br>\n";
+ }
+ elsif (/^\.B\s*(.*)/)
+ {
+ &new_para() if (!$inpara);
+ $_ = &do_line($1);
+ s/"(.*?)"/$1/g;
+ print TEMP "<b>$_</b>\n";
+ $wrotetext = 1;
+ }
+ elsif (/^\.I\s*(.*)/)
+ {
+ &new_para() if (!$inpara);
+ $_ = &do_line($1);
+ s/"(.*?)"/$1/g;
+ print TEMP "<i>$_</i>\n";
+ $wrotetext = 1;
+ }
+
+ # A comment that starts "HREF" takes the next line as a name that
+ # is turned into a hyperlink, using the text given, which might be
+ # in a special font. If it ends in () or (digits) or punctuation, they
+ # aren't part of the link.
+
+ elsif (/^\.\\"\s*HREF/)
+ {
+ $_=<STDIN>;
+ chomp;
+ $_ = &do_line($_);
+ $_ =~ s/\s+$//;
+ $_ =~ /^(?:<.>)?([^<(]+)(?:\(\))?(?:<\/.>)?(?:\(\d+\))?[.,;:]?$/;
+ print TEMP "<a href=\"$1.html\">$_</a>\n";
+ }
+
+ # A comment that starts "HTML" inserts literal HTML
+
+ elsif (/^\.\\"\s*HTML\s*(.*)/)
+ {
+ print TEMP $1;
+ }
+
+ # A comment that starts < inserts that HTML at the end of the
+ # *next* input line - so as not to get a newline between them.
+
+ elsif (/^\.\\"\s*(<.*>)/)
+ {
+ my($markup) = $1;
+ $_=<STDIN>;
+ chomp;
+ $_ = &do_line($_);
+ $_ =~ s/\s+$//;
+ print TEMP "$_$markup\n";
+ }
+
+ # A comment that starts JOIN joins the next two lines together, with one
+ # space between them. Then that line is processed. This is used in some
+ # displays where two lines are needed for the "man" version. JOINSH works
+ # the same, except that it assumes this is a shell command, so removes
+ # continuation backslashes.
+
+ elsif (/^\.\\"\s*JOIN(SH)?/)
+ {
+ my($one,$two);
+ $one = <STDIN>;
+ $two = <STDIN>;
+ $one =~ s/\s*\\e\s*$// if (defined($1));
+ chomp($one);
+ $two =~ s/^\s+//;
+ $_ = "$one $two";
+ redo; # Process the joined lines
+ }
+
+ # .EX/.EE are used in the pcredemo page to bracket the entire program,
+ # which is unmodified except for turning backslash into "\e".
+
+ elsif (/^\.EX\s*$/)
+ {
+ print TEMP "<PRE>\n";
+ while (<STDIN>)
+ {
+ last if /^\.EE\s*$/;
+ s/\\e/\\/g;
+ s/&/&amp;/g;
+ s/</&lt;/g;
+ s/>/&gt;/g;
+ print TEMP;
+ }
+ }
+
+ # Ignore anything not recognized
+
+ next;
+ }
+
+ # Line does not begin with a dot. Replace blank lines with new paragraphs
+
+ if (/^\s*$/)
+ {
+ &end_para() if ($wrotetext);
+ next;
+ }
+
+ # Convert fonts changes and output an ordinary line. Ensure that indented
+ # lines are marked as literal.
+
+ $_ = &do_line($_);
+ &new_para() if (!$inpara);
+
+ if (/^\s/)
+ {
+ if (!$inpre)
+ {
+ print TEMP "<pre>\n";
+ $inpre = 1;
+ }
+ }
+ elsif ($inpre)
+ {
+ print TEMP "</pre>\n";
+ $inpre = 0;
+ }
+
+ # Add <br> to the end of a non-literal line if we are within .nf/.fi
+
+ $_ .= "<br>\n" if (!$inpre && $innf);
+
+ print TEMP;
+ $wrotetext = 1;
+ }
+
+# The TOC, if present, will have been written - terminate it
+
+print "</ul>\n" if ($toc);
+
+# Copy the remainder to the standard output
+
+close(TEMP);
+open(TEMP, "/tmp/$$") || die "Can't open /tmp/$$ for input\n";
+
+print while (<TEMP>);
+
+print <<End ;
+<p>
+Return to the <a href="index.html">PCRE2 index page</a>.
+</p>
+End
+
+close(TEMP);
+unlink("/tmp/$$");
+
+# End
diff --git a/CheckMan b/CheckMan
new file mode 100755
index 0000000..5686746
--- /dev/null
+++ b/CheckMan
@@ -0,0 +1,67 @@
+#! /usr/bin/perl
+
+# A script to scan PCRE2's man pages to check for typos in the control
+# sequences. I use only a small set of the available repertoire, so it is
+# straightforward to check that nothing else has slipped in by mistake. This
+# script should be called in the doc directory.
+
+$yield = 0;
+
+while (scalar(@ARGV) > 0)
+ {
+ $line = 0;
+ $file = shift @ARGV;
+
+ open (IN, $file) || die "Failed to open $file\n";
+
+ while (<IN>)
+ {
+ $line++;
+ if (/^\s*$/)
+ {
+ printf "Empty line $line of $file\n";
+ $yield = 1;
+ }
+ elsif (/^\./)
+ {
+ if (!/^\.\s*$|
+ ^\.B\s+\S|
+ ^\.TH\s\S|
+ ^\.SH\s\S|
+ ^\.SS\s\S|
+ ^\.TP(?:\s?\d+)?\s*$|
+ ^\.SM\s*$|
+ ^\.br\s*$|
+ ^\.rs\s*$|
+ ^\.sp\s*$|
+ ^\.nf\s*$|
+ ^\.fi\s*$|
+ ^\.P\s*$|
+ ^\.PP\s*$|
+ ^\.\\"(?:\ HREF)?\s*$|
+ ^\.\\"\sHTML\s<a\shref="[^"]+?">\s*$|
+ ^\.\\"\sHTML\s<a\sname="[^"]+?"><\/a>\s*$|
+ ^\.\\"\s<\/a>\s*$|
+ ^\.\\"\sJOINSH\s*$|
+ ^\.\\"\sJOIN\s*$/x
+ )
+ {
+ printf "Bad control line $line of $file\n";
+ $yield = 1;
+ }
+ }
+ else
+ {
+ if (/\\[^ef]|\\f[^IBP]/)
+ {
+ printf "Bad backslash in line $line of $file\n";
+ $yield = 1;
+ }
+ }
+ }
+
+ close(IN);
+ }
+
+exit $yield;
+# End
diff --git a/CleanTxt b/CleanTxt
new file mode 100755
index 0000000..1f42519
--- /dev/null
+++ b/CleanTxt
@@ -0,0 +1,113 @@
+#! /usr/bin/perl -w
+
+# Script to take the output of nroff -man and remove all the backspacing and
+# the page footers and the screen commands etc so that it is more usefully
+# readable online. In fact, in the latest nroff, intermediate footers don't
+# seem to be generated any more.
+
+$blankcount = 0;
+$lastwascut = 0;
+$firstheader = 1;
+
+# Input on STDIN; output to STDOUT.
+
+while (<STDIN>)
+ {
+ s/\x1b\[\d+m//g; # Remove screen controls "ESC [ number m"
+ s/.\x8//g; # Remove "char, backspace"
+
+ # Handle header lines. Retain only the first one we encounter, but remove
+ # the blank line that follows. Any others (e.g. at end of document) and the
+ # following blank line are dropped.
+
+ if (/^PCRE(\w*)\(([13])\)\s+PCRE\1\(\2\)$/)
+ {
+ if ($firstheader)
+ {
+ $firstheader = 0;
+ print;
+ $lastprinted = $_;
+ $lastwascut = 0;
+ }
+ $_=<STDIN>; # Remove a blank that follows
+ next;
+ }
+
+ # Count runs of empty lines
+
+ if (/^\s*$/)
+ {
+ $blankcount++;
+ $lastwascut = 0;
+ next;
+ }
+
+ # If a chunk of lines has been cut out (page footer) and the next line
+ # has a different indentation, put back one blank line.
+
+ if ($lastwascut && $blankcount < 1 && defined($lastprinted))
+ {
+ ($a) = $lastprinted =~ /^(\s*)/;
+ ($b) = $_ =~ /^(\s*)/;
+ $blankcount++ if ($a ne $b);
+ }
+
+ # We get here only when we have a non-blank line in hand. If it was preceded
+ # by 3 or more blank lines, read the next 3 lines and see if they are blank.
+ # If so, remove all 7 lines, and remember that we have just done a cut.
+
+ if ($blankcount >= 3)
+ {
+ for ($i = 0; $i < 3; $i++)
+ {
+ $next[$i] = <STDIN>;
+ $next[$i] = "" if !defined $next[$i];
+ $next[$i] =~ s/\x1b\[\d+m//g; # Remove screen controls "ESC [ number m"
+ $next[$i] =~ s/.\x8//g; # Remove "char, backspace"
+ }
+
+ # Cut out chunks of the form <3 blanks><non-blank><3 blanks>
+
+ if ($next[0] =~ /^\s*$/ &&
+ $next[1] =~ /^\s*$/ &&
+ $next[2] =~ /^\s*$/)
+ {
+ $blankcount -= 3;
+ $lastwascut = 1;
+ }
+
+ # Otherwise output the saved blanks, the current, and the next three
+ # lines. Remember the last printed line.
+
+ else
+ {
+ for ($i = 0; $i < $blankcount; $i++) { print "\n"; }
+ print;
+ for ($i = 0; $i < 3; $i++)
+ {
+ $next[$i] =~ s/.\x8//g;
+ print $next[$i];
+ $lastprinted = $_;
+ }
+ $lastwascut = 0;
+ $blankcount = 0;
+ }
+ }
+
+ # This non-blank line is not preceded by 3 or more blank lines. Output
+ # any blanks there are, and the line. Remember it. Force two blank lines
+ # before headings.
+
+ else
+ {
+ $blankcount = 2 if /^\S/ && !/^Last updated/ && !/^Copyright/ &&
+ defined($lastprinted);
+ for ($i = 0; $i < $blankcount; $i++) { print "\n"; }
+ print;
+ $lastprinted = $_;
+ $lastwascut = 0;
+ $blankcount = 0;
+ }
+ }
+
+# End
diff --git a/Detrail b/Detrail
new file mode 100755
index 0000000..1c5c7e9
--- /dev/null
+++ b/Detrail
@@ -0,0 +1,35 @@
+#!/usr/bin/perl
+
+# This is a script for removing trailing whitespace from lines in files that
+# are listed on the command line.
+
+# This subroutine does the work for one file.
+
+sub detrail {
+my($file) = $_[0];
+my($changed) = 0;
+open(IN, "$file") || die "Can't open $file for input";
+@lines = <IN>;
+close(IN);
+foreach (@lines)
+ {
+ if (/\s+\n$/)
+ {
+ s/\s+\n$/\n/;
+ $changed = 1;
+ }
+ }
+if ($changed)
+ {
+ open(OUT, ">$file") || die "Can't open $file for output";
+ print OUT @lines;
+ close(OUT);
+ }
+}
+
+# This is the main program
+
+$, = ""; # Output field separator
+for ($i = 0; $i < @ARGV; $i++) { &detrail($ARGV[$i]); }
+
+# End
diff --git a/PrepareRelease b/PrepareRelease
new file mode 100755
index 0000000..c92d7f9
--- /dev/null
+++ b/PrepareRelease
@@ -0,0 +1,265 @@
+#/bin/sh
+
+# Script to prepare the files for building a PCRE2 release. It does some
+# processing of the documentation, detrails files, and creates pcre2.h.generic
+# and config.h.generic (for use by builders who can't run ./configure).
+
+# You must run this script before runnning "make dist". If its first argument
+# is "doc", it stops after preparing the documentation. There are no other
+# arguments. The script makes use of the following files:
+
+# 132html A Perl script that converts a .1 or .3 man page into HTML. It
+# "knows" the relevant troff constructs that are used in the PCRE2
+# man pages.
+
+# CheckMan A Perl script that checks man pages for typos in the mark up.
+
+# CleanTxt A Perl script that cleans up the output of "nroff -man" by
+# removing backspaces and other redundant text so as to produce
+# a readable .txt file.
+
+# Detrail A Perl script that removes trailing spaces from files.
+
+# doc/index.html.src
+# A file that is copied as index.html into the doc/html directory
+# when the HTML documentation is built. It works like this so that
+# doc/html can be deleted and re-created from scratch.
+
+# README & NON-AUTOTOOLS-BUILD
+# These files are copied into the doc/html directory, with .txt
+# extensions so that they can by hyperlinked from the HTML
+# documentation, because some people just go to the HTML without
+# looking for text files.
+
+
+# First, sort out the documentation. Remove pcre2demo.3 first because it won't
+# pass the markup check (it is created below, using markup that none of the
+# other pages use).
+
+cd doc
+echo Processing documentation
+
+/bin/rm -f pcre2demo.3
+
+# Check the remaining man pages
+
+perl ../CheckMan *.1 *.3
+if [ $? != 0 ] ; then exit 1; fi
+
+# Make Text form of the documentation. It needs some mangling to make it
+# tidy for online reading. Concatenate all the .3 stuff, but omit the
+# individual function pages.
+
+cat <<End >pcre2.txt
+-----------------------------------------------------------------------------
+This file contains a concatenation of the PCRE2 man pages, converted to plain
+text format for ease of searching with a text editor, or for use on systems
+that do not have a man page processor. The small individual files that give
+synopses of each function in the library have not been included. Neither has
+the pcre2demo program. There are separate text files for the pcre2grep and
+pcre2test commands.
+-----------------------------------------------------------------------------
+
+
+End
+
+echo "Making pcre2.txt"
+for file in pcre2api pcre2callout pcre2unicode ; do
+
+#for file in pcre pcre16 pcre32 pcrebuild pcrematching \
+# pcrecompat pcrepattern pcresyntax pcrejit pcrepartial \
+# pcreprecompile pcreperform pcreposix pcrecpp pcresample \
+# pcrelimits pcrestack ; do
+
+ echo " Processing $file.3"
+ nroff -c -man $file.3 >$file.rawtxt
+ perl ../CleanTxt <$file.rawtxt >>pcre2.txt
+ /bin/rm $file.rawtxt
+ echo "------------------------------------------------------------------------------" >>pcre2.txt
+ if [ "$file" != "pcre2sample" ] ; then
+ echo " " >>pcre2.txt
+ echo " " >>pcre2.txt
+ fi
+done
+
+# The three commands
+for file in pcre2test ; do
+# for file in pcre2test pcre2grep pcre-config ; do
+ echo Making $file.txt
+ nroff -c -man $file.1 >$file.rawtxt
+ perl ../CleanTxt <$file.rawtxt >$file.txt
+ /bin/rm $file.rawtxt
+done
+
+
+# Make pcre2demo.3 from the pcre2demo.c source file
+
+echo "Making pcre2demo.3"
+perl <<"END" >pcre2demo.3
+ open(IN, "../src/pcre2demo.c") || die "Failed to open src/pcre2demo.c\n";
+ open(OUT, ">pcre2demo.3") || die "Failed to open pcre2demo.3\n";
+ print OUT ".\\\" Start example.\n" .
+ ".de EX\n" .
+ ". nr mE \\\\n(.f\n" .
+ ". nf\n" .
+ ". nh\n" .
+ ". ft CW\n" .
+ "..\n" .
+ ".\n" .
+ ".\n" .
+ ".\\\" End example.\n" .
+ ".de EE\n" .
+ ". ft \\\\n(mE\n" .
+ ". fi\n" .
+ ". hy \\\\n(HY\n" .
+ "..\n" .
+ ".\n" .
+ ".EX\n" ;
+ while (<IN>)
+ {
+ s/\\/\\e/g;
+ print OUT;
+ }
+ print OUT ".EE\n";
+ close(IN);
+ close(OUT);
+END
+if [ $? != 0 ] ; then exit 1; fi
+
+
+# Make HTML form of the documentation.
+
+echo "Making HTML documentation"
+/bin/rm html/*
+cp index.html.src html/index.html
+cp ../README html/README.txt
+# cp ../NON-AUTOTOOLS-BUILD html/NON-AUTOTOOLS-BUILD.txt
+
+for file in *.1 ; do
+ base=`basename $file .1`
+ echo " Making $base.html"
+ perl ../132html -toc $base <$file >html/$base.html
+done
+
+# Exclude table of contents for function summaries. It seems that expr
+# forces an anchored regex. Also exclude them for small pages that have
+# only one section.
+
+for file in *.3 ; do
+ base=`basename $file .3`
+ toc=-toc
+ if [ `expr $base : '.*_'` -ne 0 ] ; then toc="" ; fi
+ if [ "$base" = "pcre2sample" ] || \
+ [ "$base" = "pcre2stack" ] || \
+ [ "$base" = "pcre2compat" ] || \
+ [ "$base" = "pcre2limits" ] || \
+ [ "$base" = "pcre2perform" ] || \
+ [ "$base" = "pcre2unicode" ] ; then
+ toc=""
+ fi
+ echo " Making $base.html"
+ perl ../132html $toc $base <$file >html/$base.html
+ if [ $? != 0 ] ; then exit 1; fi
+done
+
+# End of documentation processing; stop if only documentation required.
+
+cd ..
+echo Documentation done
+if [ "$1" = "doc" ] ; then exit; fi
+
+# FIXME pro tem only do docs
+exit
+
+# These files are detrailed; do not detrail the test data because there may be
+# significant trailing spaces. Do not detrail RunTest.bat, because it has CRLF
+# line endings and the detrail script removes all trailing white space. The
+# configure files are also omitted from the detrailing. We don't bother with
+# those pcre[16|32]_xx files that just define COMPILE_PCRE16 and then #include the
+# common file, because they aren't going to change.
+
+files="\
+ Makefile.am \
+ Makefile.in \
+ configure.ac \
+ README \
+ LICENCE \
+ COPYING \
+ AUTHORS \
+ NEWS \
+ NON-UNIX-USE \
+ NON-AUTOTOOLS-BUILD \
+ INSTALL \
+ 132html \
+ CleanTxt \
+ Detrail \
+ ChangeLog \
+ CMakeLists.txt \
+ RunGrepTest \
+ RunTest \
+ pcre-config.in \
+ libpcre.pc.in \
+ libpcre16.pc.in \
+ libpcre32.pc.in \
+ libpcreposix.pc.in \
+ libpcrecpp.pc.in \
+ config.h.in \
+ pcre_chartables.c.dist \
+ pcredemo.c \
+ pcregrep.c \
+ pcretest.c \
+ dftables.c \
+ pcreposix.c \
+ pcreposix.h \
+ pcre.h.in \
+ pcre_internal.h \
+ pcre_byte_order.c \
+ pcre_compile.c \
+ pcre_config.c \
+ pcre_dfa_exec.c \
+ pcre_exec.c \
+ pcre_fullinfo.c \
+ pcre_get.c \
+ pcre_globals.c \
+ pcre_jit_compile.c \
+ pcre_jit_test.c \
+ pcre_maketables.c \
+ pcre_newline.c \
+ pcre_ord2utf8.c \
+ pcre16_ord2utf16.c \
+ pcre32_ord2utf32.c \
+ pcre_printint.c \
+ pcre_refcount.c \
+ pcre_string_utils.c \
+ pcre_study.c \
+ pcre_tables.c \
+ pcre_valid_utf8.c \
+ pcre_version.c \
+ pcre_xclass.c \
+ pcre16_utf16_utils.c \
+ pcre32_utf32_utils.c \
+ pcre16_valid_utf16.c \
+ pcre32_valid_utf32.c \
+ pcre_scanner.cc \
+ pcre_scanner.h \
+ pcre_scanner_unittest.cc \
+ pcrecpp.cc \
+ pcrecpp.h \
+ pcrecpparg.h.in \
+ pcrecpp_unittest.cc \
+ pcre_stringpiece.cc \
+ pcre_stringpiece.h.in \
+ pcre_stringpiece_unittest.cc \
+ perltest.pl \
+ ucp.h \
+ makevp.bat \
+ pcre.def \
+ libpcre.def \
+ libpcreposix.def"
+
+echo Detrailing
+perl ./Detrail $files doc/p* doc/html/*
+
+echo Done
+
+#End
diff --git a/doc/html/README.txt b/doc/html/README.txt
new file mode 100644
index 0000000..7ad597a
--- /dev/null
+++ b/doc/html/README.txt
@@ -0,0 +1 @@
+This is a placeholder README file for a work in progress.
diff --git a/doc/html/index.html b/doc/html/index.html
new file mode 100644
index 0000000..4e264ec
--- /dev/null
+++ b/doc/html/index.html
@@ -0,0 +1,177 @@
+<html>
+<!-- This is a manually maintained file that is the root of the HTML version of
+ the PCRE2 documentation. When the HTML documents are built from the man
+ page versions, the entire doc/html directory is emptied, this file is then
+ copied into doc/html/index.html, and the remaining files therein are
+ created by the 132html script.
+-->
+<head>
+<title>PCRE2 specification</title>
+</head>
+<body bgcolor="#FFFFFF" text="#00005A" link="#0066FF" alink="#3399FF" vlink="#2222BB">
+<h1>Perl-compatible Regular Expressions (revised API: PCRE2)</h1>
+<p>
+The HTML documentation for PCRE2 consists of a number of pages that are listed
+below in alphabetical order. If you are new to PCRE2, please read the first one
+first.
+</p>
+
+<table>
+<tr><td><a href="pcre2.html">pcre</a></td>
+ <td>&nbsp;&nbsp;Introductory page</td></tr>
+
+<tr><td><a href="pcre2-config.html">pcre-config</a></td>
+ <td>&nbsp;&nbsp;Information about the installation configuration</td></tr>
+
+<tr><td><a href="pcre2api.html">pcreapi</a></td>
+ <td>&nbsp;&nbsp;PCRE2's native API</td></tr>
+
+<tr><td><a href="pcre2build.html">pcrebuild</a></td>
+ <td>&nbsp;&nbsp;Building PCRE2</td></tr>
+
+<tr><td><a href="pcre2callout.html">pcre2callout</a></td>
+ <td>&nbsp;&nbsp;The <i>callout</i> facility</td></tr>
+
+<tr><td><a href="pcre2compat.html">pcre2compat</a></td>
+ <td>&nbsp;&nbsp;Compability with Perl</td></tr>
+
+<tr><td><a href="pcre2demo.html">pcre2demo</a></td>
+ <td>&nbsp;&nbsp;A demonstration C program that uses the PCRE2 library</td></tr>
+
+<tr><td><a href="pcre2grep.html">pcre2grep</a></td>
+ <td>&nbsp;&nbsp;The <b>pcre2grep</b> command</td></tr>
+
+<tr><td><a href="pcre2jit.html">pcre2jit</a></td>
+ <td>&nbsp;&nbsp;Discussion of the just-in-time optimization support</td></tr>
+
+<tr><td><a href="pcre2limits.html">pcre2limits</a></td>
+ <td>&nbsp;&nbsp;Details of size and other limits</td></tr>
+
+<tr><td><a href="pcre2matching.html">pcre2matching</a></td>
+ <td>&nbsp;&nbsp;Discussion of the two matching algorithms</td></tr>
+
+<tr><td><a href="pcre2partial.html">pcre2partial</a></td>
+ <td>&nbsp;&nbsp;Using PCRE2 for partial matching</td></tr>
+
+<tr><td><a href="pcre2pattern.html">pcre2pattern</a></td>
+ <td>&nbsp;&nbsp;Specification of the regular expressions supported by PCRE2</td></tr>
+
+<tr><td><a href="pcre2perform.html">pcre2perform</a></td>
+ <td>&nbsp;&nbsp;Some comments on performance</td></tr>
+
+<tr><td><a href="pcre2posix.html">pcre2posix</a></td>
+ <td>&nbsp;&nbsp;The POSIX API to the PCRE2 8-bit library</td></tr>
+
+<tr><td><a href="pcre2precompile.html">pcre2precompile</a></td>
+ <td>&nbsp;&nbsp;How to save and re-use compiled patterns</td></tr>
+
+<tr><td><a href="pcre2sample.html">pcre2sample</a></td>
+ <td>&nbsp;&nbsp;Discussion of the pcre2demo program</td></tr>
+
+<tr><td><a href="pcre2stack.html">pcre2stack</a></td>
+ <td>&nbsp;&nbsp;Discussion of PCRE2's stack usage</td></tr>
+
+<tr><td><a href="pcre2syntax.html">pcre2syntax</a></td>
+ <td>&nbsp;&nbsp;Syntax quick-reference summary</td></tr>
+
+<tr><td><a href="pcre2test.html">pcre2test</a></td>
+ <td>&nbsp;&nbsp;The <b>pcre2test</b> command for testing PCRE2</td></tr>
+
+<tr><td><a href="pcre2unicode.html">pcre2unicode</a></td>
+ <td>&nbsp;&nbsp;Discussion of Unicode and UTF-8/UTF-16/UTF-32 support</td></tr>
+</table>
+
+<p>
+There are also individual pages that summarize the interface for each function
+in the library. There is a single page for each triple of 8-bit/16-bit/32-bit
+functions.
+</p>
+
+<table>
+
+<tr><td><a href="pcre2_assign_jit_stack.html">pcre2_assign_jit_stack</a></td>
+ <td>&nbsp;&nbsp;Assign stack for JIT matching</td></tr>
+
+<tr><td><a href="pcre2_compile.html">pcre2_compile</a></td>
+ <td>&nbsp;&nbsp;Compile a regular expression</td></tr>
+
+<tr><td><a href="pcre2_compile2.html">pcre2_compile2</a></td>
+ <td>&nbsp;&nbsp;Compile a regular expression (alternate interface)</td></tr>
+
+<tr><td><a href="pcre2_config.html">pcre2_config</a></td>
+ <td>&nbsp;&nbsp;Show build-time configuration options</td></tr>
+
+<tr><td><a href="pcre2_copy_named_substring.html">pcre2_copy_named_substring</a></td>
+ <td>&nbsp;&nbsp;Extract named substring into given buffer</td></tr>
+
+<tr><td><a href="pcre2_copy_substring.html">pcre2_copy_substring</a></td>
+ <td>&nbsp;&nbsp;Extract numbered substring into given buffer</td></tr>
+
+<tr><td><a href="pcre2_dfa_exec.html">pcre2_dfa_exec</a></td>
+ <td>&nbsp;&nbsp;Match a compiled pattern to a subject string
+ (DFA algorithm; <i>not</i> Perl compatible)</td></tr>
+
+<tr><td><a href="pcre2_exec.html">pcre2_exec</a></td>
+ <td>&nbsp;&nbsp;Match a compiled pattern to a subject string
+ (Perl compatible)</td></tr>
+
+<tr><td><a href="pcre2_free_study.html">pcre2_free_study</a></td>
+ <td>&nbsp;&nbsp;Free study data</td></tr>
+
+<tr><td><a href="pcre2_free_substring.html">pcre2_free_substring</a></td>
+ <td>&nbsp;&nbsp;Free extracted substring</td></tr>
+
+<tr><td><a href="pcre2_free_substring_list.html">pcre2_free_substring_list</a></td>
+ <td>&nbsp;&nbsp;Free list of extracted substrings</td></tr>
+
+<tr><td><a href="pcre2_fullinfo.html">pcre2_fullinfo</a></td>
+ <td>&nbsp;&nbsp;Extract information about a pattern</td></tr>
+
+<tr><td><a href="pcre2_get_named_substring.html">pcre2_get_named_substring</a></td>
+ <td>&nbsp;&nbsp;Extract named substring into new memory</td></tr>
+
+<tr><td><a href="pcre2_get_stringnumber.html">pcre2_get_stringnumber</a></td>
+ <td>&nbsp;&nbsp;Convert captured string name to number</td></tr>
+
+<tr><td><a href="pcre2_get_stringtable_entries.html">pcre2_get_stringtable_entries</a></td>
+ <td>&nbsp;&nbsp;Find table entries for given string name</td></tr>
+
+<tr><td><a href="pcre2_get_substring.html">pcre2_get_substring</a></td>
+ <td>&nbsp;&nbsp;Extract numbered substring into new memory</td></tr>
+
+<tr><td><a href="pcre2_get_substring_list.html">pcre2_get_substring_list</a></td>
+ <td>&nbsp;&nbsp;Extract all substrings into new memory</td></tr>
+
+<tr><td><a href="pcre2_jit_exec.html">pcre2_jit_exec</a></td>
+ <td>&nbsp;&nbsp;Fast path interface to JIT matching</td></tr>
+
+<tr><td><a href="pcre2_jit_stack_alloc.html">pcre2_jit_stack_alloc</a></td>
+ <td>&nbsp;&nbsp;Create a stack for JIT matching</td></tr>
+
+<tr><td><a href="pcre2_jit_stack_free.html">pcre2_jit_stack_free</a></td>
+ <td>&nbsp;&nbsp;Free a JIT matching stack</td></tr>
+
+<tr><td><a href="pcre2_maketables.html">pcre2_maketables</a></td>
+ <td>&nbsp;&nbsp;Build character tables in current locale</td></tr>
+
+<tr><td><a href="pcre2_pattern_to_host_byte_order.html">pcre2_pattern_to_host_byte_order</a></td>
+ <td>&nbsp;&nbsp;Convert compiled pattern to host byte order if necessary</td></tr>
+
+<tr><td><a href="pcre2_refcount.html">pcre2_refcount</a></td>
+ <td>&nbsp;&nbsp;Maintain reference count in compiled pattern</td></tr>
+
+<tr><td><a href="pcre2_study.html">pcre2_study</a></td>
+ <td>&nbsp;&nbsp;Study a compiled pattern</td></tr>
+
+<tr><td><a href="pcre2_utf16_to_host_byte_order.html">pcre2_utf16_to_host_byte_order</a></td>
+ <td>&nbsp;&nbsp;Convert UTF-16 string to host byte order if necessary</td></tr>
+
+<tr><td><a href="pcre2_utf32_to_host_byte_order.html">pcre2_utf32_to_host_byte_order</a></td>
+ <td>&nbsp;&nbsp;Convert UTF-32 string to host byte order if necessary</td></tr>
+
+<tr><td><a href="pcre2_version.html">pcre2_version</a></td>
+ <td>&nbsp;&nbsp;Return PCRE2 version and release date</td></tr>
+</table>
+
+</html>
+
diff --git a/doc/html/pcre2api.html b/doc/html/pcre2api.html
new file mode 100644
index 0000000..dd95b4c
--- /dev/null
+++ b/doc/html/pcre2api.html
@@ -0,0 +1,2659 @@
+<html>
+<head>
+<title>pcre2api specification</title>
+</head>
+<body bgcolor="#FFFFFF" text="#00005A" link="#0066FF" alink="#3399FF" vlink="#2222BB">
+<h1>pcre2api man page</h1>
+<p>
+Return to the <a href="index.html">PCRE2 index page</a>.
+</p>
+<p>
+This page is part of the PCRE2 HTML documentation. It was generated
+automatically from the original man page. If there is any nonsense in it,
+please consult the man page, in case the conversion went wrong.
+<br>
+<ul>
+<li><a name="TOC1" href="#SEC1">PCRE2 NATIVE API BASIC FUNCTIONS</a>
+<li><a name="TOC2" href="#SEC2">PCRE2 NATIVE API AUXILIARY MATCH FUNCTIONS</a>
+<li><a name="TOC3" href="#SEC3">PCRE2 NATIVE API GENERAL CONTEXT FUNCTIONS</a>
+<li><a name="TOC4" href="#SEC4">PCRE2 NATIVE API COMPILE CONTEXT FUNCTIONS</a>
+<li><a name="TOC5" href="#SEC5">PCRE2 NATIVE API MATCH CONTEXT FUNCTIONS</a>
+<li><a name="TOC6" href="#SEC6">PCRE2 NATIVE API STRING EXTRACTION FUNCTIONS</a>
+<li><a name="TOC7" href="#SEC7">PCRE2 NATIVE API JIT FUNCTIONS</a>
+<li><a name="TOC8" href="#SEC8">PCRE2 NATIVE API AUXILIARY FUNCTIONS</a>
+<li><a name="TOC9" href="#SEC9">PCRE2 8-BIT, 16-BIT, AND 32-BIT LIBRARIES</a>
+<li><a name="TOC10" href="#SEC10">PCRE2 API OVERVIEW</a>
+<li><a name="TOC11" href="#SEC11">NEWLINES</a>
+<li><a name="TOC12" href="#SEC12">MULTITHREADING</a>
+<li><a name="TOC13" href="#SEC13">PCRE2 CONTEXTS</a>
+<li><a name="TOC14" href="#SEC14">CHECKING BUILD-TIME OPTIONS</a>
+<li><a name="TOC15" href="#SEC15">COMPILING A PATTERN</a>
+<li><a name="TOC16" href="#SEC16">COMPILATION ERROR CODES</a>
+<li><a name="TOC17" href="#SEC17">JUST-IN-TIME (JIT) COMPILATION</a>
+<li><a name="TOC18" href="#SEC18">LOCALE SUPPORT</a>
+<li><a name="TOC19" href="#SEC19">INFORMATION ABOUT A COMPILED PATTERN</a>
+<li><a name="TOC20" href="#SEC20">THE MATCH DATA BLOCK</a>
+<li><a name="TOC21" href="#SEC21">MATCHING A PATTERN: THE TRADITIONAL FUNCTION</a>
+<li><a name="TOC22" href="#SEC22">NEWLINE HANDLING WHEN MATCHING</a>
+<li><a name="TOC23" href="#SEC23">HOW PCRE2_MATCH() RETURNS A STRING AND CAPTURED SUBSTRINGS</a>
+<li><a name="TOC24" href="#SEC24">EXTRACTING CAPTURED SUBSTRINGS BY NUMBER</a>
+<li><a name="TOC25" href="#SEC25">EXTRACTING A LIST OF ALL CAPTURED SUBSTRINGS</a>
+<li><a name="TOC26" href="#SEC26">EXTRACTING CAPTURED SUBSTRINGS BY NAME</a>
+<li><a name="TOC27" href="#SEC27">DUPLICATE SUBPATTERN NAMES</a>
+<li><a name="TOC28" href="#SEC28">FINDING ALL POSSIBLE MATCHES</a>
+<li><a name="TOC29" href="#SEC29">MATCHING A PATTERN: THE ALTERNATIVE FUNCTION</a>
+<li><a name="TOC30" href="#SEC30">SEE ALSO</a>
+<li><a name="TOC31" href="#SEC31">AUTHOR</a>
+<li><a name="TOC32" href="#SEC32">REVISION</a>
+</ul>
+<P>
+<b>#include &#60;pcre2.h&#62;</b>
+<br>
+<br>
+PCRE2 is a new API for PCRE. This document contains a description of all its
+functions. See the
+<a href="pcre2.html"><b>pcre2</b></a>
+document for an overview of all the PCRE2 documentation.
+</P>
+<br><a name="SEC1" href="#TOC1">PCRE2 NATIVE API BASIC FUNCTIONS</a><br>
+<P>
+<b>pcre2_code *pcre2_compile(PCRE2_SPTR <i>pattern</i>, PCRE2_SIZE <i>length</i>,</b>
+<b> uint32_t <i>options</i>, int *<i>errorcode</i>, PCRE2_SIZE *<i>erroroffset,</i></b>
+<b> pcre2_compile_context *<i>ccontext</i>);</b>
+<br>
+<br>
+<b>pcre2_code_free(pcre2_code *<i>code</i>);</b>
+<br>
+<br>
+<b>pcre2_match_data_create(uint32_t <i>ovecsize</i>,</b>
+<b> pcre2_general_context *<i>gcontext</i>);</b>
+<br>
+<br>
+<b>pcre2_match_data_create_from_pattern(pcre2_code *<i>code</i>,</b>
+<b> pcre2_general_context *<i>gcontext</i>);</b>
+<br>
+<br>
+<b>int pcre2_match(const pcre2_code *<i>code</i>, PCRE2_SPTR <i>subject</i>,</b>
+<b> PCRE2_SIZE <i>length</i>, PCRE2_SIZE <i>startoffset</i>,</b>
+<b> uint32_t <i>options</i>, pcre2_match_data *<i>match_data</i>,</b>
+<b> pcre2_match_context *<i>mcontext</i>);</b>
+<br>
+<br>
+<b>int pcre2_dfa_match(const pcre2_code *<i>code</i>, PCRE2_SPTR <i>subject</i>,</b>
+<b> PCRE2_SIZE <i>length</i>, PCRE2_SIZE <i>startoffset</i>,</b>
+<b> uint32_t <i>options</i>, pcre2_match_data *<i>match_data</i>,</b>
+<b> pcre2_match_context *<i>mcontext</i>,</b>
+<b> int *<i>workspace</i>, PCRE2_SIZE <i>wscount</i>);</b>
+<br>
+<br>
+<b>void pcre2_match_data_free(pcre2_match_data *<i>match_data</i>);</b>
+</P>
+<br><a name="SEC2" href="#TOC1">PCRE2 NATIVE API AUXILIARY MATCH FUNCTIONS</a><br>
+<P>
+<b>PCRE2_SIZE pcre2_get_leftchar(pcre2_match_data *<i>match_data</i>);</b>
+<br>
+<br>
+<b>PCRE2_SPTR pcre2_get_mark(pcre2_match_data *<i>match_data</i>);</b>
+<br>
+<br>
+<b>uint32_t pcre2_get_ovector_count(pcre2_match_data *<i>match_data</i>);</b>
+<br>
+<br>
+<b>PCRE2_SIZE *pcre2_get_ovector_pointer(pcre2_match_data *<i>match_data</i>);</b>
+<br>
+<br>
+<b>PCRE2_SIZE pcre2_get_rightchar(pcre2_match_data *<i>match_data</i>);</b>
+<br>
+<br>
+<b>PCRE2_SIZE pcre2_get_startchar(pcre2_match_data *<i>match_data</i>);</b>
+</P>
+<br><a name="SEC3" href="#TOC1">PCRE2 NATIVE API GENERAL CONTEXT FUNCTIONS</a><br>
+<P>
+<b>pcre2_general_context *pcre2_general_context_create(</b>
+<b> void *(*<i>private_malloc</i>)(PCRE2_SIZE, void *),</b>
+<b> void (*<i>private_free</i>)(void *, void *), void *<i>memory_data</i>);</b>
+<br>
+<br>
+<b>pcre2_general_context *pcre2_general_context_copy(</b>
+<b> pcre2_general_context *<i>gcontext</i>);</b>
+<br>
+<br>
+<b>void pcre2_general_context_free(pcre2_general_context *<i>gcontext</i>);</b>
+</P>
+<br><a name="SEC4" href="#TOC1">PCRE2 NATIVE API COMPILE CONTEXT FUNCTIONS</a><br>
+<P>
+<b>pcre2_compile_context *pcre2_compile_context_create(</b>
+<b> pcre2_general_context *<i>gcontext</i>);</b>
+<br>
+<br>
+<b>pcre2_compile_context *pcre2_compile_context_copy(</b>
+<b> pcre2_compile_context *<i>ccontext</i>);</b>
+<br>
+<br>
+<b>void pcre2_compile_context_free(pcre2_compile_context *<i>ccontext</i>);</b>
+<br>
+<br>
+<b>int pcre2_set_bsr_compile(pcre2_compile_context *<i>ccontext</i>,</b>
+<b> uint32_t <i>value</i>);</b>
+<br>
+<br>
+<b>int pcre2_set_character_tables(pcre2_compile_context *<i>ccontext</i>,</b>
+<b> const unsigned char *<i>tables</i>);</b>
+<br>
+<br>
+<b>int pcre2_set_newline_compile(pcre2_compile_context *<i>ccontext</i>,</b>
+<b> uint32_t <i>value</i>);</b>
+<br>
+<br>
+<b>int pcre2_set_parens_nest_limit(pcre2_compile_context *<i>ccontext</i>,</b>
+<b> uint32_t <i>value</i>);</b>
+<br>
+<br>
+<b>int pcre2_set_compile_recursion_guard(pcre2_compile_context *<i>ccontext</i>,</b>
+<b> int (*<i>guard_function</i>)(uint32_t));</b>
+</P>
+<br><a name="SEC5" href="#TOC1">PCRE2 NATIVE API MATCH CONTEXT FUNCTIONS</a><br>
+<P>
+<b>pcre2_match_context *pcre2_match_context_create(</b>
+<b> pcre2_general_context *<i>gcontext</i>);</b>
+<br>
+<br>
+<b>pcre2_match_context *pcre2_match_context_copy(</b>
+<b> pcre2_match_context *<i>mcontext</i>);</b>
+<br>
+<br>
+<b>void pcre2_match_context_free(pcre2_match_context *<i>mcontext</i>);</b>
+<br>
+<br>
+<b>int pcre2_set_bsr_match(pcre2_match_context *<i>mcontext</i>,</b>
+<b> uint32_t <i>value</i>);</b>
+<br>
+<br>
+<b>int pcre2_set_callout(pcre2_match_context *<i>mcontext</i>,</b>
+<b> int (*<i>callout_function</i>)(pcre2_callout_block *),</b>
+<b> void *<i>callout_data</i>);</b>
+<br>
+<br>
+<b>int pcre2_set_match_limit(pcre2_match_context *<i>mcontext</i>,</b>
+<b> uint32_t <i>value</i>);</b>
+<br>
+<br>
+<b>int pcre2_set_newline_match(pcre2_match_context *<i>mcontext</i>,</b>
+<b> uint32_t <i>value</i>);</b>
+<br>
+<br>
+<b>int pcre2_set_recursion_limit(pcre2_match_context *<i>mcontext</i>,</b>
+<b> uint32_t <i>value</i>);</b>
+<br>
+<br>
+<b>int pcre2_set_recursion_memory_management(</b>
+<b> pcre2_match_context *<i>mcontext</i>,</b>
+<b> void *(*<i>private_malloc</i>)(PCRE2_SIZE, void *),</b>
+<b> void (*<i>private_free</i>)(void *, void *), void *<i>memory_data</i>);</b>
+</P>
+<br><a name="SEC6" href="#TOC1">PCRE2 NATIVE API STRING EXTRACTION FUNCTIONS</a><br>
+<P>
+<b>int pcre2_substring_copy_byname(pcre2_match_data *<i>match_data</i>,</b>
+<b> PCRE2_SPTR <i>name</i>, PCRE2_UCHAR *<i>buffer</i>, PCRE2_SIZE *<i>bufflen</i>);</b>
+<br>
+<br>
+<b>int pcre2_substring_copy_bynumber(pcre2_match_data *<i>match_data</i>,</b>
+<b> unsigned int <i>number</i>, PCRE2_UCHAR *<i>buffer</i>,</b>
+<b> PCRE2_SIZE *<i>bufflen</i>);</b>
+<br>
+<br>
+<b>void pcre2_substring_free(PCRE2_UCHAR *<i>buffer</i>);</b>
+<br>
+<br>
+<b>int pcre2_substring_get_byname(pcre2_match_data *<i>match_data</i>,</b>
+<b> PCRE2_SPTR <i>name</i>, PCRE2_UCHAR **<i>bufferptr</i>, PCRE2_SIZE *<i>bufflen</i>);</b>
+<br>
+<br>
+<b>int pcre2_substring_get_bynumber(pcre2_match_data *<i>match_data</i>,</b>
+<b> unsigned int <i>number</i>, PCRE2_UCHAR **<i>bufferptr</i>,</b>
+<b> PCRE2_SIZE *<i>bufflen</i>);</b>
+<br>
+<br>
+<b>int pcre2_substring_length_byname(pcre2_match_data *<i>match_data</i>,</b>
+<b> PCRE2_SPTR <i>name</i>, PCRE2_SIZE *<i>length</i>);</b>
+<br>
+<br>
+<b>int pcre2_substring_length_bynumber(pcre2_match_data *<i>match_data</i>,</b>
+<b> unsigned int <i>number</i>, PCRE2_SIZE *<i>length</i>);</b>
+<br>
+<br>
+<b>int pcre2_substring_nametable_scan(const pcre2_code *<i>code</i>,</b>
+<b> PCRE2_SPTR <i>name</i>, PCRE2_SPTR *<i>first</i>, PCRE2_SPTR *<i>last</i>);</b>
+<br>
+<br>
+<b>int pcre2_substring_number_from_name(const pcre2_code *<i>code</i>,</b>
+<b> PCRE2_SPTR <i>name</i>);</b>
+<br>
+<br>
+<b>void pcre2_substring_list_free(PCRE2_SPTR *<i>list</i>);</b>
+<br>
+<br>
+<b>int pcre2_substring_list_get(pcre2_match_data *<i>match_data</i>,</b>
+<b>" PCRE2_UCHAR ***<i>listptr</i>, PCRE2_SIZE **<i>lengthsptr</i>);</b>
+</P>
+<br><a name="SEC7" href="#TOC1">PCRE2 NATIVE API JIT FUNCTIONS</a><br>
+<P>
+<b>int pcre2_jit_compile(pcre2_code *<i>code</i>, uint32_t <i>options</i>);</b>
+<br>
+<br>
+<b>int pcre2_jit_match(const pcre2_code *<i>code</i>, PCRE2_SPTR <i>subject</i>,</b>
+<b> PCRE2_SIZE <i>length</i>, PCRE2_SIZE <i>startoffset</i>,</b>
+<b> uint32_t <i>options</i>, pcre2_match_data *<i>match_data</i>,</b>
+<b> pcre2_match_context *<i>mcontext</i>, pcre2_jit_stack *<i>jit_stack</i>);</b>
+<br>
+<br>
+<b>void pcre2_jit_free_unused_memory(pcre2_general_context *<i>gcontext</i>);</b>
+<br>
+<br>
+<b>pcre2_jit_stack *pcre2_jit_stack_alloc(pcre2_general_context *<i>gcontext</i>,</b>
+<b> PCRE2_SIZE <i>startsize</i>, PCRE2_SIZE <i>maxsize</i>);</b>
+<br>
+<br>
+<b>void pcre2_jit_stack_assign(const pcre2_code *<i>code</i>,</b>
+<b> pcre2_jit_callback <i>callback_function</i>, void *<i>callback_data</i>);</b>
+<br>
+<br>
+<b>void pcre2_jit_stack_free(pcre2_jit_stack *<i>jit_stack</i>);</b>
+</P>
+<br><a name="SEC8" href="#TOC1">PCRE2 NATIVE API AUXILIARY FUNCTIONS</a><br>
+<P>
+<b>int pcre2_get_error_message(int <i>errorcode</i>, PCRE2_UCHAR *<i>buffer</i>,</b>
+<b> PCRE2_SIZE <i>bufflen</i>);</b>
+<br>
+<br>
+<b>const unsigned char *pcre2_maketables(pcre2_general_context *<i>gcontext</i>);</b>
+<br>
+<br>
+<b>int pcre2_pattern_info(const pcre2 *<i>code</i>, uint32_t <i>what</i>, void *<i>where</i>);</b>
+<br>
+<br>
+<b>int pcre2_config(uint32_t <i>what</i>, void *<i>where</i>, PCRE2_SIZE <i>length</i>);</b>
+</P>
+<br><a name="SEC9" href="#TOC1">PCRE2 8-BIT, 16-BIT, AND 32-BIT LIBRARIES</a><br>
+<P>
+There are three PCRE2 libraries, supporting 8-bit, 16-bit, and 32-bit code
+units, respectively. However, there is just one header file, <b>pcre2.h</b>.
+This contains the function prototypes and other definitions for all three
+libraries. One, two, or all three can be installed simultaneously. On Unix-like
+systems the libraries are called <b>libpcre2-8</b>, <b>libpcre2-16</b>, and
+<b>libpcre2-32</b>, and they can also co-exist with the original PCRE libraries.
+</P>
+<P>
+Character strings are passed to and from a PCRE2 library as a sequence of
+unsigned integers in code units of the appropriate width. Every PCRE2 function
+comes in three different forms, one for each library, for example:
+<pre>
+ <b>pcre2_compile_8()</b>
+ <b>pcre2_compile_16()</b>
+ <b>pcre2_compile_32()</b>
+</pre>
+There are also three different sets of data types:
+<pre>
+ <b>PCRE2_UCHAR8, PCRE2_UCHAR16, PCRE2_UCHAR32</b>
+ <b>PCRE2_SPTR8, PCRE2_SPTR16, PCRE2_SPTR32</b>
+</pre>
+The UCHAR types define unsigned code units of the appropriate widths. For
+example, PCRE2_UCHAR16 is usually defined as `uint16_t'. The SPTR types are
+constant pointers to the equivalent UCHAR types, that is, they are pointers to
+vectors of unsigned code units.
+</P>
+<P>
+Many applications use only one code unit width. For their convenience, macros
+are defined whose names are the generic forms such as <b>pcre2_compile()</b> and
+PCRE2_SPTR. These macros use the value of the macro PCRE2_CODE_UNIT_WIDTH to
+generate the appropriate width-specific function and macro names.
+PCRE2_CODE_UNIT_WIDTH is not defined by default. An application must define it
+to be 8, 16, or 32 before including <b>pcre2.h</b> in order to make use of the
+generic names.
+</P>
+<P>
+Applications that use more than one code unit width can be linked with more
+than one PCRE2 library, but must define PCRE2_CODE_UNIT_WIDTH to be 0 before
+including <b>pcre2.h</b>, and then use the real function names. Any code that is
+to be included in an environment where the value of PCRE2_CODE_UNIT_WIDTH is
+unknown should also use the real function names. (Unfortunately, it is not
+possible in C code to save and restore the value of a macro.)
+</P>
+<P>
+If PCRE2_CODE_UNIT_WIDTH is not defined before including <b>pcre2.h</b>, a
+compiler error occurs.
+</P>
+<P>
+When using multiple libraries in an application, you must take care when
+processing any particular pattern to use only functions from a single library.
+For example, if you want to run a match using a pattern that was compiled with
+<b>pcre2_compile_16()</b>, you must do so with <b>pcre2_match_16()</b>, not
+<b>pcre2_match_8()</b>.
+</P>
+<P>
+In the function summaries above, and in the rest of this document and other
+PCRE2 documents, functions and data types are described using their generic
+names, without the 8, 16, or 32 suffix.
+</P>
+<br><a name="SEC10" href="#TOC1">PCRE2 API OVERVIEW</a><br>
+<P>
+PCRE2 has its own native API, which is described in this document. There are
+also some wrapper functions for the 8-bit library that correspond to the
+POSIX regular expression API, but they do not give access to all the
+functionality. They are described in the
+<a href="pcre2posix.html"><b>pcre2posix</b></a>
+documentation. Both these APIs define a set of C function calls.
+</P>
+<P>
+The native API C data types, function prototypes, option values, and error
+codes are defined in the header file <b>pcre2.h</b>, which contains definitions
+of PCRE2_MAJOR and PCRE2_MINOR, the major and minor release numbers for the
+library. Applications can use these to include support for different releases
+of PCRE2.
+</P>
+<P>
+In a Windows environment, if you want to statically link an application program
+against a non-dll PCRE2 library, you must define PCRE2_STATIC before including
+<b>pcre2.h</b>.
+</P>
+<P>
+The functions <b>pcre2_compile()</b>, and <b>pcre2_match()</b> are used for
+compiling and matching regular expressions in a Perl-compatible manner. A
+sample program that demonstrates the simplest way of using them is provided in
+the file called <i>pcre2demo.c</i> in the PCRE2 source distribution. A listing
+of this program is given in the
+<a href="pcre2demo.html"><b>pcre2demo</b></a>
+documentation, and the
+<a href="pcre2sample.html"><b>pcre2sample</b></a>
+documentation describes how to compile and run it.
+</P>
+<P>
+Just-in-time compiler support is an optional feature of PCRE2 that can be built
+in appropriate hardware environments. It greatly speeds up the matching
+performance of many patterns. Programs can request that it be used if
+available, by calling <b>pcre2_jit_compile()</b> after a pattern has been
+successfully compiled by <b>pcre2_compile()</b>. This does nothing if JIT
+support is not available.
+</P>
+<P>
+More complicated programs might need to make use of the specialist functions
+<b>pcre2_jit_stack_alloc()</b>, <b>pcre2_jit_stack_free()</b>, and
+<b>pcre2_jit_stack_assign()</b> in order to control the JIT code's memory usage.
+</P>
+<P>
+JIT matching is automatically used by <b>pcre2_match()</b> if it is available.
+There is also a direct interface for JIT matching, which gives improved
+performance. The JIT-specific functions are discussed in the
+<a href="pcre2jit.html"><b>pcre2jit</b></a>
+documentation.
+</P>
+<P>
+A second matching function, <b>pcre2_dfa_exec()</b>, which is not
+Perl-compatible, is also provided. This uses a different algorithm for the
+matching. The alternative algorithm finds all possible matches (at a given
+point in the subject), and scans the subject just once (unless there are
+lookbehind assertions). However, this algorithm does not return captured
+substrings. A description of the two matching algorithms and their advantages
+and disadvantages is given in the
+<a href="pcre2matching.html"><b>pcre2matching</b></a>
+documentation. There is no JIT support for <b>pcre2_dfa_match()</b>.
+</P>
+<P>
+In addition to the main compiling and matching functions, there are convenience
+functions for extracting captured substrings from a subject string that is
+matched by <b>pcre2_match()</b>. They are:
+<pre>
+ <b>pcre2_substring_copy_byname()</b>
+ <b>pcre2_substring_copy_bynumber()</b>
+ <b>pcre2_substring_get_byname()</b>
+ <b>pcre2_substring_get_bynumber()</b>
+ <b>pcre2_substring_list_get()</b>
+ <b>pcre2_substring_length_byname()</b>
+ <b>pcre2_substring_length_bynumber()</b>
+ <b>pcre2_substring_nametable_scan()</b>
+ <b>pcre2_substring_number_from_name()</b>
+</pre>
+<b>pcre2_substring_free()</b> and <b>pcre2_substring_list_free()</b> are also
+provided, to free the memory used for extracted strings.
+</P>
+<P>
+There are functions for finding out information about a compiled pattern
+(<b>pcre2_pattern_info()</b>) and about the configuration with which PCRE2 was
+built (<b>pcre2_config()</b>).
+<a name="newlines"></a></P>
+<br><a name="SEC11" href="#TOC1">NEWLINES</a><br>
+<P>
+PCRE2 supports five different conventions for indicating line breaks in
+strings: a single CR (carriage return) character, a single LF (linefeed)
+character, the two-character sequence CRLF, any of the three preceding, or any
+Unicode newline sequence. The Unicode newline sequences are the three just
+mentioned, plus the single characters VT (vertical tab, U+000B), FF (form feed,
+U+000C), NEL (next line, U+0085), LS (line separator, U+2028), and PS
+(paragraph separator, U+2029).
+</P>
+<P>
+Each of the first three conventions is used by at least one operating system as
+its standard newline sequence. When PCRE2 is built, a default can be specified.
+The default default is LF, which is the Unix standard. When PCRE2 is run, the
+default can be overridden, either when a pattern is compiled, or when it is
+matched.
+</P>
+<P>
+The newline convention can be changed when calling <b>pcre2_compile()</b>, or it
+can be specified by special text at the start of the pattern itself; this
+overrides any other settings. See the
+<a href="pcre2pattern.html"><b>pcre2pattern</b></a>
+page for details of the special character sequences.
+</P>
+<P>
+In the PCRE2 documentation the word "newline" is used to mean "the character or
+pair of characters that indicate a line break". The choice of newline
+convention affects the handling of the dot, circumflex, and dollar
+metacharacters, the handling of #-comments in /x mode, and, when CRLF is a
+recognized line ending sequence, the match position advancement for a
+non-anchored pattern. There is more detail about this in the
+<a href="#matchoptions">section on <b>pcre2_match()</b> options</a>
+below.
+</P>
+<P>
+The choice of newline convention does not affect the interpretation of
+the \n or \r escape sequences, nor does it affect what \R matches, which has
+its own separate control.
+</P>
+<br><a name="SEC12" href="#TOC1">MULTITHREADING</a><br>
+<P>
+In a multithreaded application it is important to keep thread-specific data
+separate from data that can be shared between threads. The PCRE2 library code
+itself is thread-safe: it contains no static or global variables. The API is
+designed to be fairly simple for non-threaded applications while at the same
+time ensuring that multithreaded applications can use it.
+</P>
+<P>
+There are several different blocks of data that are used to pass information
+between the application and the PCRE libraries.
+</P>
+<P>
+(1) A pointer to the compiled form of a pattern is returned to the user when
+<b>pcre2_compile()</b> is successful. The data in the compiled pattern is fixed,
+and does not change when the pattern is matched. Therefore, it is thread-safe,
+that is, the same compiled pattern can be used by more than one thread
+simultaneously. An application can compile all its patterns at the start,
+before forking off multiple threads that use them. However, if the just-in-time
+optimization feature is being used, it needs separate memory stack areas for
+each thread. See the
+<a href="pcre2jit.html"><b>pcre2jit</b></a>
+documentation for more details.
+</P>
+<P>
+(2) The next section below introduces the idea of "contexts" in which PCRE2
+functions are called. A context is nothing more than a collection of parameters
+that control the way PCRE2 operates. Grouping a number of parameters together
+in a context is a convenient way of passing them to a PCRE2 function without
+using lots of arguments. The parameters that are stored in contexts are in some
+sense "advanced features" of the API. Many straightforward applications will
+not need to use contexts.
+</P>
+<P>
+In a multithreaded application, if the parameters in a context are values that
+are never changed, the same context can be used by all the threads. However, if
+any thread needs to change any value in a context, it must make its own
+thread-specific copy.
+</P>
+<P>
+(3) The matching functions need a block of memory for working space and for
+storing the results of a match. This includes details of what was matched, as
+well as additional information such as the name of a (*MARK) setting. Each
+thread must provide its own version of this memory.
+</P>
+<br><a name="SEC13" href="#TOC1">PCRE2 CONTEXTS</a><br>
+<P>
+Some PCRE2 functions have a lot of parameters, many of which are used only by
+specialist applications, for example, those that use custom memory management
+or non-standard character tables. To keep function argument lists at a
+reasonable size, and at the same time to keep the API extensible, "uncommon"
+parameters are passed to certain functions in a <b>context</b> instead of
+directly. A context is just a block of memory that holds the parameter values.
+Applications that do not need to adjust any of the context parameters can pass
+NULL when a context pointer is required.
+</P>
+<P>
+There are three different types of context: a general context that is relevant
+for several PCRE2 operations, a compile-time context, and a match-time context.
+</P>
+<br><b>
+The general context
+</b><br>
+<P>
+At present, this context just contains pointers to (and data for) external
+memory management functions that are called from several places in the PCRE2
+library. The context is named `general' rather than specifically `memory'
+because in future other fields may be added. If you do not want to supply your
+own custom memory management functions, you do not need to bother with a
+general context. A general context is created by:
+<b>pcre2_general_context *pcre2_general_context_create(</b>
+<b> void *(*<i>private_malloc</i>)(PCRE2_SIZE, void *),</b>
+<b> void (*<i>private_free</i>)(void *, void *), void *<i>memory_data</i>);</b>
+<br>
+<br>
+The two function pointers specify custom memory management functions, whose
+prototypes are:
+<pre>
+ <b>void *private_malloc(PCRE2_SIZE, void *);</b>
+ <b>void private_free(void *, void *);</b>
+</pre>
+Whenever code in PCRE2 calls these functions, the final argument is the value
+of <i>memory_data</i>. Either of the first two arguments of the creation
+function may be NULL, in which case the system memory management functions
+<i>malloc()</i> and <i>free()</i> are used. (This is not currently useful, as
+there are no other fields in a general context, but in future there might be.)
+The <i>private_malloc()</i> function is used (if supplied) to obtain memory for
+storing the context, and all three values are saved as part of the context.
+</P>
+<P>
+Whenever PCRE2 creates a data block of any kind, the block contains a pointer
+to the <i>free()</i> function that matches the <i>malloc()</i> function that was
+used. When the time comes to free the block, this function is called.
+</P>
+<P>
+A general context can be copied by calling:
+<b>pcre2_general_context *pcre2_general_context_copy(</b>
+<b> pcre2_general_context *<i>gcontext</i>);</b>
+<br>
+<br>
+The memory used for a general context should be freed by calling:
+<b>void pcre2_general_context_free(pcre2_general_context *<i>gcontext</i>);</b>
+<a name="compilecontext"></a></P>
+<br><b>
+The compile context
+</b><br>
+<P>
+A compile context is required if you want to change the default values of any
+of the following compile-time parameters:
+<pre>
+ What \R matches (Unicode newlines or CR, LF, CRLF only);
+ PCRE2's character tables;
+ The newline character sequence;
+ The compile time nested parentheses limit;
+ An external function for stack checking.
+</pre>
+A compile context is also required if you are using custom memory management.
+If none of these apply, just pass NULL as the context argument of
+<i>pcre2_compile()</i>.
+</P>
+<P>
+A compile context is created, copied, and freed by the following functions:
+<b>pcre2_compile_context *pcre2_compile_context_create(</b>
+<b> pcre2_general_context *<i>gcontext</i>);</b>
+<br>
+<br>
+<b>pcre2_compile_context *pcre2_compile_context_copy(</b>
+<b> pcre2_compile_context *<i>ccontext</i>);</b>
+<br>
+<br>
+<b>void pcre2_compile_context_free(pcre2_compile_context *<i>ccontext</i>);</b>
+<br>
+<br>
+A compile context is created with default values for its parameters. These can
+be changed by calling the following functions, which return 0 on success, or
+PCRE2_ERROR_BADDATA if invalid data is detected.
+<b>int pcre2_set_bsr_compile(pcre2_compile_context *<i>ccontext</i>,</b>
+<b> uint32_t <i>value</i>);</b>
+<br>
+<br>
+The value must be PCRE2_BSR_ANYCRLF, to specify that \R matches only CR, LF,
+or CRLF, or PCRE2_BSR_UNICODE, to specify that \R matches any Unicode line
+ending sequence. The value of this parameter does not affect what is compiled;
+it is just saved with the compiled pattern. The value is used by the JIT
+compiler and by the two interpreted matching functions, <i>pcre2_match()</i> and
+<i>pcre2_dfa_match()</i>. You can change the value when calling these functions,
+but doing so disables the use of JIT.
+<b>int pcre2_set_character_tables(pcre2_compile_context *<i>ccontext</i>,</b>
+<b> const unsigned char *<i>tables</i>);</b>
+<br>
+<br>
+The value must be the result of a call to <i>pcre2_maketables()</i>, whose only
+argument is a general context. This function builds a set of character tables
+in the current locale.
+<b>int pcre2_set_newline_compile(pcre2_compile_context *<i>ccontext</i>,</b>
+<b> uint32_t <i>value</i>);</b>
+<br>
+<br>
+This specifies which characters or character sequences are to be recognized as
+newlines. The value must be one of PCRE2_NEWLINE_CR (carriage return only),
+PCRE2_NEWLINE_LF (linefeed only), PCRE2_NEWLINE_CRLF (the two-character
+sequence CR followed by LF), PCRE2_NEWLINE_ANYCRLF (any of the above), or
+PCRE2_NEWLINE_ANY (any Unicode newline sequence).
+</P>
+<P>
+When a pattern is compiled with the PCRE2_EXTENDED option, the value of this
+parameter affects the recognition of white space and the end of internal
+comments starting with #. The value is saved with the compiled pattern for
+subsequent use by the JIT compiler and by the two interpreted matching
+functions, <i>pcre2_match()</i> and <i>pcre2_dfa_match()</i>. You can change the
+value when calling these functions, but doing so disables the use of JIT.
+<b>int pcre2_set_parens_nest_limit(pcre2_compile_context *<i>ccontext</i>,</b>
+<b> uint32_t <i>value</i>);</b>
+<br>
+<br>
+This parameter ajusts the limit, set when PCRE2 is built (default 250), on the
+depth of parenthesis nesting in a pattern. This limit stops rogue patterns
+using up too much system stack when being compiled.
+<b>int pcre2_set_compile_recursion_guard(pcre2_compile_context *<i>ccontext</i>,</b>
+<b> int (*<i>guard_function</i>)(uint32_t));</b>
+<br>
+<br>
+There is at least one application that runs PCRE2 in threads with very limited
+system stack, where running out of stack is to be avoided at all costs. The
+parenthesis limit above cannot take account of how much stack is actually
+available. For a finer control, you can supply a function that is called
+whenever <b>pcre2_compile()</b> starts to compile a parenthesized part of a
+pattern. The argument to the function gives the current depth of nesting. The
+function should return zero if all is well, or non-zero to force an error.
+<a name="matchcontext"></a></P>
+<br><b>
+The match context
+</b><br>
+<P>
+A match context is required if you want to change the default values of any
+of the following match-time parameters:
+<pre>
+ What \R matches (Unicode newlines or CR, LF, CRLF only);
+ A callout function;
+ The limit for calling <i>match()</i>;
+ The limit for calling <i>match()</i> recursively;
+ The newline character sequence;
+</pre>
+A match context is also required if you are using custom memory management.
+If none of these apply, just pass NULL as the context argument of
+<b>pcre2_match()</b>, <b>pcre2_dfa_match()</b>, or <b>pcre2_jit_match()</b>.
+Changing the newline value or what \R matches at match time disables the use
+of JIT via <b>pcre2_match()</b>.
+</P>
+<P>
+A match context is created, copied, and freed by the following functions:
+<b>pcre2_match_context *pcre2_match_context_create(</b>
+<b> pcre2_general_context *<i>gcontext</i>);</b>
+<br>
+<br>
+<b>pcre2_match_context *pcre2_match_context_copy(</b>
+<b> pcre2_match_context *<i>mcontext</i>);</b>
+<br>
+<br>
+<b>void pcre2_match_context_free(pcre2_match_context *<i>mcontext</i>);</b>
+<br>
+<br>
+A match context is created with default values for its parameters. These can
+be changed by calling the following functions, which return 0 on success, or
+PCRE2_ERROR_BADDATA if invalid data is detected.
+<b>int pcre2_set_bsr_match(pcre2_match_context *<i>mcontext</i>,</b>
+<b> uint32_t <i>value</i>);</b>
+<br>
+<br>
+The value must be PCRE2_BSR_ANYCRLF, to specify that \R matches only CR, LF,
+or CRLF, or PCRE2_BSR_UNICODE, to specify that \R matches any Unicode line
+ending sequence. If you want to make use of JIT matching, you should not use
+this function, but instead set the value in a compile context.
+<b>int pcre2_set_callout(pcre2_match_context *<i>mcontext</i>,</b>
+<b> int (*<i>callout_function</i>)(pcre2_callout_block *),</b>
+<b> void *<i>callout_data</i>);</b>
+<br>
+<br>
+This sets up a "callout" function, which PCRE2 will call at specified points
+during a matching operation. Details are given in the
+<a href="pcre2callout.html"><b>pcre2callout</b></a>
+documentation.
+<b>int pcre2_set_match_limit(pcre2_match_context *<i>mcontext</i>,</b>
+<b> uint32_t <i>value</i>);</b>
+<br>
+<br>
+The <i>match_limit</i> parameter provides a means of preventing PCRE2 from using
+up too many resources when processing patterns that are not going to match, but
+which have a very large number of possibilities in their search trees. The
+classic example is a pattern that uses nested unlimited repeats.
+</P>
+<P>
+Internally, <b>pcre2_match()</b> uses a function called <b>match()</b>, which it
+calls repeatedly (sometimes recursively). The limit set by <i>match_limit</i> is
+imposed on the number of times this function is called during a match, which
+has the effect of limiting the amount of backtracking that can take place. For
+patterns that are not anchored, the count restarts from zero for each position
+in the subject string. This limit is not relevant to <b>pcre2_dfa_match()</b>,
+which ignores it.
+</P>
+<P>
+When <b>pcre2_match()</b> is called with a pattern that was successfully studied
+with <b>pcre2_jit_compile()</b>, the way that the matching is executed is
+entirely different. However, there is still the possibility of runaway matching
+that goes on for a very long time, and so the <i>match_limit</i> value is also
+used in this case (but in a different way) to limit how long the matching can
+continue.
+</P>
+<P>
+The default value for the limit can be set when PCRE2 is built; the default
+default is 10 million, which handles all but the most extreme cases. If the
+limit is exceeded, <b>pcre2_match()</b> returns PCRE2_ERROR_MATCHLIMIT. A value
+for the match limit may also be supplied by an item at the start of a pattern
+of the form
+<pre>
+ (*LIMIT_MATCH=ddd)
+</pre>
+where ddd is a decimal number. However, such a setting is ignored unless ddd is
+less than the limit set by the caller of <b>pcre2_match()</b> or, if no such
+limit is set, less than the default.
+<b>int pcre2_set_recursion_limit(pcre2_match_context *<i>mcontext</i>,</b>
+<b> uint32_t <i>value</i>);</b>
+<br>
+<br>
+The <i>recursion_limit</i> parameter is similar to <i>match_limit</i>, but
+instead of limiting the total number of times that <b>match()</b> is called, it
+limits the depth of recursion. The recursion depth is a smaller number than the
+total number of calls, because not all calls to <b>match()</b> are recursive.
+This limit is of use only if it is set smaller than <i>match_limit</i>.
+</P>
+<P>
+Limiting the recursion depth limits the amount of system stack that can be
+used, or, when PCRE2 has been compiled to use memory on the heap instead of the
+stack, the amount of heap memory that can be used. This limit is not relevant,
+and is ignored, when matching is done using JIT compiled code or by the
+<b>pcre2_dfa_match()</b> function.
+</P>
+<P>
+The default value for <i>recursion_limit</i> can be set when PCRE2 is built; the
+default default is the same value as the default for <i>match_limit</i>. If the
+limit is exceeded, <b>pcre2_match()</b> returns PCRE2_ERROR_RECURSIONLIMIT. A
+value for the recursion limit may also be supplied by an item at the start of a
+pattern of the form
+<pre>
+ (*LIMIT_RECURSION=ddd)
+</pre>
+where ddd is a decimal number. However, such a setting is ignored unless ddd is
+less than the limit set by the caller of <b>pcre2_match()</b> or, if no such
+limit is set, less than the default.
+<b>int pcre2_set_newline_match(pcre2_match_context *<i>mcontext</i>,</b>
+<b> uint32_t <i>value</i>);</b>
+<br>
+<br>
+This specifies which characters or character sequences are to be recognized as
+newlines. The value must be one of PCRE2_NEWLINE_CR (carriage return only),
+PCRE2_NEWLINE_LF (linefeed only), PCRE2_NEWLINE_CRLF (the two-character
+sequence CR followed by LF), PCRE2_NEWLINE_ANYCRLF (any of the above), or
+PCRE2_NEWLINE_ANY (any Unicode newline sequence). If you want to make use of
+JIT matching, you should not use this function, but instead set the value in a
+compile context.
+<b>int pcre2_set_recursion_memory_management(</b>
+<b> pcre2_match_context *<i>mcontext</i>,</b>
+<b> void *(*<i>private_malloc</i>)(PCRE2_SIZE, void *),</b>
+<b> void (*<i>private_free</i>)(void *, void *), void *<i>memory_data</i>);</b>
+<br>
+<br>
+This function sets up two additional custom memory management functions for use
+by <b>pcre2_match()</b> when PCRE2 is compiled to use the heap for remembering
+backtracking data, instead of recursive function calls that use the system
+stack. There is a discussion about PCRE2's stack usage in the
+<a href="pcre2stack.html"><b>pcre2stack</b></a>
+documentation. See the
+<a href="pcre2build.html"><b>pcre2build</b></a>
+documentation for details of how to build PCRE2. Using the heap for recursion
+is a non-standard way of building PCRE2, for use in environments that have
+limited stacks. Because of the greater use of memory management,
+<b>pcre2_match()</b> runs more slowly. Functions that are different to the
+general custom memory functions are provided so that special-purpose external
+code can be used for this case, because the memory blocks are all the same
+size. The blocks are retained by <b>pcre2_match()</b> until it is about to exit
+so that they can be re-used when possible during the match. In the absence of
+these functions, the normal custom memory management functions are used, if
+supplied, otherwise the system functions.
+</P>
+<br><a name="SEC14" href="#TOC1">CHECKING BUILD-TIME OPTIONS</a><br>
+<P>
+<b>int pcre2_config(uint32_t <i>what</i>, void *<i>where</i>, PCRE2_SIZE <i>length</i>);</b>
+</P>
+<P>
+The function <b>pcre2_config()</b> makes it possible for a PCRE2 client to
+discover which optional features have been compiled into the PCRE2 library. The
+<a href="pcre2build.html"><b>pcre2build</b></a>
+documentation has more details about these optional features.
+</P>
+<P>
+The first argument for <b>pcre2_config()</b> specifies which information is
+required. The second argument is a pointer to memory into which the information
+is placed, with the final argument giving the length of this memory in bytes.
+For calls that return numerical values, <i>where</i> should point to
+appropriately aligned memory, with <i>length</i> set to at least the "sizeof"
+the data type.
+</P>
+<P>
+The returned value from <b>pcre2_config()</b> is zero on success, or the
+negative error code PCRE2_ERROR_BADOPTION if the value in the first argument is
+not recognized. The following information is available:
+<pre>
+ PCRE2_CONFIG_BSR
+</pre>
+The output is an integer whose value indicates what character sequences the \R
+escape sequence matches by default. A value of 0 means that \R matches any
+Unicode line ending sequence; a value of 1 means that \R matches only CR, LF,
+or CRLF. The default can be overridden when a pattern is compiled or matched.
+<pre>
+ PCRE2_CONFIG_JIT
+</pre>
+The output is an integer that is set to one if support for just-in-time
+compiling is available; otherwise it is set to zero.
+<pre>
+ PCRE2_CONFIG_JITTARGET
+</pre>
+FIXME: this needs sorting out once JIT is implemented.
+If JIT support is available, the string contains the name of the architecture
+for which the JIT compiler is configured, for example "x86 32bit (little endian
++ unaligned)". If JIT support is not available, FIXME.
+<pre>
+ PCRE2_CONFIG_LINKSIZE
+</pre>
+The output is an integer that contains the number of bytes used for internal
+linkage in compiled regular expressions. When PCRE2 is configured, the value
+can be set to 2, 3, or 4, with the default being 2. This is the value that is
+returned by <b>pcre2_config()</b>. However, when the 16-bit library is compiled,
+a value of 3 is rounded up to 4, and when the 32-bit library is compiled,
+internal linkages always use 4 bytes, so the configured value is not relevant.
+</P>
+<P>
+The default value of 2 for the 8-bit and 16-bit libraries is sufficient for all
+but the most massive patterns, since it allows the size of the compiled pattern
+to be up to 64K code units. Larger values allow larger regular expressions to
+be compiled by those two libraries, but at the expense of slower matching.
+<pre>
+ PCRE2_CONFIG_MATCHLIMIT
+</pre>
+The output is an unsigned long integer that gives the default limit for the
+number of internal matching function calls in a <b>pcre2_match()</b> execution.
+Further details are given with <b>pcre2_match()</b> below.
+<pre>
+ PCRE2_CONFIG_NEWLINE
+</pre>
+The output is an integer whose value specifies the default character sequence
+that is recognized as meaning "newline". The values are:
+<pre>
+ 1 Carriage return (CR)
+ 2 Linefeed (LF)
+ 3 Carriage return, linefeed (CRLF)
+ 4 Any Unicode line ending
+ 5 Any of CR, LF, or CRLF
+</pre>
+The default should normally correspond to the standard sequence for your
+operating system.
+<pre>
+ PCRE2_CONFIG_PARENSLIMIT
+</pre>
+The output is an unsigned long integer that gives the maximum depth of nesting
+of parentheses (of any kind) in a pattern. This limit is imposed to cap the
+amount of system stack used when a pattern is compiled. It is specified when
+PCRE2 is built; the default is 250. This limit does not take into account the
+stack that may already be used by the calling application. For finer control
+over compilation stack usage, see <b>pcre2_set_compile_recursion_guard()</b>.
+<pre>
+ PCRE2_CONFIG_RECURSIONLIMIT
+</pre>
+The output is an unsigned long integer that gives the default limit for the
+depth of recursion when calling the internal matching function in a
+<b>pcre2_match()</b> execution. Further details are given with
+<b>pcre2_match()</b> below.
+<pre>
+ PCRE2_CONFIG_STACKRECURSE
+</pre>
+The output is an integer that is set to one if internal recursion when running
+<b>pcre2_match()</b> is implemented by recursive function calls that use the
+system stack to remember their state. This is the usual way that PCRE2 is
+compiled. The output is zero if PCRE2 was compiled to use blocks of data on the
+heap instead of recursive function calls.
+<pre>
+ PCRE2_CONFIG_UNICODE_VERSION
+</pre>
+The <i>where</i> argument should point to a buffer that is at least 24 code
+units long. If PCRE2 has been compiled without Unicode support, this is filled
+with the text "Unicode not supported". Otherwise, the Unicode version string
+(for example, "7.0.0") is returnd. The string is zero-terminated.
+<pre>
+ PCRE2_CONFIG_UNICODE
+</pre>
+The output is an integer that is set to one if Unicode support is available;
+otherwise it is set to zero. Unicode support implies UTF support.
+<pre>
+ PCRE2_CONFIG_VERSION
+</pre>
+The <i>where</i> argument should point to a buffer that is at least 12 code
+units long. It is filled with the PCRE2 version string, zero-terminated.
+</P>
+<br><a name="SEC15" href="#TOC1">COMPILING A PATTERN</a><br>
+<P>
+<b>pcre2_code *pcre2_compile(PCRE2_SPTR <i>pattern</i>, PCRE2_SIZE <i>length</i>,</b>
+<b> uint32_t <i>options</i>, int *<i>errorcode</i>, PCRE2_SIZE *<i>erroroffset,</i></b>
+<b> pcre2_compile_context *<i>ccontext</i>);</b>
+<br>
+<br>
+<b>pcre2_code_free(pcre2_code *<i>code</i>);</b>
+</P>
+<P>
+This function compiles a pattern, defined by a pointer to a string of code
+units and a length, into an internal form. If the pattern is zero-terminated,
+the length should be specified as PCRE2_ZERO_TERMINATED. The function returns a
+pointer to a block of memory that contains the compiled pattern and related
+data. The caller must free the memory by calling <b>pcre2_code_free()</b> when
+it is no longer needed.
+</P>
+<P>
+If the compile context argument <i>ccontext</i> is NULL, the memory is obtained
+by calling <b>malloc()</b>. Otherwise, it is obtained from the same memory
+function that was used for the compile context.
+</P>
+<P>
+The <i>options</i> argument contains various bit settings that affect the
+compilation. It should be zero if no options are required. The available
+options are described below. Some of them (in particular, those that are
+compatible with Perl, but some others as well) can also be set and unset from
+within the pattern (see the detailed description in the
+<a href="pcre2pattern.html"><b>pcre2pattern</b></a>
+documentation).
+</P>
+<P>
+For those options that can be different in different parts of the pattern, the
+contents of the <i>options</i> argument specifies their settings at the start of
+compilation. The PCRE2_ANCHORED, PCRE2_NO_UTF_CHECK, and
+PCRE2_NO_START_OPTIMIZE options can be set at the time of matching as well as
+at compile time.
+</P>
+<P>
+Other, less frequently required compile-time parameters (for example, the
+newline setting) can be provided in a compile context (as described
+<a href="#compilecontext">above).</a>
+</P>
+<P>
+If <i>errorcode</i> or <i>erroroffset</i> is NULL, <b>pcre2_compile()</b> returns
+NULL immediately. Otherwise, if compilation of a pattern fails,
+<b>pcre2_compile()</b> returns NULL, having set these variables to an error code
+and an offset (number of code units) within the pattern, respectively. The
+<b>pcre2_get_error_message()</b> function provides a textual message for each
+error code. Compilation errors are positive numbers, but UTF formatting errors
+are negative numbers. For an invalid UTF-8 or UTF-16 string, the offset is that
+of the first code unit of the failing character.
+</P>
+<P>
+Some errors are not detected until the whole pattern has been scanned; in these
+cases, the offset passed back is the length of the pattern. Note that the
+offset is in code units, not characters, even in a UTF mode. It may sometimes
+point into the middle of a UTF-8 or UTF-16 character.
+</P>
+<P>
+This code fragment shows a typical straightforward call to
+<b>pcre2_compile()</b>:
+<pre>
+ pcre2_code *re;
+ PCRE2_SIZE erroffset;
+ int errorcode;
+ re = pcre2_compile(
+ "^A.*Z", /* the pattern */
+ PCRE2_ZERO_TERMINATED, /* the pattern is zero-terminated */
+ 0, /* default options */
+ &errorcode, /* for error code */
+ &erroffset, /* for error offset */
+ NULL); /* no compile context */
+</pre>
+The following names for option bits are defined in the <b>pcre2.h</b> header
+file:
+<pre>
+ PCRE2_ANCHORED
+</pre>
+If this bit is set, the pattern is forced to be "anchored", that is, it is
+constrained to match only at the first matching point in the string that is
+being searched (the "subject string"). This effect can also be achieved by
+appropriate constructs in the pattern itself, which is the only way to do it in
+Perl.
+<pre>
+ PCRE2_ALLOW_EMPTY_CLASS
+</pre>
+By default, for compatibility with Perl, a closing square bracket that
+immediately follows an opening one is treated as a data character for the
+class. When PCRE2_ALLOW_EMPTY_CLASS is set, it terminates the class, which
+therefore contains no characters and so can never match.
+<pre>
+ PCRE2_ALT_BSUX
+</pre>
+This option request alternative handling of three escape sequences, which
+makes PCRE2's behaviour more like ECMAscript (aka JavaScript). When it is set:
+</P>
+<P>
+(1) \U matches an upper case "U" character; by default \U causes a compile
+time error (Perl uses \U to upper case subsequent characters).
+</P>
+<P>
+(2) \u matches a lower case "u" character unless it is followed by four
+hexadecimal digits, in which case the hexadecimal number defines the code point
+to match. By default, \u causes a compile time error (Perl uses it to upper
+case the following character).
+</P>
+<P>
+(3) \x matches a lower case "x" character unless it is followed by two
+hexadecimal digits, in which case the hexadecimal number defines the code point
+to match. By default, as in Perl, a hexadecimal number is always expected after
+\x, but it may have zero, one, or two digits (so, for example, \xz matches a
+binary zero character followed by z).
+<pre>
+ PCRE2_AUTO_CALLOUT
+</pre>
+If this bit is set, <b>pcre2_compile()</b> automatically inserts callout items,
+all with number 255, before each pattern item. For discussion of the callout
+facility, see the
+<a href="pcre2callout.html"><b>pcre2callout</b></a>
+documentation.
+<pre>
+ PCRE2_CASELESS
+</pre>
+If this bit is set, letters in the pattern match both upper and lower case
+letters in the subject. It is equivalent to Perl's /i option, and it can be
+changed within a pattern by a (?i) option setting.
+<pre>
+ PCRE2_DOLLAR_ENDONLY
+</pre>
+If this bit is set, a dollar metacharacter in the pattern matches only at the
+end of the subject string. Without this option, a dollar also matches
+immediately before a newline at the end of the string (but not before any other
+newlines). The PCRE2_DOLLAR_ENDONLY option is ignored if PCRE2_MULTILINE is
+set. There is no equivalent to this option in Perl, and no way to set it within
+a pattern.
+<pre>
+ PCRE2_DOTALL
+</pre>
+If this bit is set, a dot metacharacter in the pattern matches any character,
+including one that indicates a newline. However, it only ever matches one
+character, even if newlines are coded as CRLF. Without this option, a dot does
+not match when the current position in the subject is at a newline. This option
+is equivalent to Perl's /s option, and it can be changed within a pattern by a
+(?s) option setting. A negative class such as [^a] always matches newline
+characters, independent of the setting of this option.
+<pre>
+ PCRE2_DUPNAMES
+</pre>
+If this bit is set, names used to identify capturing subpatterns need not be
+unique. This can be helpful for certain types of pattern when it is known that
+only one instance of the named subpattern can ever be matched. There are more
+details of named subpatterns below; see also the
+<a href="pcre2pattern.html"><b>pcre2pattern</b></a>
+documentation.
+<pre>
+ PCRE2_EXTENDED
+</pre>
+If this bit is set, most white space characters in the pattern are totally
+ignored except when escaped or inside a character class. However, white space
+is not allowed within sequences such as (?&#62; that introduce various
+parenthesized subpatterns, nor within numerical quantifiers such as {1,3}.
+Ignorable white space is permitted between an item and a following quantifier
+and between a quantifier and a following + that indicates possessiveness.
+</P>
+<P>
+PCRE2_EXTENDED also causes characters between an unescaped # outside a
+character class and the next newline, inclusive, to be ignored, which makes it
+possible to include comments inside complicated patterns. Note that the end of
+this type of comment is a literal newline sequence in the pattern; escape
+sequences that happen to represent a newline do not count. PCRE2_EXTENDED is
+equivalent to Perl's /x option, and it can be changed within a pattern by a
+(?x) option setting.
+</P>
+<P>
+Which characters are interpreted as newlines can be specified by a setting in
+the compile context that is passed to <b>pcre2_compile()</b> or by a special
+sequence at the start of the pattern, as described in the section entitled
+<a href="pcrepattern.html#newlines">"Newline conventions"</a>
+in the <b>pcre2pattern</b> documentation. A default is defined when PCRE2 is
+built.
+<pre>
+ PCRE2_FIRSTLINE
+</pre>
+If this option is set, an unanchored pattern is required to match before or at
+the first newline in the subject string, though the matched text may continue
+over the newline.
+<pre>
+ PCRE2_MATCH_UNSET_BACKREF
+</pre>
+If this option is set, a back reference to an unset subpattern group matches an
+empty string (by default this causes the current matching alternative to fail).
+A pattern such as (\1)(a) succeeds when this option is set (assuming it can
+find an "a" in the subject), whereas it fails by default, for Perl
+compatibility. Setting this option makes PCRE2 behave more like ECMAscript (aka
+JavaScript).
+<pre>
+ PCRE2_MULTILINE
+</pre>
+By default, for the purposes of matching "start of line" and "end of line",
+PCRE2 treats the subject string as consisting of a single line of characters,
+even if it actually contains newlines. The "start of line" metacharacter (^)
+matches only at the start of the string, and the "end of line" metacharacter
+($) matches only at the end of the string, or before a terminating newline
+(except when PCRE2_DOLLAR_ENDONLY is set). Note, however, that unless
+PCRE2_DOTALL is set, the "any character" metacharacter (.) does not match at a
+newline. This behaviour (for ^, $, and dot) is the same as Perl.
+</P>
+<P>
+When PCRE2_MULTILINE it is set, the "start of line" and "end of line"
+constructs match immediately following or immediately before internal newlines
+in the subject string, respectively, as well as at the very start and end. This
+is equivalent to Perl's /m option, and it can be changed within a pattern by a
+(?m) option setting. If there are no newlines in a subject string, or no
+occurrences of ^ or $ in a pattern, setting PCRE2_MULTILINE has no effect.
+<pre>
+ PCRE2_NEVER_UCP
+</pre>
+This option locks out the use of Unicode properties for handling \B, \b, \D,
+\d, \S, \s, \W, \w, and some of the POSIX character classes, as described
+for the PCRE2_UCP option below. In particular, it prevents the creator of the
+pattern from enabling this facility by starting the pattern with (*UCP). This
+may be useful in applications that process patterns from external sources. The
+option combination PCRE_UCP and PCRE_NEVER_UCP causes an error.
+<pre>
+ PCRE2_NEVER_UTF
+</pre>
+This option locks out interpretation of the pattern as UTF-8, UTF-16, or
+UTF-32, depending on which library is in use. In particular, it prevents the
+creator of the pattern from switching to UTF interpretation by starting the
+pattern with (*UTF). This may be useful in applications that process patterns
+from external sources. The combination of PCRE2_UTF and PCRE2_NEVER_UTF causes
+an error.
+<pre>
+ PCRE2_NO_AUTO_CAPTURE
+</pre>
+If this option is set, it disables the use of numbered capturing parentheses in
+the pattern. Any opening parenthesis that is not followed by ? behaves as if it
+were followed by ?: but named parentheses can still be used for capturing (and
+they acquire numbers in the usual way). There is no equivalent of this option
+in Perl.
+<pre>
+ PCRE2_NO_AUTO_POSSESS
+</pre>
+If this option is set, it disables "auto-possessification", which is an
+optimization that, for example, turns a+b into a++b in order to avoid
+backtracks into a+ that can never be successful. However, if callouts are in
+use, auto-possessification means that some callouts are never taken. You can
+set this option if you want the matching functions to do a full unoptimized
+search and run all the callouts, but it is mainly provided for testing
+purposes.
+<pre>
+ PCRE2_NO_START_OPTIMIZE
+</pre>
+This is an option that acts at matching time; that is, it is really an option
+for <b>pcre2_match()</b> or <b>pcre_dfa_match()</b>. If it is set at compile
+time, it is remembered with the compiled pattern and assumed at matching time.
+This is necessary if you want to use JIT execution, because the JIT compiler
+needs to know whether or not this option is set. For details, see the
+discussion of PCRE2_NO_START_OPTIMIZE in the section on <b>pcre2_match()</b>
+options
+<a href="#matchoptions">below.</a>
+<pre>
+ PCRE2_NO_UTF_CHECK
+</pre>
+When PCRE2_UTF is set, the validity of the pattern as a UTF string is
+automatically checked. There are discussions about the validity of
+<a href="pcre2unicode.html#utf8strings">UTF-8 strings,</a>
+<a href="pcre2unicode.html#utf16strings">UTF-16 strings,</a>
+and
+<a href="pcre2unicode.html#utf32strings">UTF-32 strings</a>
+in the
+<a href="pcre2unicode.html"><b>pcre2unicode</b></a>
+document.
+If an invalid UTF sequence is found, <b>pcre2_compile()</b> returns a negative
+error code.
+</P>
+<P>
+If you know that your pattern is valid, and you want to skip this check for
+performance reasons, you can set the PCRE2_NO_UTF_CHECK option. When it is set,
+the effect of passing an invalid UTF string as a pattern is undefined. It may
+cause your program to crash or loop. Note that this option can also be passed
+to <b>pcre2_match()</b> and <b>pcre_dfa_match()</b>, to suppress validity
+checking of the subject string.
+<pre>
+ PCRE2_UCP
+</pre>
+This option changes the way PCRE2 processes \B, \b, \D, \d, \S, \s, \W,
+\w, and some of the POSIX character classes. By default, only ASCII characters
+are recognized, but if PCRE2_UCP is set, Unicode properties are used instead to
+classify characters. More details are given in the section on
+<a href="pcre2.html#genericchartypes">generic character types</a>
+in the
+<a href="pcre2pattern.html"><b>pcre2pattern</b></a>
+page. If you set PCRE2_UCP, matching one of the items it affects takes much
+longer. The option is available only if PCRE2 has been compiled with UTF
+support.
+<pre>
+ PCRE2_UNGREEDY
+</pre>
+This option inverts the "greediness" of the quantifiers so that they are not
+greedy by default, but become greedy if followed by "?". It is not compatible
+with Perl. It can also be set by a (?U) option setting within the pattern.
+<pre>
+ PCRE2_UTF
+</pre>
+This option causes PCRE2 to regard both the pattern and the subject strings
+that are subsequently processed as strings of UTF characters instead of
+single-code-unit strings. However, it is available only when PCRE2 is built to
+include UTF support. If not, the use of this option provokes an error. Details
+of how this option changes the behaviour of PCRE2 are given in the
+<a href="pcre2unicode.html"><b>pcre2unicode</b></a>
+page.
+</P>
+<br><a name="SEC16" href="#TOC1">COMPILATION ERROR CODES</a><br>
+<P>
+There are over 80 positive error codes that <b>pcre2_compile()</b> may return if
+it finds an error in the pattern. There are also some negative error codes that
+are used for invalid UTF strings. These are the same as given by
+<b>pcre2_match()</b> and <b>pcre2_dfa_match()</b>, and are described in the
+<a href="pcre2unicode.html"><b>pcre2unicode</b></a>
+page. The <b>pcre2_get_error_message()</b> function can be called to obtain a
+textual error message from any error code.
+</P>
+<br><a name="SEC17" href="#TOC1">JUST-IN-TIME (JIT) COMPILATION</a><br>
+<P>
+<b>int pcre2_jit_compile(pcre2_code *<i>code</i>, uint32_t <i>options</i>);</b>
+<br>
+<br>
+<b>int pcre2_jit_match(const pcre2_code *<i>code</i>, PCRE2_SPTR <i>subject</i>,</b>
+<b> PCRE2_SIZE <i>length</i>, PCRE2_SIZE <i>startoffset</i>,</b>
+<b> uint32_t <i>options</i>, pcre2_match_data *<i>match_data</i>,</b>
+<b> pcre2_match_context *<i>mcontext</i>, pcre2_jit_stack *<i>jit_stack</i>);</b>
+<br>
+<br>
+<b>void pcre2_jit_free_unused_memory(pcre2_general_context *<i>gcontext</i>);</b>
+<br>
+<br>
+<b>pcre2_jit_stack *pcre2_jit_stack_alloc(pcre2_general_context *<i>gcontext</i>,</b>
+<b> PCRE2_SIZE <i>startsize</i>, PCRE2_SIZE <i>maxsize</i>);</b>
+<br>
+<br>
+<b>void pcre2_jit_stack_assign(const pcre2_code *<i>code</i>,</b>
+<b> pcre2_jit_callback <i>callback_function</i>, void *<i>callback_data</i>);</b>
+<br>
+<br>
+<b>void pcre2_jit_stack_free(pcre2_jit_stack *<i>jit_stack</i>);</b>
+</P>
+<P>
+These functions provide support for JIT compilation, which, if the just-in-time
+compiler is available, further processes a compiled pattern into machine code
+that executes much faster than the <b>pcre2_match()</b> interpretive matching
+function. Full details are given in the
+<a href="pcre2jit.html"><b>pcre2jit</b></a>
+documentation.
+</P>
+<P>
+JIT compilation is a heavyweight optimization. It can take some time for
+patterns to be analyzed, and for one-off matches and simple patterns the
+benefit of faster execution might be offset by a much slower compilation time.
+Most, but not all patterns can be optimized by the JIT compiler.
+<a name="localesupport"></a></P>
+<br><a name="SEC18" href="#TOC1">LOCALE SUPPORT</a><br>
+<P>
+PCRE2 handles caseless matching, and determines whether characters are letters,
+digits, or whatever, by reference to a set of tables, indexed by character code
+point. When running in UTF-8 mode, or using the 16-bit or 32-bit libraries,
+this applies only to characters with code points less than 256. By default,
+higher-valued code points never match escapes such as \w or \d. However, if
+PCRE2 is built with UTF support, all characters can be tested with \p and \P,
+or, alternatively, the PCRE2_UCP option can be set when a pattern is compiled;
+this causes \w and friends to use Unicode property support instead of the
+built-in tables.
+</P>
+<P>
+The use of locales with Unicode is discouraged. If you are handling characters
+with code points greater than 128, you should either use Unicode support, or
+use locales, but not try to mix the two.
+</P>
+<P>
+PCRE2 contains an internal set of character tables that are used by default.
+These are sufficient for many applications. Normally, the internal tables
+recognize only ASCII characters. However, when PCRE2 is built, it is possible
+to cause the internal tables to be rebuilt in the default "C" locale of the
+local system, which may cause them to be different.
+</P>
+<P>
+The internal tables can be overridden by tables supplied by the application
+that calls PCRE2. These may be created in a different locale from the default.
+As more and more applications change to using Unicode, the need for this locale
+support is expected to die away.
+</P>
+<P>
+External tables are built by calling the <b>pcre2_maketables()</b> function, in
+the relevant locale. The result can be passed to <b>pcre2_compile()</b> as often
+as necessary, by creating a compile context and calling
+<b>pcre2_set_character_tables()</b> to set the tables pointer therein. For
+example, to build and use tables that are appropriate for the French locale
+(where accented characters with values greater than 128 are treated as
+letters), the following code could be used:
+<pre>
+ setlocale(LC_CTYPE, "fr_FR");
+ tables = pcre2_maketables(NULL);
+ ccontext = pcre2_compile_context_create(NULL);
+ pcre2_set_character_tables(ccontext, tables);
+ re = pcre2_compile(..., ccontext);
+</pre>
+The locale name "fr_FR" is used on Linux and other Unix-like systems; if you
+are using Windows, the name for the French locale is "french". It is the
+caller's responsibility to ensure that the memory containing the tables remains
+available for as long as it is needed.
+</P>
+<P>
+The pointer that is passed (via the compile context) to <b>pcre2_compile()</b>
+is saved with the compiled pattern, and the same tables are used by
+<b>pcre2_match()</b> and <b>pcre_dfa_match()</b>. Thus, for any single pattern,
+compilation, and matching all happen in the same locale, but different patterns
+can be processed in different locales.
+<a name="infoaboutpattern"></a></P>
+<br><a name="SEC19" href="#TOC1">INFORMATION ABOUT A COMPILED PATTERN</a><br>
+<P>
+<b>int pcre2_pattern_info(const pcre2 *<i>code</i>, uint32_t <i>what</i>, void *<i>where</i>);</b>
+</P>
+<P>
+The <b>pcre2_pattern_info()</b> function returns information about a compiled
+pattern. The first argument is a pointer to the compiled pattern. The second
+argument specifies which piece of information is required, and the third
+argument is a pointer to a variable to receive the data. The yield of the
+function is zero for success, or one of the following negative numbers:
+<pre>
+ PCRE2_ERROR_NULL the argument <i>code</i> was NULL
+ the argument <i>where</i> was NULL
+ PCRE2_ERROR_BADMAGIC the "magic number" was not found
+ PCRE2_ERROR_BADOPTION the value of <i>what</i> was invalid
+ PCRE2_ERROR_UNSET the requested field is not set
+</pre>
+The "magic number" is placed at the start of each compiled pattern as an simple
+check against passing an arbitrary memory pointer.
+Here is
+a typical call of <b>pcre2_pattern_info()</b>, to obtain the length of the compiled
+pattern:
+<pre>
+ int rc;
+ size_t length;
+ rc = pcre2_pattern_info(
+ re, /* result of pcre2_compile() */
+ PCRE2_INFO_SIZE, /* what is required */
+ &length); /* where to put the data */
+</pre>
+The possible values for the second argument are defined in <b>pcre2.h</b>, and
+are as follows:
+<pre>
+ PCRE2_INFO_ALLOPTIONS
+ PCRE2_INFO_ARGOPTIONS
+</pre>
+Return a copy of the pattern's options. The third argument should point to a
+<b>uint32_t</b> variable. PCRE2_INFO_ARGOPTIONS returns exactly the options that
+were passed to <b>pcre2_compile()</b>, whereas PCRE2_INFO_ALLOPTIONS returns
+the compile options as modified by any top-level option settings at the start
+of the pattern itself. In other words, they are the options that will be in
+force when matching starts. For example, if the pattern /(?im)abc(?-i)d/ is
+compiled with the PCRE2_EXTENDED option, the result is PCRE2_CASELESS,
+PCRE2_MULTILINE, and PCRE2_EXTENDED.
+</P>
+<P>
+A pattern is automatically anchored by PCRE2 if all of its top-level
+alternatives begin with one of the following:
+<pre>
+ ^ unless PCRE2_MULTILINE is set
+ \A always
+ \G always
+ .* if PCRE2_DOTALL is set and there are no back references to the subpattern in which .* appears
+</pre>
+For such patterns, the PCRE2_ANCHORED bit is set in the options returned for
+PCRE2_INFO_ALLOPTIONS.
+<pre>
+ PCRE2_INFO_BACKREFMAX
+</pre>
+Return the number of the highest back reference in the pattern. The third
+argument should point to an <b>uint32_t</b> variable. Zero is returned if there
+are no back references.
+<pre>
+ PCRE2_INFO_BSR
+</pre>
+The output is a uint32_t whose value indicates what character sequences the \R
+escape sequence matches by default. A value of 0 means that \R matches any
+Unicode line ending sequence; a value of 1 means that \R matches only CR, LF,
+or CRLF. The default can be overridden when a pattern is matched.
+<pre>
+ PCRE2_INFO_CAPTURECOUNT
+</pre>
+Return the number of capturing subpatterns in the pattern. The third argument
+should point to an <b>uint32_t</b> variable.
+<pre>
+ PCRE2_INFO_FIRSTCODETYPE
+</pre>
+Return information about the first code unit of any matched string, for a
+non-anchored pattern. The third argument should point to an <b>uint32_t</b>
+variable.
+</P>
+<P>
+If there is a fixed first value, for example, the letter "c" from a pattern
+such as (cat|cow|coyote), 1 is returned, and the character value can be
+retrieved using PCRE2_INFO_FIRSTCODEUNIT. If there is no fixed first value, and
+if either
+<br>
+<br>
+(a) the pattern was compiled with the PCRE2_MULTILINE option, and every branch
+starts with "^", or
+<br>
+<br>
+(b) every branch of the pattern starts with ".*" and PCRE2_DOTALL is not set
+(if it were set, the pattern would be anchored),
+<br>
+<br>
+2 is returned, indicating that the pattern matches only at the start of a
+subject string or after any newline within the string. Otherwise 0 is
+returned. For anchored patterns, 0 is returned.
+<pre>
+ PCRE2_INFO_FIRSTCODEUNIT
+</pre>
+Return the value of the first code unit of any matched string in the situation
+where PCRE2_INFO_FIRSTCODETYPE returns 1; otherwise return 0. The third
+argument should point to an <b>uint32_t</b> variable. In the 8-bit library, the
+value is always less than 256. In the 16-bit library the value can be up to
+0xffff. In the 32-bit library in UTF-32 mode the value can be up to 0x10ffff,
+and up to 0xffffffff when not using UTF-32 mode.
+<pre>
+ PCRE2_INFO_FIRSTBITMAP
+</pre>
+In the absence of a single first code unit for a non-anchored pattern,
+<b>pcre2_compile()</b> may construct a 256-bit table that defines a fixed set of
+values for the first code unit in any match. For example, a pattern that starts
+with [abc] results in a table with three bits set. When code unit values
+greater than 255 are supported, the flag bit for 255 means "any code unit of
+value 255 or above". If such a table was constructed, a pointer to it is
+returned. Otherwise NULL is returned. The third argument should point to an
+<b>const uint8_t *</b> variable.
+<pre>
+ PCRE2_INFO_HASCRORLF
+</pre>
+Return 1 if the pattern contains any explicit matches for CR or LF characters,
+otherwise 0. The third argument should point to an <b>uint32_t</b> variable. An
+explicit match is either a literal CR or LF character, or \r or \n.
+<pre>
+ PCRE2_INFO_JCHANGED
+</pre>
+Return 1 if the (?J) or (?-J) option setting is used in the pattern, otherwise
+0. The third argument should point to an <b>uint32_t</b> variable. (?J) and
+(?-J) set and unset the local PCRE2_DUPNAMES option, respectively.
+<pre>
+ PCRE2_INFO_JITSIZE
+</pre>
+If the compiled pattern was successfully processed by
+<b>pcre2_jit_compile()</b>, return the size of the JIT compiled code, otherwise
+return zero. The third argument should point to a <b>size_t</b> variable.
+<pre>
+ PCRE2_INFO_LASTCODETYPE
+</pre>
+Returns 1 if there is a rightmost literal code unit that must exist in any
+matched string, other than at its start. The third argument should point to an
+<b>uint32_t</b> variable. If there is no such value, 0 is returned. When 1 is
+returned, the code unit value itself can be retrieved using
+PCRE2_INFO_LASTCODEUNIT.
+</P>
+<P>
+For anchored patterns, a last literal value is recorded only if it follows
+something of variable length. For example, for the pattern /^a\d+z\d+/ the
+returned value is 1 (with "z" returned from PCRE2_INFO_LASTCODEUNIT), but for
+/^a\dz\d/ the returned value is 0.
+<pre>
+ PCRE2_INFO_LASTCODEUNIT
+</pre>
+Return the value of the rightmost literal data unit that must exist in any
+matched string, other than at its start, if such a value has been recorded. The
+third argument should point to an <b>uint32_t</b> variable. If there is no such
+value, 0 is returned.
+<pre>
+ PCRE2_INFO_MATCHEMPTY
+</pre>
+Return 1 if the pattern can match an empty string, otherwise 0. The third
+argument should point to an <b>uint32_t</b> variable.
+<pre>
+ PCRE2_INFO_MATCHLIMIT
+</pre>
+If the pattern set a match limit by including an item of the form
+(*LIMIT_MATCH=nnnn) at the start, the value is returned. The third argument
+should point to an unsigned 32-bit integer. If no such value has been set, the
+call to <b>pcre2_pattern_info()</b> returns the error PCRE2_ERROR_UNSET.
+<pre>
+ PCRE2_INFO_MAXLOOKBEHIND
+</pre>
+Return the number of characters (not code units) in the longest lookbehind
+assertion in the pattern. The third argument should point to an unsigned 32-bit
+integer. This information is useful when doing multi-segment matching using the
+partial matching facilities. Note that the simple assertions \b and \B
+require a one-character lookbehind. \A also registers a one-character
+lookbehind, though it does not actually inspect the previous character. This is
+to ensure that at least one character from the old segment is retained when a
+new segment is processed. Otherwise, if there are no lookbehinds in the
+pattern, \A might match incorrectly at the start of a new segment.
+<pre>
+ PCRE2_INFO_MINLENGTH
+</pre>
+If a minimum length for matching subject strings was computed, its value is
+returned. Otherwise the returned value is 0. The value is a number of
+characters, which in UTF mode may be different from the number of code units.
+The third argument should point to an <b>uint32_t</b> variable. The value is a
+lower bound to the length of any matching string. There may not be any strings
+of that length that do actually match, but every string that does match is at
+least that long.
+<pre>
+ PCRE2_INFO_NAMECOUNT
+ PCRE2_INFO_NAMEENTRYSIZE
+ PCRE2_INFO_NAMETABLE
+</pre>
+PCRE2 supports the use of named as well as numbered capturing parentheses. The
+names are just an additional way of identifying the parentheses, which still
+acquire numbers. Several convenience functions such as
+<b>pcre2_substring_get_byname()</b> are provided for extracting captured
+substrings by name. It is also possible to extract the data directly, by first
+converting the name to a number in order to access the correct pointers in the
+output vector (described with <b>pcre2_match()</b> below). To do the conversion,
+you need to use the name-to-number map, which is described by these three
+values.
+</P>
+<P>
+The map consists of a number of fixed-size entries. PCRE2_INFO_NAMECOUNT gives
+the number of entries, and PCRE2_INFO_NAMEENTRYSIZE gives the size of each
+entry; both of these return a <b>uint32_t</b> value. The entry size depends on
+the length of the longest name. PCRE2_INFO_NAMETABLE returns a pointer to the
+first entry of the table. This is a PCRE2_SPTR pointer to a block of code
+units. In the 8-bit library, the first two bytes of each entry are the number
+of the capturing parenthesis, most significant byte first. In the 16-bit
+library, the pointer points to 16-bit data units, the first of which contains
+the parenthesis number. In the 32-bit library, the pointer points to 32-bit
+data units, the first of which contains the parenthesis number. The rest of the
+entry is the corresponding name, zero terminated.
+</P>
+<P>
+The names are in alphabetical order. If (?| is used to create multiple groups
+with the same number, as described in the
+<a href="pcre2pattern.html#dupsubpatternnumber">section on duplicate subpattern numbers</a>
+in the
+<a href="pcre2pattern.html"><b>pcre2pattern</b></a>
+page, the groups may be given the same name, but there is only one entry in the
+table. Different names for groups of the same number are not permitted.
+</P>
+<P>
+Duplicate names for subpatterns with different numbers are permitted, but only
+if PCRE2_DUPNAMES is set. They appear in the table in the order in which they
+were found in the pattern. In the absence of (?| this is the order of
+increasing number; when (?| is used this is not necessarily the case because
+later subpatterns may have lower numbers.
+</P>
+<P>
+As a simple example of the name/number table, consider the following pattern
+after compilation by the 8-bit library (assume PCRE2_EXTENDED is set, so white
+space - including newlines - is ignored):
+<pre>
+ (?&#60;date&#62; (?&#60;year&#62;(\d\d)?\d\d) - (?&#60;month&#62;\d\d) - (?&#60;day&#62;\d\d) )
+</pre>
+There are four named subpatterns, so the table has four entries, and each entry
+in the table is eight bytes long. The table is as follows, with non-printing
+bytes shows in hexadecimal, and undefined bytes shown as ??:
+<pre>
+ 00 01 d a t e 00 ??
+ 00 05 d a y 00 ?? ??
+ 00 04 m o n t h 00
+ 00 02 y e a r 00 ??
+</pre>
+When writing code to extract data from named subpatterns using the
+name-to-number map, remember that the length of the entries is likely to be
+different for each compiled pattern.
+<pre>
+ PCRE2_INFO_NEWLINE
+</pre>
+The output is a <b>uint32_t</b> whose value specifies the default character
+sequence that will be recognized as meaning "newline" while matching. The
+values are:
+<pre>
+ 1 Carriage return (CR)
+ 2 Linefeed (LF)
+ 3 Carriage return, linefeed (CRLF)
+ 4 Any Unicode line ending
+ 5 Any of CR, LF, or CRLF
+</pre>
+The default can be overridden when a pattern is matched.
+<pre>
+ PCRE2_INFO_RECURSIONLIMIT
+</pre>
+If the pattern set a recursion limit by including an item of the form
+(*LIMIT_RECURSION=nnnn) at the start, the value is returned. The third
+argument should point to an unsigned 32-bit integer. If no such value has been
+set, the call to <b>pcre2_pattern_info()</b> returns the error PCRE2_ERROR_UNSET.
+<pre>
+ PCRE2_INFO_SIZE
+</pre>
+Return the size of the compiled pattern in bytes (for all three libraries). The
+third argument should point to a <b>size_t</b> variable. This value does not
+include the size of the <b>pcre2_code</b> structure that is returned by
+<b>pcre_compile()</b>. The value that is used when <b>pcre2_compile()</b> is
+getting memory in which to place the compiled data is the value returned by
+this option plus the size of the <b>pcre2_code</b> structure. Processing a
+pattern with the JIT compiler does not alter the value returned by this option.
+<a name="matchdatablock"></a></P>
+<br><a name="SEC20" href="#TOC1">THE MATCH DATA BLOCK</a><br>
+<P>
+<b>pcre2_match_data_create(uint32_t <i>ovecsize</i>,</b>
+<b> pcre2_general_context *<i>gcontext</i>);</b>
+<br>
+<br>
+<b>pcre2_match_data_create_from_pattern(pcre2_code *<i>code</i>,</b>
+<b> pcre2_general_context *<i>gcontext</i>);</b>
+<br>
+<br>
+<b>void pcre2_match_data_free(pcre2_match_data *<i>match_data</i>);</b>
+</P>
+<P>
+Information about successful and unsuccessful matches is placed in a match
+data block, which is an opaque structure that is accessed by function calls. In
+particular, the match data block contains a vector of offsets into the subject
+string that define the matched part of the subject and any substrings that were
+capured. This is know as the <i>ovector</i>.
+</P>
+<P>
+Before calling <b>pcre2_match()</b> or <b>pcre2_dfa_match()</b> you must create a
+match data block by calling one of the creation functions above. For
+<b>pcre2_match_data_create()</b>, the first argument is the number of pairs of
+offsets in the <i>ovector</i>. One pair of offsets is required to identify the
+string that matched the whole pattern, with another pair for each captured
+substring. For example, a value of 4 creates enough space to record the
+matched portion of the subject plus three captured substrings.
+</P>
+<P>
+For <b>pcre2_match_data_create_from_pattern()</b>, the first argument is a
+pointer to a compiled pattern. In this case the ovector is created to be
+exactly the right size to hold all the substrings a pattern might capture.
+</P>
+<P>
+The second argument of both these functions ia a pointer to a general context,
+which can specify custom memory management for obtaining the memory for the
+match data block. If you are not using custom memory management, pass NULL.
+</P>
+<P>
+A match data block can be used many times, with the same or different compiled
+patterns. When it is no longer needed, it should be freed by calling
+<b>pcre2_match_data_free()</b>. How to extract information from a match data
+block after a match operation is described in the sections on
+<a href="#matchedstrings">matched strings</a>
+and
+<a href="#matchotherdata">other match data</a>
+below.
+</P>
+<br><a name="SEC21" href="#TOC1">MATCHING A PATTERN: THE TRADITIONAL FUNCTION</a><br>
+<P>
+<b>int pcre2_match(const pcre2_code *<i>code</i>, PCRE2_SPTR <i>subject</i>,</b>
+<b> PCRE2_SIZE <i>length</i>, PCRE2_SIZE <i>startoffset</i>,</b>
+<b> uint32_t <i>options</i>, pcre2_match_data *<i>match_data</i>,</b>
+<b> pcre2_match_context *<i>mcontext</i>);</b>
+</P>
+<P>
+The function <b>pcre2_match()</b> is called to match a subject string against a
+compiled pattern, which is passed in the <i>code</i> argument. You can call
+<b>pcre2_match()</b> with the same <i>code</i> argument as many times as you
+like, in order to find multiple matches in the subject string or to match
+different subject strings with the same pattern.
+</P>
+<P>
+This function is the main matching facility of the library, and it operates in
+a Perl-like manner. For specialist use there is also an alternative matching
+function, which is described
+<a href="#dfamatch">below</a>
+in the section about the <b>pcre2_dfa_match()</b> function.
+</P>
+<P>
+Here is an example of a simple call to <b>pcre2_match()</b>:
+<pre>
+ pcre2_match_data *md = pcre2_match_data_create(4, NULL);
+ int rc = pcre2_match(
+ re, /* result of pcre2_compile() */
+ "some string", /* the subject string */
+ 11, /* the length of the subject string */
+ 0, /* start at offset 0 in the subject */
+ 0, /* default options */
+ match_data, /* the match data block */
+ NULL); /* a match context; NULL means use defaults */
+</pre>
+If the subject string is zero-terminated, the length can be given as
+PCRE2_ZERO_TERMINATED. A match context must be provided if certain less common
+matching parameters are to be changed. For details, see the section on
+<a href="#matchcontext">the match context</a>
+above.
+</P>
+<br><b>
+The string to be matched by <b>pcre2_match()</b>
+</b><br>
+<P>
+The subject string is passed to <b>pcre2_match()</b> as a pointer in
+<i>subject</i>, a length in <i>length</i>, and a starting offset in
+<i>startoffset</i>. The length and offset are in code units, not characters.
+That is, they are in bytes for the 8-bit library, 16-bit code units for the
+16-bit library, and 32-bit code units for the 32-bit library, whether or not
+UTF processing is enabled.
+</P>
+<P>
+If <i>startoffset</i> is greater than the length of the subject,
+<b>pcre2_match()</b> returns PCRE2_ERROR_BADOFFSET. When the starting offset is
+zero, the search for a match starts at the beginning of the subject, and this
+is by far the most common case. In UTF-8 or UTF-16 mode, the starting offset
+must point to the start of a character, or to the end of the subject (in UTF-32
+mode, one code unit equals one character, so all offsets are valid). Like the
+pattern string, the subject may contain binary zeroes.
+</P>
+<P>
+A non-zero starting offset is useful when searching for another match in the
+same subject by calling <b>pcre2_match()</b> again after a previous success.
+Setting <i>startoffset</i> differs from passing over a shortened string and
+setting PCRE2_NOTBOL in the case of a pattern that begins with any kind of
+lookbehind. For example, consider the pattern
+<pre>
+ \Biss\B
+</pre>
+which finds occurrences of "iss" in the middle of words. (\B matches only if
+the current position in the subject is not a word boundary.) When applied to
+the string "Mississipi" the first call to <b>pcre2_match()</b> finds the first
+occurrence. If <b>pcre2_match()</b> is called again with just the remainder of
+the subject, namely "issipi", it does not match, because \B is always false at
+the start of the subject, which is deemed to be a word boundary. However, if
+<b>pcre2_match()</b> is passed the entire string again, but with
+<i>startoffset</i> set to 4, it finds the second occurrence of "iss" because it
+is able to look behind the starting point to discover that it is preceded by a
+letter.
+</P>
+<P>
+Finding all the matches in a subject is tricky when the pattern can match an
+empty string. It is possible to emulate Perl's /g behaviour by first trying the
+match again at the same offset, with the PCRE2_NOTEMPTY_ATSTART and
+PCRE2_ANCHORED options, and then if that fails, advancing the starting offset
+and trying an ordinary match again. There is some code that demonstrates how to
+do this in the
+<a href="pcre2demo.html"><b>pcre2demo</b></a>
+sample program. In the most general case, you have to check to see if the
+newline convention recognizes CRLF as a newline, and if so, and the current
+character is CR followed by LF, advance the starting offset by two characters
+instead of one.
+</P>
+<P>
+If a non-zero starting offset is passed when the pattern is anchored, one
+attempt to match at the given offset is made. This can only succeed if the
+pattern does not require the match to be at the start of the subject.
+<a name="matchoptions"></a></P>
+<br><b>
+Option bits for <b>pcre2_match()</b>
+</b><br>
+<P>
+The unused bits of the <i>options</i> argument for <b>pcre2_match()</b> must be
+zero. The only bits that may be set are PCRE2_ANCHORED,
+PCRE2_NOTBOL, PCRE2_NOTEOL, PCRE2_NOTEMPTY, PCRE2_NOTEMPTY_ATSTART,
+PCRE2_NO_START_OPTIMIZE, PCRE2_NO_UTF_CHECK, PCRE2_PARTIAL_HARD, and
+PCRE2_PARTIAL_SOFT. Their action is described below.
+</P>
+<P>
+If the pattern was successfully processed by the just-in-time (JIT) compiler,
+the only supported options for matching using the JIT code are PCRE2_NOTBOL,
+PCRE2_NOTEOL, PCRE2_NOTEMPTY, PCRE2_NOTEMPTY_ATSTART, PCRE2_NO_UTF_CHECK,
+PCRE2_PARTIAL_HARD, and PCRE2_PARTIAL_SOFT. If an unsupported option is used,
+JIT matching is disabled and the normal interpretive code in
+<b>pcre2_match()</b> is run.
+<pre>
+ PCRE2_ANCHORED
+</pre>
+The PCRE2_ANCHORED option limits <b>pcre2_match()</b> to matching at the first
+matching position. If a pattern was compiled with PCRE2_ANCHORED, or turned out
+to be anchored by virtue of its contents, it cannot be made unachored at
+matching time. Note that setting the option at match time disables JIT
+matching.
+<pre>
+ PCRE2_NOTBOL
+</pre>
+This option specifies that first character of the subject string is not the
+beginning of a line, so the circumflex metacharacter should not match before
+it. Setting this without PCRE2_MULTILINE (at compile time) causes circumflex
+never to match. This option affects only the behaviour of the circumflex
+metacharacter. It does not affect \A.
+<pre>
+ PCRE2_NOTEOL
+</pre>
+This option specifies that the end of the subject string is not the end of a
+line, so the dollar metacharacter should not match it nor (except in multiline
+mode) a newline immediately before it. Setting this without PCRE2_MULTILINE (at
+compile time) causes dollar never to match. This option affects only the
+behaviour of the dollar metacharacter. It does not affect \Z or \z.
+<pre>
+ PCRE2_NOTEMPTY
+</pre>
+An empty string is not considered to be a valid match if this option is set. If
+there are alternatives in the pattern, they are tried. If all the alternatives
+match the empty string, the entire match fails. For example, if the pattern
+<pre>
+ a?b?
+</pre>
+is applied to a string not beginning with "a" or "b", it matches an empty
+string at the start of the subject. With PCRE2_NOTEMPTY set, this match is not
+valid, so PCRE2 searches further into the string for occurrences of "a" or "b".
+<pre>
+ PCRE2_NOTEMPTY_ATSTART
+</pre>
+This is like PCRE2_NOTEMPTY, except that an empty string match that is not at
+the start of the subject is permitted. If the pattern is anchored, such a match
+can occur only if the pattern contains \K.
+<pre>
+ PCRE2_NO_START_OPTIMIZE
+</pre>
+There are a number of optimizations that <b>pcre2_match()</b> uses at the start
+of a match, in order to speed up the process. For example, if it is known that
+an unanchored match must start with a specific character, it searches the
+subject for that character, and fails immediately if it cannot find it, without
+actually running the main matching function. This means that a special item
+such as (*COMMIT) at the start of a pattern is not considered until after a
+suitable starting point for the match has been found. Also, when callouts or
+(*MARK) items are in use, these "start-up" optimizations can cause them to be
+skipped if the pattern is never actually used. The start-up optimizations are
+in effect a pre-scan of the subject that takes place before the pattern is run.
+</P>
+<P>
+The PCRE2_NO_START_OPTIMIZE option disables the start-up optimizations,
+possibly causing performance to suffer, but ensuring that in cases where the
+result is "no match", the callouts do occur, and that items such as (*COMMIT)
+and (*MARK) are considered at every possible starting position in the subject
+string. If PCRE2_NO_START_OPTIMIZE is set at compile time, it cannot be unset
+at matching time. The use of PCRE2_NO_START_OPTIMIZE at matching time (that is,
+passing it to <b>pcre2_match()</b>) disables JIT execution; in this situation,
+matching is always done using interpretively.
+</P>
+<P>
+Setting PCRE2_NO_START_OPTIMIZE can change the outcome of a matching operation.
+Consider the pattern
+<pre>
+ (*COMMIT)ABC
+</pre>
+When this is compiled, PCRE2 records the fact that a match must start with the
+character "A". Suppose the subject string is "DEFABC". The start-up
+optimization scans along the subject, finds "A" and runs the first match
+attempt from there. The (*COMMIT) item means that the pattern must match the
+current starting position, which in this case, it does. However, if the same
+match is run with PCRE2_NO_START_OPTIMIZE set, the initial scan along the
+subject string does not happen. The first match attempt is run starting from
+"D" and when this fails, (*COMMIT) prevents any further matches being tried, so
+the overall result is "no match". There are also other start-up optimizations.
+For example, a minimum length for the subject may be recorded. Consider the
+pattern
+<pre>
+ (*MARK:A)(X|Y)
+</pre>
+The minimum length for a match is one character. If the subject is "ABC", there
+will be attempts to match "ABC", "BC", and "C". An attempt to match an empty
+string at the end of the subject does not take place, because PCRE2 knows that
+the subject is now too short, and so the (*MARK) is never encountered. In this
+case, the optimization does not affect the overall match result, which is still
+"no match", but it does affect the auxiliary information that is returned.
+<pre>
+ PCRE2_NO_UTF_CHECK
+</pre>
+When PCRE2_UTF is set at compile time, the validity of the subject as a UTF
+string is checked by default when <b>pcre2_match()</b> is subsequently called.
+The entire string is checked before any other processing takes place, and a
+negative error code is returned if the check fails. There are several UTF error
+codes for each code unit width, corresponding to different problems with the
+code unit sequence. The value of <i>startoffset</i> is also checked, to ensure
+that it points to the start of a character or to the end of the subject. There
+are discussions about the validity of
+<a href="pcre2unicode.html#utf8strings">UTF-8 strings,</a>
+<a href="pcre2unicode.html#utf16strings">UTF-16 strings,</a>
+and
+<a href="pcre2unicode.html#utf32strings">UTF-32 strings</a>
+in the
+<a href="pcre2unicode.html"><b>pcre2unicode</b></a>
+page.
+</P>
+<P>
+If you know that your subject is valid, and you want to skip these checks for
+performance reasons, you can set the PCRE2_NO_UTF_CHECK option when calling
+<b>pcre2_match()</b>. You might want to do this for the second and subsequent
+calls to <b>pcre2_match()</b> if you are making repeated calls to find all the
+matches in a single subject string.
+</P>
+<P>
+NOTE: When PCRE2_NO_UTF_CHECK is set, the effect of passing an invalid string
+as a subject, or an invalid value of <i>startoffset</i>, is undefined. Your
+program may crash or loop indefinitely.
+<pre>
+ PCRE2_PARTIAL_HARD
+ PCRE2_PARTIAL_SOFT
+</pre>
+These options turn on the partial matching feature. A partial match occurs if
+the end of the subject string is reached successfully, but there are not enough
+subject characters to complete the match. If this happens when
+PCRE2_PARTIAL_SOFT (but not PCRE2_PARTIAL_HARD) is set, matching continues by
+testing any remaining alternatives. Only if no complete match can be found is
+PCRE2_ERROR_PARTIAL returned instead of PCRE2_ERROR_NOMATCH. In other words,
+PCRE2_PARTIAL_SOFT says that the caller is prepared to handle a partial match,
+but only if no complete match can be found.
+</P>
+<P>
+If PCRE2_PARTIAL_HARD is set, it overrides PCRE2_PARTIAL_SOFT. In this case, if
+a partial match is found, <b>pcre2_match()</b> immediately returns
+PCRE2_ERROR_PARTIAL, without considering any other alternatives. In other
+words, when PCRE2_PARTIAL_HARD is set, a partial match is considered to be more
+important that an alternative complete match.
+</P>
+<P>
+There is a more detailed discussion of partial and multi-segment matching, with
+examples, in the
+<a href="pcre2partial.html"><b>pcre2partial</b></a>
+documentation.
+</P>
+<br><a name="SEC22" href="#TOC1">NEWLINE HANDLING WHEN MATCHING</a><br>
+<P>
+When PCRE2 is built, a default newline convention is set; this is usually the
+standard convention for the operating system. The default can be overridden in
+either a
+<a href="#compilecontext">compile context</a>
+or a
+<a href="#matchcontext">match context.</a>
+However, changing the newline convention at match time disables JIT matching.
+During matching, the newline choice affects the behaviour of the dot,
+circumflex, and dollar metacharacters. It may also alter the way the match
+position is advanced after a match failure for an unanchored pattern.
+</P>
+<P>
+When PCRE2_NEWLINE_CRLF, PCRE2_NEWLINE_ANYCRLF, or PCRE2_NEWLINE_ANY is set,
+and a match attempt for an unanchored pattern fails when the current position
+is at a CRLF sequence, and the pattern contains no explicit matches for CR or
+LF characters, the match position is advanced by two characters instead of one,
+in other words, to after the CRLF.
+</P>
+<P>
+The above rule is a compromise that makes the most common cases work as
+expected. For example, if the pattern is .+A (and the PCRE2_DOTALL option is
+not set), it does not match the string "\r\nA" because, after failing at the
+start, it skips both the CR and the LF before retrying. However, the pattern
+[\r\n]A does match that string, because it contains an explicit CR or LF
+reference, and so advances only by one character after the first failure.
+</P>
+<P>
+An explicit match for CR of LF is either a literal appearance of one of those
+characters in the pattern, or one of the \r or \n escape sequences. Implicit
+matches such as [^X] do not count, nor does \s (which includes CR and LF in
+the characters that it matches).
+</P>
+<P>
+Notwithstanding the above, anomalous effects may still occur when CRLF is a
+valid newline sequence and explicit \r or \n escapes appear in the pattern.
+<a name="matchedstrings"></a></P>
+<br><a name="SEC23" href="#TOC1">HOW PCRE2_MATCH() RETURNS A STRING AND CAPTURED SUBSTRINGS</a><br>
+<P>
+<b>uint32_t pcre2_get_ovector_count(pcre2_match_data *<i>match_data</i>);</b>
+<br>
+<br>
+<b>PCRE2_SIZE *pcre2_get_ovector_pointer(pcre2_match_data *<i>match_data</i>);</b>
+</P>
+<P>
+In general, a pattern matches a certain portion of the subject, and in
+addition, further substrings from the subject may be picked out by
+parenthesized parts of the pattern. Following the usage in Jeffrey Friedl's
+book, this is called "capturing" in what follows, and the phrase "capturing
+subpattern" is used for a fragment of a pattern that picks out a substring.
+PCRE2 supports several other kinds of parenthesized subpattern that do not
+cause substrings to be captured. The <b>pcre2_pattern_info()</b> function can be
+used to find out how many capturing subpatterns there are in a compiled
+pattern.
+</P>
+<P>
+The overall matched string and any captured substrings are returned to the
+caller via a vector of PCRE2_SIZE values, called the <b>ovector</b>. This is
+contained within the
+<a href="#matchdatablock">match data block.</a>
+You can obtain direct access to the ovector by calling
+<b>pcre2_get_ovector_pointer()</b> to find its address, and
+<b>pcre2_get_ovector_count()</b> to find the number of pairs of values it
+contains. Alternatively, you can use the auxiliary functions for accessing
+captured substrings
+<a href="#extractbynumber">by number</a>
+or
+<a href="#extractbyname">by name</a>
+(see below).
+</P>
+<P>
+Within the ovector, the first in each pair of values is set to the offset of
+the first code unit of a substring, and the second is set to the offset of the
+first code unit after the end of a substring. These values are always code unit
+offsets, not character offsets. That is, they are byte offsets in the 8-bit
+library, 16-bit offsets in the 16-bit library, and 32-bit offsets in the 32-bit
+library.
+</P>
+<P>
+The first pair of offsets (that is, <i>ovector[0]</i> and <i>ovector[1]</i>)
+identifies the portion of the subject string that was matched by the entire
+pattern. The next pair is used for the first capturing subpattern, and so on.
+The value returned by <b>pcre2_match()</b> is one more than the highest numbered
+pair that has been set. For example, if two substrings have been captured, the
+returned value is 3. If there are no capturing subpatterns, the return value
+from a successful match is 1, indicating that just the first pair of offsets
+has been set.
+</P>
+<P>
+If a capturing subpattern is matched repeatedly within a single match
+operation, it is the last portion of the string that it matched that is
+returned.
+</P>
+<P>
+If the ovector is too small to hold all the captured substring offsets, as much
+as possible is filled in, and the function returns a value of zero. If neither
+the actual string matched nor any captured substrings are of interest,
+<b>pcre2_match()</b> may be called with a match data block whose ovector is of
+zero length. However, if the pattern contains back references and the
+<i>ovector</i> is not big enough to remember the related substrings, PCRE2 has
+to get additional memory for use during matching. Thus it is usually advisable
+to set up a match data block containing an ovector of reasonable size.
+</P>
+<P>
+It is possible for capturing subpattern number <i>n+1</i> to match some part of
+the subject when subpattern <i>n</i> has not been used at all. For example, if
+the string "abc" is matched against the pattern (a|(z))(bc) the return from the
+function is 4, and subpatterns 1 and 3 are matched, but 2 is not. When this
+happens, both values in the offset pairs corresponding to unused subpatterns
+are set to PCRE2_UNSET.
+</P>
+<P>
+Offset values that correspond to unused subpatterns at the end of the
+expression are also set to PCRE2_UNSET. For example, if the string "abc" is
+matched against the pattern (abc)(x(yz)?)? subpatterns 2 and 3 are not matched.
+The return from the function is 2, because the highest used capturing
+subpattern number is 1. The offsets for for the second and third capturing
+subpatterns (assuming the vector is large enough, of course) are set to
+PCRE2_UNSET.
+</P>
+<P>
+Elements in the ovector that do not correspond to capturing parentheses in the
+pattern are never changed. That is, if a pattern contains <i>n</i> capturing
+parentheses, no more than <i>ovector[0]</i> to <i>ovector[2n+1]</i> are set by
+<b>pcre2_match()</b>. The other elements retain whatever values they previously
+had.
+<a name="matchotherdata"></a></P>
+<br><b>
+Other information about the match
+</b><br>
+<P>
+<b>PCRE2_SPTR pcre2_get_mark(pcre2_match_data *<i>match_data</i>);</b>
+<br>
+<br>
+<b>PCRE2_SIZE pcre2_get_leftchar(pcre2_match_data *<i>match_data</i>);</b>
+<br>
+<br>
+<b>PCRE2_SIZE pcre2_get_rightchar(pcre2_match_data *<i>match_data</i>);</b>
+<br>
+<br>
+<b>PCRE2_SIZE pcre2_get_startchar(pcre2_match_data *<i>match_data</i>);</b>
+</P>
+<P>
+In addition to the offsets in the ovector, other information about a match is
+retained in the match data block and can be retrieved by the above functions.
+</P>
+<P>
+When a (*MARK) name is to be passed back, <b>pcre2_get_mark()</b> returns a
+pointer to the zero-terminated name, which is within the compiled pattern.
+Otherwise NULL is returned. A (*MARK) name may be available after a failed
+match or a partial match, as well as after a successful one.
+</P>
+<P>
+The other three functions yield values that give information about the part of
+the subject string that was inspected during a successful match or a partial
+match. Their results are undefined after a failed match. They return the
+following values, respectively:
+<br>
+<br>
+(1) The offset of the leftmost character that was inspected during the match.
+This can be earlier than the point at which the match started if the pattern
+contains lookbehind assertions or \b or \B at the start.
+<br>
+<br>
+(2) The offset of the character that follows the rightmost character that was
+inspected during the match. This can be after the end of the match if the
+pattern contains lookahead assertions.
+<br>
+<br>
+(3) The offset of the character at which the successful or partial match
+started. This can be different to the value of <i>ovector[0]</i> if the pattern
+contains the \K escape sequence.
+</P>
+<P>
+For example, if the pattern (?&#60;=abc)xx\Kyy(?=def) is matched against the
+string "123abcxxyydef123", the resulting offsets are:
+<pre>
+ ovector[0] 8
+ ovector[1] 10
+ leftchar 3
+ rightchar 13
+ startchar 6
+</pre>
+The <b>allusedtext</b> modifier in <b>pcre2test</b> can be used to display a
+longer string that shows the leftmost and rightmost characters in a match
+instead of just the matched string.
+<a name="errorlist"></a></P>
+<br><b>
+Error return values from <b>pcre2_match()</b>
+</b><br>
+<P>
+If <b>pcre2_match()</b> fails, it returns a negative number. This can be
+converted to a text string by calling <b>pcre2_get_error_message()</b>. Negative
+error codes are also returned by other functions, and are documented with them.
+The codes are given names in the header file. If UTF checking is in force and
+an invalid UTF subject string is detected, one of a number of UTF-specific
+negative error codes is returned. Details are given in the
+<a href="pcre2unicode.html"><b>pcre2unicode</b></a>
+page. The following are the other errors that may be returned by
+<b>pcre2_match()</b>:
+<pre>
+ PCRE2_ERROR_NOMATCH
+</pre>
+The subject string did not match the pattern.
+<pre>
+ PCRE2_ERROR_PARTIAL
+</pre>
+The subject string did not match, but it did match partially. See the
+<a href="pcre2partial.html"><b>pcre2partial</b></a>
+documentation for details of partial matching.
+<pre>
+ PCRE2_ERROR_BADMAGIC
+</pre>
+PCRE2 stores a 4-byte "magic number" at the start of the compiled code, to
+catch the case when it is passed a junk pointer. This is the error that is
+returned when the magic number is not present.
+<pre>
+ PCRE2_ERROR_BADMODE
+</pre>
+This error is given when a pattern that was compiled by the 8-bit library is
+passed to a 16-bit or 32-bit library function, or vice versa.
+<pre>
+ PCRE2_ERROR_BADOFFSET
+</pre>
+The value of <i>startoffset</i> greater than the length of the subject.
+<pre>
+ PCRE2_ERROR_BADOPTION
+</pre>
+An unrecognized bit was set in the <i>options</i> argument.
+<pre>
+ PCRE2_ERROR_BADUTFOFFSET
+</pre>
+The UTF code unit sequence that was passed as a subject was checked and found
+to be valid (the PCRE2_NO_UTF_CHECK option was not set), but the value of
+<i>startoffset</i> did not point to the beginning of a UTF character or the end
+of the subject.
+<pre>
+ PCRE2_ERROR_CALLOUT
+</pre>
+This error is never generated by <b>pcre2_match()</b> itself. It is provided for
+use by callout functions that want to cause <b>pcre2_match()</b> to return a
+distinctive error code. See the
+<a href="pcre2callout.html"><b>pcre2callout</b></a>
+documentation for details.
+<pre>
+ PCRE2_ERROR_INTERNAL
+</pre>
+An unexpected internal error has occurred. This error could be caused by a bug
+in PCRE2 or by overwriting of the compiled pattern.
+<pre>
+ PCRE2_ERROR_JIT_BADOPTION
+</pre>
+This error is returned when a pattern that was successfully studied using JIT
+is being matched, but the matching mode (partial or complete match) does not
+correspond to any JIT compilation mode. When the JIT fast path function is
+used, this error may be also given for invalid options. See the
+<a href="pcre2jit.html"><b>pcre2jit</b></a>
+documentation for more details.
+<pre>
+ PCRE2_ERROR_JIT_STACKLIMIT
+</pre>
+This error is returned when a pattern that was successfully studied using JIT
+is being matched, but the memory available for the just-in-time processing
+stack is not large enough. See the
+<a href="pcre2jit.html"><b>pcre2jit</b></a>
+documentation for more details.
+<pre>
+ PCRE2_ERROR_MATCHLIMIT
+</pre>
+The backtracking limit was reached.
+<pre>
+ PCRE2_ERROR_NOMEMORY
+</pre>
+If a pattern contains back references, but the ovector is not big enough to
+remember the referenced substrings, PCRE2 gets a block of memory at the start
+of matching to use for this purpose. There are some other special cases where
+extra memory is needed during matching. This error is given when memory cannot
+be obtained.
+<pre>
+ PCRE2_ERROR_NULL
+</pre>
+Either the <i>code</i>, <i>subject</i>, or <i>match_data</i> argument was passed
+as NULL.
+<pre>
+ PCRE2_ERROR_RECURSELOOP
+</pre>
+This error is returned when <b>pcre2_match()</b> detects a recursion loop within
+the pattern. Specifically, it means that either the whole pattern or a
+subpattern has been called recursively for the second time at the same position
+in the subject string. Some simple patterns that might do this are detected and
+faulted at compile time, but more complicated cases, in particular mutual
+recursions between two different subpatterns, cannot be detected until run
+time.
+<pre>
+ PCRE2_ERROR_RECURSIONLIMIT
+</pre>
+The internal recursion limit was reached.
+<a name="extractbynumber"></a></P>
+<br><a name="SEC24" href="#TOC1">EXTRACTING CAPTURED SUBSTRINGS BY NUMBER</a><br>
+<P>
+<b>int pcre2_substring_length_bynumber(pcre2_match_data *<i>match_data</i>,</b>
+<b> unsigned int <i>number</i>, PCRE2_SIZE *<i>length</i>);</b>
+<br>
+<br>
+<b>int pcre2_substring_copy_bynumber(pcre2_match_data *<i>match_data</i>,</b>
+<b> unsigned int <i>number</i>, PCRE2_UCHAR *<i>buffer</i>,</b>
+<b> PCRE2_SIZE *<i>bufflen</i>);</b>
+<br>
+<br>
+<b>int pcre2_substring_get_bynumber(pcre2_match_data *<i>match_data</i>,</b>
+<b> unsigned int <i>number</i>, PCRE2_UCHAR **<i>bufferptr</i>,</b>
+<b> PCRE2_SIZE *<i>bufflen</i>);</b>
+<br>
+<br>
+<b>void pcre2_substring_free(PCRE2_UCHAR *<i>buffer</i>);</b>
+</P>
+<P>
+Captured substrings can be accessed directly by using the ovector as described
+<a href="#matchedstrings">above.</a>
+For convenience, auxiliary functions are provided for extracting captured
+substrings as new, separate, zero-terminated strings. The functions in this
+section identify substrings by number. The next section describes similar
+functions for extracting substrings by name. A substring that contains a binary
+zero is correctly extracted and has a further zero added on the end, but the
+result is not, of course, a C string.
+</P>
+<P>
+You can find the length in code units of a captured substring without
+extracting it by calling <b>pcre2_substring_length_bynumber()</b>. The first
+argument is a pointer to the match data block, the second is the group number,
+and the third is a pointer to a variable into which the length is placed.
+</P>
+<P>
+The <b>pcre2_substring_copy_bynumber()</b> function copies one string into a
+supplied buffer, whereas <b>pcre2_substring_get_bynumber()</b> copies it into
+new memory, obtained using the same memory allocation function that was used
+for the match data block. The first two arguments of these functions are a
+pointer to the match data block and a capturing group number. A group number of
+zero extracts the substring that matched the entire pattern, and higher values
+extract the captured substrings.
+</P>
+<P>
+The final arguments of <b>pcre2_substring_copy_bynumber()</b> are a pointer to
+the buffer and a pointer to a variable that contains its length in code units.
+This is updated to contain the actual number of code units used, excluding the
+terminating zero.
+</P>
+<P>
+For <b>pcre2_substring_get_bynumber()</b> the third and fourth arguments point
+to variables that are updated with a pointer to the new memory and the number
+of code units that comprise the substring, again excluding the terminating
+zero. When the substring is no longer needed, the memory should be freed by
+calling <b>pcre2_substring_free()</b>.
+</P>
+<P>
+The return value from these functions is zero for success, or one of these
+error codes:
+<pre>
+ PCRE2_ERROR_NOMEMORY
+</pre>
+The buffer was too small for <b>pcre2_substring_copy_bynumber()</b>, or the
+attempt to get memory failed for <b>pcre2_substring_get_bynumber()</b>.
+<pre>
+ PCRE2_ERROR_NOSUBSTRING
+</pre>
+No substring with the given number was captured. This could be because there is
+no capturing group of that number in the pattern, or because the group with
+that number did not participate in the match, or because the ovector was too
+small to capture that group.
+</P>
+<br><a name="SEC25" href="#TOC1">EXTRACTING A LIST OF ALL CAPTURED SUBSTRINGS</a><br>
+<P>
+<b>int pcre2_substring_list_get(pcre2_match_data *<i>match_data</i>,</b>
+<b>" PCRE2_UCHAR ***<i>listptr</i>, PCRE2_SIZE **<i>lengthsptr</i>);</b>
+<br>
+<br>
+<b>void pcre2_substring_list_free(PCRE2_SPTR *<i>list</i>);</b>
+</P>
+<P>
+The <b>pcre2_substring_list_get()</b> function extracts all available substrings
+and builds a list of pointers to them, and a second list that contains their
+lengths (in code units), excluding a terminating zero that is added to each of
+them. All this is done in a single block of memory that is obtained using the
+same memory allocation function that was used to get the match data block.
+</P>
+<P>
+The address of the memory block is returned via <i>listptr</i>, which is also
+the start of the list of string pointers. The end of the list is marked by a
+NULL pointer. The address of the list of lengths is returned via
+<i>lengthsptr</i>. If your strings do not contain binary zeros and you do not
+therefore need the lengths, you may supply NULL as the <b>lengthsptr</b>
+argument to disable the creation of a list of lengths. The yield of the
+function is zero if all went well, or PCRE2_ERROR_NOMEMORY if the memory block
+could not be obtained. When the list is no longer needed, it should be freed by
+calling <b>pcre2_substring_list_free()</b>.
+</P>
+<P>
+If this function encounters a substring that is unset, which can happen when
+capturing subpattern number <i>n+1</i> matches some part of the subject, but
+subpattern <i>n</i> has not been used at all, it returns an empty string. This
+can be distinguished from a genuine zero-length substring by inspecting the
+appropriate offset in the ovector, which contains PCRE2_UNSET for unset
+substrings.
+<a name="extractbynname"></a></P>
+<br><a name="SEC26" href="#TOC1">EXTRACTING CAPTURED SUBSTRINGS BY NAME</a><br>
+<P>
+<b>int pcre2_substring_number_from_name(const pcre2_code *<i>code</i>,</b>
+<b> PCRE2_SPTR <i>name</i>);</b>
+<br>
+<br>
+<b>int pcre2_substring_length_byname(pcre2_match_data *<i>match_data</i>,</b>
+<b> PCRE2_SPTR <i>name</i>, PCRE2_SIZE *<i>length</i>);</b>
+<br>
+<br>
+<b>int pcre2_substring_copy_byname(pcre2_match_data *<i>match_data</i>,</b>
+<b> PCRE2_SPTR <i>name</i>, PCRE2_UCHAR *<i>buffer</i>, PCRE2_SIZE *<i>bufflen</i>);</b>
+<br>
+<br>
+<b>int pcre2_substring_get_byname(pcre2_match_data *<i>match_data</i>,</b>
+<b> PCRE2_SPTR <i>name</i>, PCRE2_UCHAR **<i>bufferptr</i>, PCRE2_SIZE *<i>bufflen</i>);</b>
+<br>
+<br>
+<b>void pcre2_substring_free(PCRE2_UCHAR *<i>buffer</i>);</b>
+</P>
+<P>
+To extract a substring by name, you first have to find associated number.
+For example, for this pattern:
+<pre>
+ (a+)b(?&#60;xxx&#62;\d+)...
+</pre>
+the number of the subpattern called "xxx" is 2. If the name is known to be
+unique (PCRE2_DUPNAMES was not set), you can find the number from the name by
+calling <b>pcre2_substring_number_from_name()</b>. The first argument is the
+compiled pattern, and the second is the name. The yield of the function is the
+subpattern number, or PCRE2_ERROR_NOSUBSTRING if there is no subpattern of that
+name.
+</P>
+<P>
+Given the number, you can extract the substring directly, or use one of the
+functions described in the previous section. For convenience, there are also
+"byname" functions that correspond to the "bynumber" functions, the only
+difference being that the second argument is a name instead of a number.
+However, if PCRE2_DUPNAMES is set and there are duplicate names,
+the behaviour may not be what you want (see the next section).
+</P>
+<P>
+<b>Warning:</b> If the pattern uses the (?| feature to set up multiple
+subpatterns with the same number, as described in the
+<a href="pcre2pattern.html#dupsubpatternnumber">section on duplicate subpattern numbers</a>
+in the
+<a href="pcre2pattern.html"><b>pcre2pattern</b></a>
+page, you cannot use names to distinguish the different subpatterns, because
+names are not included in the compiled code. The matching process uses only
+numbers. For this reason, the use of different names for subpatterns of the
+same number causes an error at compile time.
+</P>
+<br><a name="SEC27" href="#TOC1">DUPLICATE SUBPATTERN NAMES</a><br>
+<P>
+<b>int pcre2_substring_nametable_scan(const pcre2_code *<i>code</i>,</b>
+<b> PCRE2_SPTR <i>name</i>, PCRE2_SPTR *<i>first</i>, PCRE2_SPTR *<i>last</i>);</b>
+</P>
+<P>
+When a pattern is compiled with the PCRE2_DUPNAMES option, names for
+subpatterns are not required to be unique. Duplicate names are always allowed
+for subpatterns with the same number, created by using the (?| feature. Indeed,
+if such subpatterns are named, they are required to use the same names.
+</P>
+<P>
+Normally, patterns with duplicate names are such that in any one match, only
+one of the named subpatterns participates. An example is shown in the
+<a href="pcre2pattern.html"><b>pcre2pattern</b></a>
+documentation.
+</P>
+<P>
+When duplicates are present, <b>pcre2_substring_copy_byname()</b> and
+<b>pcre2_substring_get_byname()</b> return the first substring corresponding to
+the given name that is set. If none are set, PCRE2_ERROR_NOSUBSTRING is
+returned. The <b>pcre2_substring_number_from_name()</b> function returns one of
+the numbers that are associated with the name, but it is not defined which it
+is.
+</P>
+<P>
+If you want to get full details of all captured substrings for a given name,
+you must use the <b>pcre2_substring_nametable_scan()</b> function. The first
+argument is the compiled pattern, and the second is the name. If the third and
+fourth arguments are NULL, the function returns a group number (it is not
+defined which). Otherwise, the third and fourth arguments must be pointers to
+variables that are updated by the function. After it has run, they point to the
+first and last entries in the name-to-number table for the given name, and the
+function returns the length of each entry. In both cases,
+PCRE2_ERROR_NOSUBSTRING is returned if there are no entries for the given name.
+</P>
+<P>
+The format of the name table is described above in the section entitled
+<i>Information about a pattern</i>
+<a href="#infoaboutpattern">above.</a>
+Given all the relevant entries for the name, you can extract each of their
+numbers, and hence the captured data.
+</P>
+<br><a name="SEC28" href="#TOC1">FINDING ALL POSSIBLE MATCHES</a><br>
+<P>
+The traditional matching function uses a similar algorithm to Perl, which stops
+when it finds the first match, starting at a given point in the subject. If you
+want to find all possible matches, or the longest possible match at a given
+position, consider using the alternative matching function (see below) instead.
+If you cannot use the alternative function, you can kludge it up by making use
+of the callout facility, which is described in the
+<a href="pcre2callout.html"><b>pcre2callout</b></a>
+documentation.
+</P>
+<P>
+What you have to do is to insert a callout right at the end of the pattern.
+When your callout function is called, extract and save the current matched
+substring. Then return 1, which forces <b>pcre2_match()</b> to backtrack and try
+other alternatives. Ultimately, when it runs out of matches,
+<b>pcre2_match()</b> will yield PCRE2_ERROR_NOMATCH.
+<a name="dfamatch"></a></P>
+<br><a name="SEC29" href="#TOC1">MATCHING A PATTERN: THE ALTERNATIVE FUNCTION</a><br>
+<P>
+<b>int pcre2_dfa_match(const pcre2_code *<i>code</i>, PCRE2_SPTR <i>subject</i>,</b>
+<b> PCRE2_SIZE <i>length</i>, PCRE2_SIZE <i>startoffset</i>,</b>
+<b> uint32_t <i>options</i>, pcre2_match_data *<i>match_data</i>,</b>
+<b> pcre2_match_context *<i>mcontext</i>,</b>
+<b> int *<i>workspace</i>, PCRE2_SIZE <i>wscount</i>);</b>
+</P>
+<P>
+The function <b>pcre2_dfa_match()</b> is called to match a subject string
+against a compiled pattern, using a matching algorithm that scans the subject
+string just once, and does not backtrack. This has different characteristics to
+the normal algorithm, and is not compatible with Perl. Some of the features of
+PCRE2 patterns are not supported. Nevertheless, there are times when this kind
+of matching can be useful. For a discussion of the two matching algorithms, and
+a list of features that <b>pcre2_dfa_match()</b> does not support, see the
+<a href="pcre2matching.html"><b>pcre2matching</b></a>
+documentation.
+</P>
+<P>
+The arguments for the <b>pcre2_dfa_match()</b> function are the same as for
+<b>pcre2_match()</b>, plus two extras. The ovector within the match data block
+is used in a different way, and this is described below. The other common
+arguments are used in the same way as for <b>pcre2_match()</b>, so their
+description is not repeated here.
+</P>
+<P>
+The two additional arguments provide workspace for the function. The workspace
+vector should contain at least 20 elements. It is used for keeping track of
+multiple paths through the pattern tree. More workspace is needed for patterns
+and subjects where there are a lot of potential matches.
+</P>
+<P>
+Here is an example of a simple call to <b>pcre2_dfa_match()</b>:
+<pre>
+ int wspace[20];
+ pcre2_match_data *md = pcre2_match_data_create(4, NULL);
+ int rc = pcre2_dfa_match(
+ re, /* result of pcre2_compile() */
+ "some string", /* the subject string */
+ 11, /* the length of the subject string */
+ 0, /* start at offset 0 in the subject */
+ 0, /* default options */
+ match_data, /* the match data block */
+ NULL, /* a match context; NULL means use defaults */
+ wspace, /* working space vector */
+ 20); /* number of elements (NOT size in bytes) */
+</PRE>
+</P>
+<br><b>
+Option bits for <b>pcre_dfa_match()</b>
+</b><br>
+<P>
+The unused bits of the <i>options</i> argument for <b>pcre2_dfa_match()</b> must
+be zero. The only bits that may be set are PCRE2_ANCHORED, PCRE2_NOTBOL,
+PCRE2_NOTEOL, PCRE2_NOTEMPTY, PCRE2_NOTEMPTY_ATSTART, PCRE2_NO_UTF_CHECK,
+PCRE2_NO_START_OPTIMIZE, PCRE2_PARTIAL_HARD, PCRE2_PARTIAL_SOFT,
+PCRE2_DFA_SHORTEST, and PCRE2_DFA_RESTART. All but the last four of these are
+exactly the same as for <b>pcre2_match()</b>, so their description is not
+repeated here.
+<pre>
+ PCRE2_PARTIAL_HARD
+ PCRE2_PARTIAL_SOFT
+</pre>
+These have the same general effect as they do for <b>pcre2_match()</b>, but the
+details are slightly different. When PCRE2_PARTIAL_HARD is set for
+<b>pcre2_dfa_match()</b>, it returns PCRE2_ERROR_PARTIAL if the end of the
+subject is reached and there is still at least one matching possibility that
+requires additional characters. This happens even if some complete matches have
+already been found. When PCRE2_PARTIAL_SOFT is set, the return code
+PCRE2_ERROR_NOMATCH is converted into PCRE2_ERROR_PARTIAL if the end of the
+subject is reached, there have been no complete matches, but there is still at
+least one matching possibility. The portion of the string that was inspected
+when the longest partial match was found is set as the first matching string in
+both cases. There is a more detailed discussion of partial and multi-segment
+matching, with examples, in the
+<a href="pcre2partial.html"><b>pcre2partial</b></a>
+documentation.
+<pre>
+ PCRE2_DFA_SHORTEST
+</pre>
+Setting the PCRE2_DFA_SHORTEST option causes the matching algorithm to stop as
+soon as it has found one match. Because of the way the alternative algorithm
+works, this is necessarily the shortest possible match at the first possible
+matching point in the subject string.
+<pre>
+ PCRE2_DFA_RESTART
+</pre>
+When <b>pcre2_dfa_match()</b> returns a partial match, it is possible to call it
+again, with additional subject characters, and have it continue with the same
+match. The PCRE2_DFA_RESTART option requests this action; when it is set, the
+<i>workspace</i> and <i>wscount</i> options must reference the same vector as
+before because data about the match so far is left in them after a partial
+match. There is more discussion of this facility in the
+<a href="pcre2partial.html"><b>pcre2partial</b></a>
+documentation.
+</P>
+<br><b>
+Successful returns from <b>pcre2_dfa_match()</b>
+</b><br>
+<P>
+When <b>pcre2_dfa_match()</b> succeeds, it may have matched more than one
+substring in the subject. Note, however, that all the matches from one run of
+the function start at the same point in the subject. The shorter matches are
+all initial substrings of the longer matches. For example, if the pattern
+<pre>
+ &#60;.*&#62;
+</pre>
+is matched against the string
+<pre>
+ This is &#60;something&#62; &#60;something else&#62; &#60;something further&#62; no more
+</pre>
+the three matched strings are
+<pre>
+ &#60;something&#62;
+ &#60;something&#62; &#60;something else&#62;
+ &#60;something&#62; &#60;something else&#62; &#60;something further&#62;
+</pre>
+On success, the yield of the function is a number greater than zero, which is
+the number of matched substrings. The offsets of the substrings are returned in
+the ovector, and can be extracted in the same way as for <b>pcre2_match()</b>.
+They are returned in reverse order of length; that is, the longest
+matching string is given first. If there were too many matches to fit into
+the ovector, the yield of the function is zero, and the vector is filled with
+the longest matches.
+</P>
+<P>
+NOTE: PCRE2's "auto-possessification" optimization usually applies to character
+repeats at the end of a pattern (as well as internally). For example, the
+pattern "a\d+" is compiled as if it were "a\d++" because there is no point in
+backtracking into the repeated digits. For DFA matching, this means that only
+one possible match is found. If you really do want multiple matches in such
+cases, either use an ungreedy repeat ("a\d+?") or set the
+PCRE2_NO_AUTO_POSSESS option when compiling.
+</P>
+<br><b>
+Error returns from <b>pcre2_dfa_match()</b>
+</b><br>
+<P>
+The <b>pcre2_dfa_match()</b> function returns a negative number when it fails.
+Many of the errors are the same as for <b>pcre2_match()</b>, as described
+<a href="#errorlist">above.</a>
+There are in addition the following errors that are specific to
+<b>pcre2_dfa_match()</b>:
+<pre>
+ PCRE2_ERROR_DFA_UITEM
+</pre>
+This return is given if <b>pcre2_dfa_match()</b> encounters an item in the
+pattern that it does not support, for instance, the use of \C or a back
+reference.
+<pre>
+ PCRE2_ERROR_DFA_UCOND
+</pre>
+This return is given if <b>pcre2_dfa_match()</b> encounters a condition item
+that uses a back reference for the condition, or a test for recursion in a
+specific group. These are not supported.
+<pre>
+ PCRE2_ERROR_DFA_WSSIZE
+</pre>
+This return is given if <b>pcre2_dfa_match()</b> runs out of space in the
+<i>workspace</i> vector.
+<pre>
+ PCRE2_ERROR_DFA_RECURSE
+</pre>
+When a recursive subpattern is processed, the matching function calls itself
+recursively, using private memory for the ovector and <i>workspace</i>. This
+error is given if the internal ovector is not large enough. This should be
+extremely rare, as a vector of size 1000 is used.
+<pre>
+ PCRE2_ERROR_DFA_BADRESTART
+</pre>
+When <b>pcre2_dfa_match()</b> is called with the <b>pcre2_dfa_RESTART</b> option,
+some plausibility checks are made on the contents of the workspace, which
+should contain data about the previous partial match. If any of these checks
+fail, this error is given.
+</P>
+<br><a name="SEC30" href="#TOC1">SEE ALSO</a><br>
+<P>
+<b>pcre2build</b>(3), <b>pcre2libs</b>(3), <b>pcre2callout</b>(3),
+<b>pcre2matching</b>(3), <b>pcre2partial</b>(3), <b>pcre2posix</b>(3),
+<b>pcre2demo(3)</b>, <b>pcre2sample</b>(3), <b>pcre2stack</b>(3).
+</P>
+<br><a name="SEC31" href="#TOC1">AUTHOR</a><br>
+<P>
+Philip Hazel
+<br>
+University Computing Service
+<br>
+Cambridge CB2 3QH, England.
+<br>
+</P>
+<br><a name="SEC32" href="#TOC1">REVISION</a><br>
+<P>
+Last updated: 16 September 2014
+<br>
+Copyright &copy; 1997-2014 University of Cambridge.
+<br>
+<p>
+Return to the <a href="index.html">PCRE2 index page</a>.
+</p>
diff --git a/doc/html/pcre2callout.html b/doc/html/pcre2callout.html
new file mode 100644
index 0000000..c742f90
--- /dev/null
+++ b/doc/html/pcre2callout.html
@@ -0,0 +1,270 @@
+<html>
+<head>
+<title>pcre2callout specification</title>
+</head>
+<body bgcolor="#FFFFFF" text="#00005A" link="#0066FF" alink="#3399FF" vlink="#2222BB">
+<h1>pcre2callout man page</h1>
+<p>
+Return to the <a href="index.html">PCRE2 index page</a>.
+</p>
+<p>
+This page is part of the PCRE2 HTML documentation. It was generated
+automatically from the original man page. If there is any nonsense in it,
+please consult the man page, in case the conversion went wrong.
+<br>
+<ul>
+<li><a name="TOC1" href="#SEC1">SYNOPSIS</a>
+<li><a name="TOC2" href="#SEC2">DESCRIPTION</a>
+<li><a name="TOC3" href="#SEC3">MISSING CALLOUTS</a>
+<li><a name="TOC4" href="#SEC4">THE CALLOUT INTERFACE</a>
+<li><a name="TOC5" href="#SEC5">RETURN VALUES</a>
+<li><a name="TOC6" href="#SEC6">AUTHOR</a>
+<li><a name="TOC7" href="#SEC7">REVISION</a>
+</ul>
+<br><a name="SEC1" href="#TOC1">SYNOPSIS</a><br>
+<P>
+<b>#include &#60;pcre2.h&#62;</b>
+</P>
+<P>
+<b>int (*pcre2_callout)(pcre2_callout_block *);</b>
+</P>
+<br><a name="SEC2" href="#TOC1">DESCRIPTION</a><br>
+<P>
+PCRE2 provides a feature called "callout", which is a means of temporarily
+passing control to the caller of PCRE2 in the middle of pattern matching. The
+caller of PCRE2 provides an external function by putting its entry point in
+a match context (see <b>pcre2_set_callout()</b>) in the
+<a href="pcre2api.html"><b>pcre2api</b></a>
+documentation).
+</P>
+<P>
+Within a regular expression, (?C) indicates the points at which the external
+function is to be called. Different callout points can be identified by putting
+a number less than 256 after the letter C. The default value is zero.
+For example, this pattern has two callout points:
+<pre>
+ (?C1)abc(?C2)def
+</pre>
+If the PCRE2_AUTO_CALLOUT option bit is set when a pattern is compiled, PCRE2
+automatically inserts callouts, all with number 255, before each item in the
+pattern. For example, if PCRE2_AUTO_CALLOUT is used with the pattern
+<pre>
+ A(\d{2}|--)
+</pre>
+it is processed as if it were
+<br>
+<br>
+(?C255)A(?C255)((?C255)\d{2}(?C255)|(?C255)-(?C255)-(?C255))(?C255)
+<br>
+<br>
+Notice that there is a callout before and after each parenthesis and
+alternation bar. If the pattern contains a conditional group whose condition is
+an assertion, an automatic callout is inserted immediately before the
+condition. Such a callout may also be inserted explicitly, for example:
+<pre>
+ (?(?C9)(?=a)ab|de)
+</pre>
+This applies only to assertion conditions (because they are themselves
+independent groups).
+</P>
+<P>
+Automatic callouts can be used for tracking the progress of pattern matching.
+The
+<a href="pcre2test.html"><b>pcre2test</b></a>
+program has a pattern qualifier (/auto_callout) that sets automatic callouts;
+when it is used, the output indicates how the pattern is being matched. This is
+useful information when you are trying to optimize the performance of a
+particular pattern.
+</P>
+<br><a name="SEC3" href="#TOC1">MISSING CALLOUTS</a><br>
+<P>
+You should be aware that, because of optimizations in the way PCRE2 compiles
+and matches patterns, callouts sometimes do not happen exactly as you might
+expect.
+</P>
+<P>
+At compile time, PCRE2 "auto-possessifies" repeated items when it knows that
+what follows cannot be part of the repeat. For example, a+[bc] is compiled as
+if it were a++[bc]. The <b>pcre2test</b> output when this pattern is anchored
+and then applied with automatic callouts to the string "aaaa" is:
+<pre>
+ ---&#62;aaaa
+ +0 ^ ^
+ +1 ^ a+
+ +3 ^ ^ [bc]
+ No match
+</pre>
+This indicates that when matching [bc] fails, there is no backtracking into a+
+and therefore the callouts that would be taken for the backtracks do not occur.
+You can disable the auto-possessify feature by passing PCRE2_NO_AUTO_POSSESS
+to <b>pcre2_compile()</b>, or starting the pattern with (*NO_AUTO_POSSESS). If
+this is done in <b>pcre2test</b> (using the /no_auto_possess qualifier), the
+output changes to this:
+<pre>
+ ---&#62;aaaa
+ +0 ^ ^
+ +1 ^ a+
+ +3 ^ ^ [bc]
+ +3 ^ ^ [bc]
+ +3 ^ ^ [bc]
+ +3 ^^ [bc]
+ No match
+</pre>
+This time, when matching [bc] fails, the matcher backtracks into a+ and tries
+again, repeatedly, until a+ itself fails.
+</P>
+<P>
+Other optimizations that provide fast "no match" results also affect callouts.
+For example, if the pattern is
+<pre>
+ ab(?C4)cd
+</pre>
+PCRE2 knows that any matching string must contain the letter "d". If the
+subject string is "abyz", the lack of "d" means that matching doesn't ever
+start, and the callout is never reached. However, with "abyd", though the
+result is still no match, the callout is obeyed.
+</P>
+<P>
+PCRE2 also knows the minimum length of a matching string, and will immediately
+give a "no match" return without actually running a match if the subject is not
+long enough, or, for unanchored patterns, if it has been scanned far enough.
+</P>
+<P>
+You can disable these optimizations by passing the PCRE2_NO_START_OPTIMIZE
+option to the matching function, or by starting the pattern with
+(*NO_START_OPT). This slows down the matching process, but does ensure that
+callouts such as the example above are obeyed.
+</P>
+<br><a name="SEC4" href="#TOC1">THE CALLOUT INTERFACE</a><br>
+<P>
+During matching, when PCRE2 reaches a callout point, the external function that
+is set in the match context is called (if it is set). This applies to both
+normal and DFA matching. The only argument to the callout function is a pointer
+to a <b>pcre2_callout</b> block. This structure contains the following fields:
+<pre>
+ uint32_t <i>version</i>;
+ uint32_t <i>callout_number</i>;
+ uint32_t <i>capture_top</i>;
+ uint32_t <i>capture_last</i>;
+ void *<i>callout_data</i>;
+ PCRE2_SIZE *<i>offset_vector</i>;
+ PCRE2_SPTR <i>mark</i>;
+ PCRE2_SPTR <i>subject</i>;
+ PCRE2_SIZE <i>subject_length</i>;
+ PCRE2_SIZE <i>start_match</i>;
+ PCRE2_SIZE <i>current_position</i>;
+ PCRE2_SIZE <i>pattern_position</i>;
+ PCRE2_SIZE <i>next_item_length</i>;
+</pre>
+The <i>version</i> field contains the version number of the block format. The
+current version is 0. The version number will change in future if additional
+fields are added, but the intention is never to remove any of the existing
+fields.
+</P>
+<P>
+The <i>callout_number</i> field contains the number of the callout, as compiled
+into the pattern (that is, the number after ?C for manual callouts, and 255 for
+automatically generated callouts).
+</P>
+<P>
+The <i>offset_vector</i> field is a pointer to the vector of capturing offsets
+(the "ovector") that was passed to the matching function in the match data
+block. When <b>pcre2_match()</b> is used, the contents can be inspected, in
+order to extract substrings that have been matched so far, in the same way as
+for extracting substrings after a match has completed. For the DFA matching
+function, this field is not useful.
+</P>
+<P>
+The <i>subject</i> and <i>subject_length</i> fields contain copies of the values
+that were passed to the matching function.
+</P>
+<P>
+The <i>start_match</i> field normally contains the offset within the subject at
+which the current match attempt started. However, if the escape sequence \K
+has been encountered, this value is changed to reflect the modified starting
+point. If the pattern is not anchored, the callout function may be called
+several times from the same point in the pattern for different starting points
+in the subject.
+</P>
+<P>
+The <i>current_position</i> field contains the offset within the subject of the
+current match pointer.
+</P>
+<P>
+When the <b>pcre2_match()</b> is used, the <i>capture_top</i> field contains one
+more than the number of the highest numbered captured substring so far. If no
+substrings have been captured, the value of <i>capture_top</i> is one. This is
+always the case when the DFA functions are used, because they do not support
+captured substrings.
+</P>
+<P>
+The <i>capture_last</i> field contains the number of the most recently captured
+substring. However, when a recursion exits, the value reverts to what it was
+outside the recursion, as do the values of all captured substrings. If no
+substrings have been captured, the value of <i>capture_last</i> is 0. This is
+always the case for the DFA matching functions.
+</P>
+<P>
+The <i>callout_data</i> field contains a value that is passed to a matching
+function specifically so that it can be passed back in callouts. It is set in
+the match context when the callout is set up by calling
+<b>pcre2_set_callout()</b> (see the
+<a href="pcre2api.html"><b>pcre2api</b></a>
+documentation).
+</P>
+<P>
+The <i>pattern_position</i> field contains the offset to the next item to be
+matched in the pattern string.
+</P>
+<P>
+The <i>next_item_length</i> field contains the length of the next item to be
+matched in the pattern string. When the callout immediately precedes an
+alternation bar, a closing parenthesis, or the end of the pattern, the length
+is zero. When the callout precedes an opening parenthesis, the length is that
+of the entire subpattern.
+</P>
+<P>
+The <i>pattern_position</i> and <i>next_item_length</i> fields are intended to
+help in distinguishing between different automatic callouts, which all have the
+same callout number. However, they are set for all callouts.
+</P>
+<P>
+In callouts from <b>pcre2_match()</b> the <i>mark</i> field contains a pointer to
+the zero-terminated name of the most recently passed (*MARK), (*PRUNE), or
+(*THEN) item in the match, or NULL if no such items have been passed. Instances
+of (*PRUNE) or (*THEN) without a name do not obliterate a previous (*MARK). In
+callouts from the DFA matching function this field always contains NULL.
+</P>
+<br><a name="SEC5" href="#TOC1">RETURN VALUES</a><br>
+<P>
+The external callout function returns an integer to PCRE2. If the value is
+zero, matching proceeds as normal. If the value is greater than zero, matching
+fails at the current point, but the testing of other matching possibilities
+goes ahead, just as if a lookahead assertion had failed. If the value is less
+than zero, the match is abandoned, and the matching function returns the
+negative value.
+</P>
+<P>
+Negative values should normally be chosen from the set of PCRE2_ERROR_xxx
+values. In particular, PCRE2_ERROR_NOMATCH forces a standard "no match"
+failure. The error number PCRE2_ERROR_CALLOUT is reserved for use by callout
+functions; it will never be used by PCRE2 itself.
+</P>
+<br><a name="SEC6" href="#TOC1">AUTHOR</a><br>
+<P>
+Philip Hazel
+<br>
+University Computing Service
+<br>
+Cambridge CB2 3QH, England.
+<br>
+</P>
+<br><a name="SEC7" href="#TOC1">REVISION</a><br>
+<P>
+Last updated: 19 October 2014
+<br>
+Copyright &copy; 1997-2014 University of Cambridge.
+<br>
+<p>
+Return to the <a href="index.html">PCRE2 index page</a>.
+</p>
diff --git a/doc/html/pcre2demo.html b/doc/html/pcre2demo.html
new file mode 100644
index 0000000..2d1d92b
--- /dev/null
+++ b/doc/html/pcre2demo.html
@@ -0,0 +1,443 @@
+<html>
+<head>
+<title>pcre2demo specification</title>
+</head>
+<body bgcolor="#FFFFFF" text="#00005A" link="#0066FF" alink="#3399FF" vlink="#2222BB">
+<h1>pcre2demo man page</h1>
+<p>
+Return to the <a href="index.html">PCRE2 index page</a>.
+</p>
+<p>
+This page is part of the PCRE2 HTML documentation. It was generated
+automatically from the original man page. If there is any nonsense in it,
+please consult the man page, in case the conversion went wrong.
+<br>
+<ul>
+</ul>
+<PRE>
+/*************************************************
+* PCRE2 DEMONSTRATION PROGRAM *
+*************************************************/
+
+/* This is a demonstration program to illustrate a straightforward way of
+calling the PCRE2 regular expression library from a C program. See the
+pcre2sample documentation for a short discussion ("man pcre2sample" if you have
+the PCRE2 man pages installed). PCRE2 is a revised API for the library, and is
+incompatible with the original PCRE API.
+
+There are actually three libraries, each supporting a different code unit
+width. This demonstration program uses the 8-bit library.
+
+In Unix-like environments, if PCRE2 is installed in your standard system
+libraries, you should be able to compile this program using this command:
+
+gcc -Wall pcre2demo.c -lpcre2-8 -o pcre2demo
+
+If PCRE2 is not installed in a standard place, it is likely to be installed
+with support for the pkg-config mechanism. If you have pkg-config, you can
+compile this program using this command:
+
+gcc -Wall pcre2demo.c `pkg-config --cflags --libs libpcre2-8` -o pcre2demo
+
+If you do not have pkg-config, you may have to use this:
+
+gcc -Wall pcre2demo.c -I/usr/local/include -L/usr/local/lib \
+ -R/usr/local/lib -lpcre2-8 -o pcre2demo
+
+Replace "/usr/local/include" and "/usr/local/lib" with wherever the include and
+library files for PCRE2 are installed on your system. Only some operating
+systems (Solaris is one) use the -R option.
+
+Building under Windows:
+
+If you want to statically link this program against a non-dll .a file, you must
+define PCRE2_STATIC before including pcre2.h, so in this environment, uncomment
+the following line. */
+
+/* #define PCRE2_STATIC */
+
+/* This macro must be defined before including pcre2.h. For a program that uses
+only one code unit width, it makes it possible to use generic function names
+such as pcre2_compile(). */
+
+#define PCRE2_CODE_UNIT_WIDTH 8
+
+#include &lt;stdio.h&gt;
+#include &lt;string.h&gt;
+#include &lt;pcre2.h&gt;
+
+
+/**************************************************************************
+* Here is the program. The API includes the concept of "contexts" for *
+* setting up unusual interface requirements for compiling and matching, *
+* such as custom memory managers and non-standard newline definitions. *
+* This program does not do any of this, so it makes no use of contexts, *
+* always passing NULL where a context could be given. *
+**************************************************************************/
+
+int main(int argc, char **argv)
+{
+pcre2_code *re;
+PCRE2_SPTR pattern; /* PCRE2_SPTR is a pointer to unsigned code units of */
+PCRE2_SPTR subject; /* the appropriate width (8, 16, or 32 bits). */
+PCRE2_SPTR name_table;
+
+int crlf_is_newline;
+int errornumber;
+int find_all;
+int i;
+int namecount;
+int name_entry_size;
+int rc;
+int utf8;
+
+uint32_t option_bits;
+uint32_t newline;
+
+PCRE2_SIZE erroroffset;
+PCRE2_SIZE *ovector;
+
+size_t subject_length;
+pcre2_match_data *match_data;
+
+
+
+/**************************************************************************
+* First, sort out the command line. There is only one possible option at *
+* the moment, "-g" to request repeated matching to find all occurrences, *
+* like Perl's /g option. We set the variable find_all to a non-zero value *
+* if the -g option is present. Apart from that, there must be exactly two *
+* arguments. *
+**************************************************************************/
+
+find_all = 0;
+for (i = 1; i &lt; argc; i++)
+ {
+ if (strcmp(argv[i], "-g") == 0) find_all = 1;
+ else break;
+ }
+
+/* After the options, we require exactly two arguments, which are the pattern,
+and the subject string. */
+
+if (argc - i != 2)
+ {
+ printf("Two arguments required: a regex and a subject string\n");
+ return 1;
+ }
+
+/* As pattern and subject are char arguments, they can be straightforwardly
+cast to PCRE2_SPTR as we are working in 8-bit code units. */
+
+pattern = (PCRE2_SPTR)argv[i];
+subject = (PCRE2_SPTR)argv[i+1];
+subject_length = strlen((char *)subject);
+
+
+/*************************************************************************
+* Now we are going to compile the regular expression pattern, and handle *
+* any errors that are detected. *
+*************************************************************************/
+
+re = pcre2_compile(
+ pattern, /* the pattern */
+ -1, /* indicates pattern is zero-terminated */
+ 0, /* default options */
+ &amp;errornumber, /* for error number */
+ &amp;erroroffset, /* for error offset */
+ NULL); /* use default compile context */
+
+/* Compilation failed: print the error message and exit. */
+
+if (re == NULL)
+ {
+ PCRE2_UCHAR buffer[256];
+ pcre2_get_error_message(errornumber, buffer, sizeof(buffer));
+ printf("PCRE2 compilation failed at offset %d: %s\n", (int)erroroffset,
+ buffer);
+ return 1;
+ }
+
+
+/*************************************************************************
+* If the compilation succeeded, we call PCRE again, in order to do a *
+* pattern match against the subject string. This does just ONE match. If *
+* further matching is needed, it will be done below. Before running the *
+* match we must set up a match_data block for holding the result. *
+*************************************************************************/
+
+/* Using this function ensures that the block is exactly the right size for
+the number of capturing parentheses in the pattern. */
+
+match_data = pcre2_match_data_create_from_pattern(re, NULL);
+
+rc = pcre2_match(
+ re, /* the compiled pattern */
+ subject, /* the subject string */
+ subject_length, /* the length of the subject */
+ 0, /* start at offset 0 in the subject */
+ 0, /* default options */
+ match_data, /* block for storing the result */
+ NULL); /* use default match context */
+
+/* Matching failed: handle error cases */
+
+if (rc &lt; 0)
+ {
+ switch(rc)
+ {
+ case PCRE2_ERROR_NOMATCH: printf("No match\n"); break;
+ /*
+ Handle other special cases if you like
+ */
+ default: printf("Matching error %d\n", rc); break;
+ }
+ pcre2_match_data_free(match_data); /* Release memory used for the match */
+ pcre2_code_free(re); /* data and the compiled pattern. */
+ return 1;
+ }
+
+/* Match succeded. Get a pointer to the output vector, where string offsets are
+stored. */
+
+ovector = pcre2_get_ovector_pointer(match_data);
+printf("\nMatch succeeded at offset %d\n", (int)ovector[0]);
+
+
+/*************************************************************************
+* We have found the first match within the subject string. If the output *
+* vector wasn't big enough, say so. Then output any substrings that were *
+* captured. *
+*************************************************************************/
+
+/* The output vector wasn't big enough. This should not happen, because we used
+pcre2_match_data_create_from_pattern() above. */
+
+if (rc == 0)
+ printf("ovector was not big enough for all the captured substrings\n");
+
+/* Show substrings stored in the output vector by number. Obviously, in a real
+application you might want to do things other than print them. */
+
+for (i = 0; i &lt; rc; i++)
+ {
+ PCRE2_SPTR substring_start = subject + ovector[2*i];
+ size_t substring_length = ovector[2*i+1] - ovector[2*i];
+ printf("%2d: %.*s\n", i, (int)substring_length, (char *)substring_start);
+ }
+
+
+/**************************************************************************
+* That concludes the basic part of this demonstration program. We have *
+* compiled a pattern, and performed a single match. The code that follows *
+* shows first how to access named substrings, and then how to code for *
+* repeated matches on the same subject. *
+**************************************************************************/
+
+/* See if there are any named substrings, and if so, show them by name. First
+we have to extract the count of named parentheses from the pattern. */
+
+(void)pcre2_pattern_info(
+ re, /* the compiled pattern */
+ PCRE2_INFO_NAMECOUNT, /* get the number of named substrings */
+ &amp;namecount); /* where to put the answer */
+
+if (namecount &lt;= 0) printf("No named substrings\n"); else
+ {
+ PCRE2_SPTR tabptr;
+ printf("Named substrings\n");
+
+ /* Before we can access the substrings, we must extract the table for
+ translating names to numbers, and the size of each entry in the table. */
+
+ (void)pcre2_pattern_info(
+ re, /* the compiled pattern */
+ PCRE2_INFO_NAMETABLE, /* address of the table */
+ &amp;name_table); /* where to put the answer */
+
+ (void)pcre2_pattern_info(
+ re, /* the compiled pattern */
+ PCRE2_INFO_NAMEENTRYSIZE, /* size of each entry in the table */
+ &amp;name_entry_size); /* where to put the answer */
+
+ /* Now we can scan the table and, for each entry, print the number, the name,
+ and the substring itself. In the 8-bit library the number is held in two
+ bytes, most significant first. */
+
+ tabptr = name_table;
+ for (i = 0; i &lt; namecount; i++)
+ {
+ int n = (tabptr[0] &lt;&lt; 8) | tabptr[1];
+ printf("(%d) %*s: %.*s\n", n, name_entry_size - 3, tabptr + 2,
+ (int)(ovector[2*n+1] - ovector[2*n]), subject + ovector[2*n]);
+ tabptr += name_entry_size;
+ }
+ }
+
+
+/*************************************************************************
+* If the "-g" option was given on the command line, we want to continue *
+* to search for additional matches in the subject string, in a similar *
+* way to the /g option in Perl. This turns out to be trickier than you *
+* might think because of the possibility of matching an empty string. *
+* What happens is as follows: *
+* *
+* If the previous match was NOT for an empty string, we can just start *
+* the next match at the end of the previous one. *
+* *
+* If the previous match WAS for an empty string, we can't do that, as it *
+* would lead to an infinite loop. Instead, a call of pcre2_match() is *
+* made with the PCRE2_NOTEMPTY_ATSTART and PCRE2_ANCHORED flags set. The *
+* first of these tells PCRE2 that an empty string at the start of the *
+* subject is not a valid match; other possibilities must be tried. The *
+* second flag restricts PCRE2 to one match attempt at the initial string *
+* position. If this match succeeds, an alternative to the empty string *
+* match has been found, and we can print it and proceed round the loop, *
+* advancing by the length of whatever was found. If this match does not *
+* succeed, we still stay in the loop, advancing by just one character. *
+* In UTF-8 mode, which can be set by (*UTF) in the pattern, this may be *
+* more than one byte. *
+* *
+* However, there is a complication concerned with newlines. When the *
+* newline convention is such that CRLF is a valid newline, we must *
+* advance by two characters rather than one. The newline convention can *
+* be set in the regex by (*CR), etc.; if not, we must find the default. *
+*************************************************************************/
+
+if (!find_all) /* Check for -g */
+ {
+ pcre2_match_data_free(match_data); /* Release the memory that was used */
+ pcre2_code_free(re); /* for the match data and the pattern. */
+ return 0; /* Exit the program. */
+ }
+
+/* Before running the loop, check for UTF-8 and whether CRLF is a valid newline
+sequence. First, find the options with which the regex was compiled and extract
+the UTF state. */
+
+(void)pcre2_pattern_info(re, PCRE2_INFO_ALLOPTIONS, &amp;option_bits);
+utf8 = (option_bits &amp; PCRE2_UTF) != 0;
+
+/* Now find the newline convention and see whether CRLF is a valid newline
+sequence. */
+
+(void)pcre2_pattern_info(re, PCRE2_INFO_NEWLINE, &amp;newline);
+crlf_is_newline = newline == PCRE2_NEWLINE_ANY ||
+ newline == PCRE2_NEWLINE_CRLF ||
+ newline == PCRE2_NEWLINE_ANYCRLF;
+
+/* Loop for second and subsequent matches */
+
+for (;;)
+ {
+ uint32_t options = 0; /* Normally no options */
+ PCRE2_SIZE start_offset = ovector[1]; /* Start at end of previous match */
+
+ /* If the previous match was for an empty string, we are finished if we are
+ at the end of the subject. Otherwise, arrange to run another match at the
+ same point to see if a non-empty match can be found. */
+
+ if (ovector[0] == ovector[1])
+ {
+ if (ovector[0] == subject_length) break;
+ options = PCRE2_NOTEMPTY_ATSTART | PCRE2_ANCHORED;
+ }
+
+ /* Run the next matching operation */
+
+ rc = pcre2_match(
+ re, /* the compiled pattern */
+ subject, /* the subject string */
+ subject_length, /* the length of the subject */
+ start_offset, /* starting offset in the subject */
+ options, /* options */
+ match_data, /* block for storing the result */
+ NULL); /* use default match context */
+
+ /* This time, a result of NOMATCH isn't an error. If the value in "options"
+ is zero, it just means we have found all possible matches, so the loop ends.
+ Otherwise, it means we have failed to find a non-empty-string match at a
+ point where there was a previous empty-string match. In this case, we do what
+ Perl does: advance the matching position by one character, and continue. We
+ do this by setting the "end of previous match" offset, because that is picked
+ up at the top of the loop as the point at which to start again.
+
+ There are two complications: (a) When CRLF is a valid newline sequence, and
+ the current position is just before it, advance by an extra byte. (b)
+ Otherwise we must ensure that we skip an entire UTF character if we are in
+ UTF mode. */
+
+ if (rc == PCRE2_ERROR_NOMATCH)
+ {
+ if (options == 0) break; /* All matches found */
+ ovector[1] = start_offset + 1; /* Advance one code unit */
+ if (crlf_is_newline &amp;&amp; /* If CRLF is newline &amp; */
+ start_offset &lt; subject_length - 1 &amp;&amp; /* we are at CRLF, */
+ subject[start_offset] == '\r' &amp;&amp;
+ subject[start_offset + 1] == '\n')
+ ovector[1] += 1; /* Advance by one more. */
+ else if (utf8) /* Otherwise, ensure we */
+ { /* advance a whole UTF-8 */
+ while (ovector[1] &lt; subject_length) /* character. */
+ {
+ if ((subject[ovector[1]] &amp; 0xc0) != 0x80) break;
+ ovector[1] += 1;
+ }
+ }
+ continue; /* Go round the loop again */
+ }
+
+ /* Other matching errors are not recoverable. */
+
+ if (rc &lt; 0)
+ {
+ printf("Matching error %d\n", rc);
+ pcre2_match_data_free(match_data);
+ pcre2_code_free(re);
+ return 1;
+ }
+
+ /* Match succeded */
+
+ printf("\nMatch succeeded again at offset %d\n", (int)ovector[0]);
+
+ /* The match succeeded, but the output vector wasn't big enough. This
+ should not happen. */
+
+ if (rc == 0)
+ printf("ovector was not big enough for all the captured substrings\n");
+
+ /* As before, show substrings stored in the output vector by number, and then
+ also any named substrings. */
+
+ for (i = 0; i &lt; rc; i++)
+ {
+ PCRE2_SPTR substring_start = subject + ovector[2*i];
+ size_t substring_length = ovector[2*i+1] - ovector[2*i];
+ printf("%2d: %.*s\n", i, (int)substring_length, (char *)substring_start);
+ }
+
+ if (namecount &lt;= 0) printf("No named substrings\n"); else
+ {
+ PCRE2_SPTR tabptr = name_table;
+ printf("Named substrings\n");
+ for (i = 0; i &lt; namecount; i++)
+ {
+ int n = (tabptr[0] &lt;&lt; 8) | tabptr[1];
+ printf("(%d) %*s: %.*s\n", n, name_entry_size - 3, tabptr + 2,
+ (int)(ovector[2*n+1] - ovector[2*n]), subject + ovector[2*n]);
+ tabptr += name_entry_size;
+ }
+ }
+ } /* End of loop to find second and subsequent matches */
+
+printf("\n");
+pcre2_match_data_free(match_data);
+pcre2_code_free(re);
+return 0;
+}
+
+/* End of pcre2demo.c */
+<p>
+Return to the <a href="index.html">PCRE2 index page</a>.
+</p>
diff --git a/doc/html/pcre2test.html b/doc/html/pcre2test.html
new file mode 100644
index 0000000..30b527d
--- /dev/null
+++ b/doc/html/pcre2test.html
@@ -0,0 +1,1199 @@
+<html>
+<head>
+<title>pcre2test specification</title>
+</head>
+<body bgcolor="#FFFFFF" text="#00005A" link="#0066FF" alink="#3399FF" vlink="#2222BB">
+<h1>pcre2test man page</h1>
+<p>
+Return to the <a href="index.html">PCRE2 index page</a>.
+</p>
+<p>
+This page is part of the PCRE2 HTML documentation. It was generated
+automatically from the original man page. If there is any nonsense in it,
+please consult the man page, in case the conversion went wrong.
+<br>
+<ul>
+<li><a name="TOC1" href="#SEC1">SYNOPSIS</a>
+<li><a name="TOC2" href="#SEC2">PCRE2's 8-BIT, 16-BIT AND 32-BIT LIBRARIES</a>
+<li><a name="TOC3" href="#SEC3">INPUT ENCODING</a>
+<li><a name="TOC4" href="#SEC4">COMMAND LINE OPTIONS</a>
+<li><a name="TOC5" href="#SEC5">DESCRIPTION</a>
+<li><a name="TOC6" href="#SEC6">COMMAND LINES</a>
+<li><a name="TOC7" href="#SEC7">MODIFIER SYNTAX</a>
+<li><a name="TOC8" href="#SEC8">PATTERN SYNTAX</a>
+<li><a name="TOC9" href="#SEC9">SUBJECT LINE SYNTAX</a>
+<li><a name="TOC10" href="#SEC10">PATTERN MODIFIERS</a>
+<li><a name="TOC11" href="#SEC11">SUBJECT MODIFIERS</a>
+<li><a name="TOC12" href="#SEC12">THE ALTERNATIVE MATCHING FUNCTION</a>
+<li><a name="TOC13" href="#SEC13">DEFAULT OUTPUT FROM pcre2test</a>
+<li><a name="TOC14" href="#SEC14">OUTPUT FROM THE ALTERNATIVE MATCHING FUNCTION</a>
+<li><a name="TOC15" href="#SEC15">RESTARTING AFTER A PARTIAL MATCH</a>
+<li><a name="TOC16" href="#SEC16">CALLOUTS</a>
+<li><a name="TOC17" href="#SEC17">NON-PRINTING CHARACTERS</a>
+<li><a name="TOC18" href="#SEC18">SEE ALSO</a>
+<li><a name="TOC19" href="#SEC19">AUTHOR</a>
+<li><a name="TOC20" href="#SEC20">REVISION</a>
+</ul>
+<br><a name="SEC1" href="#TOC1">SYNOPSIS</a><br>
+<P>
+<b>pcre2test [options] [input file [output file]]</b>
+<br>
+<br>
+<b>pcre2test</b> is a test program for the PCRE2 regular expression libraries,
+but it can also be used for experimenting with regular expressions. This
+document describes the features of the test program; for details of the regular
+expressions themselves, see the
+<a href="pcre2pattern.html"><b>pcre2pattern</b></a>
+documentation. For details of the PCRE2 library function calls and their
+options, see the
+<a href="pcre2api.html"><b>pcre2api</b></a>
+documentation.
+</P>
+<P>
+The input for <b>pcre2test</b> is a sequence of regular expression patterns and
+subject strings to be matched. The output shows the result of each match
+attempt. Modifiers on the command line, the patterns, and the subject lines
+specify PCRE2 function options, control how the subject is processed, and what
+output is produced.
+</P>
+<P>
+As the original fairly simple PCRE library evolved, it acquired many different
+features, and as a result, the original <b>pcretest</b> program ended up with a
+lot of options in a messy, arcane syntax, for testing all the features. The
+move to the new PCRE2 API provided an opportunity to re-implement the test
+program as <b>pcre2test</b>, with a cleaner modifier syntax. Nevertheless, there
+are still many obscure modifiers, some of which are specifically designed for
+use in conjunction with the test script and data files that are distributed as
+part of PCRE2. All the modifiers are documented here, some without much
+justification, but many of them are unlikely to be of use except when testing
+the libraries.
+</P>
+<br><a name="SEC2" href="#TOC1">PCRE2's 8-BIT, 16-BIT AND 32-BIT LIBRARIES</a><br>
+<P>
+Different versions of the PCRE2 library can be built to support character
+strings that are encoded in 8-bit, 16-bit, or 32-bit code units. One, two, or
+all three of these libraries may be simultaneously installed. The
+<b>pcre2test</b> program can be used to test all the libraries. However, its own
+input and output are always in 8-bit format. When testing the 16-bit or 32-bit
+libraries, patterns and subject strings are converted to 16- or 32-bit format
+before being passed to the library functions. Results are converted back to
+8-bit code units for output.
+</P>
+<P>
+In the rest of this document, the names of library functions and structures
+are given in generic form, for example, <b>pcre_compile()</b>. The actual
+names used in the libraries have a suffix _8, _16, or _32, as appropriate.
+</P>
+<br><a name="SEC3" href="#TOC1">INPUT ENCODING</a><br>
+<P>
+Input to <b>pcre2test</b> is processed line by line, either by calling the C
+library's <b>fgets()</b> function, or via the <b>libreadline</b> library (see
+below). In Unix-like environments, <b>fgets()</b> treats any bytes other than
+newline as data characters. However, in some Windows environments character 26
+(hex 1A) causes an immediate end of file, and no further data is read. For
+maximum portability, therefore, it is safest to avoid non-printing characters
+in <b>pcre2test</b> input files.
+</P>
+<br><a name="SEC4" href="#TOC1">COMMAND LINE OPTIONS</a><br>
+<P>
+<b>-8</b>
+If the 8-bit library has been built, this option causes it to be used (this is
+the default). If the 8-bit library has not been built, this option causes an
+error.
+</P>
+<P>
+<b>-16</b>
+If the 16-bit library has been built, this option causes it to be used. If only
+the 16-bit library has been built, this is the default. If the 16-bit library
+has not been built, this option causes an error.
+</P>
+<P>
+<b>-32</b>
+If the 32-bit library has been built, this option causes it to be used. If only
+the 32-bit library has been built, this is the default. If the 32-bit library
+has not been built, this option causes an error.
+</P>
+<P>
+<b>-b</b>
+Behave as if each pattern has the <b>/fullbincode</b> modifier; the full
+internal binary form of the pattern is output after compilation.
+</P>
+<P>
+<b>-C</b>
+Output the version number of the PCRE2 library, and all available information
+about the optional features that are included, and then exit with zero exit
+code. All other options are ignored.
+</P>
+<P>
+<b>-C</b> <i>option</i>
+Output information about a specific build-time option, then exit. This
+functionality is intended for use in scripts such as <b>RunTest</b>. The
+following options output the value and set the exit code as indicated:
+<pre>
+ ebcdic-nl the code for LF (= NL) in an EBCDIC environment:
+ 0x15 or 0x25
+ 0 if used in an ASCII environment
+ exit code is always 0
+ linksize the configured internal link size (2, 3, or 4)
+ exit code is set to the link size
+ newline the default newline setting:
+ CR, LF, CRLF, ANYCRLF, or ANY
+ exit code is always 0
+ bsr the default setting for what \R matches:
+ ANYCRLF or ANY
+ exit code is always 0
+</pre>
+The following options output 1 for true or 0 for false, and set the exit code
+to the same value:
+<pre>
+ ebcdic compiled for an EBCDIC environment
+ jit just-in-time support is available
+ pcre16 the 16-bit library was built
+ pcre32 the 32-bit library was built
+ pcre8 the 8-bit library was built
+ unicode Unicode support is available
+</pre>
+If an unknown option is given, an error message is output; the exit code is 0.
+</P>
+<P>
+<b>-d</b>
+Behave as if each pattern has the <b>debug</b> modifier; the internal
+form and information about the compiled pattern is output after compilation;
+<b>-d</b> is equivalent to <b>-b -i</b>.
+</P>
+<P>
+<b>-dfa</b>
+Behave as if each subject line has the <b>dfa</b> modifier; matching is done
+using the <b>pcre2_dfa_match()</b> function instead of the default
+<b>pcre2_match()</b>.
+</P>
+<P>
+<b>-help</b>
+Output a brief summary these options and then exit.
+</P>
+<P>
+<b>-i</b>
+Behave as if each pattern has the <b>/info</b> modifier; information about the
+compiled pattern is given after compilation.
+</P>
+<P>
+<b>-jit</b>
+Behave as if each pattern line has the <b>jit</b> modifier; after successful
+compilation, each pattern is passed to the just-in-time compiler, if available.
+</P>
+<P>
+\fB-pattern\fB <i>modifier-list</i>
+Behave as if each pattern line contains the given modifiers.
+</P>
+<P>
+<b>-q</b>
+Do not output the version number of <b>pcre2test</b> at the start of execution.
+</P>
+<P>
+<b>-S</b> <i>size</i>
+On Unix-like systems, set the size of the run-time stack to <i>size</i>
+megabytes.
+</P>
+<P>
+<b>-subject</b> <i>modifier-list</i>
+Behave as if each subject line contains the given modifiers.
+</P>
+<P>
+<b>-t</b>
+Run each compile and match many times with a timer, and output the resulting
+times per compile or match. You can control the number of iterations that are
+used for timing by following <b>-t</b> with a number (as a separate item on the
+command line). For example, "-t 1000" iterates 1000 times. The default is to
+iterate 500,000 times.
+</P>
+<P>
+<b>-tm</b>
+This is like <b>-t</b> except that it times only the matching phase, not the
+compile phase.
+</P>
+<P>
+<b>-T</b> <b>-TM</b>
+These behave like <b>-t</b> and <b>-tm</b>, but in addition, at the end of a run,
+the total times for all compiles and matches are output.
+</P>
+<P>
+<b>-version</b>
+Output the PCRE2 version number and then exit.
+</P>
+<br><a name="SEC5" href="#TOC1">DESCRIPTION</a><br>
+<P>
+If <b>pcre2test</b> is given two filename arguments, it reads from the first and
+writes to the second. If it is given only one filename argument, it reads from
+that file and writes to stdout. Otherwise, it reads from stdin and writes to
+stdout, and prompts for each line of input, using "re&#62;" to prompt for regular
+expression patterns, and "data&#62;" to prompt for subject lines.
+</P>
+<P>
+When <b>pcre2test</b> is built, a configuration option can specify that it
+should be linked with the <b>libreadline</b> or <b>libedit</b> library. When this
+is done, if the input is from a terminal, it is read using the <b>readline()</b>
+function. This provides line-editing and history facilities. The output from
+the <b>-help</b> option states whether or not <b>readline()</b> will be used.
+</P>
+<P>
+The program handles any number of tests, each of which consists of a set of
+input lines. Each set starts with a regular expression pattern, followed by any
+number of subject lines to be matched against that pattern. In between sets of
+test data, command lines that begin with a hash (#) character may appear. This
+file format, with some restrictions, can also be processed by the
+<b>perltest.pl</b> script that is distributed with PCRE2 as a means of checking
+that the behaviour of PCRE2 and Perl is the same.
+</P>
+<P>
+Each subject line is matched separately and independently. If you want to do
+multi-line matches, you have to use the \n escape sequence (or \r or \r\n,
+etc., depending on the newline setting) in a single line of input to encode the
+newline sequences. There is no limit on the length of subject lines; the input
+buffer is automatically extended if it is too small. There is a replication
+feature that makes it possible to generate long subject lines without having to
+supply them explicitly.
+</P>
+<P>
+An empty line or the end of the file signals the end of the subject lines for a
+test, at which point a new pattern or command line is expected if there is
+still input to be read.
+</P>
+<br><a name="SEC6" href="#TOC1">COMMAND LINES</a><br>
+<P>
+In between sets of test data, a line that begins with a hash (#) character is
+interpreted as a command line. If the first character is followed by white
+space or an exclamation mark, the line is treated as a comment, and ignored.
+Otherwise, the following commands are recognized:
+<pre>
+ #forbid_utf
+</pre>
+Subsequent patterns automatically have the PCRE2_NEVER_UTF and PCRE2_NEVER_UCP
+options set, which locks out the use of UTF and Unicode property features. This
+is a trigger guard that is used in test files to ensure that UTF/Unicode tests
+are not accidentally added to files that are used when UTF support is not
+included in the library. This effect can also be obtained by the use of
+<b>#pattern</b>; the difference is that <b>#forbid_utf</b> cannot be unset, and
+the automatic options are not displayed in pattern information, to avoid
+cluttering up test output.
+<pre>
+ #pattern &#60;modifier-list&#62;
+</pre>
+This command sets a default modifier list that applies to all subsequent
+patterns. Modifiers on a pattern can change these settings.
+<pre>
+ #perltest
+</pre>
+The appearance of this line causes all subsequent modifier settings to be
+checked for compatibility with the <b>perltest.pl</b> script, which is used to
+confirm that Perl gives the same results as PCRE2. Also, apart from comment
+lines, none of the other command lines are permitted, because they and many
+of the modifiers are specific to <b>pcre2test</b>, and should not be used in
+test files that are also processed by <b>perltest.pl</b>. The \fP#perltest\fB
+command helps detect tests that are accidentally put in the wrong file.
+<pre>
+ #subject &#60;modifier-list&#62;
+</pre>
+This command sets a default modifier list that applies to all subsequent
+subject lines. Modifiers on a subject line can change these settings.
+</P>
+<br><a name="SEC7" href="#TOC1">MODIFIER SYNTAX</a><br>
+<P>
+Modifier lists are used with both pattern and subject lines. Items in a list
+are separated by commas and optional white space. Some modifiers may be given
+for both patterns and subject lines, whereas others are valid for one or the
+other only. Each modifier has a long name, for example "anchored", and some of
+them must be followed by an equals sign and a value, for example, "offset=12".
+Modifiers that do not take values may be preceded by a minus sign to turn off a
+previous default setting.
+</P>
+<P>
+A few of the more common modifiers can also be specified as single letters, for
+example "i" for "caseless". In documentation, following the Perl convention,
+these are written with a slash ("the /i modifier") for clarity. Abbreviated
+modifiers must all be concatenated in the first item of a modifier list. If the
+first item is not recognized as a long modifier name, it is interpreted as a
+sequence of these abbreviations. For example:
+<pre>
+ /abc/ig,newline=cr,jit=3
+</pre>
+This is a pattern line whose modifier list starts with two one-letter modifiers
+(/i and /g). The lower-case abbreviated modifiers are the same as used in Perl.
+</P>
+<br><a name="SEC8" href="#TOC1">PATTERN SYNTAX</a><br>
+<P>
+A pattern line must start with one of the following characters (common symbols,
+excluding pattern meta-characters):
+<pre>
+ / ! " ' ` - = _ : ; , % & @ ~
+</pre>
+This is interpreted as the pattern's delimiter. A regular expression may be
+continued over several input lines, in which case the newline characters are
+included within it. It is possible to include the delimiter within the pattern
+by escaping it with a backslash, for example
+<pre>
+ /abc\/def/
+</pre>
+If you do this, the escape and the delimiter form part of the pattern, but
+since the delimiters are all non-alphanumeric, this does not affect its
+interpretation. If the terminating delimiter is immediately followed by a
+backslash, for example,
+<pre>
+ /abc/\
+</pre>
+then a backslash is added to the end of the pattern. This is done to provide a
+way of testing the error condition that arises if a pattern finishes with a
+backslash, because
+<pre>
+ /abc\/
+</pre>
+is interpreted as the first line of a pattern that starts with "abc/", causing
+pcre2test to read the next line as a continuation of the regular expression.
+</P>
+<P>
+A pattern can be followed by a modifier list (details below).
+</P>
+<br><a name="SEC9" href="#TOC1">SUBJECT LINE SYNTAX</a><br>
+<P>
+Before each subject line is passed to <b>pcre2_match()</b> or
+<b>pcre2_dfa_match()</b>, leading and trailing white space is removed, and the
+line is scanned for backslash escapes. The following provide a means of
+encoding non-printing characters in a visible way:
+<pre>
+ \a alarm (BEL, \x07)
+ \b backspace (\x08)
+ \e escape (\x27)
+ \f form feed (\x0c)
+ \n newline (\x0a)
+ \r carriage return (\x0d)
+ \t tab (\x09)
+ \v vertical tab (\x0b)
+ \nnn octal character (up to 3 octal digits); always
+ a byte unless &#62; 255 in UTF-8 or 16-bit or 32-bit mode
+ \o{dd...} octal character (any number of octal digits}
+ \xhh hexadecimal byte (up to 2 hex digits)
+ \x{hh...} hexadecimal character (any number of hex digits)
+</pre>
+The use of \x{hh...} is not dependent on the use of the utf modifier on
+the pattern. It is recognized always. There may be any number of hexadecimal
+digits inside the braces; invalid values provoke error messages.
+</P>
+<P>
+Note that \xhh specifies one byte rather than one character in UTF-8 mode;
+this makes it possible to construct invalid UTF-8 sequences for testing
+purposes. On the other hand, \x{hh} is interpreted as a UTF-8 character in
+UTF-8 mode, generating more than one byte if the value is greater than 127.
+When testing the 8-bit library not in UTF-8 mode, \x{hh} generates one byte
+for values less than 256, and causes an error for greater values.
+</P>
+<P>
+In UTF-16 mode, all 4-digit \x{hhhh} values are accepted. This makes it
+possible to construct invalid UTF-16 sequences for testing purposes.
+</P>
+<P>
+In UTF-32 mode, all 4- to 8-digit \x{...} values are accepted. This makes it
+possible to construct invalid UTF-32 sequences for testing purposes.
+</P>
+<P>
+There is a special backslash sequence that specifies replication of one or more
+characters:
+<pre>
+ \[&#60;characters&#62;]{&#60;count&#62;}
+</pre>
+This makes it possible to test long strings without having to provide them as
+part of the file. For example:
+<pre>
+ \[abc]{4}
+</pre>
+is converted to "abcabcabcabc". This feature does not support nesting. To
+include a closing square bracket in the characters, code it as \x5D.
+</P>
+<P>
+A backslash followed by an equals sign marke the end of the subject string and
+the start of a modifier list. For example:
+<pre>
+ abc\=notbol,notempty
+</pre>
+A backslash followed by any other non-alphanumeric character just escapes that
+character. A backslash followed by anything else causes an error. However, if
+the very last character in the line is a backslash (and there is no modifier
+list), it is ignored. This gives a way of passing an empty line as data, since
+a real empty line terminates the data input.
+</P>
+<br><a name="SEC10" href="#TOC1">PATTERN MODIFIERS</a><br>
+<P>
+There are three types of modifier that can appear in pattern lines, two of
+which may also be used in a <b>#pattern</b> command. A pattern's modifier list
+can add to or override default modifiers that were set by a previous
+<b>#pattern</b> command.
+</P>
+<br><b>
+Setting compilation options
+</b><br>
+<P>
+The following modifiers set options for <b>pcre2_compile()</b>. The most common
+ones have single-letter abbreviations. See
+<a href="pcreapi.html"><b>pcreapi</b></a>
+for a description of their effects.
+<pre>
+ allow_empty_class set PCRE2_ALLOW_EMPTY_CLASS
+ alt_bsux set PCRE2_ALT_BSUX
+ anchored set PCRE2_ANCHORED
+ auto_callout set PCRE2_AUTO_CALLOUT
+ /i caseless set PCRE2_CASELESS
+ dollar_endonly set PCRE2_DOLLAR_ENDONLY
+ /s dotall set PCRE2_DOTALL
+ dupnames set PCRE2_DUPNAMES
+ /x extended set PCRE2_EXTENDED
+ firstline set PCRE2_FIRSTLINE
+ match_unset_backref set PCRE2_MATCH_UNSET_BACKREF
+ /m multiline set PCRE2_MULTILINE
+ never_ucp set PCRE2_NEVER_UCP
+ never_utf set PCRE2_NEVER_UTF
+ no_auto_capture set PCRE2_NO_AUTO_CAPTURE
+ no_auto_possess set PCRE2_NO_AUTO_POSSESS
+ no_start_optimize set PCRE2_NO_START_OPTIMIZE
+ no_utf_check set PCRE2_NO_UTF_CHECK
+ ucp set PCRE2_UCP
+ ungreedy set PCRE2_UNGREEDY
+ utf set PCRE2_UTF
+</pre>
+As well as turning on the PCRE2_UTF option, the <b>utf</b> modifier causes all
+non-printing characters in output strings to be printed using the \x{hh...}
+notation. Otherwise, those less than 0x100 are output in hex without the curly
+brackets.
+</P>
+<br><b>
+Setting compilation controls
+</b><br>
+<P>
+The following modifiers affect the compilation process or request information
+about the pattern:
+<pre>
+ bsr=[anycrlf|unicode] specify \R handling
+ /B bincode show binary code without lengths
+ debug same as info,fullbincode
+ fullbincode show binary code with lengths
+ /I info show info about compiled pattern
+ hex pattern is coded in hexadecimal
+ jit[=&#60;number&#62;] use JIT
+ locale=&#60;name&#62; use this locale
+ memory show memory used
+ newline=&#60;type&#62; set newline type
+ parens_nest_limit=&#60;n&#62; set maximum parentheses depth
+ perlcompat lock out non-Perl modifiers
+ posix use the POSIX API
+ stackguard=&#60;number&#62; test the stackguard feature
+ tables=[0|1|2] select internal tables
+ use_length use the pattern's length
+</pre>
+The effects of these modifiers are described in the following sections.
+FIXME: Give more examples.
+</P>
+<br><b>
+Newline and \R handling
+</b><br>
+<P>
+The <b>bsr</b> modifier specifies what \R in a pattern should match. If it is
+set to "anycrlf", \R matches CR, LF, or CRLF only. If it is set to "unicode",
+\R matches any Unicode newline sequence. The default is specified when PCRE2
+is built, with the default default being Unicode.
+</P>
+<P>
+The <b>newline</b> modifier specifies which characters are to be interpreted as
+newlines, both in the pattern and (by default) in subject lines. The type must
+be one of CR, LF, CRLF, ANYCRLF, or ANY.
+</P>
+<P>
+Both the \R and newline settings can be changed at match time, but if this is
+done, JIT matching is disabled.
+</P>
+<br><b>
+Information about a pattern
+</b><br>
+<P>
+The <b>debug</b> modifier is a shorthand for <b>info,fullbincode</b>, requesting
+all available information.
+</P>
+<P>
+The <b>bincode</b> modifier causes a representation of the compiled code to be
+output after compilation. This information does not contain length and offset
+values, which ensures that the same output is generated for different internal
+link sizes and different code unit widths. By using <b>bincode</b>, the same
+regression tests can be used in different environments.
+</P>
+<P>
+The <b>fullbincode</b> modifier, by contrast, <i>does</i> include length and
+offset values. This is used in a few special tests and is also useful for
+one-off tests.
+</P>
+<P>
+The <b>info</b> modifier requests information about the compiled pattern
+(whether it is anchored, has a fixed first character, and so on). The
+information is obtained from the <b>pcre2_pattern_info()</b> function.
+</P>
+<br><b>
+Specifying a pattern in hex
+</b><br>
+<P>
+The <b>hex</b> modifier specifies that the characters of the pattern are to be
+interpreted as pairs of hexadecimal digits. White space is permitted between
+pairs. For example:
+<pre>
+ /ab 32 59/hex
+</pre>
+This feature is provided as a way of creating patterns that contain binary zero
+characters. When <b>hex</b> is set, it implies <b>use_length</b>.
+</P>
+<br><b>
+Using the pattern's length
+</b><br>
+<P>
+By default, <b>pcre2test</b> passes patterns as zero-terminated strings to
+<b>pcre2_compile()</b>, giving the length as -1. If <b>use_length</b> is set, the
+length of the pattern is passed. This is implied if <b>hex</b> is set.
+</P>
+<br><b>
+JIT compilation
+</b><br>
+<P>
+The <b>/jit</b> modifier may optionally be followed by a number in the range 0
+to 7:
+<pre>
+ 0 disable JIT
+ 1 normal match only
+ 2 soft partial match only
+ 3 normal match and soft partial match
+ 4 hard partial match only
+ 6 soft and hard partial match
+ 7 all three modes
+</pre>
+If no number is given, 7 is assumed. If JIT compilation is successful, the
+compiled JIT code will automatically be used when <b>pcre2_match()</b> is run,
+except when incompatible run-time options are specified. For more details, see
+the
+<a href="pcre2jit.html"><b>pcre2jit</b></a>
+documentation. See also the <b>jitstack</b> modifier below for a way of
+setting the size of the JIT stack.
+</P>
+<P>
+If the <b>jitverify</b> modifier is specified, the text "(JIT)" is added to the
+first output line after a match or non match when JIT-compiled code was
+actually used. This modifier can also be set on a subject line.
+</P>
+<br><b>
+Setting a locale
+</b><br>
+<P>
+The <b>/locale</b> modifier must specify the name of a locale, for example:
+<pre>
+ /pattern/locale=fr_FR
+</pre>
+The given locale is set, <b>pcre2_maketables()</b> is called to build a set of
+character tables for the locale, and this is then passed to
+<b>pcre2_compile()</b> when compiling the regular expression. The same tables
+are used when matching the following subject lines. The <b>/locale</b> modifier
+applies only to the pattern on which it appears, but can be given in a
+<b>#pattern</b> command if a default is needed. Setting a locale and alternate
+character tables are mutually exclusive.
+</P>
+<br><b>
+Showing pattern memory
+</b><br>
+<P>
+The <b>/memory</b> modifier causes the size in bytes of the memory block used to
+hold the compiled pattern to be output. This does not include the size of the
+<b>pcre2_code</b> block; it is just the actual compiled data. If the pattern is
+subsequently passed to the JIT compiler, the size of the JIT compiled code is
+also output.
+</P>
+<br><b>
+Limiting nested parentheses
+</b><br>
+<P>
+The <b>parens_nest_limit</b> modifier sets a limit on the depth of nested
+parentheses in a pattern. Breaching the limit causes a compilation error.
+</P>
+<br><b>
+Using the POSIX wrapper API
+</b><br>
+<P>
+The <b>/posix</b> modifier causes <b>pcre2test</b> to call PCRE2 via the POSIX
+wrapper API rather than its native API. This supports only the 8-bit library.
+When the POSIX API is being used, the following pattern modifiers set options
+for the <b>regcomp()</b> function:
+<pre>
+ caseless REG_ICASE
+ multiline REG_NEWLINE
+ no_auto_capture REG_NOSUB
+ dotall REG_DOTALL )
+ ungreedy REG_UNGREEDY ) These options are not part of
+ ucp REG_UCP ) the POSIX standard
+ utf REG_UTF8 )
+</pre>
+The <b>aftertext</b> and <b>allaftertext</b> subject modifiers work as described
+below. All other modifiers cause an error.
+</P>
+<br><b>
+Testing the stack guard feature
+</b><br>
+<P>
+The <b>/stackguard</b> modifier is used to test the use of
+<b>pcre2_set_compile_recursion_guard()</b>, a function that is provided to
+enable stack availability to be checked during compilation (see the
+<a href="pcre2api.html"><b>pcre2api</b></a>
+documentation for details). If the number specified by the modifier is greater
+than zero, <b>pcre2_set_compile_recursion_guard()</b> is called to set up
+callback from <b>pcre2_compile()</b> to a local function. The argument it is
+passed is the current nesting parenthesis depth; if this is greater than the
+value given by the modifier, non-zero is returned, causing the compilation to
+be aborted.
+</P>
+<br><b>
+Using alternative character tables
+</b><br>
+<P>
+The <b>/tables</b> modifier must be followed by a single digit. It causes a
+specific set of built-in character tables to be passed to
+<b>pcre2_compile()</b>. This is used in the PCRE2 tests to check behaviour with
+different character tables. The digit specifies the tables as follows:
+<pre>
+ 0 do not pass any special character tables
+ 1 the default ASCII tables, as distributed in
+ pcre2_chartables.c.dist
+ 2 a set of tables defining ISO 8859 characters
+</pre>
+In table 2, some characters whose codes are greater than 128 are identified as
+letters, digits, spaces, etc. Setting alternate character tables and a locale
+are mutually exclusive.
+</P>
+<br><b>
+Setting certain match controls
+</b><br>
+<P>
+The following modifiers are really subject modifiers, and are described below.
+However, they may be included in a pattern's modifier list, in which case they
+are applied to every subject line that is processed with that pattern. They do
+not affect the compilation process.
+<pre>
+ aftertext show text after match
+ allaftertext show text after captures
+ allcaptures show all captures
+ allusedtext show all consulted text
+ /g global global matching
+ jitverify verify JIT usage
+ mark show mark values
+</pre>
+These modifiers may not appear in a <b>#pattern</b> command. If you want them as
+defaults, set them in a <b>#subject</b> command.
+</P>
+<br><a name="SEC11" href="#TOC1">SUBJECT MODIFIERS</a><br>
+<P>
+The modifiers that can appear in subject lines and the <b>#subject</b>
+command are of two types.
+</P>
+<br><b>
+Setting match options
+</b><br>
+<P>
+The following modifiers set options for <b>pcre2_match()</b> or
+<b>pcre2_dfa_match()</b>. See
+<a href="pcreapi.html"><b>pcreapi</b></a>
+for a description of their effects.
+<pre>
+ anchored set PCRE2_ANCHORED
+ dfa_restart set PCRE2_DFA_RESTART
+ dfa_shortest set PCRE2_DFA_SHORTEST
+ no_start_optimize set PCRE2_NO_START_OPTIMIZE
+ no_utf_check set PCRE2_NO_UTF_CHECK
+ notbol set PCRE2_NOTBOL
+ notempty set PCRE2_NOTEMPTY
+ notempty_atstart set PCRE2_NOTEMPTY_ATSTART
+ noteol set PCRE2_NOTEOL
+ partial_hard (or ph) set PCRE2_PARTIAL_HARD
+ partial_soft (or ps) set PCRE2_PARTIAL_SOFT
+</pre>
+The partial matching modifiers are provided with abbreviations because they
+appear frequently in tests.
+</P>
+<P>
+If the <b>/posix</b> modifier was present on the pattern, causing the POSIX
+wrapper API to be used, the only option-setting modifiers that have any effect
+are <b>notbol</b>, <b>notempty</b>, and <b>noteol</b>, causing REG_NOTBOL,
+REG_NOTEMPTY, and REG_NOTEOL, respectively, to be passed to <b>regexec()</b>.
+Any other modifiers cause an error.
+</P>
+<br><b>
+Setting match controls
+</b><br>
+<P>
+The following modifiers affect the matching process or request additional
+information. Some of them may also be specified on a pattern line (see above),
+in which case they apply to every subject line that is matched against that
+pattern.
+<pre>
+ aftertext show text after match
+ allaftertext show text after captures
+ allcaptures show all captures
+ allusedtext show all consulted text
+ altglobal alternative global matching
+ bsr=[anycrlf|unicode] specify \R handling
+ callout_capture show captures at callout time
+ callout_data=&#60;n&#62; set a value to pass via callouts
+ callout_fail=&#60;n&#62;[:&#60;m&#62;] control callout failure
+ callout_none do not supply a callout function
+ copy=&#60;number or name&#62; copy captured substring
+ dfa use <b>pcre2_dfa_match()</b>
+ find_limits find match and recursion limits
+ get=&#60;number or name&#62; extract captured substring
+ getall extract all captured substrings
+ /g global global matching
+ jitstack=&#60;n&#62; set size of JIT stack
+ jitverify verify JIT usage
+ mark show mark values
+ match_limit=&#62;n&#62; set a match limit
+ memory show memory usage
+ newline=&#60;type&#62; set newline type
+ offset=&#60;n&#62; set starting offset
+ ovector=&#60;n&#62; set size of output vector
+ recursion_limit=&#60;n&#62; set a recursion limit
+</pre>
+The effects of these modifiers are described in the following sections.
+FIXME: Give more examples.
+</P>
+<br><b>
+Newline and \R handling
+</b><br>
+<P>
+These modifiers set the newline and \R processing conventions for the subject
+line, overriding any values that were set at compile time (as described above).
+JIT matching is disabled if these settings are changed at match time.
+</P>
+<br><b>
+Showing more text
+</b><br>
+<P>
+The <b>aftertext</b> modifier requests that as well as outputting the substring
+that matched the entire pattern, <b>pcre2test</b> should in addition output the
+remainder of the subject string. This is useful for tests where the subject
+contains multiple copies of the same substring. The <b>allaftertext</b> modifier
+requests the same action for captured substrings as well as the main matched
+substring. In each case the remainder is output on the following line with a
+plus character following the capture number.
+</P>
+<P>
+The <b>allusedtext</b> modifier requests that all the text that was consulted
+during a successful pattern match be shown. This affects the output if there
+is a lookbehind at the start of a match, or a lookahead at the end, or if \K
+is used in the pattern. Characters that precede or follow the start and end of
+the actual match are indicated in the output by '&#60;' or '&#62;' characters
+underneath them. Here is an example:
+<pre>
+ /(?&#60;=pqr)abc(?=xyz)/
+ 123pqrabcxyz456\=allusedtext
+ 0: pqrabcxyz
+ &#60;&#60;&#60; &#62;&#62;&#62;
+</pre>
+This shows that the matched string is "abc", with the preceding and following
+strings "pqr" and "xyz" also consulted during the match.
+</P>
+<br><b>
+Showing the value of all capture groups
+</b><br>
+<P>
+The <b>allcaptures</b> modifier requests that the values of all potential
+captured parentheses be output after a match. By default, only those up to the
+highest one actually used in the match are output (corresponding to the return
+code from <b>pcre2_match()</b>). Groups that did not take part in the match
+are output as "&#60;unset&#62;".
+</P>
+<br><b>
+Testing callouts
+</b><br>
+<P>
+A callout function is supplied when <b>pcre2test</b> calls the library matching
+functions, unless <b>callout_none</b> is specified. If <b>callout_capture</b> is
+set, the current captured groups are output when a callout occurs.
+</P>
+<P>
+The <b>callout_fail</b> modifier can be given one or two numbers. If there is
+only one number, 1 is returned instead of 0 when a callout of that number is
+reached. If two numbers are given, 1 is returned when callout &#60;n&#62; is reached
+for the &#60;m&#62;th time.
+</P>
+<P>
+The <b>callout_data</b> modifier can be given an unsigned or a negative number.
+Any value other than zero is used as a return from <b>pcre2test</b>'s callout
+function.
+</P>
+<br><b>
+Testing substring extraction functions
+</b><br>
+<P>
+The <b>copy</b> and <b>get</b> modifiers can be used to test the
+<b>pcre2_substring_copy_xxx()</b> and <b>pcre2_substring_get_xxx()</b> functions.
+They can be given more than once, and each can specify a group name or number,
+for example:
+<pre>
+ abcd\=copy=1,copy=3,get=G1
+</pre>
+If the <b>#subject</b> command is used to set default copy and get lists, these
+can be unset by specifying a negative number for numbered groups and an empty
+name for named groups.
+</P>
+<P>
+The <b>getall</b> modifier tests <b>pcre2_substring_list_get()</b>, which
+extracts all captured substrings.
+</P>
+<P>
+If the subject line is successfully matched, the substrings extracted by the
+convenience functions are output with C, G, or L after the string number
+instead of a colon. This is in addition to the normal full list. The string
+length (that is, the return from the extraction function) is given in
+parentheses after each substring.
+</P>
+<br><b>
+Finding all matches in a string
+</b><br>
+<P>
+Searching for all possible matches within a subject can be requested by the
+<b>global</b> or <b>/altglobal</b> modifier. After finding a match, the matching
+function is called again to search the remainder of the subject. The difference
+between <b>global</b> and <b>altglobal</b> is that the former uses the
+<i>start_offset</i> argument to <b>pcre2_match()</b> or <b>pcre2_dfa_match()</b>
+to start searching at a new point within the entire string (which is what Perl
+does), whereas the latter passes over a shortened substring. This makes a
+difference to the matching process if the pattern begins with a lookbehind
+assertion (including \b or \B).
+</P>
+<P>
+If an empty string is matched, the next match is done with the
+PCRE2_NOTEMPTY_ATSTART and PCRE2_ANCHORED flags set, in order to search for
+another, non-empty, match at the same point in the subject. If this match
+fails, the start offset is advanced, and the normal match is retried. This
+imitates the way Perl handles such cases when using the <b>/g</b> modifier or
+the <b>split()</b> function. Normally, the start offset is advanced by one
+character, but if the newline convention recognizes CRLF as a newline, and the
+current character is CR followed by LF, an advance of two is used.
+</P>
+<br><b>
+Setting the JIT stack size
+</b><br>
+<P>
+The <b>jitstack</b> modifier provides a way of setting the maximum stack size
+that is used by the just-in-time optimization code. It is ignored if JIT
+optimization is not being used. Providing a stack that is larger than the
+default 32K is necessary only for very complicated patterns.
+</P>
+<br><b>
+Setting match and recursion limits
+</b><br>
+<P>
+The <b>match_limit</b> and <b>recursion_limit</b> modifiers set the appropriate
+limits in the match context. These values are ignored when the
+<b>find_limits</b> modifier is specified.
+</P>
+<br><b>
+Finding minimum limits
+</b><br>
+<P>
+If the <b>find_limits</b> modifier is present, <b>pcre2test</b> calls
+<b>pcre2_match()</b> several times, setting different values in the match
+context via <b>pcre2_set_match_limit()</b> and <b>pcre2_set_recursion_limit()</b>
+until it finds the minimum values for each parameter that allow
+<b>pcre2_match()</b> to complete without error.
+</P>
+<P>
+The <i>match_limit</i> number is a measure of the amount of backtracking
+that takes place, and learning the minimum value can be instructive. For most
+simple matches, the number is quite small, but for patterns with very large
+numbers of matching possibilities, it can become large very quickly with
+increasing length of subject string. The <i>match_limit_recursion</i> number is
+a measure of how much stack (or, if PCRE2 is compiled with NO_RECURSE, how much
+heap) memory is needed to complete the match attempt.
+</P>
+<br><b>
+Showing MARK names
+</b><br>
+<P>
+The <b>mark</b> modifier causes the names from backtracking control verbs that
+are returned from calls to <b>pcre2_match()</b> to be displayed. If a mark is
+returned for a match, non-match, or partial match, <b>pcre2test</b> shows it.
+For a match, it is on a line by itself, tagged with "MK:". Otherwise, it
+is added to the non-match message.
+</P>
+<br><b>
+Showing memory usage
+</b><br>
+<P>
+The <b>memory</b> modifier causes <b>pcre2test</b> to log all memory allocation
+and freeing calls that occur during a match operation.
+</P>
+<br><b>
+Setting a starting offset
+</b><br>
+<P>
+The <b>offset</b> modifier sets an offset in the subject string at which
+matching starts. Its value is a number of code units, not characters.
+</P>
+<br><b>
+Setting the size of the output vector
+</b><br>
+<P>
+The <b>ovector</b> modifier applies only to the subject line in which it
+appears, though of course it can also be used to set a default in a
+<b>#subject</b> command. It specifies the number of pairs of offsets that are
+available for storing matching information. The default is 15.
+</P>
+<br><a name="SEC12" href="#TOC1">THE ALTERNATIVE MATCHING FUNCTION</a><br>
+<P>
+By default, <b>pcre2test</b> uses the standard PCRE2 matching function,
+<b>pcre2_match()</b> to match each subject line. PCRE2 also supports an
+alternative matching function, <b>pcre2_dfa_match()</b>, which operates in a
+different way, and has some restrictions. The differences between the two
+functions are described in the
+<a href="pcre2matching.html"><b>pcre2matching</b></a>
+documentation.
+</P>
+<P>
+If the <b>dfa</b> modifier is set, the alternative matching function is used.
+This function finds all possible matches at a given point in the subject. If,
+however, the <b>dfa_shortest</b> modifier is set, processing stops after the
+first match is found. This is always the shortest possible match.
+</P>
+<br><a name="SEC13" href="#TOC1">DEFAULT OUTPUT FROM pcre2test</a><br>
+<P>
+This section describes the output when the normal matching function,
+<b>pcre2_match()</b>, is being used.
+</P>
+<P>
+When a match succeeds, <b>pcre2test</b> outputs the list of captured substrings,
+starting with number 0 for the string that matched the whole pattern.
+Otherwise, it outputs "No match" when the return is PCRE2_ERROR_NOMATCH, or
+"Partial match:" followed by the partially matching substring when the
+return is PCRE2_ERROR_PARTIAL. (Note that this is the
+entire substring that was inspected during the partial match; it may include
+characters before the actual match start if a lookbehind assertion, \K, \b,
+or \B was involved.)
+</P>
+<P>
+For any other return, <b>pcre2test</b> outputs the PCRE2
+negative error number and a short descriptive phrase. If the error is a failed
+UTF string check, the offset of the start of the failing character and the
+reason code are also output. Here is an example of an interactive
+<b>pcre2test</b> run.
+<pre>
+ $ pcre2test
+ PCRE2 version 9.00 2014-05-10
+
+ re&#62; /^abc(\d+)/
+ data&#62; abc123
+ 0: abc123
+ 1: 123
+ data&#62; xyz
+ No match
+</pre>
+Unset capturing substrings that are not followed by one that is set are not
+returned by <b>pcre2_match()</b>, and are not shown by <b>pcre2test</b>. In the
+following example, there are two capturing substrings, but when the first data
+line is matched, the second, unset substring is not shown. An "internal" unset
+substring is shown as "&#60;unset&#62;", as for the second data line.
+<pre>
+ re&#62; /(a)|(b)/
+ data&#62; a
+ 0: a
+ 1: a
+ data&#62; b
+ 0: b
+ 1: &#60;unset&#62;
+ 2: b
+</pre>
+If the strings contain any non-printing characters, they are output as \xhh
+escapes if the value is less than 256 and UTF mode is not set. Otherwise they
+are output as \x{hh...} escapes. See below for the definition of non-printing
+characters. If the <b>/aftertext</b> modifier is set, the output for substring
+0 is followed by the the rest of the subject string, identified by "0+" like
+this:
+<pre>
+ re&#62; /cat/aftertext
+ data&#62; cataract
+ 0: cat
+ 0+ aract
+</pre>
+If global matching is requested, the results of successive matching attempts
+are output in sequence, like this:
+<pre>
+ re&#62; /\Bi(\w\w)/g
+ data&#62; Mississippi
+ 0: iss
+ 1: ss
+ 0: iss
+ 1: ss
+ 0: ipp
+ 1: pp
+</pre>
+"No match" is output only if the first match attempt fails. Here is an example
+of a failure message (the offset 4 that is specified by \&#62;4 is past the end of
+the subject string):
+<pre>
+ re&#62; /xyz/
+ data&#62; xyz\=offset=4
+ Error -24 (bad offset value)
+</PRE>
+</P>
+<P>
+Note that whereas patterns can be continued over several lines (a plain "&#62;"
+prompt is used for continuations), subject lines may not. However newlines can
+be included in a subject by means of the \n escape (or \r, \r\n, etc.,
+depending on the newline sequence setting).
+</P>
+<br><a name="SEC14" href="#TOC1">OUTPUT FROM THE ALTERNATIVE MATCHING FUNCTION</a><br>
+<P>
+When the alternative matching function, <b>pcre2_dfa_match()</b>, is used, the
+output consists of a list of all the matches that start at the first point in
+the subject where there is at least one match. For example:
+<pre>
+ re&#62; /(tang|tangerine|tan)/
+ data&#62; yellow tangerine\=dfa
+ 0: tangerine
+ 1: tang
+ 2: tan
+</pre>
+(Using the normal matching function on this data finds only "tang".) The
+longest matching string is always given first (and numbered zero). After a
+PCRE2_ERROR_PARTIAL return, the output is "Partial match:", followed by the
+partially matching substring. (Note that this is the entire substring that was
+inspected during the partial match; it may include characters before the actual
+match start if a lookbehind assertion, \K, \b, or \B was involved.)
+</P>
+<P>
+If global matching is requested, the search for further matches resumes
+at the end of the longest match. For example:
+<pre>
+ re&#62; /(tang|tangerine|tan)/g
+ data&#62; yellow tangerine and tangy sultana\=dfa
+ 0: tangerine
+ 1: tang
+ 2: tan
+ 0: tang
+ 1: tan
+ 0: tan
+</pre>
+The alternative matching function does not support substring capture, so the
+modifiers that are concerned with captured substrings are not relevant.
+</P>
+<br><a name="SEC15" href="#TOC1">RESTARTING AFTER A PARTIAL MATCH</a><br>
+<P>
+When the alternative matching function has given the PCRE2_ERROR_PARTIAL
+return, indicating that the subject partially matched the pattern, you can
+restart the match with additional subject data by means of the
+<b>dfa_restart</b> modifier. For example:
+<pre>
+ re&#62; /^\d?\d(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\d\d$/
+ data&#62; 23ja\=P,dfa
+ Partial match: 23ja
+ data&#62; n05\=dfa,dfa_restart
+ 0: n05
+</pre>
+For further information about partial matching, see the
+<a href="pcre2partial.html"><b>pcre2partial</b></a>
+documentation.
+</P>
+<br><a name="SEC16" href="#TOC1">CALLOUTS</a><br>
+<P>
+If the pattern contains any callout requests, <b>pcre2test</b>'s callout function
+is called during matching. This works with both matching functions. By default,
+the called function displays the callout number, the start and current
+positions in the text at the callout time, and the next pattern item to be
+tested. For example:
+<pre>
+ ---&#62;pqrabcdef
+ 0 ^ ^ \d
+</pre>
+This output indicates that callout number 0 occurred for a match attempt
+starting at the fourth character of the subject string, when the pointer was at
+the seventh character, and when the next pattern item was \d. Just
+one circumflex is output if the start and current positions are the same.
+</P>
+<P>
+Callouts numbered 255 are assumed to be automatic callouts, inserted as a
+result of the <b>/auto_callout</b> pattern modifier. In this case, instead of
+showing the callout number, the offset in the pattern, preceded by a plus, is
+output. For example:
+<pre>
+ re&#62; /\d?[A-E]\*/auto_callout
+ data&#62; E*
+ ---&#62;E*
+ +0 ^ \d?
+ +3 ^ [A-E]
+ +8 ^^ \*
+ +10 ^ ^
+ 0: E*
+</pre>
+If a pattern contains (*MARK) items, an additional line is output whenever
+a change of latest mark is passed to the callout function. For example:
+<pre>
+ re&#62; /a(*MARK:X)bc/auto_callout
+ data&#62; abc
+ ---&#62;abc
+ +0 ^ a
+ +1 ^^ (*MARK:X)
+ +10 ^^ b
+ Latest Mark: X
+ +11 ^ ^ c
+ +12 ^ ^
+ 0: abc
+</pre>
+The mark changes between matching "a" and "b", but stays the same for the rest
+of the match, so nothing more is output. If, as a result of backtracking, the
+mark reverts to being unset, the text "&#60;unset&#62;" is output.
+</P>
+<P>
+The callout function in <b>pcre2test</b> returns zero (carry on matching) by
+default, but you can use a <b>callout_fail</b> modifier in a subject line (as
+described above) to change this and other parameters of the callout.
+</P>
+<P>
+Inserting callouts can be helpful when using <b>pcre2test</b> to check
+complicated regular expressions. For further information about callouts, see
+the
+<a href="pcre2callout.html"><b>pcre2callout</b></a>
+documentation.
+</P>
+<br><a name="SEC17" href="#TOC1">NON-PRINTING CHARACTERS</a><br>
+<P>
+When <b>pcre2test</b> is outputting text in the compiled version of a pattern,
+bytes other than 32-126 are always treated as non-printing characters and are
+therefore shown as hex escapes.
+</P>
+<P>
+When <b>pcre2test</b> is outputting text that is a matched part of a subject
+string, it behaves in the same way, unless a different locale has been set for
+the pattern (using the <b>/locale</b> modifier). In this case, the
+<b>isprint()</b> function is used to distinguish printing and non-printing
+characters.
+</P>
+<br><a name="SEC18" href="#TOC1">SEE ALSO</a><br>
+<P>
+<b>pcre2</b>(3), <b>pcre16</b>(3), <b>pcre32</b>(3), <b>pcre2api</b>(3),
+<b>pcre2callout</b>(3),
+<b>pcre2jit</b>, <b>pcre2matching</b>(3), <b>pcre2partial</b>(d),
+<b>pcre2pattern</b>(3), <b>pcre2precompile</b>(3).
+</P>
+<br><a name="SEC19" href="#TOC1">AUTHOR</a><br>
+<P>
+Philip Hazel
+<br>
+University Computing Service
+<br>
+Cambridge CB2 3QH, England.
+<br>
+</P>
+<br><a name="SEC20" href="#TOC1">REVISION</a><br>
+<P>
+Last updated: 19 August 2014
+<br>
+Copyright &copy; 1997-2014 University of Cambridge.
+<br>
+<p>
+Return to the <a href="index.html">PCRE2 index page</a>.
+</p>
diff --git a/doc/html/pcre2unicode.html b/doc/html/pcre2unicode.html
new file mode 100644
index 0000000..bbefd02
--- /dev/null
+++ b/doc/html/pcre2unicode.html
@@ -0,0 +1,270 @@
+<html>
+<head>
+<title>pcre2unicode specification</title>
+</head>
+<body bgcolor="#FFFFFF" text="#00005A" link="#0066FF" alink="#3399FF" vlink="#2222BB">
+<h1>pcre2unicode man page</h1>
+<p>
+Return to the <a href="index.html">PCRE2 index page</a>.
+</p>
+<p>
+This page is part of the PCRE2 HTML documentation. It was generated
+automatically from the original man page. If there is any nonsense in it,
+please consult the man page, in case the conversion went wrong.
+<br>
+<br><b>
+UNICODE AND UTF SUPPORT
+</b><br>
+<P>
+When PCRE2 is built with Unicode support, it acquires knowledge of Unicode
+character properties and can process text strings in UTF-8, UTF-16, or UTF-32
+format (depending on the code unit width). By default, PCRE2 assumes that one
+code unit is one character. To process a pattern as a UTF string, where a
+character may require more than one code unit, you must call
+<a href="pcre2_compile.html"><b>pcre2_compile()</b></a>
+with the PCRE2_UTF option flag, or the pattern must start with the sequence
+(*UTF). When either of these is the case, both the pattern and any subject
+strings that are matched against it are treated as UTF strings instead of
+strings of individual one-code-unit characters.
+</P>
+<P>
+If you build PCRE2 with Unicode support, the library will be bigger, but the
+additional run time overhead is limited to testing the PCRE2_UTF flag
+occasionally, so should not be very much.
+</P>
+<br><b>
+UNICODE PROPERTY SUPPORT
+</b><br>
+<P>
+When PCRE2 is built with Unicode support, the escape sequences \p{..},
+\P{..}, and \X can be used. The Unicode properties that can be tested are
+limited to the general category properties such as Lu for an upper case letter
+or Nd for a decimal number, the Unicode script names such as Arabic or Han, and
+the derived properties Any and L&. Full lists are given in the
+<a href="pcre2pattern.html"><b>pcre2pattern</b></a>
+and
+<a href="pcre2syntax.html"><b>pcre2syntax</b></a>
+documentation. Only the short names for properties are supported. For example,
+\p{L} matches a letter. Its Perl synonym, \p{Letter}, is not supported.
+Furthermore, in Perl, many properties may optionally be prefixed by "Is", for
+compatibility with Perl 5.6. PCRE does not support this.
+</P>
+<br><b>
+WIDE CHARACTERS AND UTF MODES
+</b><br>
+<P>
+Codepoints less than 256 can be specified in patterns by either braced or
+unbraced hexadecimal escape sequences (for example, \x{b3} or \xb3). Larger
+values have to use braced sequences. Unbraced octal code points up to \777 are
+also recognized; larger ones can be coded using \o{...}.
+</P>
+<P>
+In UTF modes, repeat quantifiers apply to complete UTF characters, not to
+individual code units.
+</P>
+<P>
+In UTF modes, the dot metacharacter matches one UTF character instead of a
+single code unit.
+</P>
+<P>
+The escape sequence \C can be used to match a single code unit, in a UTF mode,
+but its use can lead to some strange effects because it breaks up multi-unit
+characters (see the description of \C in the
+<a href="pcre2pattern.html"><b>pcre2pattern</b></a>
+documentation). The use of \C is not supported in the alternative matching
+function <b>pcre2_dfa_exec()</b>, nor is it supported in UTF mode by the JIT
+optimization. If JIT optimization is requested for a UTF pattern that contains
+\C, it will not succeed, and so the matching will be carried out by the normal
+interpretive function.
+</P>
+<P>
+The character escapes \b, \B, \d, \D, \s, \S, \w, and \W correctly test
+characters of any code value, but, by default, the characters that PCRE2
+recognizes as digits, spaces, or word characters remain the same set as in
+non-UTF mode, all with code points less than 256. This remains true even when
+PCRE2 is built to include Unicode support, because to do otherwise would slow
+down matching in many common cases. Note that this also applies to \b
+and \B, because they are defined in terms of \w and \W. If you want
+to test for a wider sense of, say, "digit", you can use explicit Unicode
+property tests such as \p{Nd}. Alternatively, if you set the PCRE2_UCP option,
+the way that the character escapes work is changed so that Unicode properties
+are used to determine which characters match. There are more details in the
+section on
+<a href="pcre2pattern.html#genericchartypes">generic character types</a>
+in the
+<a href="pcre2pattern.html"><b>pcre2pattern</b></a>
+documentation.
+</P>
+<P>
+Similarly, characters that match the POSIX named character classes are all
+low-valued characters, unless the PCRE2_UCP option is set.
+</P>
+<P>
+However, the special horizontal and vertical white space matching escapes (\h,
+\H, \v, and \V) do match all the appropriate Unicode characters, whether or
+not PCRE2_UCP is set.
+</P>
+<P>
+Case-insensitive matching in UTF mode makes use of Unicode properties. A few
+Unicode characters such as Greek sigma have more than two codepoints that are
+case-equivalent, and these are treated as such.
+</P>
+<br><b>
+VALIDITY OF UTF STRINGS
+</b><br>
+<P>
+When the PCRE2_UTF option is set, the strings passed as patterns and subjects
+are (by default) checked for validity on entry to the relevant functions.
+If an invalid UTF string is passed, an error return is given.
+</P>
+<P>
+UTF-16 and UTF-32 strings can indicate their endianness by special code knows
+as a byte-order mark (BOM). The PCRE2 functions do not handle this, expecting
+strings to be in host byte order.
+</P>
+<P>
+The entire string is checked before any other processing takes place. In
+addition to checking the format of the string, there is a check to ensure that
+all code points lie in the range U+0 to U+10FFFF, excluding the surrogate area.
+The so-called "non-character" code points are not excluded because Unicode
+corrigendum #9 makes it clear that they should not be.
+</P>
+<P>
+Characters in the "Surrogate Area" of Unicode are reserved for use by UTF-16,
+where they are used in pairs to encode code points with values greater than
+0xFFFF. The code points that are encoded by UTF-16 pairs are available
+independently in the UTF-8 and UTF-32 encodings. (In other words, the whole
+surrogate thing is a fudge for UTF-16 which unfortunately messes up UTF-8 and
+UTF-32.)
+</P>
+<P>
+In some situations, you may already know that your strings are valid, and
+therefore want to skip these checks in order to improve performance, for
+example in the case of a long subject string that is being scanned repeatedly.
+If you set the PCRE2_NO_UTF_CHECK flag at compile time or at run time, PCRE2
+assumes that the pattern or subject it is given (respectively) contains only
+valid UTF code unit sequences.
+</P>
+<P>
+Passing PCRE2_NO_UTF_CHECK to <b>pcre2_compile()</b> just disables the check for
+the pattern; it does not also apply to subject strings. If you want to disable
+the check for a subject string you must pass this option to <b>pcre2_exec()</b>
+or <b>pcre2_dfa_exec()</b>.
+</P>
+<P>
+If you pass an invalid UTF string when PCRE2_NO_UTF_CHECK is set, the result
+is undefined and your program may crash or loop indefinitely.
+<a name="utf8strings"></a></P>
+<br><b>
+Errors in UTF-8 strings
+</b><br>
+<P>
+The following negative error codes are given for invalid UTF-8 strings:
+<pre>
+ PCRE2_ERROR_UTF8_ERR1
+ PCRE2_ERROR_UTF8_ERR2
+ PCRE2_ERROR_UTF8_ERR3
+ PCRE2_ERROR_UTF8_ERR4
+ PCRE2_ERROR_UTF8_ERR5
+</pre>
+The string ends with a truncated UTF-8 character; the code specifies how many
+bytes are missing (1 to 5). Although RFC 3629 restricts UTF-8 characters to be
+no longer than 4 bytes, the encoding scheme (originally defined by RFC 2279)
+allows for up to 6 bytes, and this is checked first; hence the possibility of
+4 or 5 missing bytes.
+<pre>
+ PCRE2_ERROR_UTF8_ERR6
+ PCRE2_ERROR_UTF8_ERR7
+ PCRE2_ERROR_UTF8_ERR8
+ PCRE2_ERROR_UTF8_ERR9
+ PCRE2_ERROR_UTF8_ERR10
+</pre>
+The two most significant bits of the 2nd, 3rd, 4th, 5th, or 6th byte of the
+character do not have the binary value 0b10 (that is, either the most
+significant bit is 0, or the next bit is 1).
+<pre>
+ PCRE2_ERROR_UTF8_ERR11
+ PCRE2_ERROR_UTF8_ERR12
+</pre>
+A character that is valid by the RFC 2279 rules is either 5 or 6 bytes long;
+these code points are excluded by RFC 3629.
+<pre>
+ PCRE2_ERROR_UTF8_ERR13
+</pre>
+A 4-byte character has a value greater than 0x10fff; these code points are
+excluded by RFC 3629.
+<pre>
+ PCRE2_ERROR_UTF8_ERR14
+</pre>
+A 3-byte character has a value in the range 0xd800 to 0xdfff; this range of
+code points are reserved by RFC 3629 for use with UTF-16, and so are excluded
+from UTF-8.
+<pre>
+ PCRE2_ERROR_UTF8_ERR15
+ PCRE2_ERROR_UTF8_ERR16
+ PCRE2_ERROR_UTF8_ERR17
+ PCRE2_ERROR_UTF8_ERR18
+ PCRE2_ERROR_UTF8_ERR19
+</pre>
+A 2-, 3-, 4-, 5-, or 6-byte character is "overlong", that is, it codes for a
+value that can be represented by fewer bytes, which is invalid. For example,
+the two bytes 0xc0, 0xae give the value 0x2e, whose correct coding uses just
+one byte.
+<pre>
+ PCRE2_ERROR_UTF8_ERR20
+</pre>
+The two most significant bits of the first byte of a character have the binary
+value 0b10 (that is, the most significant bit is 1 and the second is 0). Such a
+byte can only validly occur as the second or subsequent byte of a multi-byte
+character.
+<pre>
+ PCRE2_ERROR_UTF8_ERR21
+</pre>
+The first byte of a character has the value 0xfe or 0xff. These values can
+never occur in a valid UTF-8 string.
+<a name="utf16strings"></a></P>
+<br><b>
+Errors in UTF-16 strings
+</b><br>
+<P>
+The following negative error codes are given for invalid UTF-16 strings:
+<pre>
+ PCRE_UTF16_ERR1 Missing low surrogate at end of string
+ PCRE_UTF16_ERR2 Invalid low surrogate follows high surrogate
+ PCRE_UTF16_ERR3 Isolated low surrogate
+
+<a name="utf32strings"></a></PRE>
+</P>
+<br><b>
+Errors in UTF-32 strings
+</b><br>
+<P>
+The following negative error codes are given for invalid UTF-32 strings:
+<pre>
+ PCRE_UTF32_ERR1 Surrogate character (range from 0xd800 to 0xdfff)
+ PCRE_UTF32_ERR2 Code point is greater than 0x10ffff
+
+</PRE>
+</P>
+<br><b>
+AUTHOR
+</b><br>
+<P>
+Philip Hazel
+<br>
+University Computing Service
+<br>
+Cambridge CB2 3QH, England.
+<br>
+</P>
+<br><b>
+REVISION
+</b><br>
+<P>
+Last updated: 16 September 2014
+<br>
+Copyright &copy; 1997-2014 University of Cambridge.
+<br>
+<p>
+Return to the <a href="index.html">PCRE2 index page</a>.
+</p>
diff --git a/doc/index.html.src b/doc/index.html.src
new file mode 100644
index 0000000..4e264ec
--- /dev/null
+++ b/doc/index.html.src
@@ -0,0 +1,177 @@
+<html>
+<!-- This is a manually maintained file that is the root of the HTML version of
+ the PCRE2 documentation. When the HTML documents are built from the man
+ page versions, the entire doc/html directory is emptied, this file is then
+ copied into doc/html/index.html, and the remaining files therein are
+ created by the 132html script.
+-->
+<head>
+<title>PCRE2 specification</title>
+</head>
+<body bgcolor="#FFFFFF" text="#00005A" link="#0066FF" alink="#3399FF" vlink="#2222BB">
+<h1>Perl-compatible Regular Expressions (revised API: PCRE2)</h1>
+<p>
+The HTML documentation for PCRE2 consists of a number of pages that are listed
+below in alphabetical order. If you are new to PCRE2, please read the first one
+first.
+</p>
+
+<table>
+<tr><td><a href="pcre2.html">pcre</a></td>
+ <td>&nbsp;&nbsp;Introductory page</td></tr>
+
+<tr><td><a href="pcre2-config.html">pcre-config</a></td>
+ <td>&nbsp;&nbsp;Information about the installation configuration</td></tr>
+
+<tr><td><a href="pcre2api.html">pcreapi</a></td>
+ <td>&nbsp;&nbsp;PCRE2's native API</td></tr>
+
+<tr><td><a href="pcre2build.html">pcrebuild</a></td>
+ <td>&nbsp;&nbsp;Building PCRE2</td></tr>
+
+<tr><td><a href="pcre2callout.html">pcre2callout</a></td>
+ <td>&nbsp;&nbsp;The <i>callout</i> facility</td></tr>
+
+<tr><td><a href="pcre2compat.html">pcre2compat</a></td>
+ <td>&nbsp;&nbsp;Compability with Perl</td></tr>
+
+<tr><td><a href="pcre2demo.html">pcre2demo</a></td>
+ <td>&nbsp;&nbsp;A demonstration C program that uses the PCRE2 library</td></tr>
+
+<tr><td><a href="pcre2grep.html">pcre2grep</a></td>
+ <td>&nbsp;&nbsp;The <b>pcre2grep</b> command</td></tr>
+
+<tr><td><a href="pcre2jit.html">pcre2jit</a></td>
+ <td>&nbsp;&nbsp;Discussion of the just-in-time optimization support</td></tr>
+
+<tr><td><a href="pcre2limits.html">pcre2limits</a></td>
+ <td>&nbsp;&nbsp;Details of size and other limits</td></tr>
+
+<tr><td><a href="pcre2matching.html">pcre2matching</a></td>
+ <td>&nbsp;&nbsp;Discussion of the two matching algorithms</td></tr>
+
+<tr><td><a href="pcre2partial.html">pcre2partial</a></td>
+ <td>&nbsp;&nbsp;Using PCRE2 for partial matching</td></tr>
+
+<tr><td><a href="pcre2pattern.html">pcre2pattern</a></td>
+ <td>&nbsp;&nbsp;Specification of the regular expressions supported by PCRE2</td></tr>
+
+<tr><td><a href="pcre2perform.html">pcre2perform</a></td>
+ <td>&nbsp;&nbsp;Some comments on performance</td></tr>
+
+<tr><td><a href="pcre2posix.html">pcre2posix</a></td>
+ <td>&nbsp;&nbsp;The POSIX API to the PCRE2 8-bit library</td></tr>
+
+<tr><td><a href="pcre2precompile.html">pcre2precompile</a></td>
+ <td>&nbsp;&nbsp;How to save and re-use compiled patterns</td></tr>
+
+<tr><td><a href="pcre2sample.html">pcre2sample</a></td>
+ <td>&nbsp;&nbsp;Discussion of the pcre2demo program</td></tr>
+
+<tr><td><a href="pcre2stack.html">pcre2stack</a></td>
+ <td>&nbsp;&nbsp;Discussion of PCRE2's stack usage</td></tr>
+
+<tr><td><a href="pcre2syntax.html">pcre2syntax</a></td>
+ <td>&nbsp;&nbsp;Syntax quick-reference summary</td></tr>
+
+<tr><td><a href="pcre2test.html">pcre2test</a></td>
+ <td>&nbsp;&nbsp;The <b>pcre2test</b> command for testing PCRE2</td></tr>
+
+<tr><td><a href="pcre2unicode.html">pcre2unicode</a></td>
+ <td>&nbsp;&nbsp;Discussion of Unicode and UTF-8/UTF-16/UTF-32 support</td></tr>
+</table>
+
+<p>
+There are also individual pages that summarize the interface for each function
+in the library. There is a single page for each triple of 8-bit/16-bit/32-bit
+functions.
+</p>
+
+<table>
+
+<tr><td><a href="pcre2_assign_jit_stack.html">pcre2_assign_jit_stack</a></td>
+ <td>&nbsp;&nbsp;Assign stack for JIT matching</td></tr>
+
+<tr><td><a href="pcre2_compile.html">pcre2_compile</a></td>
+ <td>&nbsp;&nbsp;Compile a regular expression</td></tr>
+
+<tr><td><a href="pcre2_compile2.html">pcre2_compile2</a></td>
+ <td>&nbsp;&nbsp;Compile a regular expression (alternate interface)</td></tr>
+
+<tr><td><a href="pcre2_config.html">pcre2_config</a></td>
+ <td>&nbsp;&nbsp;Show build-time configuration options</td></tr>
+
+<tr><td><a href="pcre2_copy_named_substring.html">pcre2_copy_named_substring</a></td>
+ <td>&nbsp;&nbsp;Extract named substring into given buffer</td></tr>
+
+<tr><td><a href="pcre2_copy_substring.html">pcre2_copy_substring</a></td>
+ <td>&nbsp;&nbsp;Extract numbered substring into given buffer</td></tr>
+
+<tr><td><a href="pcre2_dfa_exec.html">pcre2_dfa_exec</a></td>
+ <td>&nbsp;&nbsp;Match a compiled pattern to a subject string
+ (DFA algorithm; <i>not</i> Perl compatible)</td></tr>
+
+<tr><td><a href="pcre2_exec.html">pcre2_exec</a></td>
+ <td>&nbsp;&nbsp;Match a compiled pattern to a subject string
+ (Perl compatible)</td></tr>
+
+<tr><td><a href="pcre2_free_study.html">pcre2_free_study</a></td>
+ <td>&nbsp;&nbsp;Free study data</td></tr>
+
+<tr><td><a href="pcre2_free_substring.html">pcre2_free_substring</a></td>
+ <td>&nbsp;&nbsp;Free extracted substring</td></tr>
+
+<tr><td><a href="pcre2_free_substring_list.html">pcre2_free_substring_list</a></td>
+ <td>&nbsp;&nbsp;Free list of extracted substrings</td></tr>
+
+<tr><td><a href="pcre2_fullinfo.html">pcre2_fullinfo</a></td>
+ <td>&nbsp;&nbsp;Extract information about a pattern</td></tr>
+
+<tr><td><a href="pcre2_get_named_substring.html">pcre2_get_named_substring</a></td>
+ <td>&nbsp;&nbsp;Extract named substring into new memory</td></tr>
+
+<tr><td><a href="pcre2_get_stringnumber.html">pcre2_get_stringnumber</a></td>
+ <td>&nbsp;&nbsp;Convert captured string name to number</td></tr>
+
+<tr><td><a href="pcre2_get_stringtable_entries.html">pcre2_get_stringtable_entries</a></td>
+ <td>&nbsp;&nbsp;Find table entries for given string name</td></tr>
+
+<tr><td><a href="pcre2_get_substring.html">pcre2_get_substring</a></td>
+ <td>&nbsp;&nbsp;Extract numbered substring into new memory</td></tr>
+
+<tr><td><a href="pcre2_get_substring_list.html">pcre2_get_substring_list</a></td>
+ <td>&nbsp;&nbsp;Extract all substrings into new memory</td></tr>
+
+<tr><td><a href="pcre2_jit_exec.html">pcre2_jit_exec</a></td>
+ <td>&nbsp;&nbsp;Fast path interface to JIT matching</td></tr>
+
+<tr><td><a href="pcre2_jit_stack_alloc.html">pcre2_jit_stack_alloc</a></td>
+ <td>&nbsp;&nbsp;Create a stack for JIT matching</td></tr>
+
+<tr><td><a href="pcre2_jit_stack_free.html">pcre2_jit_stack_free</a></td>
+ <td>&nbsp;&nbsp;Free a JIT matching stack</td></tr>
+
+<tr><td><a href="pcre2_maketables.html">pcre2_maketables</a></td>
+ <td>&nbsp;&nbsp;Build character tables in current locale</td></tr>
+
+<tr><td><a href="pcre2_pattern_to_host_byte_order.html">pcre2_pattern_to_host_byte_order</a></td>
+ <td>&nbsp;&nbsp;Convert compiled pattern to host byte order if necessary</td></tr>
+
+<tr><td><a href="pcre2_refcount.html">pcre2_refcount</a></td>
+ <td>&nbsp;&nbsp;Maintain reference count in compiled pattern</td></tr>
+
+<tr><td><a href="pcre2_study.html">pcre2_study</a></td>
+ <td>&nbsp;&nbsp;Study a compiled pattern</td></tr>
+
+<tr><td><a href="pcre2_utf16_to_host_byte_order.html">pcre2_utf16_to_host_byte_order</a></td>
+ <td>&nbsp;&nbsp;Convert UTF-16 string to host byte order if necessary</td></tr>
+
+<tr><td><a href="pcre2_utf32_to_host_byte_order.html">pcre2_utf32_to_host_byte_order</a></td>
+ <td>&nbsp;&nbsp;Convert UTF-32 string to host byte order if necessary</td></tr>
+
+<tr><td><a href="pcre2_version.html">pcre2_version</a></td>
+ <td>&nbsp;&nbsp;Return PCRE2 version and release date</td></tr>
+</table>
+
+</html>
+
diff --git a/doc/pcre2.txt b/doc/pcre2.txt
new file mode 100644
index 0000000..52b7406
--- /dev/null
+++ b/doc/pcre2.txt
@@ -0,0 +1,2903 @@
+-----------------------------------------------------------------------------
+This file contains a concatenation of the PCRE2 man pages, converted to plain
+text format for ease of searching with a text editor, or for use on systems
+that do not have a man page processor. The small individual files that give
+synopses of each function in the library have not been included. Neither has
+the pcre2demo program. There are separate text files for the pcre2grep and
+pcre2test commands.
+-----------------------------------------------------------------------------
+
+
+PCRE2API(3) Library Functions Manual PCRE2API(3)
+
+
+
+NAME
+ PCRE2 - Perl-compatible regular expressions (revised API)
+
+ #include <pcre2.h>
+
+ PCRE2 is a new API for PCRE. This document contains a description of
+ all its functions. See the pcre2 document for an overview of all the
+ PCRE2 documentation.
+
+
+PCRE2 NATIVE API BASIC FUNCTIONS
+
+ pcre2_code *pcre2_compile(PCRE2_SPTR pattern, PCRE2_SIZE length,
+ uint32_t options, int *errorcode, PCRE2_SIZE *erroroffset,
+ pcre2_compile_context *ccontext);
+
+ pcre2_code_free(pcre2_code *code);
+
+ pcre2_match_data_create(uint32_t ovecsize,
+ pcre2_general_context *gcontext);
+
+ pcre2_match_data_create_from_pattern(pcre2_code *code,
+ pcre2_general_context *gcontext);
+
+ int pcre2_match(const pcre2_code *code, PCRE2_SPTR subject,
+ PCRE2_SIZE length, PCRE2_SIZE startoffset,
+ uint32_t options, pcre2_match_data *match_data,
+ pcre2_match_context *mcontext);
+
+ int pcre2_dfa_match(const pcre2_code *code, PCRE2_SPTR subject,
+ PCRE2_SIZE length, PCRE2_SIZE startoffset,
+ uint32_t options, pcre2_match_data *match_data,
+ pcre2_match_context *mcontext,
+ int *workspace, PCRE2_SIZE wscount);
+
+ void pcre2_match_data_free(pcre2_match_data *match_data);
+
+
+PCRE2 NATIVE API AUXILIARY MATCH FUNCTIONS
+
+ PCRE2_SIZE pcre2_get_leftchar(pcre2_match_data *match_data);
+
+ PCRE2_SPTR pcre2_get_mark(pcre2_match_data *match_data);
+
+ uint32_t pcre2_get_ovector_count(pcre2_match_data *match_data);
+
+ PCRE2_SIZE *pcre2_get_ovector_pointer(pcre2_match_data *match_data);
+
+ PCRE2_SIZE pcre2_get_rightchar(pcre2_match_data *match_data);
+
+ PCRE2_SIZE pcre2_get_startchar(pcre2_match_data *match_data);
+
+
+PCRE2 NATIVE API GENERAL CONTEXT FUNCTIONS
+
+ pcre2_general_context *pcre2_general_context_create(
+ void *(*private_malloc)(PCRE2_SIZE, void *),
+ void (*private_free)(void *, void *), void *memory_data);
+
+ pcre2_general_context *pcre2_general_context_copy(
+ pcre2_general_context *gcontext);
+
+ void pcre2_general_context_free(pcre2_general_context *gcontext);
+
+
+PCRE2 NATIVE API COMPILE CONTEXT FUNCTIONS
+
+ pcre2_compile_context *pcre2_compile_context_create(
+ pcre2_general_context *gcontext);
+
+ pcre2_compile_context *pcre2_compile_context_copy(
+ pcre2_compile_context *ccontext);
+
+ void pcre2_compile_context_free(pcre2_compile_context *ccontext);
+
+ int pcre2_set_bsr_compile(pcre2_compile_context *ccontext,
+ uint32_t value);
+
+ int pcre2_set_character_tables(pcre2_compile_context *ccontext,
+ const unsigned char *tables);
+
+ int pcre2_set_newline_compile(pcre2_compile_context *ccontext,
+ uint32_t value);
+
+ int pcre2_set_parens_nest_limit(pcre2_compile_context *ccontext,
+ uint32_t value);
+
+ int pcre2_set_compile_recursion_guard(pcre2_compile_context *ccontext,
+ int (*guard_function)(uint32_t));
+
+
+PCRE2 NATIVE API MATCH CONTEXT FUNCTIONS
+
+ pcre2_match_context *pcre2_match_context_create(
+ pcre2_general_context *gcontext);
+
+ pcre2_match_context *pcre2_match_context_copy(
+ pcre2_match_context *mcontext);
+
+ void pcre2_match_context_free(pcre2_match_context *mcontext);
+
+ int pcre2_set_bsr_match(pcre2_match_context *mcontext,
+ uint32_t value);
+
+ int pcre2_set_callout(pcre2_match_context *mcontext,
+ int (*callout_function)(pcre2_callout_block *),
+ void *callout_data);
+
+ int pcre2_set_match_limit(pcre2_match_context *mcontext,
+ uint32_t value);
+
+ int pcre2_set_newline_match(pcre2_match_context *mcontext,
+ uint32_t value);
+
+ int pcre2_set_recursion_limit(pcre2_match_context *mcontext,
+ uint32_t value);
+
+ int pcre2_set_recursion_memory_management(
+ pcre2_match_context *mcontext,
+ void *(*private_malloc)(PCRE2_SIZE, void *),
+ void (*private_free)(void *, void *), void *memory_data);
+
+
+PCRE2 NATIVE API STRING EXTRACTION FUNCTIONS
+
+ int pcre2_substring_copy_byname(pcre2_match_data *match_data,
+ PCRE2_SPTR name, PCRE2_UCHAR *buffer, PCRE2_SIZE *bufflen);
+
+ int pcre2_substring_copy_bynumber(pcre2_match_data *match_data,
+ unsigned int number, PCRE2_UCHAR *buffer,
+ PCRE2_SIZE *bufflen);
+
+ void pcre2_substring_free(PCRE2_UCHAR *buffer);
+
+ int pcre2_substring_get_byname(pcre2_match_data *match_data,
+ PCRE2_SPTR name, PCRE2_UCHAR **bufferptr, PCRE2_SIZE *bufflen);
+
+ int pcre2_substring_get_bynumber(pcre2_match_data *match_data,
+ unsigned int number, PCRE2_UCHAR **bufferptr,
+ PCRE2_SIZE *bufflen);
+
+ int pcre2_substring_length_byname(pcre2_match_data *match_data,
+ PCRE2_SPTR name, PCRE2_SIZE *length);
+
+ int pcre2_substring_length_bynumber(pcre2_match_data *match_data,
+ unsigned int number, PCRE2_SIZE *length);
+
+ int pcre2_substring_nametable_scan(const pcre2_code *code,
+ PCRE2_SPTR name, PCRE2_SPTR *first, PCRE2_SPTR *last);
+
+ int pcre2_substring_number_from_name(const pcre2_code *code,
+ PCRE2_SPTR name);
+
+ void pcre2_substring_list_free(PCRE2_SPTR *list);
+
+ int pcre2_substring_list_get(pcre2_match_data *match_data,
+ PCRE2_UCHAR ***listptr, PCRE2_SIZE **lengthsptr);
+
+
+PCRE2 NATIVE API JIT FUNCTIONS
+
+ int pcre2_jit_compile(pcre2_code *code, uint32_t options);
+
+ int pcre2_jit_match(const pcre2_code *code, PCRE2_SPTR subject,
+ PCRE2_SIZE length, PCRE2_SIZE startoffset,
+ uint32_t options, pcre2_match_data *match_data,
+ pcre2_match_context *mcontext, pcre2_jit_stack *jit_stack);
+
+ void pcre2_jit_free_unused_memory(pcre2_general_context *gcontext);
+
+ pcre2_jit_stack *pcre2_jit_stack_alloc(pcre2_general_context *gcontext,
+ PCRE2_SIZE startsize, PCRE2_SIZE maxsize);
+
+ void pcre2_jit_stack_assign(const pcre2_code *code,
+ pcre2_jit_callback callback_function, void *callback_data);
+
+ void pcre2_jit_stack_free(pcre2_jit_stack *jit_stack);
+
+
+PCRE2 NATIVE API AUXILIARY FUNCTIONS
+
+ int pcre2_get_error_message(int errorcode, PCRE2_UCHAR *buffer,
+ PCRE2_SIZE bufflen);
+
+ const unsigned char *pcre2_maketables(pcre2_general_context *gcontext);
+
+ int pcre2_pattern_info(const pcre2 *code, uint32_t what, void *where);
+
+ int pcre2_config(uint32_t what, void *where, PCRE2_SIZE length);
+
+
+PCRE2 8-BIT, 16-BIT, AND 32-BIT LIBRARIES
+
+ There are three PCRE2 libraries, supporting 8-bit, 16-bit, and 32-bit
+ code units, respectively. However, there is just one header file,
+ pcre2.h. This contains the function prototypes and other definitions
+ for all three libraries. One, two, or all three can be installed simul-
+ taneously. On Unix-like systems the libraries are called libpcre2-8,
+ libpcre2-16, and libpcre2-32, and they can also co-exist with the orig-
+ inal PCRE libraries.
+
+ Character strings are passed to and from a PCRE2 library as a sequence
+ of unsigned integers in code units of the appropriate width. Every
+ PCRE2 function comes in three different forms, one for each library,
+ for example:
+
+ pcre2_compile_8()
+ pcre2_compile_16()
+ pcre2_compile_32()
+
+ There are also three different sets of data types:
+
+ PCRE2_UCHAR8, PCRE2_UCHAR16, PCRE2_UCHAR32
+ PCRE2_SPTR8, PCRE2_SPTR16, PCRE2_SPTR32
+
+ The UCHAR types define unsigned code units of the appropriate widths.
+ For example, PCRE2_UCHAR16 is usually defined as `uint16_t'. The SPTR
+ types are constant pointers to the equivalent UCHAR types, that is,
+ they are pointers to vectors of unsigned code units.
+
+ Many applications use only one code unit width. For their convenience,
+ macros are defined whose names are the generic forms such as pcre2_com-
+ pile() and PCRE2_SPTR. These macros use the value of the macro
+ PCRE2_CODE_UNIT_WIDTH to generate the appropriate width-specific func-
+ tion and macro names. PCRE2_CODE_UNIT_WIDTH is not defined by default.
+ An application must define it to be 8, 16, or 32 before including
+ pcre2.h in order to make use of the generic names.
+
+ Applications that use more than one code unit width can be linked with
+ more than one PCRE2 library, but must define PCRE2_CODE_UNIT_WIDTH to
+ be 0 before including pcre2.h, and then use the real function names.
+ Any code that is to be included in an environment where the value of
+ PCRE2_CODE_UNIT_WIDTH is unknown should also use the real function
+ names. (Unfortunately, it is not possible in C code to save and restore
+ the value of a macro.)
+
+ If PCRE2_CODE_UNIT_WIDTH is not defined before including pcre2.h, a
+ compiler error occurs.
+
+ When using multiple libraries in an application, you must take care
+ when processing any particular pattern to use only functions from a
+ single library. For example, if you want to run a match using a pat-
+ tern that was compiled with pcre2_compile_16(), you must do so with
+ pcre2_match_16(), not pcre2_match_8().
+
+ In the function summaries above, and in the rest of this document and
+ other PCRE2 documents, functions and data types are described using
+ their generic names, without the 8, 16, or 32 suffix.
+
+
+PCRE2 API OVERVIEW
+
+ PCRE2 has its own native API, which is described in this document.
+ There are also some wrapper functions for the 8-bit library that corre-
+ spond to the POSIX regular expression API, but they do not give access
+ to all the functionality. They are described in the pcre2posix documen-
+ tation. Both these APIs define a set of C function calls.
+
+ The native API C data types, function prototypes, option values, and
+ error codes are defined in the header file pcre2.h, which contains def-
+ initions of PCRE2_MAJOR and PCRE2_MINOR, the major and minor release
+ numbers for the library. Applications can use these to include support
+ for different releases of PCRE2.
+
+ In a Windows environment, if you want to statically link an application
+ program against a non-dll PCRE2 library, you must define PCRE2_STATIC
+ before including pcre2.h.
+
+ The functions pcre2_compile(), and pcre2_match() are used for compiling
+ and matching regular expressions in a Perl-compatible manner. A sample
+ program that demonstrates the simplest way of using them is provided in
+ the file called pcre2demo.c in the PCRE2 source distribution. A listing
+ of this program is given in the pcre2demo documentation, and the
+ pcre2sample documentation describes how to compile and run it.
+
+ Just-in-time compiler support is an optional feature of PCRE2 that can
+ be built in appropriate hardware environments. It greatly speeds up the
+ matching performance of many patterns. Programs can request that it be
+ used if available, by calling pcre2_jit_compile() after a pattern has
+ been successfully compiled by pcre2_compile(). This does nothing if JIT
+ support is not available.
+
+ More complicated programs might need to make use of the specialist
+ functions pcre2_jit_stack_alloc(), pcre2_jit_stack_free(), and
+ pcre2_jit_stack_assign() in order to control the JIT code's memory
+ usage.
+
+ JIT matching is automatically used by pcre2_match() if it is available.
+ There is also a direct interface for JIT matching, which gives improved
+ performance. The JIT-specific functions are discussed in the pcre2jit
+ documentation.
+
+ A second matching function, pcre2_dfa_exec(), which is not Perl-compat-
+ ible, is also provided. This uses a different algorithm for the match-
+ ing. The alternative algorithm finds all possible matches (at a given
+ point in the subject), and scans the subject just once (unless there
+ are lookbehind assertions). However, this algorithm does not return
+ captured substrings. A description of the two matching algorithms and
+ their advantages and disadvantages is given in the pcre2matching docu-
+ mentation. There is no JIT support for pcre2_dfa_match().
+
+ In addition to the main compiling and matching functions, there are
+ convenience functions for extracting captured substrings from a subject
+ string that is matched by pcre2_match(). They are:
+
+ pcre2_substring_copy_byname()
+ pcre2_substring_copy_bynumber()
+ pcre2_substring_get_byname()
+ pcre2_substring_get_bynumber()
+ pcre2_substring_list_get()
+ pcre2_substring_length_byname()
+ pcre2_substring_length_bynumber()
+ pcre2_substring_nametable_scan()
+ pcre2_substring_number_from_name()
+
+ pcre2_substring_free() and pcre2_substring_list_free() are also pro-
+ vided, to free the memory used for extracted strings.
+
+ There are functions for finding out information about a compiled pat-
+ tern (pcre2_pattern_info()) and about the configuration with which
+ PCRE2 was built (pcre2_config()).
+
+
+NEWLINES
+
+ PCRE2 supports five different conventions for indicating line breaks in
+ strings: a single CR (carriage return) character, a single LF (line-
+ feed) character, the two-character sequence CRLF, any of the three pre-
+ ceding, or any Unicode newline sequence. The Unicode newline sequences
+ are the three just mentioned, plus the single characters VT (vertical
+ tab, U+000B), FF (form feed, U+000C), NEL (next line, U+0085), LS (line
+ separator, U+2028), and PS (paragraph separator, U+2029).
+
+ Each of the first three conventions is used by at least one operating
+ system as its standard newline sequence. When PCRE2 is built, a default
+ can be specified. The default default is LF, which is the Unix stan-
+ dard. When PCRE2 is run, the default can be overridden, either when a
+ pattern is compiled, or when it is matched.
+
+ The newline convention can be changed when calling pcre2_compile(), or
+ it can be specified by special text at the start of the pattern itself;
+ this overrides any other settings. See the pcre2pattern page for
+ details of the special character sequences.
+
+ In the PCRE2 documentation the word "newline" is used to mean "the
+ character or pair of characters that indicate a line break". The choice
+ of newline convention affects the handling of the dot, circumflex, and
+ dollar metacharacters, the handling of #-comments in /x mode, and, when
+ CRLF is a recognized line ending sequence, the match position advance-
+ ment for a non-anchored pattern. There is more detail about this in the
+ section on pcre2_match() options below.
+
+ The choice of newline convention does not affect the interpretation of
+ the \n or \r escape sequences, nor does it affect what \R matches,
+ which has its own separate control.
+
+
+MULTITHREADING
+
+ In a multithreaded application it is important to keep thread-specific
+ data separate from data that can be shared between threads. The PCRE2
+ library code itself is thread-safe: it contains no static or global
+ variables. The API is designed to be fairly simple for non-threaded
+ applications while at the same time ensuring that multithreaded appli-
+ cations can use it.
+
+ There are several different blocks of data that are used to pass infor-
+ mation between the application and the PCRE libraries.
+
+ (1) A pointer to the compiled form of a pattern is returned to the user
+ when pcre2_compile() is successful. The data in the compiled pattern is
+ fixed, and does not change when the pattern is matched. Therefore, it
+ is thread-safe, that is, the same compiled pattern can be used by more
+ than one thread simultaneously. An application can compile all its pat-
+ terns at the start, before forking off multiple threads that use them.
+ However, if the just-in-time optimization feature is being used, it
+ needs separate memory stack areas for each thread. See the pcre2jit
+ documentation for more details.
+
+ (2) The next section below introduces the idea of "contexts" in which
+ PCRE2 functions are called. A context is nothing more than a collection
+ of parameters that control the way PCRE2 operates. Grouping a number of
+ parameters together in a context is a convenient way of passing them to
+ a PCRE2 function without using lots of arguments. The parameters that
+ are stored in contexts are in some sense "advanced features" of the
+ API. Many straightforward applications will not need to use contexts.
+
+ In a multithreaded application, if the parameters in a context are val-
+ ues that are never changed, the same context can be used by all the
+ threads. However, if any thread needs to change any value in a context,
+ it must make its own thread-specific copy.
+
+ (3) The matching functions need a block of memory for working space and
+ for storing the results of a match. This includes details of what was
+ matched, as well as additional information such as the name of a
+ (*MARK) setting. Each thread must provide its own version of this mem-
+ ory.
+
+
+PCRE2 CONTEXTS
+
+ Some PCRE2 functions have a lot of parameters, many of which are used
+ only by specialist applications, for example, those that use custom
+ memory management or non-standard character tables. To keep function
+ argument lists at a reasonable size, and at the same time to keep the
+ API extensible, "uncommon" parameters are passed to certain functions
+ in a context instead of directly. A context is just a block of memory
+ that holds the parameter values. Applications that do not need to
+ adjust any of the context parameters can pass NULL when a context
+ pointer is required.
+
+ There are three different types of context: a general context that is
+ relevant for several PCRE2 operations, a compile-time context, and a
+ match-time context.
+
+ The general context
+
+ At present, this context just contains pointers to (and data for)
+ external memory management functions that are called from several
+ places in the PCRE2 library. The context is named `general' rather than
+ specifically `memory' because in future other fields may be added. If
+ you do not want to supply your own custom memory management functions,
+ you do not need to bother with a general context. A general context is
+ created by:
+
+ pcre2_general_context *pcre2_general_context_create(
+ void *(*private_malloc)(PCRE2_SIZE, void *),
+ void (*private_free)(void *, void *), void *memory_data);
+
+ The two function pointers specify custom memory management functions,
+ whose prototypes are:
+
+ void *private_malloc(PCRE2_SIZE, void *);
+ void private_free(void *, void *);
+
+ Whenever code in PCRE2 calls these functions, the final argument is the
+ value of memory_data. Either of the first two arguments of the creation
+ function may be NULL, in which case the system memory management func-
+ tions malloc() and free() are used. (This is not currently useful, as
+ there are no other fields in a general context, but in future there
+ might be.) The private_malloc() function is used (if supplied) to
+ obtain memory for storing the context, and all three values are saved
+ as part of the context.
+
+ Whenever PCRE2 creates a data block of any kind, the block contains a
+ pointer to the free() function that matches the malloc() function that
+ was used. When the time comes to free the block, this function is
+ called.
+
+ A general context can be copied by calling:
+
+ pcre2_general_context *pcre2_general_context_copy(
+ pcre2_general_context *gcontext);
+
+ The memory used for a general context should be freed by calling:
+
+ void pcre2_general_context_free(pcre2_general_context *gcontext);
+
+
+ The compile context
+
+ A compile context is required if you want to change the default values
+ of any of the following compile-time parameters:
+
+ What \R matches (Unicode newlines or CR, LF, CRLF only);
+ PCRE2's character tables;
+ The newline character sequence;
+ The compile time nested parentheses limit;
+ An external function for stack checking.
+
+ A compile context is also required if you are using custom memory man-
+ agement. If none of these apply, just pass NULL as the context argu-
+ ment of pcre2_compile().
+
+ A compile context is created, copied, and freed by the following func-
+ tions:
+
+ pcre2_compile_context *pcre2_compile_context_create(
+ pcre2_general_context *gcontext);
+
+ pcre2_compile_context *pcre2_compile_context_copy(
+ pcre2_compile_context *ccontext);
+
+ void pcre2_compile_context_free(pcre2_compile_context *ccontext);
+
+ A compile context is created with default values for its parameters.
+ These can be changed by calling the following functions, which return 0
+ on success, or PCRE2_ERROR_BADDATA if invalid data is detected.
+
+ int pcre2_set_bsr_compile(pcre2_compile_context *ccontext,
+ uint32_t value);
+
+ The value must be PCRE2_BSR_ANYCRLF, to specify that \R matches only
+ CR, LF, or CRLF, or PCRE2_BSR_UNICODE, to specify that \R matches any
+ Unicode line ending sequence. The value of this parameter does not
+ affect what is compiled; it is just saved with the compiled pattern.
+ The value is used by the JIT compiler and by the two interpreted match-
+ ing functions, pcre2_match() and pcre2_dfa_match(). You can change the
+ value when calling these functions, but doing so disables the use of
+ JIT.
+
+ int pcre2_set_character_tables(pcre2_compile_context *ccontext,
+ const unsigned char *tables);
+
+ The value must be the result of a call to pcre2_maketables(), whose
+ only argument is a general context. This function builds a set of char-
+ acter tables in the current locale.
+
+ int pcre2_set_newline_compile(pcre2_compile_context *ccontext,
+ uint32_t value);
+
+ This specifies which characters or character sequences are to be recog-
+ nized as newlines. The value must be one of PCRE2_NEWLINE_CR (carriage
+ return only), PCRE2_NEWLINE_LF (linefeed only), PCRE2_NEWLINE_CRLF (the
+ two-character sequence CR followed by LF), PCRE2_NEWLINE_ANYCRLF (any
+ of the above), or PCRE2_NEWLINE_ANY (any Unicode newline sequence).
+
+ When a pattern is compiled with the PCRE2_EXTENDED option, the value of
+ this parameter affects the recognition of white space and the end of
+ internal comments starting with #. The value is saved with the compiled
+ pattern for subsequent use by the JIT compiler and by the two inter-
+ preted matching functions, pcre2_match() and pcre2_dfa_match(). You can
+ change the value when calling these functions, but doing so disables
+ the use of JIT.
+
+ int pcre2_set_parens_nest_limit(pcre2_compile_context *ccontext,
+ uint32_t value);
+
+ This parameter ajusts the limit, set when PCRE2 is built (default 250),
+ on the depth of parenthesis nesting in a pattern. This limit stops
+ rogue patterns using up too much system stack when being compiled.
+
+ int pcre2_set_compile_recursion_guard(pcre2_compile_context *ccontext,
+ int (*guard_function)(uint32_t));
+
+ There is at least one application that runs PCRE2 in threads with very
+ limited system stack, where running out of stack is to be avoided at
+ all costs. The parenthesis limit above cannot take account of how much
+ stack is actually available. For a finer control, you can supply a
+ function that is called whenever pcre2_compile() starts to compile a
+ parenthesized part of a pattern. The argument to the function gives the
+ current depth of nesting. The function should return zero if all is
+ well, or non-zero to force an error.
+
+ The match context
+
+ A match context is required if you want to change the default values of
+ any of the following match-time parameters:
+
+ What \R matches (Unicode newlines or CR, LF, CRLF only);
+ A callout function;
+ The limit for calling match();
+ The limit for calling match() recursively;
+ The newline character sequence;
+
+ A match context is also required if you are using custom memory manage-
+ ment. If none of these apply, just pass NULL as the context argument
+ of pcre2_match(), pcre2_dfa_match(), or pcre2_jit_match(). Changing
+ the newline value or what \R matches at match time disables the use of
+ JIT via pcre2_match().
+
+ A match context is created, copied, and freed by the following func-
+ tions:
+
+ pcre2_match_context *pcre2_match_context_create(
+ pcre2_general_context *gcontext);
+
+ pcre2_match_context *pcre2_match_context_copy(
+ pcre2_match_context *mcontext);
+
+ void pcre2_match_context_free(pcre2_match_context *mcontext);
+
+ A match context is created with default values for its parameters.
+ These can be changed by calling the following functions, which return 0
+ on success, or PCRE2_ERROR_BADDATA if invalid data is detected.
+
+ int pcre2_set_bsr_match(pcre2_match_context *mcontext,
+ uint32_t value);
+
+ The value must be PCRE2_BSR_ANYCRLF, to specify that \R matches only
+ CR, LF, or CRLF, or PCRE2_BSR_UNICODE, to specify that \R matches any
+ Unicode line ending sequence. If you want to make use of JIT matching,
+ you should not use this function, but instead set the value in a com-
+ pile context.
+
+ int pcre2_set_callout(pcre2_match_context *mcontext,
+ int (*callout_function)(pcre2_callout_block *),
+ void *callout_data);
+
+ This sets up a "callout" function, which PCRE2 will call at specified
+ points during a matching operation. Details are given in the pcre2call-
+ out documentation.
+
+ int pcre2_set_match_limit(pcre2_match_context *mcontext,
+ uint32_t value);
+
+ The match_limit parameter provides a means of preventing PCRE2 from
+ using up too many resources when processing patterns that are not going
+ to match, but which have a very large number of possibilities in their
+ search trees. The classic example is a pattern that uses nested unlim-
+ ited repeats.
+
+ Internally, pcre2_match() uses a function called match(), which it
+ calls repeatedly (sometimes recursively). The limit set by match_limit
+ is imposed on the number of times this function is called during a
+ match, which has the effect of limiting the amount of backtracking that
+ can take place. For patterns that are not anchored, the count restarts
+ from zero for each position in the subject string. This limit is not
+ relevant to pcre2_dfa_match(), which ignores it.
+
+ When pcre2_match() is called with a pattern that was successfully stud-
+ ied with pcre2_jit_compile(), the way that the matching is executed is
+ entirely different. However, there is still the possibility of runaway
+ matching that goes on for a very long time, and so the match_limit
+ value is also used in this case (but in a different way) to limit how
+ long the matching can continue.
+
+ The default value for the limit can be set when PCRE2 is built; the
+ default default is 10 million, which handles all but the most extreme
+ cases. If the limit is exceeded, pcre2_match() returns
+ PCRE2_ERROR_MATCHLIMIT. A value for the match limit may also be sup-
+ plied by an item at the start of a pattern of the form
+
+ (*LIMIT_MATCH=ddd)
+
+ where ddd is a decimal number. However, such a setting is ignored
+ unless ddd is less than the limit set by the caller of pcre2_match()
+ or, if no such limit is set, less than the default.
+
+ int pcre2_set_recursion_limit(pcre2_match_context *mcontext,
+ uint32_t value);
+
+ The recursion_limit parameter is similar to match_limit, but instead of
+ limiting the total number of times that match() is called, it limits
+ the depth of recursion. The recursion depth is a smaller number than
+ the total number of calls, because not all calls to match() are recur-
+ sive. This limit is of use only if it is set smaller than match_limit.
+
+ Limiting the recursion depth limits the amount of system stack that can
+ be used, or, when PCRE2 has been compiled to use memory on the heap
+ instead of the stack, the amount of heap memory that can be used. This
+ limit is not relevant, and is ignored, when matching is done using JIT
+ compiled code or by the pcre2_dfa_match() function.
+
+ The default value for recursion_limit can be set when PCRE2 is built;
+ the default default is the same value as the default for match_limit.
+ If the limit is exceeded, pcre2_match() returns PCRE2_ERROR_RECURSION-
+ LIMIT. A value for the recursion limit may also be supplied by an item
+ at the start of a pattern of the form
+
+ (*LIMIT_RECURSION=ddd)
+
+ where ddd is a decimal number. However, such a setting is ignored
+ unless ddd is less than the limit set by the caller of pcre2_match()
+ or, if no such limit is set, less than the default.
+
+ int pcre2_set_newline_match(pcre2_match_context *mcontext,
+ uint32_t value);
+
+ This specifies which characters or character sequences are to be recog-
+ nized as newlines. The value must be one of PCRE2_NEWLINE_CR (carriage
+ return only), PCRE2_NEWLINE_LF (linefeed only), PCRE2_NEWLINE_CRLF (the
+ two-character sequence CR followed by LF), PCRE2_NEWLINE_ANYCRLF (any
+ of the above), or PCRE2_NEWLINE_ANY (any Unicode newline sequence). If
+ you want to make use of JIT matching, you should not use this function,
+ but instead set the value in a compile context.
+
+ int pcre2_set_recursion_memory_management(
+ pcre2_match_context *mcontext,
+ void *(*private_malloc)(PCRE2_SIZE, void *),
+ void (*private_free)(void *, void *), void *memory_data);
+
+ This function sets up two additional custom memory management functions
+ for use by pcre2_match() when PCRE2 is compiled to use the heap for
+ remembering backtracking data, instead of recursive function calls that
+ use the system stack. There is a discussion about PCRE2's stack usage
+ in the pcre2stack documentation. See the pcre2build documentation for
+ details of how to build PCRE2. Using the heap for recursion is a non-
+ standard way of building PCRE2, for use in environments that have lim-
+ ited stacks. Because of the greater use of memory management,
+ pcre2_match() runs more slowly. Functions that are different to the
+ general custom memory functions are provided so that special-purpose
+ external code can be used for this case, because the memory blocks are
+ all the same size. The blocks are retained by pcre2_match() until it is
+ about to exit so that they can be re-used when possible during the
+ match. In the absence of these functions, the normal custom memory man-
+ agement functions are used, if supplied, otherwise the system func-
+ tions.
+
+
+CHECKING BUILD-TIME OPTIONS
+
+ int pcre2_config(uint32_t what, void *where, PCRE2_SIZE length);
+
+ The function pcre2_config() makes it possible for a PCRE2 client to
+ discover which optional features have been compiled into the PCRE2
+ library. The pcre2build documentation has more details about these
+ optional features.
+
+ The first argument for pcre2_config() specifies which information is
+ required. The second argument is a pointer to memory into which the
+ information is placed, with the final argument giving the length of
+ this memory in bytes. For calls that return numerical values, where
+ should point to appropriately aligned memory, with length set to at
+ least the "sizeof" the data type.
+
+ The returned value from pcre2_config() is zero on success, or the nega-
+ tive error code PCRE2_ERROR_BADOPTION if the value in the first argu-
+ ment is not recognized. The following information is available:
+
+ PCRE2_CONFIG_BSR
+
+ The output is an integer whose value indicates what character sequences
+ the \R escape sequence matches by default. A value of 0 means that \R
+ matches any Unicode line ending sequence; a value of 1 means that \R
+ matches only CR, LF, or CRLF. The default can be overridden when a pat-
+ tern is compiled or matched.
+
+ PCRE2_CONFIG_JIT
+
+ The output is an integer that is set to one if support for just-in-time
+ compiling is available; otherwise it is set to zero.
+
+ PCRE2_CONFIG_JITTARGET
+
+ FIXME: this needs sorting out once JIT is implemented. If JIT support
+ is available, the string contains the name of the architecture for
+ which the JIT compiler is configured, for example "x86 32bit (little
+ endian + unaligned)". If JIT support is not available, FIXME.
+
+ PCRE2_CONFIG_LINKSIZE
+
+ The output is an integer that contains the number of bytes used for
+ internal linkage in compiled regular expressions. When PCRE2 is config-
+ ured, the value can be set to 2, 3, or 4, with the default being 2.
+ This is the value that is returned by pcre2_config(). However, when the
+ 16-bit library is compiled, a value of 3 is rounded up to 4, and when
+ the 32-bit library is compiled, internal linkages always use 4 bytes,
+ so the configured value is not relevant.
+
+ The default value of 2 for the 8-bit and 16-bit libraries is sufficient
+ for all but the most massive patterns, since it allows the size of the
+ compiled pattern to be up to 64K code units. Larger values allow larger
+ regular expressions to be compiled by those two libraries, but at the
+ expense of slower matching.
+
+ PCRE2_CONFIG_MATCHLIMIT
+
+ The output is an unsigned long integer that gives the default limit for
+ the number of internal matching function calls in a pcre2_match() exe-
+ cution. Further details are given with pcre2_match() below.
+
+ PCRE2_CONFIG_NEWLINE
+
+ The output is an integer whose value specifies the default character
+ sequence that is recognized as meaning "newline". The values are:
+
+ 1 Carriage return (CR)
+ 2 Linefeed (LF)
+ 3 Carriage return, linefeed (CRLF)
+ 4 Any Unicode line ending
+ 5 Any of CR, LF, or CRLF
+
+ The default should normally correspond to the standard sequence for
+ your operating system.
+
+ PCRE2_CONFIG_PARENSLIMIT
+
+ The output is an unsigned long integer that gives the maximum depth of
+ nesting of parentheses (of any kind) in a pattern. This limit is
+ imposed to cap the amount of system stack used when a pattern is com-
+ piled. It is specified when PCRE2 is built; the default is 250. This
+ limit does not take into account the stack that may already be used by
+ the calling application. For finer control over compilation stack
+ usage, see pcre2_set_compile_recursion_guard().
+
+ PCRE2_CONFIG_RECURSIONLIMIT
+
+ The output is an unsigned long integer that gives the default limit for
+ the depth of recursion when calling the internal matching function in a
+ pcre2_match() execution. Further details are given with pcre2_match()
+ below.
+
+ PCRE2_CONFIG_STACKRECURSE
+
+ The output is an integer that is set to one if internal recursion when
+ running pcre2_match() is implemented by recursive function calls that
+ use the system stack to remember their state. This is the usual way
+ that PCRE2 is compiled. The output is zero if PCRE2 was compiled to use
+ blocks of data on the heap instead of recursive function calls.
+
+ PCRE2_CONFIG_UNICODE_VERSION
+
+ The where argument should point to a buffer that is at least 24 code
+ units long. If PCRE2 has been compiled without Unicode support, this is
+ filled with the text "Unicode not supported". Otherwise, the Unicode
+ version string (for example, "7.0.0") is returnd. The string is zero-
+ terminated.
+
+ PCRE2_CONFIG_UNICODE
+
+ The output is an integer that is set to one if Unicode support is
+ available; otherwise it is set to zero. Unicode support implies UTF
+ support.
+
+ PCRE2_CONFIG_VERSION
+
+ The where argument should point to a buffer that is at least 12 code
+ units long. It is filled with the PCRE2 version string, zero-termi-
+ nated.
+
+
+COMPILING A PATTERN
+
+ pcre2_code *pcre2_compile(PCRE2_SPTR pattern, PCRE2_SIZE length,
+ uint32_t options, int *errorcode, PCRE2_SIZE *erroroffset,
+ pcre2_compile_context *ccontext);
+
+ pcre2_code_free(pcre2_code *code);
+
+ This function compiles a pattern, defined by a pointer to a string of
+ code units and a length, into an internal form. If the pattern is zero-
+ terminated, the length should be specified as PCRE2_ZERO_TERMINATED.
+ The function returns a pointer to a block of memory that contains the
+ compiled pattern and related data. The caller must free the memory by
+ calling pcre2_code_free() when it is no longer needed.
+
+ If the compile context argument ccontext is NULL, the memory is
+ obtained by calling malloc(). Otherwise, it is obtained from the same
+ memory function that was used for the compile context.
+
+ The options argument contains various bit settings that affect the com-
+ pilation. It should be zero if no options are required. The available
+ options are described below. Some of them (in particular, those that
+ are compatible with Perl, but some others as well) can also be set and
+ unset from within the pattern (see the detailed description in the
+ pcre2pattern documentation).
+
+ For those options that can be different in different parts of the pat-
+ tern, the contents of the options argument specifies their settings at
+ the start of compilation. The PCRE2_ANCHORED, PCRE2_NO_UTF_CHECK, and
+ PCRE2_NO_START_OPTIMIZE options can be set at the time of matching as
+ well as at compile time.
+
+ Other, less frequently required compile-time parameters (for example,
+ the newline setting) can be provided in a compile context (as described
+ above).
+
+ If errorcode or erroroffset is NULL, pcre2_compile() returns NULL imme-
+ diately. Otherwise, if compilation of a pattern fails, pcre2_compile()
+ returns NULL, having set these variables to an error code and an offset
+ (number of code units) within the pattern, respectively. The
+ pcre2_get_error_message() function provides a textual message for each
+ error code. Compilation errors are positive numbers, but UTF formatting
+ errors are negative numbers. For an invalid UTF-8 or UTF-16 string, the
+ offset is that of the first code unit of the failing character.
+
+ Some errors are not detected until the whole pattern has been scanned;
+ in these cases, the offset passed back is the length of the pattern.
+ Note that the offset is in code units, not characters, even in a UTF
+ mode. It may sometimes point into the middle of a UTF-8 or UTF-16 char-
+ acter.
+
+ This code fragment shows a typical straightforward call to pcre2_com-
+ pile():
+
+ pcre2_code *re;
+ PCRE2_SIZE erroffset;
+ int errorcode;
+ re = pcre2_compile(
+ "^A.*Z", /* the pattern */
+ PCRE2_ZERO_TERMINATED, /* the pattern is zero-terminated */
+ 0, /* default options */
+ &errorcode, /* for error code */
+ &erroffset, /* for error offset */
+ NULL); /* no compile context */
+
+ The following names for option bits are defined in the pcre2.h header
+ file:
+
+ PCRE2_ANCHORED
+
+ If this bit is set, the pattern is forced to be "anchored", that is, it
+ is constrained to match only at the first matching point in the string
+ that is being searched (the "subject string"). This effect can also be
+ achieved by appropriate constructs in the pattern itself, which is the
+ only way to do it in Perl.
+
+ PCRE2_ALLOW_EMPTY_CLASS
+
+ By default, for compatibility with Perl, a closing square bracket that
+ immediately follows an opening one is treated as a data character for
+ the class. When PCRE2_ALLOW_EMPTY_CLASS is set, it terminates the
+ class, which therefore contains no characters and so can never match.
+
+ PCRE2_ALT_BSUX
+
+ This option request alternative handling of three escape sequences,
+ which makes PCRE2's behaviour more like ECMAscript (aka JavaScript).
+ When it is set:
+
+ (1) \U matches an upper case "U" character; by default \U causes a com-
+ pile time error (Perl uses \U to upper case subsequent characters).
+
+ (2) \u matches a lower case "u" character unless it is followed by four
+ hexadecimal digits, in which case the hexadecimal number defines the
+ code point to match. By default, \u causes a compile time error (Perl
+ uses it to upper case the following character).
+
+ (3) \x matches a lower case "x" character unless it is followed by two
+ hexadecimal digits, in which case the hexadecimal number defines the
+ code point to match. By default, as in Perl, a hexadecimal number is
+ always expected after \x, but it may have zero, one, or two digits (so,
+ for example, \xz matches a binary zero character followed by z).
+
+ PCRE2_AUTO_CALLOUT
+
+ If this bit is set, pcre2_compile() automatically inserts callout
+ items, all with number 255, before each pattern item. For discussion of
+ the callout facility, see the pcre2callout documentation.
+
+ PCRE2_CASELESS
+
+ If this bit is set, letters in the pattern match both upper and lower
+ case letters in the subject. It is equivalent to Perl's /i option, and
+ it can be changed within a pattern by a (?i) option setting.
+
+ PCRE2_DOLLAR_ENDONLY
+
+ If this bit is set, a dollar metacharacter in the pattern matches only
+ at the end of the subject string. Without this option, a dollar also
+ matches immediately before a newline at the end of the string (but not
+ before any other newlines). The PCRE2_DOLLAR_ENDONLY option is ignored
+ if PCRE2_MULTILINE is set. There is no equivalent to this option in
+ Perl, and no way to set it within a pattern.
+
+ PCRE2_DOTALL
+
+ If this bit is set, a dot metacharacter in the pattern matches any
+ character, including one that indicates a newline. However, it only
+ ever matches one character, even if newlines are coded as CRLF. Without
+ this option, a dot does not match when the current position in the sub-
+ ject is at a newline. This option is equivalent to Perl's /s option,
+ and it can be changed within a pattern by a (?s) option setting. A neg-
+ ative class such as [^a] always matches newline characters, independent
+ of the setting of this option.
+
+ PCRE2_DUPNAMES
+
+ If this bit is set, names used to identify capturing subpatterns need
+ not be unique. This can be helpful for certain types of pattern when it
+ is known that only one instance of the named subpattern can ever be
+ matched. There are more details of named subpatterns below; see also
+ the pcre2pattern documentation.
+
+ PCRE2_EXTENDED
+
+ If this bit is set, most white space characters in the pattern are
+ totally ignored except when escaped or inside a character class. How-
+ ever, white space is not allowed within sequences such as (?> that
+ introduce various parenthesized subpatterns, nor within numerical quan-
+ tifiers such as {1,3}. Ignorable white space is permitted between an
+ item and a following quantifier and between a quantifier and a follow-
+ ing + that indicates possessiveness.
+
+ PCRE2_EXTENDED also causes characters between an unescaped # outside a
+ character class and the next newline, inclusive, to be ignored, which
+ makes it possible to include comments inside complicated patterns. Note
+ that the end of this type of comment is a literal newline sequence in
+ the pattern; escape sequences that happen to represent a newline do not
+ count. PCRE2_EXTENDED is equivalent to Perl's /x option, and it can be
+ changed within a pattern by a (?x) option setting.
+
+ Which characters are interpreted as newlines can be specified by a set-
+ ting in the compile context that is passed to pcre2_compile() or by a
+ special sequence at the start of the pattern, as described in the sec-
+ tion entitled "Newline conventions" in the pcre2pattern documentation.
+ A default is defined when PCRE2 is built.
+
+ PCRE2_FIRSTLINE
+
+ If this option is set, an unanchored pattern is required to match
+ before or at the first newline in the subject string, though the
+ matched text may continue over the newline.
+
+ PCRE2_MATCH_UNSET_BACKREF
+
+ If this option is set, a back reference to an unset subpattern group
+ matches an empty string (by default this causes the current matching
+ alternative to fail). A pattern such as (\1)(a) succeeds when this
+ option is set (assuming it can find an "a" in the subject), whereas it
+ fails by default, for Perl compatibility. Setting this option makes
+ PCRE2 behave more like ECMAscript (aka JavaScript).
+
+ PCRE2_MULTILINE
+
+ By default, for the purposes of matching "start of line" and "end of
+ line", PCRE2 treats the subject string as consisting of a single line
+ of characters, even if it actually contains newlines. The "start of
+ line" metacharacter (^) matches only at the start of the string, and
+ the "end of line" metacharacter ($) matches only at the end of the
+ string, or before a terminating newline (except when PCRE2_DOL-
+ LAR_ENDONLY is set). Note, however, that unless PCRE2_DOTALL is set,
+ the "any character" metacharacter (.) does not match at a newline. This
+ behaviour (for ^, $, and dot) is the same as Perl.
+
+ When PCRE2_MULTILINE it is set, the "start of line" and "end of line"
+ constructs match immediately following or immediately before internal
+ newlines in the subject string, respectively, as well as at the very
+ start and end. This is equivalent to Perl's /m option, and it can be
+ changed within a pattern by a (?m) option setting. If there are no new-
+ lines in a subject string, or no occurrences of ^ or $ in a pattern,
+ setting PCRE2_MULTILINE has no effect.
+
+ PCRE2_NEVER_UCP
+
+ This option locks out the use of Unicode properties for handling \B,
+ \b, \D, \d, \S, \s, \W, \w, and some of the POSIX character classes, as
+ described for the PCRE2_UCP option below. In particular, it prevents
+ the creator of the pattern from enabling this facility by starting the
+ pattern with (*UCP). This may be useful in applications that process
+ patterns from external sources. The option combination PCRE_UCP and
+ PCRE_NEVER_UCP causes an error.
+
+ PCRE2_NEVER_UTF
+
+ This option locks out interpretation of the pattern as UTF-8, UTF-16,
+ or UTF-32, depending on which library is in use. In particular, it pre-
+ vents the creator of the pattern from switching to UTF interpretation
+ by starting the pattern with (*UTF). This may be useful in applications
+ that process patterns from external sources. The combination of
+ PCRE2_UTF and PCRE2_NEVER_UTF causes an error.
+
+ PCRE2_NO_AUTO_CAPTURE
+
+ If this option is set, it disables the use of numbered capturing paren-
+ theses in the pattern. Any opening parenthesis that is not followed by
+ ? behaves as if it were followed by ?: but named parentheses can still
+ be used for capturing (and they acquire numbers in the usual way).
+ There is no equivalent of this option in Perl.
+
+ PCRE2_NO_AUTO_POSSESS
+
+ If this option is set, it disables "auto-possessification", which is an
+ optimization that, for example, turns a+b into a++b in order to avoid
+ backtracks into a+ that can never be successful. However, if callouts
+ are in use, auto-possessification means that some callouts are never
+ taken. You can set this option if you want the matching functions to do
+ a full unoptimized search and run all the callouts, but it is mainly
+ provided for testing purposes.
+
+ PCRE2_NO_START_OPTIMIZE
+
+ This is an option that acts at matching time; that is, it is really an
+ option for pcre2_match() or pcre_dfa_match(). If it is set at compile
+ time, it is remembered with the compiled pattern and assumed at match-
+ ing time. This is necessary if you want to use JIT execution, because
+ the JIT compiler needs to know whether or not this option is set. For
+ details, see the discussion of PCRE2_NO_START_OPTIMIZE in the section
+ on pcre2_match() options below.
+
+ PCRE2_NO_UTF_CHECK
+
+ When PCRE2_UTF is set, the validity of the pattern as a UTF string is
+ automatically checked. There are discussions about the validity of
+ UTF-8 strings, UTF-16 strings, and UTF-32 strings in the pcre2unicode
+ document. If an invalid UTF sequence is found, pcre2_compile() returns
+ a negative error code.
+
+ If you know that your pattern is valid, and you want to skip this check
+ for performance reasons, you can set the PCRE2_NO_UTF_CHECK option.
+ When it is set, the effect of passing an invalid UTF string as a pat-
+ tern is undefined. It may cause your program to crash or loop. Note
+ that this option can also be passed to pcre2_match() and
+ pcre_dfa_match(), to suppress validity checking of the subject string.
+
+ PCRE2_UCP
+
+ This option changes the way PCRE2 processes \B, \b, \D, \d, \S, \s, \W,
+ \w, and some of the POSIX character classes. By default, only ASCII
+ characters are recognized, but if PCRE2_UCP is set, Unicode properties
+ are used instead to classify characters. More details are given in the
+ section on generic character types in the pcre2pattern page. If you set
+ PCRE2_UCP, matching one of the items it affects takes much longer. The
+ option is available only if PCRE2 has been compiled with UTF support.
+
+ PCRE2_UNGREEDY
+
+ This option inverts the "greediness" of the quantifiers so that they
+ are not greedy by default, but become greedy if followed by "?". It is
+ not compatible with Perl. It can also be set by a (?U) option setting
+ within the pattern.
+
+ PCRE2_UTF
+
+ This option causes PCRE2 to regard both the pattern and the subject
+ strings that are subsequently processed as strings of UTF characters
+ instead of single-code-unit strings. However, it is available only when
+ PCRE2 is built to include UTF support. If not, the use of this option
+ provokes an error. Details of how this option changes the behaviour of
+ PCRE2 are given in the pcre2unicode page.
+
+
+COMPILATION ERROR CODES
+
+ There are over 80 positive error codes that pcre2_compile() may return
+ if it finds an error in the pattern. There are also some negative error
+ codes that are used for invalid UTF strings. These are the same as
+ given by pcre2_match() and pcre2_dfa_match(), and are described in the
+ pcre2unicode page. The pcre2_get_error_message() function can be called
+ to obtain a textual error message from any error code.
+
+
+JUST-IN-TIME (JIT) COMPILATION
+
+ int pcre2_jit_compile(pcre2_code *code, uint32_t options);
+
+ int pcre2_jit_match(const pcre2_code *code, PCRE2_SPTR subject,
+ PCRE2_SIZE length, PCRE2_SIZE startoffset,
+ uint32_t options, pcre2_match_data *match_data,
+ pcre2_match_context *mcontext, pcre2_jit_stack *jit_stack);
+
+ void pcre2_jit_free_unused_memory(pcre2_general_context *gcontext);
+
+ pcre2_jit_stack *pcre2_jit_stack_alloc(pcre2_general_context *gcontext,
+ PCRE2_SIZE startsize, PCRE2_SIZE maxsize);
+
+ void pcre2_jit_stack_assign(const pcre2_code *code,
+ pcre2_jit_callback callback_function, void *callback_data);
+
+ void pcre2_jit_stack_free(pcre2_jit_stack *jit_stack);
+
+ These functions provide support for JIT compilation, which, if the
+ just-in-time compiler is available, further processes a compiled pat-
+ tern into machine code that executes much faster than the pcre2_match()
+ interpretive matching function. Full details are given in the pcre2jit
+ documentation.
+
+ JIT compilation is a heavyweight optimization. It can take some time
+ for patterns to be analyzed, and for one-off matches and simple pat-
+ terns the benefit of faster execution might be offset by a much slower
+ compilation time. Most, but not all patterns can be optimized by the
+ JIT compiler.
+
+
+LOCALE SUPPORT
+
+ PCRE2 handles caseless matching, and determines whether characters are
+ letters, digits, or whatever, by reference to a set of tables, indexed
+ by character code point. When running in UTF-8 mode, or using the
+ 16-bit or 32-bit libraries, this applies only to characters with code
+ points less than 256. By default, higher-valued code points never match
+ escapes such as \w or \d. However, if PCRE2 is built with UTF support,
+ all characters can be tested with \p and \P, or, alternatively, the
+ PCRE2_UCP option can be set when a pattern is compiled; this causes \w
+ and friends to use Unicode property support instead of the built-in
+ tables.
+
+ The use of locales with Unicode is discouraged. If you are handling
+ characters with code points greater than 128, you should either use
+ Unicode support, or use locales, but not try to mix the two.
+
+ PCRE2 contains an internal set of character tables that are used by
+ default. These are sufficient for many applications. Normally, the
+ internal tables recognize only ASCII characters. However, when PCRE2 is
+ built, it is possible to cause the internal tables to be rebuilt in the
+ default "C" locale of the local system, which may cause them to be dif-
+ ferent.
+
+ The internal tables can be overridden by tables supplied by the appli-
+ cation that calls PCRE2. These may be created in a different locale
+ from the default. As more and more applications change to using Uni-
+ code, the need for this locale support is expected to die away.
+
+ External tables are built by calling the pcre2_maketables() function,
+ in the relevant locale. The result can be passed to pcre2_compile() as
+ often as necessary, by creating a compile context and calling
+ pcre2_set_character_tables() to set the tables pointer therein. For
+ example, to build and use tables that are appropriate for the French
+ locale (where accented characters with values greater than 128 are
+ treated as letters), the following code could be used:
+
+ setlocale(LC_CTYPE, "fr_FR");
+ tables = pcre2_maketables(NULL);
+ ccontext = pcre2_compile_context_create(NULL);
+ pcre2_set_character_tables(ccontext, tables);
+ re = pcre2_compile(..., ccontext);
+
+ The locale name "fr_FR" is used on Linux and other Unix-like systems;
+ if you are using Windows, the name for the French locale is "french".
+ It is the caller's responsibility to ensure that the memory containing
+ the tables remains available for as long as it is needed.
+
+ The pointer that is passed (via the compile context) to pcre2_compile()
+ is saved with the compiled pattern, and the same tables are used by
+ pcre2_match() and pcre_dfa_match(). Thus, for any single pattern, com-
+ pilation, and matching all happen in the same locale, but different
+ patterns can be processed in different locales.
+
+
+INFORMATION ABOUT A COMPILED PATTERN
+
+ int pcre2_pattern_info(const pcre2 *code, uint32_t what, void *where);
+
+ The pcre2_pattern_info() function returns information about a compiled
+ pattern. The first argument is a pointer to the compiled pattern. The
+ second argument specifies which piece of information is required, and
+ the third argument is a pointer to a variable to receive the data. The
+ yield of the function is zero for success, or one of the following neg-
+ ative numbers:
+
+ PCRE2_ERROR_NULL the argument code was NULL
+ the argument where was NULL
+ PCRE2_ERROR_BADMAGIC the "magic number" was not found
+ PCRE2_ERROR_BADOPTION the value of what was invalid
+ PCRE2_ERROR_UNSET the requested field is not set
+
+ The "magic number" is placed at the start of each compiled pattern as
+ an simple check against passing an arbitrary memory pointer. Here is a
+ typical call of pcre2_pattern_info(), to obtain the length of the com-
+ piled pattern:
+
+ int rc;
+ size_t length;
+ rc = pcre2_pattern_info(
+ re, /* result of pcre2_compile() */
+ PCRE2_INFO_SIZE, /* what is required */
+ &length); /* where to put the data */
+
+ The possible values for the second argument are defined in pcre2.h, and
+ are as follows:
+
+ PCRE2_INFO_ALLOPTIONS
+ PCRE2_INFO_ARGOPTIONS
+
+ Return a copy of the pattern's options. The third argument should point
+ to a uint32_t variable. PCRE2_INFO_ARGOPTIONS returns exactly the
+ options that were passed to pcre2_compile(), whereas PCRE2_INFO_ALLOP-
+ TIONS returns the compile options as modified by any top-level option
+ settings at the start of the pattern itself. In other words, they are
+ the options that will be in force when matching starts. For example, if
+ the pattern /(?im)abc(?-i)d/ is compiled with the PCRE2_EXTENDED
+ option, the result is PCRE2_CASELESS, PCRE2_MULTILINE, and
+ PCRE2_EXTENDED.
+
+ A pattern is automatically anchored by PCRE2 if all of its top-level
+ alternatives begin with one of the following:
+
+ ^ unless PCRE2_MULTILINE is set
+ \A always
+ \G always
+ .* if PCRE2_DOTALL is set and there are no back
+ references to the subpattern in which .* appears
+
+ For such patterns, the PCRE2_ANCHORED bit is set in the options
+ returned for PCRE2_INFO_ALLOPTIONS.
+
+ PCRE2_INFO_BACKREFMAX
+
+ Return the number of the highest back reference in the pattern. The
+ third argument should point to an uint32_t variable. Zero is returned
+ if there are no back references.
+
+ PCRE2_INFO_BSR
+
+ The output is a uint32_t whose value indicates what character sequences
+ the \R escape sequence matches by default. A value of 0 means that \R
+ matches any Unicode line ending sequence; a value of 1 means that \R
+ matches only CR, LF, or CRLF. The default can be overridden when a pat-
+ tern is matched.
+
+ PCRE2_INFO_CAPTURECOUNT
+
+ Return the number of capturing subpatterns in the pattern. The third
+ argument should point to an uint32_t variable.
+
+ PCRE2_INFO_FIRSTCODETYPE
+
+ Return information about the first code unit of any matched string, for
+ a non-anchored pattern. The third argument should point to an uint32_t
+ variable.
+
+ If there is a fixed first value, for example, the letter "c" from a
+ pattern such as (cat|cow|coyote), 1 is returned, and the character
+ value can be retrieved using PCRE2_INFO_FIRSTCODEUNIT. If there is no
+ fixed first value, and if either
+
+ (a) the pattern was compiled with the PCRE2_MULTILINE option, and every
+ branch starts with "^", or
+
+ (b) every branch of the pattern starts with ".*" and PCRE2_DOTALL is
+ not set (if it were set, the pattern would be anchored),
+
+ 2 is returned, indicating that the pattern matches only at the start of
+ a subject string or after any newline within the string. Otherwise 0 is
+ returned. For anchored patterns, 0 is returned.
+
+ PCRE2_INFO_FIRSTCODEUNIT
+
+ Return the value of the first code unit of any matched string in the
+ situation where PCRE2_INFO_FIRSTCODETYPE returns 1; otherwise return 0.
+ The third argument should point to an uint32_t variable. In the 8-bit
+ library, the value is always less than 256. In the 16-bit library the
+ value can be up to 0xffff. In the 32-bit library in UTF-32 mode the
+ value can be up to 0x10ffff, and up to 0xffffffff when not using UTF-32
+ mode.
+
+ PCRE2_INFO_FIRSTBITMAP
+
+ In the absence of a single first code unit for a non-anchored pattern,
+ pcre2_compile() may construct a 256-bit table that defines a fixed set
+ of values for the first code unit in any match. For example, a pattern
+ that starts with [abc] results in a table with three bits set. When
+ code unit values greater than 255 are supported, the flag bit for 255
+ means "any code unit of value 255 or above". If such a table was con-
+ structed, a pointer to it is returned. Otherwise NULL is returned. The
+ third argument should point to an const uint8_t * variable.
+
+ PCRE2_INFO_HASCRORLF
+
+ Return 1 if the pattern contains any explicit matches for CR or LF
+ characters, otherwise 0. The third argument should point to an uint32_t
+ variable. An explicit match is either a literal CR or LF character, or
+ \r or \n.
+
+ PCRE2_INFO_JCHANGED
+
+ Return 1 if the (?J) or (?-J) option setting is used in the pattern,
+ otherwise 0. The third argument should point to an uint32_t variable.
+ (?J) and (?-J) set and unset the local PCRE2_DUPNAMES option, respec-
+ tively.
+
+ PCRE2_INFO_JITSIZE
+
+ If the compiled pattern was successfully processed by pcre2_jit_com-
+ pile(), return the size of the JIT compiled code, otherwise return
+ zero. The third argument should point to a size_t variable.
+
+ PCRE2_INFO_LASTCODETYPE
+
+ Returns 1 if there is a rightmost literal code unit that must exist in
+ any matched string, other than at its start. The third argument should
+ point to an uint32_t variable. If there is no such value, 0 is
+ returned. When 1 is returned, the code unit value itself can be
+ retrieved using PCRE2_INFO_LASTCODEUNIT.
+
+ For anchored patterns, a last literal value is recorded only if it fol-
+ lows something of variable length. For example, for the pattern
+ /^a\d+z\d+/ the returned value is 1 (with "z" returned from
+ PCRE2_INFO_LASTCODEUNIT), but for /^a\dz\d/ the returned value is 0.
+
+ PCRE2_INFO_LASTCODEUNIT
+
+ Return the value of the rightmost literal data unit that must exist in
+ any matched string, other than at its start, if such a value has been
+ recorded. The third argument should point to an uint32_t variable. If
+ there is no such value, 0 is returned.
+
+ PCRE2_INFO_MATCHEMPTY
+
+ Return 1 if the pattern can match an empty string, otherwise 0. The
+ third argument should point to an uint32_t variable.
+
+ PCRE2_INFO_MATCHLIMIT
+
+ If the pattern set a match limit by including an item of the form
+ (*LIMIT_MATCH=nnnn) at the start, the value is returned. The third
+ argument should point to an unsigned 32-bit integer. If no such value
+ has been set, the call to pcre2_pattern_info() returns the error
+ PCRE2_ERROR_UNSET.
+
+ PCRE2_INFO_MAXLOOKBEHIND
+
+ Return the number of characters (not code units) in the longest lookbe-
+ hind assertion in the pattern. The third argument should point to an
+ unsigned 32-bit integer. This information is useful when doing multi-
+ segment matching using the partial matching facilities. Note that the
+ simple assertions \b and \B require a one-character lookbehind. \A also
+ registers a one-character lookbehind, though it does not actually
+ inspect the previous character. This is to ensure that at least one
+ character from the old segment is retained when a new segment is pro-
+ cessed. Otherwise, if there are no lookbehinds in the pattern, \A might
+ match incorrectly at the start of a new segment.
+
+ PCRE2_INFO_MINLENGTH
+
+ If a minimum length for matching subject strings was computed, its
+ value is returned. Otherwise the returned value is 0. The value is a
+ number of characters, which in UTF mode may be different from the num-
+ ber of code units. The third argument should point to an uint32_t
+ variable. The value is a lower bound to the length of any matching
+ string. There may not be any strings of that length that do actually
+ match, but every string that does match is at least that long.
+
+ PCRE2_INFO_NAMECOUNT
+ PCRE2_INFO_NAMEENTRYSIZE
+ PCRE2_INFO_NAMETABLE
+
+ PCRE2 supports the use of named as well as numbered capturing parenthe-
+ ses. The names are just an additional way of identifying the parenthe-
+ ses, which still acquire numbers. Several convenience functions such as
+ pcre2_substring_get_byname() are provided for extracting captured sub-
+ strings by name. It is also possible to extract the data directly, by
+ first converting the name to a number in order to access the correct
+ pointers in the output vector (described with pcre2_match() below). To
+ do the conversion, you need to use the name-to-number map, which is
+ described by these three values.
+
+ The map consists of a number of fixed-size entries. PCRE2_INFO_NAME-
+ COUNT gives the number of entries, and PCRE2_INFO_NAMEENTRYSIZE gives
+ the size of each entry; both of these return a uint32_t value. The
+ entry size depends on the length of the longest name.
+ PCRE2_INFO_NAMETABLE returns a pointer to the first entry of the table.
+ This is a PCRE2_SPTR pointer to a block of code units. In the 8-bit
+ library, the first two bytes of each entry are the number of the cap-
+ turing parenthesis, most significant byte first. In the 16-bit library,
+ the pointer points to 16-bit data units, the first of which contains
+ the parenthesis number. In the 32-bit library, the pointer points to
+ 32-bit data units, the first of which contains the parenthesis number.
+ The rest of the entry is the corresponding name, zero terminated.
+
+ The names are in alphabetical order. If (?| is used to create multiple
+ groups with the same number, as described in the section on duplicate
+ subpattern numbers in the pcre2pattern page, the groups may be given
+ the same name, but there is only one entry in the table. Different
+ names for groups of the same number are not permitted.
+
+ Duplicate names for subpatterns with different numbers are permitted,
+ but only if PCRE2_DUPNAMES is set. They appear in the table in the
+ order in which they were found in the pattern. In the absence of (?|
+ this is the order of increasing number; when (?| is used this is not
+ necessarily the case because later subpatterns may have lower numbers.
+
+ As a simple example of the name/number table, consider the following
+ pattern after compilation by the 8-bit library (assume PCRE2_EXTENDED
+ is set, so white space - including newlines - is ignored):
+
+ (?<date> (?<year>(\d\d)?\d\d) -
+ (?<month>\d\d) - (?<day>\d\d) )
+
+ There are four named subpatterns, so the table has four entries, and
+ each entry in the table is eight bytes long. The table is as follows,
+ with non-printing bytes shows in hexadecimal, and undefined bytes shown
+ as ??:
+
+ 00 01 d a t e 00 ??
+ 00 05 d a y 00 ?? ??
+ 00 04 m o n t h 00
+ 00 02 y e a r 00 ??
+
+ When writing code to extract data from named subpatterns using the
+ name-to-number map, remember that the length of the entries is likely
+ to be different for each compiled pattern.
+
+ PCRE2_INFO_NEWLINE
+
+ The output is a uint32_t whose value specifies the default character
+ sequence that will be recognized as meaning "newline" while matching.
+ The values are:
+
+ 1 Carriage return (CR)
+ 2 Linefeed (LF)
+ 3 Carriage return, linefeed (CRLF)
+ 4 Any Unicode line ending
+ 5 Any of CR, LF, or CRLF
+
+ The default can be overridden when a pattern is matched.
+
+ PCRE2_INFO_RECURSIONLIMIT
+
+ If the pattern set a recursion limit by including an item of the form
+ (*LIMIT_RECURSION=nnnn) at the start, the value is returned. The third
+ argument should point to an unsigned 32-bit integer. If no such value
+ has been set, the call to pcre2_pattern_info() returns the error
+ PCRE2_ERROR_UNSET.
+
+ PCRE2_INFO_SIZE
+
+ Return the size of the compiled pattern in bytes (for all three
+ libraries). The third argument should point to a size_t variable. This
+ value does not include the size of the pcre2_code structure that is
+ returned by pcre_compile(). The value that is used when pcre2_compile()
+ is getting memory in which to place the compiled data is the value
+ returned by this option plus the size of the pcre2_code structure. Pro-
+ cessing a pattern with the JIT compiler does not alter the value
+ returned by this option.
+
+
+THE MATCH DATA BLOCK
+
+ pcre2_match_data_create(uint32_t ovecsize,
+ pcre2_general_context *gcontext);
+
+ pcre2_match_data_create_from_pattern(pcre2_code *code,
+ pcre2_general_context *gcontext);
+
+ void pcre2_match_data_free(pcre2_match_data *match_data);
+
+ Information about successful and unsuccessful matches is placed in a
+ match data block, which is an opaque structure that is accessed by
+ function calls. In particular, the match data block contains a vector
+ of offsets into the subject string that define the matched part of the
+ subject and any substrings that were capured. This is know as the ovec-
+ tor.
+
+ Before calling pcre2_match() or pcre2_dfa_match() you must create a
+ match data block by calling one of the creation functions above. For
+ pcre2_match_data_create(), the first argument is the number of pairs of
+ offsets in the ovector. One pair of offsets is required to identify the
+ string that matched the whole pattern, with another pair for each cap-
+ tured substring. For example, a value of 4 creates enough space to
+ record the matched portion of the subject plus three captured sub-
+ strings.
+
+ For pcre2_match_data_create_from_pattern(), the first argument is a
+ pointer to a compiled pattern. In this case the ovector is created to
+ be exactly the right size to hold all the substrings a pattern might
+ capture.
+
+ The second argument of both these functions ia a pointer to a general
+ context, which can specify custom memory management for obtaining the
+ memory for the match data block. If you are not using custom memory
+ management, pass NULL.
+
+ A match data block can be used many times, with the same or different
+ compiled patterns. When it is no longer needed, it should be freed by
+ calling pcre2_match_data_free(). How to extract information from a
+ match data block after a match operation is described in the sections
+ on matched strings and other match data below.
+
+
+MATCHING A PATTERN: THE TRADITIONAL FUNCTION
+
+ int pcre2_match(const pcre2_code *code, PCRE2_SPTR subject,
+ PCRE2_SIZE length, PCRE2_SIZE startoffset,
+ uint32_t options, pcre2_match_data *match_data,
+ pcre2_match_context *mcontext);
+
+ The function pcre2_match() is called to match a subject string against
+ a compiled pattern, which is passed in the code argument. You can call
+ pcre2_match() with the same code argument as many times as you like, in
+ order to find multiple matches in the subject string or to match dif-
+ ferent subject strings with the same pattern.
+
+ This function is the main matching facility of the library, and it
+ operates in a Perl-like manner. For specialist use there is also an
+ alternative matching function, which is described below in the section
+ about the pcre2_dfa_match() function.
+
+ Here is an example of a simple call to pcre2_match():
+
+ pcre2_match_data *md = pcre2_match_data_create(4, NULL);
+ int rc = pcre2_match(
+ re, /* result of pcre2_compile() */
+ "some string", /* the subject string */
+ 11, /* the length of the subject string */
+ 0, /* start at offset 0 in the subject */
+ 0, /* default options */
+ match_data, /* the match data block */
+ NULL); /* a match context; NULL means use defaults */
+
+ If the subject string is zero-terminated, the length can be given as
+ PCRE2_ZERO_TERMINATED. A match context must be provided if certain less
+ common matching parameters are to be changed. For details, see the sec-
+ tion on the match context above.
+
+ The string to be matched by pcre2_match()
+
+ The subject string is passed to pcre2_match() as a pointer in subject,
+ a length in length, and a starting offset in startoffset. The length
+ and offset are in code units, not characters. That is, they are in
+ bytes for the 8-bit library, 16-bit code units for the 16-bit library,
+ and 32-bit code units for the 32-bit library, whether or not UTF pro-
+ cessing is enabled.
+
+ If startoffset is greater than the length of the subject, pcre2_match()
+ returns PCRE2_ERROR_BADOFFSET. When the starting offset is zero, the
+ search for a match starts at the beginning of the subject, and this is
+ by far the most common case. In UTF-8 or UTF-16 mode, the starting off-
+ set must point to the start of a character, or to the end of the sub-
+ ject (in UTF-32 mode, one code unit equals one character, so all off-
+ sets are valid). Like the pattern string, the subject may contain
+ binary zeroes.
+
+ A non-zero starting offset is useful when searching for another match
+ in the same subject by calling pcre2_match() again after a previous
+ success. Setting startoffset differs from passing over a shortened
+ string and setting PCRE2_NOTBOL in the case of a pattern that begins
+ with any kind of lookbehind. For example, consider the pattern
+
+ \Biss\B
+
+ which finds occurrences of "iss" in the middle of words. (\B matches
+ only if the current position in the subject is not a word boundary.)
+ When applied to the string "Mississipi" the first call to pcre2_match()
+ finds the first occurrence. If pcre2_match() is called again with just
+ the remainder of the subject, namely "issipi", it does not match,
+ because \B is always false at the start of the subject, which is deemed
+ to be a word boundary. However, if pcre2_match() is passed the entire
+ string again, but with startoffset set to 4, it finds the second occur-
+ rence of "iss" because it is able to look behind the starting point to
+ discover that it is preceded by a letter.
+
+ Finding all the matches in a subject is tricky when the pattern can
+ match an empty string. It is possible to emulate Perl's /g behaviour by
+ first trying the match again at the same offset, with the
+ PCRE2_NOTEMPTY_ATSTART and PCRE2_ANCHORED options, and then if that
+ fails, advancing the starting offset and trying an ordinary match
+ again. There is some code that demonstrates how to do this in the
+ pcre2demo sample program. In the most general case, you have to check
+ to see if the newline convention recognizes CRLF as a newline, and if
+ so, and the current character is CR followed by LF, advance the start-
+ ing offset by two characters instead of one.
+
+ If a non-zero starting offset is passed when the pattern is anchored,
+ one attempt to match at the given offset is made. This can only succeed
+ if the pattern does not require the match to be at the start of the
+ subject.
+
+ Option bits for pcre2_match()
+
+ The unused bits of the options argument for pcre2_match() must be zero.
+ The only bits that may be set are PCRE2_ANCHORED, PCRE2_NOTBOL,
+ PCRE2_NOTEOL, PCRE2_NOTEMPTY, PCRE2_NOTEMPTY_ATSTART,
+ PCRE2_NO_START_OPTIMIZE, PCRE2_NO_UTF_CHECK, PCRE2_PARTIAL_HARD, and
+ PCRE2_PARTIAL_SOFT. Their action is described below.
+
+ If the pattern was successfully processed by the just-in-time (JIT)
+ compiler, the only supported options for matching using the JIT code
+ are PCRE2_NOTBOL, PCRE2_NOTEOL, PCRE2_NOTEMPTY, PCRE2_NOTEMPTY_ATSTART,
+ PCRE2_NO_UTF_CHECK, PCRE2_PARTIAL_HARD, and PCRE2_PARTIAL_SOFT. If an
+ unsupported option is used, JIT matching is disabled and the normal
+ interpretive code in pcre2_match() is run.
+
+ PCRE2_ANCHORED
+
+ The PCRE2_ANCHORED option limits pcre2_match() to matching at the first
+ matching position. If a pattern was compiled with PCRE2_ANCHORED, or
+ turned out to be anchored by virtue of its contents, it cannot be made
+ unachored at matching time. Note that setting the option at match time
+ disables JIT matching.
+
+ PCRE2_NOTBOL
+
+ This option specifies that first character of the subject string is not
+ the beginning of a line, so the circumflex metacharacter should not
+ match before it. Setting this without PCRE2_MULTILINE (at compile time)
+ causes circumflex never to match. This option affects only the behav-
+ iour of the circumflex metacharacter. It does not affect \A.
+
+ PCRE2_NOTEOL
+
+ This option specifies that the end of the subject string is not the end
+ of a line, so the dollar metacharacter should not match it nor (except
+ in multiline mode) a newline immediately before it. Setting this with-
+ out PCRE2_MULTILINE (at compile time) causes dollar never to match.
+ This option affects only the behaviour of the dollar metacharacter. It
+ does not affect \Z or \z.
+
+ PCRE2_NOTEMPTY
+
+ An empty string is not considered to be a valid match if this option is
+ set. If there are alternatives in the pattern, they are tried. If all
+ the alternatives match the empty string, the entire match fails. For
+ example, if the pattern
+
+ a?b?
+
+ is applied to a string not beginning with "a" or "b", it matches an
+ empty string at the start of the subject. With PCRE2_NOTEMPTY set, this
+ match is not valid, so PCRE2 searches further into the string for
+ occurrences of "a" or "b".
+
+ PCRE2_NOTEMPTY_ATSTART
+
+ This is like PCRE2_NOTEMPTY, except that an empty string match that is
+ not at the start of the subject is permitted. If the pattern is
+ anchored, such a match can occur only if the pattern contains \K.
+
+ PCRE2_NO_START_OPTIMIZE
+
+ There are a number of optimizations that pcre2_match() uses at the
+ start of a match, in order to speed up the process. For example, if it
+ is known that an unanchored match must start with a specific character,
+ it searches the subject for that character, and fails immediately if it
+ cannot find it, without actually running the main matching function.
+ This means that a special item such as (*COMMIT) at the start of a pat-
+ tern is not considered until after a suitable starting point for the
+ match has been found. Also, when callouts or (*MARK) items are in use,
+ these "start-up" optimizations can cause them to be skipped if the pat-
+ tern is never actually used. The start-up optimizations are in effect a
+ pre-scan of the subject that takes place before the pattern is run.
+
+ The PCRE2_NO_START_OPTIMIZE option disables the start-up optimizations,
+ possibly causing performance to suffer, but ensuring that in cases
+ where the result is "no match", the callouts do occur, and that items
+ such as (*COMMIT) and (*MARK) are considered at every possible starting
+ position in the subject string. If PCRE2_NO_START_OPTIMIZE is set at
+ compile time, it cannot be unset at matching time. The use of
+ PCRE2_NO_START_OPTIMIZE at matching time (that is, passing it to
+ pcre2_match()) disables JIT execution; in this situation, matching is
+ always done using interpretively.
+
+ Setting PCRE2_NO_START_OPTIMIZE can change the outcome of a matching
+ operation. Consider the pattern
+
+ (*COMMIT)ABC
+
+ When this is compiled, PCRE2 records the fact that a match must start
+ with the character "A". Suppose the subject string is "DEFABC". The
+ start-up optimization scans along the subject, finds "A" and runs the
+ first match attempt from there. The (*COMMIT) item means that the pat-
+ tern must match the current starting position, which in this case, it
+ does. However, if the same match is run with PCRE2_NO_START_OPTIMIZE
+ set, the initial scan along the subject string does not happen. The
+ first match attempt is run starting from "D" and when this fails,
+ (*COMMIT) prevents any further matches being tried, so the overall
+ result is "no match". There are also other start-up optimizations. For
+ example, a minimum length for the subject may be recorded. Consider the
+ pattern
+
+ (*MARK:A)(X|Y)
+
+ The minimum length for a match is one character. If the subject is
+ "ABC", there will be attempts to match "ABC", "BC", and "C". An attempt
+ to match an empty string at the end of the subject does not take place,
+ because PCRE2 knows that the subject is now too short, and so the
+ (*MARK) is never encountered. In this case, the optimization does not
+ affect the overall match result, which is still "no match", but it does
+ affect the auxiliary information that is returned.
+
+ PCRE2_NO_UTF_CHECK
+
+ When PCRE2_UTF is set at compile time, the validity of the subject as a
+ UTF string is checked by default when pcre2_match() is subsequently
+ called. The entire string is checked before any other processing takes
+ place, and a negative error code is returned if the check fails. There
+ are several UTF error codes for each code unit width, corresponding to
+ different problems with the code unit sequence. The value of startoff-
+ set is also checked, to ensure that it points to the start of a charac-
+ ter or to the end of the subject. There are discussions about the
+ validity of UTF-8 strings, UTF-16 strings, and UTF-32 strings in the
+ pcre2unicode page.
+
+ If you know that your subject is valid, and you want to skip these
+ checks for performance reasons, you can set the PCRE2_NO_UTF_CHECK
+ option when calling pcre2_match(). You might want to do this for the
+ second and subsequent calls to pcre2_match() if you are making repeated
+ calls to find all the matches in a single subject string.
+
+ NOTE: When PCRE2_NO_UTF_CHECK is set, the effect of passing an invalid
+ string as a subject, or an invalid value of startoffset, is undefined.
+ Your program may crash or loop indefinitely.
+
+ PCRE2_PARTIAL_HARD
+ PCRE2_PARTIAL_SOFT
+
+ These options turn on the partial matching feature. A partial match
+ occurs if the end of the subject string is reached successfully, but
+ there are not enough subject characters to complete the match. If this
+ happens when PCRE2_PARTIAL_SOFT (but not PCRE2_PARTIAL_HARD) is set,
+ matching continues by testing any remaining alternatives. Only if no
+ complete match can be found is PCRE2_ERROR_PARTIAL returned instead of
+ PCRE2_ERROR_NOMATCH. In other words, PCRE2_PARTIAL_SOFT says that the
+ caller is prepared to handle a partial match, but only if no complete
+ match can be found.
+
+ If PCRE2_PARTIAL_HARD is set, it overrides PCRE2_PARTIAL_SOFT. In this
+ case, if a partial match is found, pcre2_match() immediately returns
+ PCRE2_ERROR_PARTIAL, without considering any other alternatives. In
+ other words, when PCRE2_PARTIAL_HARD is set, a partial match is consid-
+ ered to be more important that an alternative complete match.
+
+ There is a more detailed discussion of partial and multi-segment match-
+ ing, with examples, in the pcre2partial documentation.
+
+
+NEWLINE HANDLING WHEN MATCHING
+
+ When PCRE2 is built, a default newline convention is set; this is usu-
+ ally the standard convention for the operating system. The default can
+ be overridden in either a compile context or a match context. However,
+ changing the newline convention at match time disables JIT matching.
+ During matching, the newline choice affects the behaviour of the dot,
+ circumflex, and dollar metacharacters. It may also alter the way the
+ match position is advanced after a match failure for an unanchored pat-
+ tern.
+
+ When PCRE2_NEWLINE_CRLF, PCRE2_NEWLINE_ANYCRLF, or PCRE2_NEWLINE_ANY is
+ set, and a match attempt for an unanchored pattern fails when the cur-
+ rent position is at a CRLF sequence, and the pattern contains no
+ explicit matches for CR or LF characters, the match position is
+ advanced by two characters instead of one, in other words, to after the
+ CRLF.
+
+ The above rule is a compromise that makes the most common cases work as
+ expected. For example, if the pattern is .+A (and the PCRE2_DOTALL
+ option is not set), it does not match the string "\r\nA" because, after
+ failing at the start, it skips both the CR and the LF before retrying.
+ However, the pattern [\r\n]A does match that string, because it con-
+ tains an explicit CR or LF reference, and so advances only by one char-
+ acter after the first failure.
+
+ An explicit match for CR of LF is either a literal appearance of one of
+ those characters in the pattern, or one of the \r or \n escape
+ sequences. Implicit matches such as [^X] do not count, nor does \s
+ (which includes CR and LF in the characters that it matches).
+
+ Notwithstanding the above, anomalous effects may still occur when CRLF
+ is a valid newline sequence and explicit \r or \n escapes appear in the
+ pattern.
+
+
+HOW PCRE2_MATCH() RETURNS A STRING AND CAPTURED SUBSTRINGS
+
+ uint32_t pcre2_get_ovector_count(pcre2_match_data *match_data);
+
+ PCRE2_SIZE *pcre2_get_ovector_pointer(pcre2_match_data *match_data);
+
+ In general, a pattern matches a certain portion of the subject, and in
+ addition, further substrings from the subject may be picked out by
+ parenthesized parts of the pattern. Following the usage in Jeffrey
+ Friedl's book, this is called "capturing" in what follows, and the
+ phrase "capturing subpattern" is used for a fragment of a pattern that
+ picks out a substring. PCRE2 supports several other kinds of parenthe-
+ sized subpattern that do not cause substrings to be captured. The
+ pcre2_pattern_info() function can be used to find out how many captur-
+ ing subpatterns there are in a compiled pattern.
+
+ The overall matched string and any captured substrings are returned to
+ the caller via a vector of PCRE2_SIZE values, called the ovector. This
+ is contained within the match data block. You can obtain direct access
+ to the ovector by calling pcre2_get_ovector_pointer() to find its
+ address, and pcre2_get_ovector_count() to find the number of pairs of
+ values it contains. Alternatively, you can use the auxiliary functions
+ for accessing captured substrings by number or by name (see below).
+
+ Within the ovector, the first in each pair of values is set to the off-
+ set of the first code unit of a substring, and the second is set to the
+ offset of the first code unit after the end of a substring. These val-
+ ues are always code unit offsets, not character offsets. That is, they
+ are byte offsets in the 8-bit library, 16-bit offsets in the 16-bit
+ library, and 32-bit offsets in the 32-bit library.
+
+ The first pair of offsets (that is, ovector[0] and ovector[1]) identi-
+ fies the portion of the subject string that was matched by the entire
+ pattern. The next pair is used for the first capturing subpattern, and
+ so on. The value returned by pcre2_match() is one more than the high-
+ est numbered pair that has been set. For example, if two substrings
+ have been captured, the returned value is 3. If there are no capturing
+ subpatterns, the return value from a successful match is 1, indicating
+ that just the first pair of offsets has been set.
+
+ If a capturing subpattern is matched repeatedly within a single match
+ operation, it is the last portion of the string that it matched that is
+ returned.
+
+ If the ovector is too small to hold all the captured substring offsets,
+ as much as possible is filled in, and the function returns a value of
+ zero. If neither the actual string matched nor any captured substrings
+ are of interest, pcre2_match() may be called with a match data block
+ whose ovector is of zero length. However, if the pattern contains back
+ references and the ovector is not big enough to remember the related
+ substrings, PCRE2 has to get additional memory for use during matching.
+ Thus it is usually advisable to set up a match data block containing an
+ ovector of reasonable size.
+
+ It is possible for capturing subpattern number n+1 to match some part
+ of the subject when subpattern n has not been used at all. For example,
+ if the string "abc" is matched against the pattern (a|(z))(bc) the
+ return from the function is 4, and subpatterns 1 and 3 are matched, but
+ 2 is not. When this happens, both values in the offset pairs corre-
+ sponding to unused subpatterns are set to PCRE2_UNSET.
+
+ Offset values that correspond to unused subpatterns at the end of the
+ expression are also set to PCRE2_UNSET. For example, if the string
+ "abc" is matched against the pattern (abc)(x(yz)?)? subpatterns 2 and 3
+ are not matched. The return from the function is 2, because the high-
+ est used capturing subpattern number is 1. The offsets for for the sec-
+ ond and third capturing subpatterns (assuming the vector is large
+ enough, of course) are set to PCRE2_UNSET.
+
+ Elements in the ovector that do not correspond to capturing parentheses
+ in the pattern are never changed. That is, if a pattern contains n cap-
+ turing parentheses, no more than ovector[0] to ovector[2n+1] are set by
+ pcre2_match(). The other elements retain whatever values they previ-
+ ously had.
+
+ Other information about the match
+
+ PCRE2_SPTR pcre2_get_mark(pcre2_match_data *match_data);
+
+ PCRE2_SIZE pcre2_get_leftchar(pcre2_match_data *match_data);
+
+ PCRE2_SIZE pcre2_get_rightchar(pcre2_match_data *match_data);
+
+ PCRE2_SIZE pcre2_get_startchar(pcre2_match_data *match_data);
+
+ In addition to the offsets in the ovector, other information about a
+ match is retained in the match data block and can be retrieved by the
+ above functions.
+
+ When a (*MARK) name is to be passed back, pcre2_get_mark() returns a
+ pointer to the zero-terminated name, which is within the compiled pat-
+ tern. Otherwise NULL is returned. A (*MARK) name may be available
+ after a failed match or a partial match, as well as after a successful
+ one.
+
+ The other three functions yield values that give information about the
+ part of the subject string that was inspected during a successful match
+ or a partial match. Their results are undefined after a failed match.
+ They return the following values, respectively:
+
+ (1) The offset of the leftmost character that was inspected during the
+ match. This can be earlier than the point at which the match started
+ if the pattern contains lookbehind assertions or \b or \B at the start.
+
+ (2) The offset of the character that follows the rightmost character
+ that was inspected during the match. This can be after the end of the
+ match if the pattern contains lookahead assertions.
+
+ (3) The offset of the character at which the successful or partial
+ match started. This can be different to the value of ovector[0] if the
+ pattern contains the \K escape sequence.
+
+ For example, if the pattern (?<=abc)xx\Kyy(?=def) is matched against
+ the string "123abcxxyydef123", the resulting offsets are:
+
+ ovector[0] 8
+ ovector[1] 10
+ leftchar 3
+ rightchar 13
+ startchar 6
+
+ The allusedtext modifier in pcre2test can be used to display a longer
+ string that shows the leftmost and rightmost characters in a match
+ instead of just the matched string.
+
+ Error return values from pcre2_match()
+
+ If pcre2_match() fails, it returns a negative number. This can be con-
+ verted to a text string by calling pcre2_get_error_message(). Negative
+ error codes are also returned by other functions, and are documented
+ with them. The codes are given names in the header file. If UTF check-
+ ing is in force and an invalid UTF subject string is detected, one of a
+ number of UTF-specific negative error codes is returned. Details are
+ given in the pcre2unicode page. The following are the other errors that
+ may be returned by pcre2_match():
+
+ PCRE2_ERROR_NOMATCH
+
+ The subject string did not match the pattern.
+
+ PCRE2_ERROR_PARTIAL
+
+ The subject string did not match, but it did match partially. See the
+ pcre2partial documentation for details of partial matching.
+
+ PCRE2_ERROR_BADMAGIC
+
+ PCRE2 stores a 4-byte "magic number" at the start of the compiled code,
+ to catch the case when it is passed a junk pointer. This is the error
+ that is returned when the magic number is not present.
+
+ PCRE2_ERROR_BADMODE
+
+ This error is given when a pattern that was compiled by the 8-bit
+ library is passed to a 16-bit or 32-bit library function, or vice
+ versa.
+
+ PCRE2_ERROR_BADOFFSET
+
+ The value of startoffset greater than the length of the subject.
+
+ PCRE2_ERROR_BADOPTION
+
+ An unrecognized bit was set in the options argument.
+
+ PCRE2_ERROR_BADUTFOFFSET
+
+ The UTF code unit sequence that was passed as a subject was checked and
+ found to be valid (the PCRE2_NO_UTF_CHECK option was not set), but the
+ value of startoffset did not point to the beginning of a UTF character
+ or the end of the subject.
+
+ PCRE2_ERROR_CALLOUT
+
+ This error is never generated by pcre2_match() itself. It is provided
+ for use by callout functions that want to cause pcre2_match() to return
+ a distinctive error code. See the pcre2callout documentation for
+ details.
+
+ PCRE2_ERROR_INTERNAL
+
+ An unexpected internal error has occurred. This error could be caused
+ by a bug in PCRE2 or by overwriting of the compiled pattern.
+
+ PCRE2_ERROR_JIT_BADOPTION
+
+ This error is returned when a pattern that was successfully studied
+ using JIT is being matched, but the matching mode (partial or complete
+ match) does not correspond to any JIT compilation mode. When the JIT
+ fast path function is used, this error may be also given for invalid
+ options. See the pcre2jit documentation for more details.
+
+ PCRE2_ERROR_JIT_STACKLIMIT
+
+ This error is returned when a pattern that was successfully studied
+ using JIT is being matched, but the memory available for the just-in-
+ time processing stack is not large enough. See the pcre2jit documenta-
+ tion for more details.
+
+ PCRE2_ERROR_MATCHLIMIT
+
+ The backtracking limit was reached.
+
+ PCRE2_ERROR_NOMEMORY
+
+ If a pattern contains back references, but the ovector is not big
+ enough to remember the referenced substrings, PCRE2 gets a block of
+ memory at the start of matching to use for this purpose. There are some
+ other special cases where extra memory is needed during matching. This
+ error is given when memory cannot be obtained.
+
+ PCRE2_ERROR_NULL
+
+ Either the code, subject, or match_data argument was passed as NULL.
+
+ PCRE2_ERROR_RECURSELOOP
+
+ This error is returned when pcre2_match() detects a recursion loop
+ within the pattern. Specifically, it means that either the whole pat-
+ tern or a subpattern has been called recursively for the second time at
+ the same position in the subject string. Some simple patterns that
+ might do this are detected and faulted at compile time, but more com-
+ plicated cases, in particular mutual recursions between two different
+ subpatterns, cannot be detected until run time.
+
+ PCRE2_ERROR_RECURSIONLIMIT
+
+ The internal recursion limit was reached.
+
+
+EXTRACTING CAPTURED SUBSTRINGS BY NUMBER
+
+ int pcre2_substring_length_bynumber(pcre2_match_data *match_data,
+ unsigned int number, PCRE2_SIZE *length);
+
+ int pcre2_substring_copy_bynumber(pcre2_match_data *match_data,
+ unsigned int number, PCRE2_UCHAR *buffer,
+ PCRE2_SIZE *bufflen);
+
+ int pcre2_substring_get_bynumber(pcre2_match_data *match_data,
+ unsigned int number, PCRE2_UCHAR **bufferptr,
+ PCRE2_SIZE *bufflen);
+
+ void pcre2_substring_free(PCRE2_UCHAR *buffer);
+
+ Captured substrings can be accessed directly by using the ovector as
+ described above. For convenience, auxiliary functions are provided for
+ extracting captured substrings as new, separate, zero-terminated
+ strings. The functions in this section identify substrings by number.
+ The next section describes similar functions for extracting substrings
+ by name. A substring that contains a binary zero is correctly extracted
+ and has a further zero added on the end, but the result is not, of
+ course, a C string.
+
+ You can find the length in code units of a captured substring without
+ extracting it by calling pcre2_substring_length_bynumber(). The first
+ argument is a pointer to the match data block, the second is the group
+ number, and the third is a pointer to a variable into which the length
+ is placed.
+
+ The pcre2_substring_copy_bynumber() function copies one string into a
+ supplied buffer, whereas pcre2_substring_get_bynumber() copies it into
+ new memory, obtained using the same memory allocation function that was
+ used for the match data block. The first two arguments of these func-
+ tions are a pointer to the match data block and a capturing group num-
+ ber. A group number of zero extracts the substring that matched the
+ entire pattern, and higher values extract the captured substrings.
+
+ The final arguments of pcre2_substring_copy_bynumber() are a pointer to
+ the buffer and a pointer to a variable that contains its length in code
+ units. This is updated to contain the actual number of code units
+ used, excluding the terminating zero.
+
+ For pcre2_substring_get_bynumber() the third and fourth arguments point
+ to variables that are updated with a pointer to the new memory and the
+ number of code units that comprise the substring, again excluding the
+ terminating zero. When the substring is no longer needed, the memory
+ should be freed by calling pcre2_substring_free().
+
+ The return value from these functions is zero for success, or one of
+ these error codes:
+
+ PCRE2_ERROR_NOMEMORY
+
+ The buffer was too small for pcre2_substring_copy_bynumber(), or the
+ attempt to get memory failed for pcre2_substring_get_bynumber().
+
+ PCRE2_ERROR_NOSUBSTRING
+
+ No substring with the given number was captured. This could be because
+ there is no capturing group of that number in the pattern, or because
+ the group with that number did not participate in the match, or because
+ the ovector was too small to capture that group.
+
+
+EXTRACTING A LIST OF ALL CAPTURED SUBSTRINGS
+
+ int pcre2_substring_list_get(pcre2_match_data *match_data,
+ PCRE2_UCHAR ***listptr, PCRE2_SIZE **lengthsptr);
+
+ void pcre2_substring_list_free(PCRE2_SPTR *list);
+
+ The pcre2_substring_list_get() function extracts all available sub-
+ strings and builds a list of pointers to them, and a second list that
+ contains their lengths (in code units), excluding a terminating zero
+ that is added to each of them. All this is done in a single block of
+ memory that is obtained using the same memory allocation function that
+ was used to get the match data block.
+
+ The address of the memory block is returned via listptr, which is also
+ the start of the list of string pointers. The end of the list is marked
+ by a NULL pointer. The address of the list of lengths is returned via
+ lengthsptr. If your strings do not contain binary zeros and you do not
+ therefore need the lengths, you may supply NULL as the lengthsptr argu-
+ ment to disable the creation of a list of lengths. The yield of the
+ function is zero if all went well, or PCRE2_ERROR_NOMEMORY if the mem-
+ ory block could not be obtained. When the list is no longer needed, it
+ should be freed by calling pcre2_substring_list_free().
+
+ If this function encounters a substring that is unset, which can happen
+ when capturing subpattern number n+1 matches some part of the subject,
+ but subpattern n has not been used at all, it returns an empty string.
+ This can be distinguished from a genuine zero-length substring by
+ inspecting the appropriate offset in the ovector, which contains
+ PCRE2_UNSET for unset substrings.
+
+
+EXTRACTING CAPTURED SUBSTRINGS BY NAME
+
+ int pcre2_substring_number_from_name(const pcre2_code *code,
+ PCRE2_SPTR name);
+
+ int pcre2_substring_length_byname(pcre2_match_data *match_data,
+ PCRE2_SPTR name, PCRE2_SIZE *length);
+
+ int pcre2_substring_copy_byname(pcre2_match_data *match_data,
+ PCRE2_SPTR name, PCRE2_UCHAR *buffer, PCRE2_SIZE *bufflen);
+
+ int pcre2_substring_get_byname(pcre2_match_data *match_data,
+ PCRE2_SPTR name, PCRE2_UCHAR **bufferptr, PCRE2_SIZE *bufflen);
+
+ void pcre2_substring_free(PCRE2_UCHAR *buffer);
+
+ To extract a substring by name, you first have to find associated num-
+ ber. For example, for this pattern:
+
+ (a+)b(?<xxx>\d+)...
+
+ the number of the subpattern called "xxx" is 2. If the name is known to
+ be unique (PCRE2_DUPNAMES was not set), you can find the number from
+ the name by calling pcre2_substring_number_from_name(). The first argu-
+ ment is the compiled pattern, and the second is the name. The yield of
+ the function is the subpattern number, or PCRE2_ERROR_NOSUBSTRING if
+ there is no subpattern of that name.
+
+ Given the number, you can extract the substring directly, or use one of
+ the functions described in the previous section. For convenience, there
+ are also "byname" functions that correspond to the "bynumber" func-
+ tions, the only difference being that the second argument is a name
+ instead of a number. However, if PCRE2_DUPNAMES is set and there are
+ duplicate names, the behaviour may not be what you want (see the next
+ section).
+
+ Warning: If the pattern uses the (?| feature to set up multiple subpat-
+ terns with the same number, as described in the section on duplicate
+ subpattern numbers in the pcre2pattern page, you cannot use names to
+ distinguish the different subpatterns, because names are not included
+ in the compiled code. The matching process uses only numbers. For this
+ reason, the use of different names for subpatterns of the same number
+ causes an error at compile time.
+
+
+DUPLICATE SUBPATTERN NAMES
+
+ int pcre2_substring_nametable_scan(const pcre2_code *code,
+ PCRE2_SPTR name, PCRE2_SPTR *first, PCRE2_SPTR *last);
+
+ When a pattern is compiled with the PCRE2_DUPNAMES option, names for
+ subpatterns are not required to be unique. Duplicate names are always
+ allowed for subpatterns with the same number, created by using the (?|
+ feature. Indeed, if such subpatterns are named, they are required to
+ use the same names.
+
+ Normally, patterns with duplicate names are such that in any one match,
+ only one of the named subpatterns participates. An example is shown in
+ the pcre2pattern documentation.
+
+ When duplicates are present, pcre2_substring_copy_byname() and
+ pcre2_substring_get_byname() return the first substring corresponding
+ to the given name that is set. If none are set, PCRE2_ERROR_NOSUBSTRING
+ is returned. The pcre2_substring_number_from_name() function returns
+ one of the numbers that are associated with the name, but it is not
+ defined which it is.
+
+ If you want to get full details of all captured substrings for a given
+ name, you must use the pcre2_substring_nametable_scan() function. The
+ first argument is the compiled pattern, and the second is the name. If
+ the third and fourth arguments are NULL, the function returns a group
+ number (it is not defined which). Otherwise, the third and fourth argu-
+ ments must be pointers to variables that are updated by the function.
+ After it has run, they point to the first and last entries in the name-
+ to-number table for the given name, and the function returns the length
+ of each entry. In both cases, PCRE2_ERROR_NOSUBSTRING is returned if
+ there are no entries for the given name.
+
+ The format of the name table is described above in the section entitled
+ Information about a pattern above. Given all the relevant entries for
+ the name, you can extract each of their numbers, and hence the captured
+ data.
+
+
+FINDING ALL POSSIBLE MATCHES
+
+ The traditional matching function uses a similar algorithm to Perl,
+ which stops when it finds the first match, starting at a given point in
+ the subject. If you want to find all possible matches, or the longest
+ possible match at a given position, consider using the alternative
+ matching function (see below) instead. If you cannot use the alterna-
+ tive function, you can kludge it up by making use of the callout facil-
+ ity, which is described in the pcre2callout documentation.
+
+ What you have to do is to insert a callout right at the end of the pat-
+ tern. When your callout function is called, extract and save the cur-
+ rent matched substring. Then return 1, which forces pcre2_match() to
+ backtrack and try other alternatives. Ultimately, when it runs out of
+ matches, pcre2_match() will yield PCRE2_ERROR_NOMATCH.
+
+
+MATCHING A PATTERN: THE ALTERNATIVE FUNCTION
+
+ int pcre2_dfa_match(const pcre2_code *code, PCRE2_SPTR subject,
+ PCRE2_SIZE length, PCRE2_SIZE startoffset,
+ uint32_t options, pcre2_match_data *match_data,
+ pcre2_match_context *mcontext,
+ int *workspace, PCRE2_SIZE wscount);
+
+ The function pcre2_dfa_match() is called to match a subject string
+ against a compiled pattern, using a matching algorithm that scans the
+ subject string just once, and does not backtrack. This has different
+ characteristics to the normal algorithm, and is not compatible with
+ Perl. Some of the features of PCRE2 patterns are not supported. Never-
+ theless, there are times when this kind of matching can be useful. For
+ a discussion of the two matching algorithms, and a list of features
+ that pcre2_dfa_match() does not support, see the pcre2matching documen-
+ tation.
+
+ The arguments for the pcre2_dfa_match() function are the same as for
+ pcre2_match(), plus two extras. The ovector within the match data block
+ is used in a different way, and this is described below. The other com-
+ mon arguments are used in the same way as for pcre2_match(), so their
+ description is not repeated here.
+
+ The two additional arguments provide workspace for the function. The
+ workspace vector should contain at least 20 elements. It is used for
+ keeping track of multiple paths through the pattern tree. More
+ workspace is needed for patterns and subjects where there are a lot of
+ potential matches.
+
+ Here is an example of a simple call to pcre2_dfa_match():
+
+ int wspace[20];
+ pcre2_match_data *md = pcre2_match_data_create(4, NULL);
+ int rc = pcre2_dfa_match(
+ re, /* result of pcre2_compile() */
+ "some string", /* the subject string */
+ 11, /* the length of the subject string */
+ 0, /* start at offset 0 in the subject */
+ 0, /* default options */
+ match_data, /* the match data block */
+ NULL, /* a match context; NULL means use defaults */
+ wspace, /* working space vector */
+ 20); /* number of elements (NOT size in bytes) */
+
+ Option bits for pcre_dfa_match()
+
+ The unused bits of the options argument for pcre2_dfa_match() must be
+ zero. The only bits that may be set are PCRE2_ANCHORED, PCRE2_NOTBOL,
+ PCRE2_NOTEOL, PCRE2_NOTEMPTY, PCRE2_NOTEMPTY_ATSTART,
+ PCRE2_NO_UTF_CHECK, PCRE2_NO_START_OPTIMIZE, PCRE2_PARTIAL_HARD,
+ PCRE2_PARTIAL_SOFT, PCRE2_DFA_SHORTEST, and PCRE2_DFA_RESTART. All but
+ the last four of these are exactly the same as for pcre2_match(), so
+ their description is not repeated here.
+
+ PCRE2_PARTIAL_HARD
+ PCRE2_PARTIAL_SOFT
+
+ These have the same general effect as they do for pcre2_match(), but
+ the details are slightly different. When PCRE2_PARTIAL_HARD is set for
+ pcre2_dfa_match(), it returns PCRE2_ERROR_PARTIAL if the end of the
+ subject is reached and there is still at least one matching possibility
+ that requires additional characters. This happens even if some complete
+ matches have already been found. When PCRE2_PARTIAL_SOFT is set, the
+ return code PCRE2_ERROR_NOMATCH is converted into PCRE2_ERROR_PARTIAL
+ if the end of the subject is reached, there have been no complete
+ matches, but there is still at least one matching possibility. The por-
+ tion of the string that was inspected when the longest partial match
+ was found is set as the first matching string in both cases. There is a
+ more detailed discussion of partial and multi-segment matching, with
+ examples, in the pcre2partial documentation.
+
+ PCRE2_DFA_SHORTEST
+
+ Setting the PCRE2_DFA_SHORTEST option causes the matching algorithm to
+ stop as soon as it has found one match. Because of the way the alterna-
+ tive algorithm works, this is necessarily the shortest possible match
+ at the first possible matching point in the subject string.
+
+ PCRE2_DFA_RESTART
+
+ When pcre2_dfa_match() returns a partial match, it is possible to call
+ it again, with additional subject characters, and have it continue with
+ the same match. The PCRE2_DFA_RESTART option requests this action; when
+ it is set, the workspace and wscount options must reference the same
+ vector as before because data about the match so far is left in them
+ after a partial match. There is more discussion of this facility in the
+ pcre2partial documentation.
+
+ Successful returns from pcre2_dfa_match()
+
+ When pcre2_dfa_match() succeeds, it may have matched more than one sub-
+ string in the subject. Note, however, that all the matches from one run
+ of the function start at the same point in the subject. The shorter
+ matches are all initial substrings of the longer matches. For example,
+ if the pattern
+
+ <.*>
+
+ is matched against the string
+
+ This is <something> <something else> <something further> no more
+
+ the three matched strings are
+
+ <something>
+ <something> <something else>
+ <something> <something else> <something further>
+
+ On success, the yield of the function is a number greater than zero,
+ which is the number of matched substrings. The offsets of the sub-
+ strings are returned in the ovector, and can be extracted in the same
+ way as for pcre2_match(). They are returned in reverse order of
+ length; that is, the longest matching string is given first. If there
+ were too many matches to fit into the ovector, the yield of the func-
+ tion is zero, and the vector is filled with the longest matches.
+
+ NOTE: PCRE2's "auto-possessification" optimization usually applies to
+ character repeats at the end of a pattern (as well as internally). For
+ example, the pattern "a\d+" is compiled as if it were "a\d++" because
+ there is no point in backtracking into the repeated digits. For DFA
+ matching, this means that only one possible match is found. If you
+ really do want multiple matches in such cases, either use an ungreedy
+ repeat ("a\d+?") or set the PCRE2_NO_AUTO_POSSESS option when compil-
+ ing.
+
+ Error returns from pcre2_dfa_match()
+
+ The pcre2_dfa_match() function returns a negative number when it fails.
+ Many of the errors are the same as for pcre2_match(), as described
+ above. There are in addition the following errors that are specific to
+ pcre2_dfa_match():
+
+ PCRE2_ERROR_DFA_UITEM
+
+ This return is given if pcre2_dfa_match() encounters an item in the
+ pattern that it does not support, for instance, the use of \C or a back
+ reference.
+
+ PCRE2_ERROR_DFA_UCOND
+
+ This return is given if pcre2_dfa_match() encounters a condition item
+ that uses a back reference for the condition, or a test for recursion
+ in a specific group. These are not supported.
+
+ PCRE2_ERROR_DFA_WSSIZE
+
+ This return is given if pcre2_dfa_match() runs out of space in the
+ workspace vector.
+
+ PCRE2_ERROR_DFA_RECURSE
+
+ When a recursive subpattern is processed, the matching function calls
+ itself recursively, using private memory for the ovector and workspace.
+ This error is given if the internal ovector is not large enough. This
+ should be extremely rare, as a vector of size 1000 is used.
+
+ PCRE2_ERROR_DFA_BADRESTART
+
+ When pcre2_dfa_match() is called with the pcre2_dfa_RESTART option,
+ some plausibility checks are made on the contents of the workspace,
+ which should contain data about the previous partial match. If any of
+ these checks fail, this error is given.
+
+
+SEE ALSO
+
+ pcre2build(3), pcre2libs(3), pcre2callout(3), pcre2matching(3),
+ pcre2partial(3), pcre2posix(3), pcre2demo(3), pcre2sample(3),
+ pcre2stack(3).
+
+
+AUTHOR
+
+ Philip Hazel
+ University Computing Service
+ Cambridge CB2 3QH, England.
+
+
+REVISION
+
+ Last updated: 16 September 2014
+ Copyright (c) 1997-2014 University of Cambridge.
+------------------------------------------------------------------------------
+
+
+PCRE2CALLOUT(3) Library Functions Manual PCRE2CALLOUT(3)
+
+
+
+NAME
+ PCRE2 - Perl-compatible regular expressions (revised API)
+
+SYNOPSIS
+
+ #include <pcre2.h>
+
+ int (*pcre2_callout)(pcre2_callout_block *);
+
+
+DESCRIPTION
+
+ PCRE2 provides a feature called "callout", which is a means of tempo-
+ rarily passing control to the caller of PCRE2 in the middle of pattern
+ matching. The caller of PCRE2 provides an external function by putting
+ its entry point in a match context (see pcre2_set_callout()) in the
+ pcre2api documentation).
+
+ Within a regular expression, (?C) indicates the points at which the
+ external function is to be called. Different callout points can be
+ identified by putting a number less than 256 after the letter C. The
+ default value is zero. For example, this pattern has two callout
+ points:
+
+ (?C1)abc(?C2)def
+
+ If the PCRE2_AUTO_CALLOUT option bit is set when a pattern is compiled,
+ PCRE2 automatically inserts callouts, all with number 255, before each
+ item in the pattern. For example, if PCRE2_AUTO_CALLOUT is used with
+ the pattern
+
+ A(\d{2}|--)
+
+ it is processed as if it were
+
+ (?C255)A(?C255)((?C255)\d{2}(?C255)|(?C255)-(?C255)-(?C255))(?C255)
+
+ Notice that there is a callout before and after each parenthesis and
+ alternation bar. If the pattern contains a conditional group whose con-
+ dition is an assertion, an automatic callout is inserted immediately
+ before the condition. Such a callout may also be inserted explicitly,
+ for example:
+
+ (?(?C9)(?=a)ab|de)
+
+ This applies only to assertion conditions (because they are themselves
+ independent groups).
+
+ Automatic callouts can be used for tracking the progress of pattern
+ matching. The pcre2test program has a pattern qualifier (/auto_call-
+ out) that sets automatic callouts; when it is used, the output indi-
+ cates how the pattern is being matched. This is useful information when
+ you are trying to optimize the performance of a particular pattern.
+
+
+MISSING CALLOUTS
+
+ You should be aware that, because of optimizations in the way PCRE2
+ compiles and matches patterns, callouts sometimes do not happen exactly
+ as you might expect.
+
+ At compile time, PCRE2 "auto-possessifies" repeated items when it knows
+ that what follows cannot be part of the repeat. For example, a+[bc] is
+ compiled as if it were a++[bc]. The pcre2test output when this pattern
+ is anchored and then applied with automatic callouts to the string
+ "aaaa" is:
+
+ --->aaaa
+ +0 ^ ^
+ +1 ^ a+
+ +3 ^ ^ [bc]
+ No match
+
+ This indicates that when matching [bc] fails, there is no backtracking
+ into a+ and therefore the callouts that would be taken for the back-
+ tracks do not occur. You can disable the auto-possessify feature by
+ passing PCRE2_NO_AUTO_POSSESS to pcre2_compile(), or starting the pat-
+ tern with (*NO_AUTO_POSSESS). If this is done in pcre2test (using the
+ /no_auto_possess qualifier), the output changes to this:
+
+ --->aaaa
+ +0 ^ ^
+ +1 ^ a+
+ +3 ^ ^ [bc]
+ +3 ^ ^ [bc]
+ +3 ^ ^ [bc]
+ +3 ^^ [bc]
+ No match
+
+ This time, when matching [bc] fails, the matcher backtracks into a+ and
+ tries again, repeatedly, until a+ itself fails.
+
+ Other optimizations that provide fast "no match" results also affect
+ callouts. For example, if the pattern is
+
+ ab(?C4)cd
+
+ PCRE2 knows that any matching string must contain the letter "d". If
+ the subject string is "abyz", the lack of "d" means that matching
+ doesn't ever start, and the callout is never reached. However, with
+ "abyd", though the result is still no match, the callout is obeyed.
+
+ PCRE2 also knows the minimum length of a matching string, and will
+ immediately give a "no match" return without actually running a match
+ if the subject is not long enough, or, for unanchored patterns, if it
+ has been scanned far enough.
+
+ You can disable these optimizations by passing the PCRE2_NO_START_OPTI-
+ MIZE option to the matching function, or by starting the pattern with
+ (*NO_START_OPT). This slows down the matching process, but does ensure
+ that callouts such as the example above are obeyed.
+
+
+THE CALLOUT INTERFACE
+
+ During matching, when PCRE2 reaches a callout point, the external func-
+ tion that is set in the match context is called (if it is set). This
+ applies to both normal and DFA matching. The only argument to the call-
+ out function is a pointer to a pcre2_callout block. This structure con-
+ tains the following fields:
+
+ uint32_t version;
+ uint32_t callout_number;
+ uint32_t capture_top;
+ uint32_t capture_last;
+ void *callout_data;
+ PCRE2_SIZE *offset_vector;
+ PCRE2_SPTR mark;
+ PCRE2_SPTR subject;
+ PCRE2_SIZE subject_length;
+ PCRE2_SIZE start_match;
+ PCRE2_SIZE current_position;
+ PCRE2_SIZE pattern_position;
+ PCRE2_SIZE next_item_length;
+
+ The version field contains the version number of the block format. The
+ current version is 0. The version number will change in future if addi-
+ tional fields are added, but the intention is never to remove any of
+ the existing fields.
+
+ The callout_number field contains the number of the callout, as com-
+ piled into the pattern (that is, the number after ?C for manual call-
+ outs, and 255 for automatically generated callouts).
+
+ The offset_vector field is a pointer to the vector of capturing offsets
+ (the "ovector") that was passed to the matching function in the match
+ data block. When pcre2_match() is used, the contents can be inspected,
+ in order to extract substrings that have been matched so far, in the
+ same way as for extracting substrings after a match has completed. For
+ the DFA matching function, this field is not useful.
+
+ The subject and subject_length fields contain copies of the values that
+ were passed to the matching function.
+
+ The start_match field normally contains the offset within the subject
+ at which the current match attempt started. However, if the escape
+ sequence \K has been encountered, this value is changed to reflect the
+ modified starting point. If the pattern is not anchored, the callout
+ function may be called several times from the same point in the pattern
+ for different starting points in the subject.
+
+ The current_position field contains the offset within the subject of
+ the current match pointer.
+
+ When the pcre2_match() is used, the capture_top field contains one more
+ than the number of the highest numbered captured substring so far. If
+ no substrings have been captured, the value of capture_top is one. This
+ is always the case when the DFA functions are used, because they do not
+ support captured substrings.
+
+ The capture_last field contains the number of the most recently cap-
+ tured substring. However, when a recursion exits, the value reverts to
+ what it was outside the recursion, as do the values of all captured
+ substrings. If no substrings have been captured, the value of cap-
+ ture_last is 0. This is always the case for the DFA matching functions.
+
+ The callout_data field contains a value that is passed to a matching
+ function specifically so that it can be passed back in callouts. It is
+ set in the match context when the callout is set up by calling
+ pcre2_set_callout() (see the pcre2api documentation).
+
+ The pattern_position field contains the offset to the next item to be
+ matched in the pattern string.
+
+ The next_item_length field contains the length of the next item to be
+ matched in the pattern string. When the callout immediately precedes an
+ alternation bar, a closing parenthesis, or the end of the pattern, the
+ length is zero. When the callout precedes an opening parenthesis, the
+ length is that of the entire subpattern.
+
+ The pattern_position and next_item_length fields are intended to help
+ in distinguishing between different automatic callouts, which all have
+ the same callout number. However, they are set for all callouts.
+
+ In callouts from pcre2_match() the mark field contains a pointer to the
+ zero-terminated name of the most recently passed (*MARK), (*PRUNE), or
+ (*THEN) item in the match, or NULL if no such items have been passed.
+ Instances of (*PRUNE) or (*THEN) without a name do not obliterate a
+ previous (*MARK). In callouts from the DFA matching function this field
+ always contains NULL.
+
+
+RETURN VALUES
+
+ The external callout function returns an integer to PCRE2. If the value
+ is zero, matching proceeds as normal. If the value is greater than
+ zero, matching fails at the current point, but the testing of other
+ matching possibilities goes ahead, just as if a lookahead assertion had
+ failed. If the value is less than zero, the match is abandoned, and the
+ matching function returns the negative value.
+
+ Negative values should normally be chosen from the set of
+ PCRE2_ERROR_xxx values. In particular, PCRE2_ERROR_NOMATCH forces a
+ standard "no match" failure. The error number PCRE2_ERROR_CALLOUT is
+ reserved for use by callout functions; it will never be used by PCRE2
+ itself.
+
+
+AUTHOR
+
+ Philip Hazel
+ University Computing Service
+ Cambridge CB2 3QH, England.
+
+
+REVISION
+
+ Last updated: 19 October 2014
+ Copyright (c) 1997-2014 University of Cambridge.
+------------------------------------------------------------------------------
+
+
+PCRE2UNICODE(3) Library Functions Manual PCRE2UNICODE(3)
+
+
+
+NAME
+ PCRE - Perl-compatible regular expressions (revised API)
+
+UNICODE AND UTF SUPPORT
+
+ When PCRE2 is built with Unicode support, it acquires knowledge of Uni-
+ code character properties and can process text strings in UTF-8,
+ UTF-16, or UTF-32 format (depending on the code unit width). By
+ default, PCRE2 assumes that one code unit is one character. To process
+ a pattern as a UTF string, where a character may require more than one
+ code unit, you must call pcre2_compile() with the PCRE2_UTF option
+ flag, or the pattern must start with the sequence (*UTF). When either
+ of these is the case, both the pattern and any subject strings that are
+ matched against it are treated as UTF strings instead of strings of
+ individual one-code-unit characters.
+
+ If you build PCRE2 with Unicode support, the library will be bigger,
+ but the additional run time overhead is limited to testing the
+ PCRE2_UTF flag occasionally, so should not be very much.
+
+
+UNICODE PROPERTY SUPPORT
+
+ When PCRE2 is built with Unicode support, the escape sequences \p{..},
+ \P{..}, and \X can be used. The Unicode properties that can be tested
+ are limited to the general category properties such as Lu for an upper
+ case letter or Nd for a decimal number, the Unicode script names such
+ as Arabic or Han, and the derived properties Any and L&. Full lists are
+ given in the pcre2pattern and pcre2syntax documentation. Only the short
+ names for properties are supported. For example, \p{L} matches a let-
+ ter. Its Perl synonym, \p{Letter}, is not supported. Furthermore, in
+ Perl, many properties may optionally be prefixed by "Is", for compati-
+ bility with Perl 5.6. PCRE does not support this.
+
+
+WIDE CHARACTERS AND UTF MODES
+
+ Codepoints less than 256 can be specified in patterns by either braced
+ or unbraced hexadecimal escape sequences (for example, \x{b3} or \xb3).
+ Larger values have to use braced sequences. Unbraced octal code points
+ up to \777 are also recognized; larger ones can be coded using \o{...}.
+
+ In UTF modes, repeat quantifiers apply to complete UTF characters, not
+ to individual code units.
+
+ In UTF modes, the dot metacharacter matches one UTF character instead
+ of a single code unit.
+
+ The escape sequence \C can be used to match a single code unit, in a
+ UTF mode, but its use can lead to some strange effects because it
+ breaks up multi-unit characters (see the description of \C in the
+ pcre2pattern documentation). The use of \C is not supported in the
+ alternative matching function pcre2_dfa_exec(), nor is it supported in
+ UTF mode by the JIT optimization. If JIT optimization is requested for
+ a UTF pattern that contains \C, it will not succeed, and so the match-
+ ing will be carried out by the normal interpretive function.
+
+ The character escapes \b, \B, \d, \D, \s, \S, \w, and \W correctly test
+ characters of any code value, but, by default, the characters that
+ PCRE2 recognizes as digits, spaces, or word characters remain the same
+ set as in non-UTF mode, all with code points less than 256. This
+ remains true even when PCRE2 is built to include Unicode support,
+ because to do otherwise would slow down matching in many common cases.
+ Note that this also applies to \b and \B, because they are defined in
+ terms of \w and \W. If you want to test for a wider sense of, say,
+ "digit", you can use explicit Unicode property tests such as \p{Nd}.
+ Alternatively, if you set the PCRE2_UCP option, the way that the char-
+ acter escapes work is changed so that Unicode properties are used to
+ determine which characters match. There are more details in the section
+ on generic character types in the pcre2pattern documentation.
+
+ Similarly, characters that match the POSIX named character classes are
+ all low-valued characters, unless the PCRE2_UCP option is set.
+
+ However, the special horizontal and vertical white space matching
+ escapes (\h, \H, \v, and \V) do match all the appropriate Unicode char-
+ acters, whether or not PCRE2_UCP is set.
+
+ Case-insensitive matching in UTF mode makes use of Unicode properties.
+ A few Unicode characters such as Greek sigma have more than two code-
+ points that are case-equivalent, and these are treated as such.
+
+
+VALIDITY OF UTF STRINGS
+
+ When the PCRE2_UTF option is set, the strings passed as patterns and
+ subjects are (by default) checked for validity on entry to the relevant
+ functions. If an invalid UTF string is passed, an error return is
+ given.
+
+ UTF-16 and UTF-32 strings can indicate their endianness by special code
+ knows as a byte-order mark (BOM). The PCRE2 functions do not handle
+ this, expecting strings to be in host byte order.
+
+ The entire string is checked before any other processing takes place.
+ In addition to checking the format of the string, there is a check to
+ ensure that all code points lie in the range U+0 to U+10FFFF, excluding
+ the surrogate area. The so-called "non-character" code points are not
+ excluded because Unicode corrigendum #9 makes it clear that they should
+ not be.
+
+ Characters in the "Surrogate Area" of Unicode are reserved for use by
+ UTF-16, where they are used in pairs to encode code points with values
+ greater than 0xFFFF. The code points that are encoded by UTF-16 pairs
+ are available independently in the UTF-8 and UTF-32 encodings. (In
+ other words, the whole surrogate thing is a fudge for UTF-16 which
+ unfortunately messes up UTF-8 and UTF-32.)
+
+ In some situations, you may already know that your strings are valid,
+ and therefore want to skip these checks in order to improve perfor-
+ mance, for example in the case of a long subject string that is being
+ scanned repeatedly. If you set the PCRE2_NO_UTF_CHECK flag at compile
+ time or at run time, PCRE2 assumes that the pattern or subject it is
+ given (respectively) contains only valid UTF code unit sequences.
+
+ Passing PCRE2_NO_UTF_CHECK to pcre2_compile() just disables the check
+ for the pattern; it does not also apply to subject strings. If you want
+ to disable the check for a subject string you must pass this option to
+ pcre2_exec() or pcre2_dfa_exec().
+
+ If you pass an invalid UTF string when PCRE2_NO_UTF_CHECK is set, the
+ result is undefined and your program may crash or loop indefinitely.
+
+ Errors in UTF-8 strings
+
+ The following negative error codes are given for invalid UTF-8 strings:
+
+ PCRE2_ERROR_UTF8_ERR1
+ PCRE2_ERROR_UTF8_ERR2
+ PCRE2_ERROR_UTF8_ERR3
+ PCRE2_ERROR_UTF8_ERR4
+ PCRE2_ERROR_UTF8_ERR5
+
+ The string ends with a truncated UTF-8 character; the code specifies
+ how many bytes are missing (1 to 5). Although RFC 3629 restricts UTF-8
+ characters to be no longer than 4 bytes, the encoding scheme (origi-
+ nally defined by RFC 2279) allows for up to 6 bytes, and this is
+ checked first; hence the possibility of 4 or 5 missing bytes.
+
+ PCRE2_ERROR_UTF8_ERR6
+ PCRE2_ERROR_UTF8_ERR7
+ PCRE2_ERROR_UTF8_ERR8
+ PCRE2_ERROR_UTF8_ERR9
+ PCRE2_ERROR_UTF8_ERR10
+
+ The two most significant bits of the 2nd, 3rd, 4th, 5th, or 6th byte of
+ the character do not have the binary value 0b10 (that is, either the
+ most significant bit is 0, or the next bit is 1).
+
+ PCRE2_ERROR_UTF8_ERR11
+ PCRE2_ERROR_UTF8_ERR12
+
+ A character that is valid by the RFC 2279 rules is either 5 or 6 bytes
+ long; these code points are excluded by RFC 3629.
+
+ PCRE2_ERROR_UTF8_ERR13
+
+ A 4-byte character has a value greater than 0x10fff; these code points
+ are excluded by RFC 3629.
+
+ PCRE2_ERROR_UTF8_ERR14
+
+ A 3-byte character has a value in the range 0xd800 to 0xdfff; this
+ range of code points are reserved by RFC 3629 for use with UTF-16, and
+ so are excluded from UTF-8.
+
+ PCRE2_ERROR_UTF8_ERR15
+ PCRE2_ERROR_UTF8_ERR16
+ PCRE2_ERROR_UTF8_ERR17
+ PCRE2_ERROR_UTF8_ERR18
+ PCRE2_ERROR_UTF8_ERR19
+
+ A 2-, 3-, 4-, 5-, or 6-byte character is "overlong", that is, it codes
+ for a value that can be represented by fewer bytes, which is invalid.
+ For example, the two bytes 0xc0, 0xae give the value 0x2e, whose cor-
+ rect coding uses just one byte.
+
+ PCRE2_ERROR_UTF8_ERR20
+
+ The two most significant bits of the first byte of a character have the
+ binary value 0b10 (that is, the most significant bit is 1 and the sec-
+ ond is 0). Such a byte can only validly occur as the second or subse-
+ quent byte of a multi-byte character.
+
+ PCRE2_ERROR_UTF8_ERR21
+
+ The first byte of a character has the value 0xfe or 0xff. These values
+ can never occur in a valid UTF-8 string.
+
+ Errors in UTF-16 strings
+
+ The following negative error codes are given for invalid UTF-16
+ strings:
+
+ PCRE_UTF16_ERR1 Missing low surrogate at end of string
+ PCRE_UTF16_ERR2 Invalid low surrogate follows high surrogate
+ PCRE_UTF16_ERR3 Isolated low surrogate
+
+
+ Errors in UTF-32 strings
+
+ The following negative error codes are given for invalid UTF-32
+ strings:
+
+ PCRE_UTF32_ERR1 Surrogate character (range from 0xd800 to 0xdfff)
+ PCRE_UTF32_ERR2 Code point is greater than 0x10ffff
+
+
+AUTHOR
+
+ Philip Hazel
+ University Computing Service
+ Cambridge CB2 3QH, England.
+
+
+REVISION
+
+ Last updated: 16 September 2014
+ Copyright (c) 1997-2014 University of Cambridge.
+------------------------------------------------------------------------------
+
+
diff --git a/doc/pcre2api.3 b/doc/pcre2api.3
index b7b350e..f5528eb 100644
--- a/doc/pcre2api.3
+++ b/doc/pcre2api.3
@@ -214,7 +214,7 @@ document for an overview of all the PCRE2 documentation.
.B int pcre2_pattern_info(const pcre2 *\fIcode\fP, uint32_t \fIwhat\fP, void *\fIwhere\fP);
.sp
.B int pcre2_config(uint32_t \fIwhat\fP, void *\fIwhere\fP, PCRE2_SIZE \fIlength\fP);
-.sp
+.fi
.
.
.SH "PCRE2 8-BIT, 16-BIT, AND 32-BIT LIBRARIES"
diff --git a/doc/pcre2demo.3 b/doc/pcre2demo.3
new file mode 100644
index 0000000..13535b2
--- /dev/null
+++ b/doc/pcre2demo.3
@@ -0,0 +1,441 @@
+.\" Start example.
+.de EX
+. nr mE \\n(.f
+. nf
+. nh
+. ft CW
+..
+.
+.
+.\" End example.
+.de EE
+. ft \\n(mE
+. fi
+. hy \\n(HY
+..
+.
+.EX
+/*************************************************
+* PCRE2 DEMONSTRATION PROGRAM *
+*************************************************/
+
+/* This is a demonstration program to illustrate a straightforward way of
+calling the PCRE2 regular expression library from a C program. See the
+pcre2sample documentation for a short discussion ("man pcre2sample" if you have
+the PCRE2 man pages installed). PCRE2 is a revised API for the library, and is
+incompatible with the original PCRE API.
+
+There are actually three libraries, each supporting a different code unit
+width. This demonstration program uses the 8-bit library.
+
+In Unix-like environments, if PCRE2 is installed in your standard system
+libraries, you should be able to compile this program using this command:
+
+gcc -Wall pcre2demo.c -lpcre2-8 -o pcre2demo
+
+If PCRE2 is not installed in a standard place, it is likely to be installed
+with support for the pkg-config mechanism. If you have pkg-config, you can
+compile this program using this command:
+
+gcc -Wall pcre2demo.c `pkg-config --cflags --libs libpcre2-8` -o pcre2demo
+
+If you do not have pkg-config, you may have to use this:
+
+gcc -Wall pcre2demo.c -I/usr/local/include -L/usr/local/lib \e
+ -R/usr/local/lib -lpcre2-8 -o pcre2demo
+
+Replace "/usr/local/include" and "/usr/local/lib" with wherever the include and
+library files for PCRE2 are installed on your system. Only some operating
+systems (Solaris is one) use the -R option.
+
+Building under Windows:
+
+If you want to statically link this program against a non-dll .a file, you must
+define PCRE2_STATIC before including pcre2.h, so in this environment, uncomment
+the following line. */
+
+/* #define PCRE2_STATIC */
+
+/* This macro must be defined before including pcre2.h. For a program that uses
+only one code unit width, it makes it possible to use generic function names
+such as pcre2_compile(). */
+
+#define PCRE2_CODE_UNIT_WIDTH 8
+
+#include <stdio.h>
+#include <string.h>
+#include <pcre2.h>
+
+
+/**************************************************************************
+* Here is the program. The API includes the concept of "contexts" for *
+* setting up unusual interface requirements for compiling and matching, *
+* such as custom memory managers and non-standard newline definitions. *
+* This program does not do any of this, so it makes no use of contexts, *
+* always passing NULL where a context could be given. *
+**************************************************************************/
+
+int main(int argc, char **argv)
+{
+pcre2_code *re;
+PCRE2_SPTR pattern; /* PCRE2_SPTR is a pointer to unsigned code units of */
+PCRE2_SPTR subject; /* the appropriate width (8, 16, or 32 bits). */
+PCRE2_SPTR name_table;
+
+int crlf_is_newline;
+int errornumber;
+int find_all;
+int i;
+int namecount;
+int name_entry_size;
+int rc;
+int utf8;
+
+uint32_t option_bits;
+uint32_t newline;
+
+PCRE2_SIZE erroroffset;
+PCRE2_SIZE *ovector;
+
+size_t subject_length;
+pcre2_match_data *match_data;
+
+
+
+/**************************************************************************
+* First, sort out the command line. There is only one possible option at *
+* the moment, "-g" to request repeated matching to find all occurrences, *
+* like Perl's /g option. We set the variable find_all to a non-zero value *
+* if the -g option is present. Apart from that, there must be exactly two *
+* arguments. *
+**************************************************************************/
+
+find_all = 0;
+for (i = 1; i < argc; i++)
+ {
+ if (strcmp(argv[i], "-g") == 0) find_all = 1;
+ else break;
+ }
+
+/* After the options, we require exactly two arguments, which are the pattern,
+and the subject string. */
+
+if (argc - i != 2)
+ {
+ printf("Two arguments required: a regex and a subject string\en");
+ return 1;
+ }
+
+/* As pattern and subject are char arguments, they can be straightforwardly
+cast to PCRE2_SPTR as we are working in 8-bit code units. */
+
+pattern = (PCRE2_SPTR)argv[i];
+subject = (PCRE2_SPTR)argv[i+1];
+subject_length = strlen((char *)subject);
+
+
+/*************************************************************************
+* Now we are going to compile the regular expression pattern, and handle *
+* any errors that are detected. *
+*************************************************************************/
+
+re = pcre2_compile(
+ pattern, /* the pattern */
+ -1, /* indicates pattern is zero-terminated */
+ 0, /* default options */
+ &errornumber, /* for error number */
+ &erroroffset, /* for error offset */
+ NULL); /* use default compile context */
+
+/* Compilation failed: print the error message and exit. */
+
+if (re == NULL)
+ {
+ PCRE2_UCHAR buffer[256];
+ pcre2_get_error_message(errornumber, buffer, sizeof(buffer));
+ printf("PCRE2 compilation failed at offset %d: %s\en", (int)erroroffset,
+ buffer);
+ return 1;
+ }
+
+
+/*************************************************************************
+* If the compilation succeeded, we call PCRE again, in order to do a *
+* pattern match against the subject string. This does just ONE match. If *
+* further matching is needed, it will be done below. Before running the *
+* match we must set up a match_data block for holding the result. *
+*************************************************************************/
+
+/* Using this function ensures that the block is exactly the right size for
+the number of capturing parentheses in the pattern. */
+
+match_data = pcre2_match_data_create_from_pattern(re, NULL);
+
+rc = pcre2_match(
+ re, /* the compiled pattern */
+ subject, /* the subject string */
+ subject_length, /* the length of the subject */
+ 0, /* start at offset 0 in the subject */
+ 0, /* default options */
+ match_data, /* block for storing the result */
+ NULL); /* use default match context */
+
+/* Matching failed: handle error cases */
+
+if (rc < 0)
+ {
+ switch(rc)
+ {
+ case PCRE2_ERROR_NOMATCH: printf("No match\en"); break;
+ /*
+ Handle other special cases if you like
+ */
+ default: printf("Matching error %d\en", rc); break;
+ }
+ pcre2_match_data_free(match_data); /* Release memory used for the match */
+ pcre2_code_free(re); /* data and the compiled pattern. */
+ return 1;
+ }
+
+/* Match succeded. Get a pointer to the output vector, where string offsets are
+stored. */
+
+ovector = pcre2_get_ovector_pointer(match_data);
+printf("\enMatch succeeded at offset %d\en", (int)ovector[0]);
+
+
+/*************************************************************************
+* We have found the first match within the subject string. If the output *
+* vector wasn't big enough, say so. Then output any substrings that were *
+* captured. *
+*************************************************************************/
+
+/* The output vector wasn't big enough. This should not happen, because we used
+pcre2_match_data_create_from_pattern() above. */
+
+if (rc == 0)
+ printf("ovector was not big enough for all the captured substrings\en");
+
+/* Show substrings stored in the output vector by number. Obviously, in a real
+application you might want to do things other than print them. */
+
+for (i = 0; i < rc; i++)
+ {
+ PCRE2_SPTR substring_start = subject + ovector[2*i];
+ size_t substring_length = ovector[2*i+1] - ovector[2*i];
+ printf("%2d: %.*s\en", i, (int)substring_length, (char *)substring_start);
+ }
+
+
+/**************************************************************************
+* That concludes the basic part of this demonstration program. We have *
+* compiled a pattern, and performed a single match. The code that follows *
+* shows first how to access named substrings, and then how to code for *
+* repeated matches on the same subject. *
+**************************************************************************/
+
+/* See if there are any named substrings, and if so, show them by name. First
+we have to extract the count of named parentheses from the pattern. */
+
+(void)pcre2_pattern_info(
+ re, /* the compiled pattern */
+ PCRE2_INFO_NAMECOUNT, /* get the number of named substrings */
+ &namecount); /* where to put the answer */
+
+if (namecount <= 0) printf("No named substrings\en"); else
+ {
+ PCRE2_SPTR tabptr;
+ printf("Named substrings\en");
+
+ /* Before we can access the substrings, we must extract the table for
+ translating names to numbers, and the size of each entry in the table. */
+
+ (void)pcre2_pattern_info(
+ re, /* the compiled pattern */
+ PCRE2_INFO_NAMETABLE, /* address of the table */
+ &name_table); /* where to put the answer */
+
+ (void)pcre2_pattern_info(
+ re, /* the compiled pattern */
+ PCRE2_INFO_NAMEENTRYSIZE, /* size of each entry in the table */
+ &name_entry_size); /* where to put the answer */
+
+ /* Now we can scan the table and, for each entry, print the number, the name,
+ and the substring itself. In the 8-bit library the number is held in two
+ bytes, most significant first. */
+
+ tabptr = name_table;
+ for (i = 0; i < namecount; i++)
+ {
+ int n = (tabptr[0] << 8) | tabptr[1];
+ printf("(%d) %*s: %.*s\en", n, name_entry_size - 3, tabptr + 2,
+ (int)(ovector[2*n+1] - ovector[2*n]), subject + ovector[2*n]);
+ tabptr += name_entry_size;
+ }
+ }
+
+
+/*************************************************************************
+* If the "-g" option was given on the command line, we want to continue *
+* to search for additional matches in the subject string, in a similar *
+* way to the /g option in Perl. This turns out to be trickier than you *
+* might think because of the possibility of matching an empty string. *
+* What happens is as follows: *
+* *
+* If the previous match was NOT for an empty string, we can just start *
+* the next match at the end of the previous one. *
+* *
+* If the previous match WAS for an empty string, we can't do that, as it *
+* would lead to an infinite loop. Instead, a call of pcre2_match() is *
+* made with the PCRE2_NOTEMPTY_ATSTART and PCRE2_ANCHORED flags set. The *
+* first of these tells PCRE2 that an empty string at the start of the *
+* subject is not a valid match; other possibilities must be tried. The *
+* second flag restricts PCRE2 to one match attempt at the initial string *
+* position. If this match succeeds, an alternative to the empty string *
+* match has been found, and we can print it and proceed round the loop, *
+* advancing by the length of whatever was found. If this match does not *
+* succeed, we still stay in the loop, advancing by just one character. *
+* In UTF-8 mode, which can be set by (*UTF) in the pattern, this may be *
+* more than one byte. *
+* *
+* However, there is a complication concerned with newlines. When the *
+* newline convention is such that CRLF is a valid newline, we must *
+* advance by two characters rather than one. The newline convention can *
+* be set in the regex by (*CR), etc.; if not, we must find the default. *
+*************************************************************************/
+
+if (!find_all) /* Check for -g */
+ {
+ pcre2_match_data_free(match_data); /* Release the memory that was used */
+ pcre2_code_free(re); /* for the match data and the pattern. */
+ return 0; /* Exit the program. */
+ }
+
+/* Before running the loop, check for UTF-8 and whether CRLF is a valid newline
+sequence. First, find the options with which the regex was compiled and extract
+the UTF state. */
+
+(void)pcre2_pattern_info(re, PCRE2_INFO_ALLOPTIONS, &option_bits);
+utf8 = (option_bits & PCRE2_UTF) != 0;
+
+/* Now find the newline convention and see whether CRLF is a valid newline
+sequence. */
+
+(void)pcre2_pattern_info(re, PCRE2_INFO_NEWLINE, &newline);
+crlf_is_newline = newline == PCRE2_NEWLINE_ANY ||
+ newline == PCRE2_NEWLINE_CRLF ||
+ newline == PCRE2_NEWLINE_ANYCRLF;
+
+/* Loop for second and subsequent matches */
+
+for (;;)
+ {
+ uint32_t options = 0; /* Normally no options */
+ PCRE2_SIZE start_offset = ovector[1]; /* Start at end of previous match */
+
+ /* If the previous match was for an empty string, we are finished if we are
+ at the end of the subject. Otherwise, arrange to run another match at the
+ same point to see if a non-empty match can be found. */
+
+ if (ovector[0] == ovector[1])
+ {
+ if (ovector[0] == subject_length) break;
+ options = PCRE2_NOTEMPTY_ATSTART | PCRE2_ANCHORED;
+ }
+
+ /* Run the next matching operation */
+
+ rc = pcre2_match(
+ re, /* the compiled pattern */
+ subject, /* the subject string */
+ subject_length, /* the length of the subject */
+ start_offset, /* starting offset in the subject */
+ options, /* options */
+ match_data, /* block for storing the result */
+ NULL); /* use default match context */
+
+ /* This time, a result of NOMATCH isn't an error. If the value in "options"
+ is zero, it just means we have found all possible matches, so the loop ends.
+ Otherwise, it means we have failed to find a non-empty-string match at a
+ point where there was a previous empty-string match. In this case, we do what
+ Perl does: advance the matching position by one character, and continue. We
+ do this by setting the "end of previous match" offset, because that is picked
+ up at the top of the loop as the point at which to start again.
+
+ There are two complications: (a) When CRLF is a valid newline sequence, and
+ the current position is just before it, advance by an extra byte. (b)
+ Otherwise we must ensure that we skip an entire UTF character if we are in
+ UTF mode. */
+
+ if (rc == PCRE2_ERROR_NOMATCH)
+ {
+ if (options == 0) break; /* All matches found */
+ ovector[1] = start_offset + 1; /* Advance one code unit */
+ if (crlf_is_newline && /* If CRLF is newline & */
+ start_offset < subject_length - 1 && /* we are at CRLF, */
+ subject[start_offset] == '\er' &&
+ subject[start_offset + 1] == '\en')
+ ovector[1] += 1; /* Advance by one more. */
+ else if (utf8) /* Otherwise, ensure we */
+ { /* advance a whole UTF-8 */
+ while (ovector[1] < subject_length) /* character. */
+ {
+ if ((subject[ovector[1]] & 0xc0) != 0x80) break;
+ ovector[1] += 1;
+ }
+ }
+ continue; /* Go round the loop again */
+ }
+
+ /* Other matching errors are not recoverable. */
+
+ if (rc < 0)
+ {
+ printf("Matching error %d\en", rc);
+ pcre2_match_data_free(match_data);
+ pcre2_code_free(re);
+ return 1;
+ }
+
+ /* Match succeded */
+
+ printf("\enMatch succeeded again at offset %d\en", (int)ovector[0]);
+
+ /* The match succeeded, but the output vector wasn't big enough. This
+ should not happen. */
+
+ if (rc == 0)
+ printf("ovector was not big enough for all the captured substrings\en");
+
+ /* As before, show substrings stored in the output vector by number, and then
+ also any named substrings. */
+
+ for (i = 0; i < rc; i++)
+ {
+ PCRE2_SPTR substring_start = subject + ovector[2*i];
+ size_t substring_length = ovector[2*i+1] - ovector[2*i];
+ printf("%2d: %.*s\en", i, (int)substring_length, (char *)substring_start);
+ }
+
+ if (namecount <= 0) printf("No named substrings\en"); else
+ {
+ PCRE2_SPTR tabptr = name_table;
+ printf("Named substrings\en");
+ for (i = 0; i < namecount; i++)
+ {
+ int n = (tabptr[0] << 8) | tabptr[1];
+ printf("(%d) %*s: %.*s\en", n, name_entry_size - 3, tabptr + 2,
+ (int)(ovector[2*n+1] - ovector[2*n]), subject + ovector[2*n]);
+ tabptr += name_entry_size;
+ }
+ }
+ } /* End of loop to find second and subsequent matches */
+
+printf("\en");
+pcre2_match_data_free(match_data);
+pcre2_code_free(re);
+return 0;
+}
+
+/* End of pcre2demo.c */
+.EE
diff --git a/doc/pcre2test.1 b/doc/pcre2test.1
index 1da6dfa..71be47d 100644
--- a/doc/pcre2test.1
+++ b/doc/pcre2test.1
@@ -154,7 +154,7 @@ Do not output the version number of \fBpcre2test\fP at the start of execution.
\fB-S\fP \fIsize\fP
On Unix-like systems, set the size of the run-time stack to \fIsize\fP
megabytes.
-.TP10
+.TP 10
\fB-subject\fP \fImodifier-list\fP
Behave as if each subject line contains the given modifiers.
.TP 10
@@ -366,7 +366,7 @@ include a closing square bracket in the characters, code it as \ex5D.
A backslash followed by an equals sign marke the end of the subject string and
the start of a modifier list. For example:
.sp
- abc\=notbol,notempty
+ abc\e=notbol,notempty
.sp
A backslash followed by any other non-alphanumeric character just escapes that
character. A backslash followed by anything else causes an error. However, if
@@ -746,7 +746,7 @@ the actual match are indicated in the output by '<' or '>' characters
underneath them. Here is an example:
.sp
/(?<=pqr)abc(?=xyz)/
- 123pqrabcxyz456\=allusedtext
+ 123pqrabcxyz456\e=allusedtext
0: pqrabcxyz
<<< >>>
.sp
@@ -789,7 +789,7 @@ The \fBcopy\fP and \fBget\fP modifiers can be used to test the
They can be given more than once, and each can specify a group name or number,
for example:
.sp
- abcd\=copy=1,copy=3,get=G1
+ abcd\e=copy=1,copy=3,get=G1
.sp
If the \fB#subject\fP command is used to set default copy and get lists, these
can be unset by specifying a negative number for numbered groups and an empty
diff --git a/doc/pcre2test.txt b/doc/pcre2test.txt
new file mode 100644
index 0000000..34c1a14
--- /dev/null
+++ b/doc/pcre2test.txt
@@ -0,0 +1,1073 @@
+PCRE2TEST(1) General Commands Manual PCRE2TEST(1)
+
+
+
+NAME
+ pcre2test - a program for testing Perl-compatible regular expressions.
+
+SYNOPSIS
+
+ pcre2test [options] [input file [output file]]
+
+ pcre2test is a test program for the PCRE2 regular expression libraries,
+ but it can also be used for experimenting with regular expressions.
+ This document describes the features of the test program; for details
+ of the regular expressions themselves, see the pcre2pattern documenta-
+ tion. For details of the PCRE2 library function calls and their
+ options, see the pcre2api documentation.
+
+ The input for pcre2test is a sequence of regular expression patterns
+ and subject strings to be matched. The output shows the result of each
+ match attempt. Modifiers on the command line, the patterns, and the
+ subject lines specify PCRE2 function options, control how the subject
+ is processed, and what output is produced.
+
+ As the original fairly simple PCRE library evolved, it acquired many
+ different features, and as a result, the original pcretest program
+ ended up with a lot of options in a messy, arcane syntax, for testing
+ all the features. The move to the new PCRE2 API provided an opportunity
+ to re-implement the test program as pcre2test, with a cleaner modifier
+ syntax. Nevertheless, there are still many obscure modifiers, some of
+ which are specifically designed for use in conjunction with the test
+ script and data files that are distributed as part of PCRE2. All the
+ modifiers are documented here, some without much justification, but
+ many of them are unlikely to be of use except when testing the
+ libraries.
+
+
+PCRE2's 8-BIT, 16-BIT AND 32-BIT LIBRARIES
+
+ Different versions of the PCRE2 library can be built to support charac-
+ ter strings that are encoded in 8-bit, 16-bit, or 32-bit code units.
+ One, two, or all three of these libraries may be simultaneously
+ installed. The pcre2test program can be used to test all the libraries.
+ However, its own input and output are always in 8-bit format. When
+ testing the 16-bit or 32-bit libraries, patterns and subject strings
+ are converted to 16- or 32-bit format before being passed to the
+ library functions. Results are converted back to 8-bit code units for
+ output.
+
+ In the rest of this document, the names of library functions and struc-
+ tures are given in generic form, for example, pcre_compile(). The
+ actual names used in the libraries have a suffix _8, _16, or _32, as
+ appropriate.
+
+
+INPUT ENCODING
+
+ Input to pcre2test is processed line by line, either by calling the C
+ library's fgets() function, or via the libreadline library (see below).
+ In Unix-like environments, fgets() treats any bytes other than newline
+ as data characters. However, in some Windows environments character 26
+ (hex 1A) causes an immediate end of file, and no further data is read.
+ For maximum portability, therefore, it is safest to avoid non-printing
+ characters in pcre2test input files.
+
+
+COMMAND LINE OPTIONS
+
+ -8 If the 8-bit library has been built, this option causes it to
+ be used (this is the default). If the 8-bit library has not
+ been built, this option causes an error.
+
+ -16 If the 16-bit library has been built, this option causes it
+ to be used. If only the 16-bit library has been built, this
+ is the default. If the 16-bit library has not been built,
+ this option causes an error.
+
+ -32 If the 32-bit library has been built, this option causes it
+ to be used. If only the 32-bit library has been built, this
+ is the default. If the 32-bit library has not been built,
+ this option causes an error.
+
+ -b Behave as if each pattern has the /fullbincode modifier; the
+ full internal binary form of the pattern is output after com-
+ pilation.
+
+ -C Output the version number of the PCRE2 library, and all
+ available information about the optional features that are
+ included, and then exit with zero exit code. All other
+ options are ignored.
+
+ -C option Output information about a specific build-time option, then
+ exit. This functionality is intended for use in scripts such
+ as RunTest. The following options output the value and set
+ the exit code as indicated:
+
+ ebcdic-nl the code for LF (= NL) in an EBCDIC environment:
+ 0x15 or 0x25
+ 0 if used in an ASCII environment
+ exit code is always 0
+ linksize the configured internal link size (2, 3, or 4)
+ exit code is set to the link size
+ newline the default newline setting:
+ CR, LF, CRLF, ANYCRLF, or ANY
+ exit code is always 0
+ bsr the default setting for what \R matches:
+ ANYCRLF or ANY
+ exit code is always 0
+
+ The following options output 1 for true or 0 for false, and
+ set the exit code to the same value:
+
+ ebcdic compiled for an EBCDIC environment
+ jit just-in-time support is available
+ pcre16 the 16-bit library was built
+ pcre32 the 32-bit library was built
+ pcre8 the 8-bit library was built
+ unicode Unicode support is available
+
+ If an unknown option is given, an error message is output;
+ the exit code is 0.
+
+ -d Behave as if each pattern has the debug modifier; the inter-
+ nal form and information about the compiled pattern is output
+ after compilation; -d is equivalent to -b -i.
+
+ -dfa Behave as if each subject line has the dfa modifier; matching
+ is done using the pcre2_dfa_match() function instead of the
+ default pcre2_match().
+
+ -help Output a brief summary these options and then exit.
+
+ -i Behave as if each pattern has the /info modifier; information
+ about the compiled pattern is given after compilation.
+
+ -jit Behave as if each pattern line has the jit modifier; after
+ successful compilation, each pattern is passed to the just-
+ in-time compiler, if available.
+
+ -pattern modifier-list
+ Behave as if each pattern line contains the given modifiers.
+
+ -q Do not output the version number of pcre2test at the start of
+ execution.
+
+ -S size On Unix-like systems, set the size of the run-time stack to
+ size megabytes.
+
+ -subject modifier-list
+ Behave as if each subject line contains the given modifiers.
+
+ -t Run each compile and match many times with a timer, and out-
+ put the resulting times per compile or match. You can control
+ the number of iterations that are used for timing by follow-
+ ing -t with a number (as a separate item on the command
+ line). For example, "-t 1000" iterates 1000 times. The
+ default is to iterate 500,000 times.
+
+ -tm This is like -t except that it times only the matching phase,
+ not the compile phase.
+
+ -T -TM These behave like -t and -tm, but in addition, at the end of
+ a run, the total times for all compiles and matches are out-
+ put.
+
+ -version Output the PCRE2 version number and then exit.
+
+
+DESCRIPTION
+
+ If pcre2test is given two filename arguments, it reads from the first
+ and writes to the second. If it is given only one filename argument, it
+ reads from that file and writes to stdout. Otherwise, it reads from
+ stdin and writes to stdout, and prompts for each line of input, using
+ "re>" to prompt for regular expression patterns, and "data>" to prompt
+ for subject lines.
+
+ When pcre2test is built, a configuration option can specify that it
+ should be linked with the libreadline or libedit library. When this is
+ done, if the input is from a terminal, it is read using the readline()
+ function. This provides line-editing and history facilities. The output
+ from the -help option states whether or not readline() will be used.
+
+ The program handles any number of tests, each of which consists of a
+ set of input lines. Each set starts with a regular expression pattern,
+ followed by any number of subject lines to be matched against that pat-
+ tern. In between sets of test data, command lines that begin with a
+ hash (#) character may appear. This file format, with some restric-
+ tions, can also be processed by the perltest.pl script that is distrib-
+ uted with PCRE2 as a means of checking that the behaviour of PCRE2 and
+ Perl is the same.
+
+ Each subject line is matched separately and independently. If you want
+ to do multi-line matches, you have to use the \n escape sequence (or \r
+ or \r\n, etc., depending on the newline setting) in a single line of
+ input to encode the newline sequences. There is no limit on the length
+ of subject lines; the input buffer is automatically extended if it is
+ too small. There is a replication feature that makes it possible to
+ generate long subject lines without having to supply them explicitly.
+
+ An empty line or the end of the file signals the end of the subject
+ lines for a test, at which point a new pattern or command line is
+ expected if there is still input to be read.
+
+
+COMMAND LINES
+
+ In between sets of test data, a line that begins with a hash (#) char-
+ acter is interpreted as a command line. If the first character is fol-
+ lowed by white space or an exclamation mark, the line is treated as a
+ comment, and ignored. Otherwise, the following commands are recog-
+ nized:
+
+ #forbid_utf
+
+ Subsequent patterns automatically have the PCRE2_NEVER_UTF and
+ PCRE2_NEVER_UCP options set, which locks out the use of UTF and Unicode
+ property features. This is a trigger guard that is used in test files
+ to ensure that UTF/Unicode tests are not accidentally added to files
+ that are used when UTF support is not included in the library. This
+ effect can also be obtained by the use of #pattern; the difference is
+ that #forbid_utf cannot be unset, and the automatic options are not
+ displayed in pattern information, to avoid cluttering up test output.
+
+ #pattern <modifier-list>
+
+ This command sets a default modifier list that applies to all subse-
+ quent patterns. Modifiers on a pattern can change these settings.
+
+ #perltest
+
+ The appearance of this line causes all subsequent modifier settings to
+ be checked for compatibility with the perltest.pl script, which is used
+ to confirm that Perl gives the same results as PCRE2. Also, apart from
+ comment lines, none of the other command lines are permitted, because
+ they and many of the modifiers are specific to pcre2test, and should
+ not be used in test files that are also processed by perltest.pl. The
+ #perltest command helps detect tests that are accidentally put in the
+ wrong file.
+
+ #subject <modifier-list>
+
+ This command sets a default modifier list that applies to all subse-
+ quent subject lines. Modifiers on a subject line can change these set-
+ tings.
+
+
+MODIFIER SYNTAX
+
+ Modifier lists are used with both pattern and subject lines. Items in a
+ list are separated by commas and optional white space. Some modifiers
+ may be given for both patterns and subject lines, whereas others are
+ valid for one or the other only. Each modifier has a long name, for
+ example "anchored", and some of them must be followed by an equals sign
+ and a value, for example, "offset=12". Modifiers that do not take val-
+ ues may be preceded by a minus sign to turn off a previous default set-
+ ting.
+
+ A few of the more common modifiers can also be specified as single let-
+ ters, for example "i" for "caseless". In documentation, following the
+ Perl convention, these are written with a slash ("the /i modifier") for
+ clarity. Abbreviated modifiers must all be concatenated in the first
+ item of a modifier list. If the first item is not recognized as a long
+ modifier name, it is interpreted as a sequence of these abbreviations.
+ For example:
+
+ /abc/ig,newline=cr,jit=3
+
+ This is a pattern line whose modifier list starts with two one-letter
+ modifiers (/i and /g). The lower-case abbreviated modifiers are the
+ same as used in Perl.
+
+
+PATTERN SYNTAX
+
+ A pattern line must start with one of the following characters (common
+ symbols, excluding pattern meta-characters):
+
+ / ! " ' ` - = _ : ; , % & @ ~
+
+ This is interpreted as the pattern's delimiter. A regular expression
+ may be continued over several input lines, in which case the newline
+ characters are included within it. It is possible to include the delim-
+ iter within the pattern by escaping it with a backslash, for example
+
+ /abc\/def/
+
+ If you do this, the escape and the delimiter form part of the pattern,
+ but since the delimiters are all non-alphanumeric, this does not affect
+ its interpretation. If the terminating delimiter is immediately fol-
+ lowed by a backslash, for example,
+
+ /abc/\
+
+ then a backslash is added to the end of the pattern. This is done to
+ provide a way of testing the error condition that arises if a pattern
+ finishes with a backslash, because
+
+ /abc\/
+
+ is interpreted as the first line of a pattern that starts with "abc/",
+ causing pcre2test to read the next line as a continuation of the regu-
+ lar expression.
+
+ A pattern can be followed by a modifier list (details below).
+
+
+SUBJECT LINE SYNTAX
+
+ Before each subject line is passed to pcre2_match() or
+ pcre2_dfa_match(), leading and trailing white space is removed, and the
+ line is scanned for backslash escapes. The following provide a means of
+ encoding non-printing characters in a visible way:
+
+ \a alarm (BEL, \x07)
+ \b backspace (\x08)
+ \e escape (\x27)
+ \f form feed (\x0c)
+ \n newline (\x0a)
+ \r carriage return (\x0d)
+ \t tab (\x09)
+ \v vertical tab (\x0b)
+ \nnn octal character (up to 3 octal digits); always
+ a byte unless > 255 in UTF-8 or 16-bit or 32-bit mode
+ \o{dd...} octal character (any number of octal digits}
+ \xhh hexadecimal byte (up to 2 hex digits)
+ \x{hh...} hexadecimal character (any number of hex digits)
+
+ The use of \x{hh...} is not dependent on the use of the utf modifier on
+ the pattern. It is recognized always. There may be any number of hexa-
+ decimal digits inside the braces; invalid values provoke error mes-
+ sages.
+
+ Note that \xhh specifies one byte rather than one character in UTF-8
+ mode; this makes it possible to construct invalid UTF-8 sequences for
+ testing purposes. On the other hand, \x{hh} is interpreted as a UTF-8
+ character in UTF-8 mode, generating more than one byte if the value is
+ greater than 127. When testing the 8-bit library not in UTF-8 mode,
+ \x{hh} generates one byte for values less than 256, and causes an error
+ for greater values.
+
+ In UTF-16 mode, all 4-digit \x{hhhh} values are accepted. This makes it
+ possible to construct invalid UTF-16 sequences for testing purposes.
+
+ In UTF-32 mode, all 4- to 8-digit \x{...} values are accepted. This
+ makes it possible to construct invalid UTF-32 sequences for testing
+ purposes.
+
+ There is a special backslash sequence that specifies replication of one
+ or more characters:
+
+ \[<characters>]{<count>}
+
+ This makes it possible to test long strings without having to provide
+ them as part of the file. For example:
+
+ \[abc]{4}
+
+ is converted to "abcabcabcabc". This feature does not support nesting.
+ To include a closing square bracket in the characters, code it as \x5D.
+
+ A backslash followed by an equals sign marke the end of the subject
+ string and the start of a modifier list. For example:
+
+ abc\=notbol,notempty
+
+ A backslash followed by any other non-alphanumeric character just
+ escapes that character. A backslash followed by anything else causes an
+ error. However, if the very last character in the line is a backslash
+ (and there is no modifier list), it is ignored. This gives a way of
+ passing an empty line as data, since a real empty line terminates the
+ data input.
+
+
+PATTERN MODIFIERS
+
+ There are three types of modifier that can appear in pattern lines, two
+ of which may also be used in a #pattern command. A pattern's modifier
+ list can add to or override default modifiers that were set by a previ-
+ ous #pattern command.
+
+ Setting compilation options
+
+ The following modifiers set options for pcre2_compile(). The most com-
+ mon ones have single-letter abbreviations. See pcreapi for a descrip-
+ tion of their effects.
+
+ allow_empty_class set PCRE2_ALLOW_EMPTY_CLASS
+ alt_bsux set PCRE2_ALT_BSUX
+ anchored set PCRE2_ANCHORED
+ auto_callout set PCRE2_AUTO_CALLOUT
+ /i caseless set PCRE2_CASELESS
+ dollar_endonly set PCRE2_DOLLAR_ENDONLY
+ /s dotall set PCRE2_DOTALL
+ dupnames set PCRE2_DUPNAMES
+ /x extended set PCRE2_EXTENDED
+ firstline set PCRE2_FIRSTLINE
+ match_unset_backref set PCRE2_MATCH_UNSET_BACKREF
+ /m multiline set PCRE2_MULTILINE
+ never_ucp set PCRE2_NEVER_UCP
+ never_utf set PCRE2_NEVER_UTF
+ no_auto_capture set PCRE2_NO_AUTO_CAPTURE
+ no_auto_possess set PCRE2_NO_AUTO_POSSESS
+ no_start_optimize set PCRE2_NO_START_OPTIMIZE
+ no_utf_check set PCRE2_NO_UTF_CHECK
+ ucp set PCRE2_UCP
+ ungreedy set PCRE2_UNGREEDY
+ utf set PCRE2_UTF
+
+ As well as turning on the PCRE2_UTF option, the utf modifier causes all
+ non-printing characters in output strings to be printed using the
+ \x{hh...} notation. Otherwise, those less than 0x100 are output in hex
+ without the curly brackets.
+
+ Setting compilation controls
+
+ The following modifiers affect the compilation process or request
+ information about the pattern:
+
+ bsr=[anycrlf|unicode] specify \R handling
+ /B bincode show binary code without lengths
+ debug same as info,fullbincode
+ fullbincode show binary code with lengths
+ /I info show info about compiled pattern
+ hex pattern is coded in hexadecimal
+ jit[=<number>] use JIT
+ locale=<name> use this locale
+ memory show memory used
+ newline=<type> set newline type
+ parens_nest_limit=<n> set maximum parentheses depth
+ perlcompat lock out non-Perl modifiers
+ posix use the POSIX API
+ stackguard=<number> test the stackguard feature
+ tables=[0|1|2] select internal tables
+ use_length use the pattern's length
+
+ The effects of these modifiers are described in the following sections.
+ FIXME: Give more examples.
+
+ Newline and \R handling
+
+ The bsr modifier specifies what \R in a pattern should match. If it is
+ set to "anycrlf", \R matches CR, LF, or CRLF only. If it is set to
+ "unicode", \R matches any Unicode newline sequence. The default is
+ specified when PCRE2 is built, with the default default being Unicode.
+
+ The newline modifier specifies which characters are to be interpreted
+ as newlines, both in the pattern and (by default) in subject lines. The
+ type must be one of CR, LF, CRLF, ANYCRLF, or ANY.
+
+ Both the \R and newline settings can be changed at match time, but if
+ this is done, JIT matching is disabled.
+
+ Information about a pattern
+
+ The debug modifier is a shorthand for info,fullbincode, requesting all
+ available information.
+
+ The bincode modifier causes a representation of the compiled code to be
+ output after compilation. This information does not contain length and
+ offset values, which ensures that the same output is generated for dif-
+ ferent internal link sizes and different code unit widths. By using
+ bincode, the same regression tests can be used in different environ-
+ ments.
+
+ The fullbincode modifier, by contrast, does include length and offset
+ values. This is used in a few special tests and is also useful for one-
+ off tests.
+
+ The info modifier requests information about the compiled pattern
+ (whether it is anchored, has a fixed first character, and so on). The
+ information is obtained from the pcre2_pattern_info() function.
+
+ Specifying a pattern in hex
+
+ The hex modifier specifies that the characters of the pattern are to be
+ interpreted as pairs of hexadecimal digits. White space is permitted
+ between pairs. For example:
+
+ /ab 32 59/hex
+
+ This feature is provided as a way of creating patterns that contain
+ binary zero characters. When hex is set, it implies use_length.
+
+ Using the pattern's length
+
+ By default, pcre2test passes patterns as zero-terminated strings to
+ pcre2_compile(), giving the length as -1. If use_length is set, the
+ length of the pattern is passed. This is implied if hex is set.
+
+ JIT compilation
+
+ The /jit modifier may optionally be followed by a number in the range 0
+ to 7:
+
+ 0 disable JIT
+ 1 normal match only
+ 2 soft partial match only
+ 3 normal match and soft partial match
+ 4 hard partial match only
+ 6 soft and hard partial match
+ 7 all three modes
+
+ If no number is given, 7 is assumed. If JIT compilation is successful,
+ the compiled JIT code will automatically be used when pcre2_match() is
+ run, except when incompatible run-time options are specified. For more
+ details, see the pcre2jit documentation. See also the jitstack modifier
+ below for a way of setting the size of the JIT stack.
+
+ If the jitverify modifier is specified, the text "(JIT)" is added to
+ the first output line after a match or non match when JIT-compiled code
+ was actually used. This modifier can also be set on a subject line.
+
+ Setting a locale
+
+ The /locale modifier must specify the name of a locale, for example:
+
+ /pattern/locale=fr_FR
+
+ The given locale is set, pcre2_maketables() is called to build a set of
+ character tables for the locale, and this is then passed to pcre2_com-
+ pile() when compiling the regular expression. The same tables are used
+ when matching the following subject lines. The /locale modifier applies
+ only to the pattern on which it appears, but can be given in a #pattern
+ command if a default is needed. Setting a locale and alternate charac-
+ ter tables are mutually exclusive.
+
+ Showing pattern memory
+
+ The /memory modifier causes the size in bytes of the memory block used
+ to hold the compiled pattern to be output. This does not include the
+ size of the pcre2_code block; it is just the actual compiled data. If
+ the pattern is subsequently passed to the JIT compiler, the size of the
+ JIT compiled code is also output.
+
+ Limiting nested parentheses
+
+ The parens_nest_limit modifier sets a limit on the depth of nested
+ parentheses in a pattern. Breaching the limit causes a compilation
+ error.
+
+ Using the POSIX wrapper API
+
+ The /posix modifier causes pcre2test to call PCRE2 via the POSIX wrap-
+ per API rather than its native API. This supports only the 8-bit
+ library. When the POSIX API is being used, the following pattern modi-
+ fiers set options for the regcomp() function:
+
+ caseless REG_ICASE
+ multiline REG_NEWLINE
+ no_auto_capture REG_NOSUB
+ dotall REG_DOTALL )
+ ungreedy REG_UNGREEDY ) These options are not part of
+ ucp REG_UCP ) the POSIX standard
+ utf REG_UTF8 )
+
+ The aftertext and allaftertext subject modifiers work as described
+ below. All other modifiers cause an error.
+
+ Testing the stack guard feature
+
+ The /stackguard modifier is used to test the use of pcre2_set_com-
+ pile_recursion_guard(), a function that is provided to enable stack
+ availability to be checked during compilation (see the pcre2api docu-
+ mentation for details). If the number specified by the modifier is
+ greater than zero, pcre2_set_compile_recursion_guard() is called to set
+ up callback from pcre2_compile() to a local function. The argument it
+ is passed is the current nesting parenthesis depth; if this is greater
+ than the value given by the modifier, non-zero is returned, causing the
+ compilation to be aborted.
+
+ Using alternative character tables
+
+ The /tables modifier must be followed by a single digit. It causes a
+ specific set of built-in character tables to be passed to pcre2_com-
+ pile(). This is used in the PCRE2 tests to check behaviour with differ-
+ ent character tables. The digit specifies the tables as follows:
+
+ 0 do not pass any special character tables
+ 1 the default ASCII tables, as distributed in
+ pcre2_chartables.c.dist
+ 2 a set of tables defining ISO 8859 characters
+
+ In table 2, some characters whose codes are greater than 128 are iden-
+ tified as letters, digits, spaces, etc. Setting alternate character
+ tables and a locale are mutually exclusive.
+
+ Setting certain match controls
+
+ The following modifiers are really subject modifiers, and are described
+ below. However, they may be included in a pattern's modifier list, in
+ which case they are applied to every subject line that is processed
+ with that pattern. They do not affect the compilation process.
+
+ aftertext show text after match
+ allaftertext show text after captures
+ allcaptures show all captures
+ allusedtext show all consulted text
+ /g global global matching
+ jitverify verify JIT usage
+ mark show mark values
+
+ These modifiers may not appear in a #pattern command. If you want them
+ as defaults, set them in a #subject command.
+
+
+SUBJECT MODIFIERS
+
+ The modifiers that can appear in subject lines and the #subject command
+ are of two types.
+
+ Setting match options
+
+ The following modifiers set options for pcre2_match() or
+ pcre2_dfa_match(). See pcreapi for a description of their effects.
+
+ anchored set PCRE2_ANCHORED
+ dfa_restart set PCRE2_DFA_RESTART
+ dfa_shortest set PCRE2_DFA_SHORTEST
+ no_start_optimize set PCRE2_NO_START_OPTIMIZE
+ no_utf_check set PCRE2_NO_UTF_CHECK
+ notbol set PCRE2_NOTBOL
+ notempty set PCRE2_NOTEMPTY
+ notempty_atstart set PCRE2_NOTEMPTY_ATSTART
+ noteol set PCRE2_NOTEOL
+ partial_hard (or ph) set PCRE2_PARTIAL_HARD
+ partial_soft (or ps) set PCRE2_PARTIAL_SOFT
+
+ The partial matching modifiers are provided with abbreviations because
+ they appear frequently in tests.
+
+ If the /posix modifier was present on the pattern, causing the POSIX
+ wrapper API to be used, the only option-setting modifiers that have any
+ effect are notbol, notempty, and noteol, causing REG_NOTBOL,
+ REG_NOTEMPTY, and REG_NOTEOL, respectively, to be passed to regexec().
+ Any other modifiers cause an error.
+
+ Setting match controls
+
+ The following modifiers affect the matching process or request addi-
+ tional information. Some of them may also be specified on a pattern
+ line (see above), in which case they apply to every subject line that
+ is matched against that pattern.
+
+ aftertext show text after match
+ allaftertext show text after captures
+ allcaptures show all captures
+ allusedtext show all consulted text
+ altglobal alternative global matching
+ bsr=[anycrlf|unicode] specify \R handling
+ callout_capture show captures at callout time
+ callout_data=<n> set a value to pass via callouts
+ callout_fail=<n>[:<m>] control callout failure
+ callout_none do not supply a callout function
+ copy=<number or name> copy captured substring
+ dfa use pcre2_dfa_match()
+ find_limits find match and recursion limits
+ get=<number or name> extract captured substring
+ getall extract all captured substrings
+ /g global global matching
+ jitstack=<n> set size of JIT stack
+ jitverify verify JIT usage
+ mark show mark values
+ match_limit=>n> set a match limit
+ memory show memory usage
+ newline=<type> set newline type
+ offset=<n> set starting offset
+ ovector=<n> set size of output vector
+ recursion_limit=<n> set a recursion limit
+
+ The effects of these modifiers are described in the following sections.
+ FIXME: Give more examples.
+
+ Newline and \R handling
+
+ These modifiers set the newline and \R processing conventions for the
+ subject line, overriding any values that were set at compile time (as
+ described above). JIT matching is disabled if these settings are
+ changed at match time.
+
+ Showing more text
+
+ The aftertext modifier requests that as well as outputting the sub-
+ string that matched the entire pattern, pcre2test should in addition
+ output the remainder of the subject string. This is useful for tests
+ where the subject contains multiple copies of the same substring. The
+ allaftertext modifier requests the same action for captured substrings
+ as well as the main matched substring. In each case the remainder is
+ output on the following line with a plus character following the cap-
+ ture number.
+
+ The allusedtext modifier requests that all the text that was consulted
+ during a successful pattern match be shown. This affects the output if
+ there is a lookbehind at the start of a match, or a lookahead at the
+ end, or if \K is used in the pattern. Characters that precede or follow
+ the start and end of the actual match are indicated in the output by
+ '<' or '>' characters underneath them. Here is an example:
+
+ /(?<=pqr)abc(?=xyz)/
+ 123pqrabcxyz456\=allusedtext
+ 0: pqrabcxyz
+ <<< >>>
+
+ This shows that the matched string is "abc", with the preceding and
+ following strings "pqr" and "xyz" also consulted during the match.
+
+ Showing the value of all capture groups
+
+ The allcaptures modifier requests that the values of all potential cap-
+ tured parentheses be output after a match. By default, only those up to
+ the highest one actually used in the match are output (corresponding to
+ the return code from pcre2_match()). Groups that did not take part in
+ the match are output as "<unset>".
+
+ Testing callouts
+
+ A callout function is supplied when pcre2test calls the library match-
+ ing functions, unless callout_none is specified. If callout_capture is
+ set, the current captured groups are output when a callout occurs.
+
+ The callout_fail modifier can be given one or two numbers. If there is
+ only one number, 1 is returned instead of 0 when a callout of that num-
+ ber is reached. If two numbers are given, 1 is returned when callout
+ <n> is reached for the <m>th time.
+
+ The callout_data modifier can be given an unsigned or a negative num-
+ ber. Any value other than zero is used as a return from pcre2test's
+ callout function.
+
+ Testing substring extraction functions
+
+ The copy and get modifiers can be used to test the pcre2_sub-
+ string_copy_xxx() and pcre2_substring_get_xxx() functions. They can be
+ given more than once, and each can specify a group name or number, for
+ example:
+
+ abcd\=copy=1,copy=3,get=G1
+
+ If the #subject command is used to set default copy and get lists,
+ these can be unset by specifying a negative number for numbered groups
+ and an empty name for named groups.
+
+ The getall modifier tests pcre2_substring_list_get(), which extracts
+ all captured substrings.
+
+ If the subject line is successfully matched, the substrings extracted
+ by the convenience functions are output with C, G, or L after the
+ string number instead of a colon. This is in addition to the normal
+ full list. The string length (that is, the return from the extraction
+ function) is given in parentheses after each substring.
+
+ Finding all matches in a string
+
+ Searching for all possible matches within a subject can be requested by
+ the global or /altglobal modifier. After finding a match, the matching
+ function is called again to search the remainder of the subject. The
+ difference between global and altglobal is that the former uses the
+ start_offset argument to pcre2_match() or pcre2_dfa_match() to start
+ searching at a new point within the entire string (which is what Perl
+ does), whereas the latter passes over a shortened substring. This makes
+ a difference to the matching process if the pattern begins with a look-
+ behind assertion (including \b or \B).
+
+ If an empty string is matched, the next match is done with the
+ PCRE2_NOTEMPTY_ATSTART and PCRE2_ANCHORED flags set, in order to search
+ for another, non-empty, match at the same point in the subject. If this
+ match fails, the start offset is advanced, and the normal match is
+ retried. This imitates the way Perl handles such cases when using the
+ /g modifier or the split() function. Normally, the start offset is
+ advanced by one character, but if the newline convention recognizes
+ CRLF as a newline, and the current character is CR followed by LF, an
+ advance of two is used.
+
+ Setting the JIT stack size
+
+ The jitstack modifier provides a way of setting the maximum stack size
+ that is used by the just-in-time optimization code. It is ignored if
+ JIT optimization is not being used. Providing a stack that is larger
+ than the default 32K is necessary only for very complicated patterns.
+
+ Setting match and recursion limits
+
+ The match_limit and recursion_limit modifiers set the appropriate lim-
+ its in the match context. These values are ignored when the find_limits
+ modifier is specified.
+
+ Finding minimum limits
+
+ If the find_limits modifier is present, pcre2test calls pcre2_match()
+ several times, setting different values in the match context via
+ pcre2_set_match_limit() and pcre2_set_recursion_limit() until it finds
+ the minimum values for each parameter that allow pcre2_match() to com-
+ plete without error.
+
+ The match_limit number is a measure of the amount of backtracking that
+ takes place, and learning the minimum value can be instructive. For
+ most simple matches, the number is quite small, but for patterns with
+ very large numbers of matching possibilities, it can become large very
+ quickly with increasing length of subject string. The
+ match_limit_recursion number is a measure of how much stack (or, if
+ PCRE2 is compiled with NO_RECURSE, how much heap) memory is needed to
+ complete the match attempt.
+
+ Showing MARK names
+
+
+ The mark modifier causes the names from backtracking control verbs that
+ are returned from calls to pcre2_match() to be displayed. If a mark is
+ returned for a match, non-match, or partial match, pcre2test shows it.
+ For a match, it is on a line by itself, tagged with "MK:". Otherwise,
+ it is added to the non-match message.
+
+ Showing memory usage
+
+ The memory modifier causes pcre2test to log all memory allocation and
+ freeing calls that occur during a match operation.
+
+ Setting a starting offset
+
+ The offset modifier sets an offset in the subject string at which
+ matching starts. Its value is a number of code units, not characters.
+
+ Setting the size of the output vector
+
+ The ovector modifier applies only to the subject line in which it
+ appears, though of course it can also be used to set a default in a
+ #subject command. It specifies the number of pairs of offsets that are
+ available for storing matching information. The default is 15.
+
+
+THE ALTERNATIVE MATCHING FUNCTION
+
+ By default, pcre2test uses the standard PCRE2 matching function,
+ pcre2_match() to match each subject line. PCRE2 also supports an alter-
+ native matching function, pcre2_dfa_match(), which operates in a dif-
+ ferent way, and has some restrictions. The differences between the two
+ functions are described in the pcre2matching documentation.
+
+ If the dfa modifier is set, the alternative matching function is used.
+ This function finds all possible matches at a given point in the sub-
+ ject. If, however, the dfa_shortest modifier is set, processing stops
+ after the first match is found. This is always the shortest possible
+ match.
+
+
+DEFAULT OUTPUT FROM pcre2test
+
+ This section describes the output when the normal matching function,
+ pcre2_match(), is being used.
+
+ When a match succeeds, pcre2test outputs the list of captured sub-
+ strings, starting with number 0 for the string that matched the whole
+ pattern. Otherwise, it outputs "No match" when the return is
+ PCRE2_ERROR_NOMATCH, or "Partial match:" followed by the partially
+ matching substring when the return is PCRE2_ERROR_PARTIAL. (Note that
+ this is the entire substring that was inspected during the partial
+ match; it may include characters before the actual match start if a
+ lookbehind assertion, \K, \b, or \B was involved.)
+
+ For any other return, pcre2test outputs the PCRE2 negative error number
+ and a short descriptive phrase. If the error is a failed UTF string
+ check, the offset of the start of the failing character and the reason
+ code are also output. Here is an example of an interactive pcre2test
+ run.
+
+ $ pcre2test
+ PCRE2 version 9.00 2014-05-10
+
+ re> /^abc(\d+)/
+ data> abc123
+ 0: abc123
+ 1: 123
+ data> xyz
+ No match
+
+ Unset capturing substrings that are not followed by one that is set are
+ not returned by pcre2_match(), and are not shown by pcre2test. In the
+ following example, there are two capturing substrings, but when the
+ first data line is matched, the second, unset substring is not shown.
+ An "internal" unset substring is shown as "<unset>", as for the second
+ data line.
+
+ re> /(a)|(b)/
+ data> a
+ 0: a
+ 1: a
+ data> b
+ 0: b
+ 1: <unset>
+ 2: b
+
+ If the strings contain any non-printing characters, they are output as
+ \xhh escapes if the value is less than 256 and UTF mode is not set.
+ Otherwise they are output as \x{hh...} escapes. See below for the defi-
+ nition of non-printing characters. If the /aftertext modifier is set,
+ the output for substring 0 is followed by the the rest of the subject
+ string, identified by "0+" like this:
+
+ re> /cat/aftertext
+ data> cataract
+ 0: cat
+ 0+ aract
+
+ If global matching is requested, the results of successive matching
+ attempts are output in sequence, like this:
+
+ re> /\Bi(\w\w)/g
+ data> Mississippi
+ 0: iss
+ 1: ss
+ 0: iss
+ 1: ss
+ 0: ipp
+ 1: pp
+
+ "No match" is output only if the first match attempt fails. Here is an
+ example of a failure message (the offset 4 that is specified by \>4 is
+ past the end of the subject string):
+
+ re> /xyz/
+ data> xyz\=offset=4
+ Error -24 (bad offset value)
+
+ Note that whereas patterns can be continued over several lines (a plain
+ ">" prompt is used for continuations), subject lines may not. However
+ newlines can be included in a subject by means of the \n escape (or \r,
+ \r\n, etc., depending on the newline sequence setting).
+
+
+OUTPUT FROM THE ALTERNATIVE MATCHING FUNCTION
+
+ When the alternative matching function, pcre2_dfa_match(), is used, the
+ output consists of a list of all the matches that start at the first
+ point in the subject where there is at least one match. For example:
+
+ re> /(tang|tangerine|tan)/
+ data> yellow tangerine\=dfa
+ 0: tangerine
+ 1: tang
+ 2: tan
+
+ (Using the normal matching function on this data finds only "tang".)
+ The longest matching string is always given first (and numbered zero).
+ After a PCRE2_ERROR_PARTIAL return, the output is "Partial match:",
+ followed by the partially matching substring. (Note that this is the
+ entire substring that was inspected during the partial match; it may
+ include characters before the actual match start if a lookbehind asser-
+ tion, \K, \b, or \B was involved.)
+
+ If global matching is requested, the search for further matches resumes
+ at the end of the longest match. For example:
+
+ re> /(tang|tangerine|tan)/g
+ data> yellow tangerine and tangy sultana\=dfa
+ 0: tangerine
+ 1: tang
+ 2: tan
+ 0: tang
+ 1: tan
+ 0: tan
+
+ The alternative matching function does not support substring capture,
+ so the modifiers that are concerned with captured substrings are not
+ relevant.
+
+
+RESTARTING AFTER A PARTIAL MATCH
+
+ When the alternative matching function has given the PCRE2_ERROR_PAR-
+ TIAL return, indicating that the subject partially matched the pattern,
+ you can restart the match with additional subject data by means of the
+ dfa_restart modifier. For example:
+
+ re> /^\d?\d(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\d\d$/
+ data> 23ja\=P,dfa
+ Partial match: 23ja
+ data> n05\=dfa,dfa_restart
+ 0: n05
+
+ For further information about partial matching, see the pcre2partial
+ documentation.
+
+
+CALLOUTS
+
+ If the pattern contains any callout requests, pcre2test's callout func-
+ tion is called during matching. This works with both matching func-
+ tions. By default, the called function displays the callout number, the
+ start and current positions in the text at the callout time, and the
+ next pattern item to be tested. For example:
+
+ --->pqrabcdef
+ 0 ^ ^ \d
+
+ This output indicates that callout number 0 occurred for a match
+ attempt starting at the fourth character of the subject string, when
+ the pointer was at the seventh character, and when the next pattern
+ item was \d. Just one circumflex is output if the start and current
+ positions are the same.
+
+ Callouts numbered 255 are assumed to be automatic callouts, inserted as
+ a result of the /auto_callout pattern modifier. In this case, instead
+ of showing the callout number, the offset in the pattern, preceded by a
+ plus, is output. For example:
+
+ re> /\d?[A-E]\*/auto_callout
+ data> E*
+ --->E*
+ +0 ^ \d?
+ +3 ^ [A-E]
+ +8 ^^ \*
+ +10 ^ ^
+ 0: E*
+
+ If a pattern contains (*MARK) items, an additional line is output when-
+ ever a change of latest mark is passed to the callout function. For
+ example:
+
+ re> /a(*MARK:X)bc/auto_callout
+ data> abc
+ --->abc
+ +0 ^ a
+ +1 ^^ (*MARK:X)
+ +10 ^^ b
+ Latest Mark: X
+ +11 ^ ^ c
+ +12 ^ ^
+ 0: abc
+
+ The mark changes between matching "a" and "b", but stays the same for
+ the rest of the match, so nothing more is output. If, as a result of
+ backtracking, the mark reverts to being unset, the text "<unset>" is
+ output.
+
+ The callout function in pcre2test returns zero (carry on matching) by
+ default, but you can use a callout_fail modifier in a subject line (as
+ described above) to change this and other parameters of the callout.
+
+ Inserting callouts can be helpful when using pcre2test to check compli-
+ cated regular expressions. For further information about callouts, see
+ the pcre2callout documentation.
+
+
+NON-PRINTING CHARACTERS
+
+ When pcre2test is outputting text in the compiled version of a pattern,
+ bytes other than 32-126 are always treated as non-printing characters
+ and are therefore shown as hex escapes.
+
+ When pcre2test is outputting text that is a matched part of a subject
+ string, it behaves in the same way, unless a different locale has been
+ set for the pattern (using the /locale modifier). In this case, the
+ isprint() function is used to distinguish printing and non-printing
+ characters.
+
+
+SEE ALSO
+
+ pcre2(3), pcre16(3), pcre32(3), pcre2api(3), pcre2callout(3), pcre2jit,
+ pcre2matching(3), pcre2partial(d), pcre2pattern(3), pcre2precompile(3).
+
+
+AUTHOR
+
+ Philip Hazel
+ University Computing Service
+ Cambridge CB2 3QH, England.
+
+
+REVISION
+
+ Last updated: 19 August 2014
+ Copyright (c) 1997-2014 University of Cambridge.
diff --git a/src/pcre2demo.c b/src/pcre2demo.c
index 6153ffa..8e37832 100644
--- a/src/pcre2demo.c
+++ b/src/pcre2demo.c
@@ -420,4 +420,4 @@ pcre2_code_free(re);
return 0;
}
-/* End of pcredemo.c */
+/* End of pcre2demo.c */