From 3dba7ab3ed2ffad9049664eb4a99273597e6b24c Mon Sep 17 00:00:00 2001
From: ph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15>
Date: Sun, 12 May 2013 16:28:22 +0000
Subject: Documentation clarification for 16/32 bit libraries.

git-svn-id: svn://vcs.exim.org/pcre/code/trunk@1328 2f5784b3-3f2a-0410-8824-cb99058d5e15
---
 ChangeLog           |  5 ++++
 doc/pcre16.3        | 11 +++----
 doc/pcre32.3        | 11 +++----
 doc/pcre_dfa_exec.3 | 11 +++----
 doc/pcre_exec.3     | 11 +++----
 doc/pcreapi.3       | 86 +++++++++++++++++++++++++++++------------------------
 6 files changed, 76 insertions(+), 59 deletions(-)

diff --git a/ChangeLog b/ChangeLog
index a24c186..0766c2d 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -158,6 +158,11 @@ Version 8.33 28-April-2013
 
 42. Data lines longer than 65536 caused pcretest to crash.
 
+44. Clarified the data type for length and startoffset arguments for pcre_exec 
+    and pcre_dfa_exec in the function-specific man pages, where they were 
+    explicitly stated to be in bytes, never having been updated. I also added 
+    some clarification to the pcreapi man page.
+
 
 Version 8.32 30-November-2012
 -----------------------------
diff --git a/doc/pcre16.3 b/doc/pcre16.3
index 2a63084..234ae96 100644
--- a/doc/pcre16.3
+++ b/doc/pcre16.3
@@ -1,4 +1,4 @@
-.TH PCRE 3 "08 November 2012" "PCRE 8.32"
+.TH PCRE 3 "12 May 2013" "PCRE 8.33"
 .SH NAME
 PCRE - Perl-compatible regular expressions
 .sp
@@ -246,8 +246,9 @@ buffer, including the zero terminator if the string was zero-terminated.
 .SH "SUBJECT STRING OFFSETS"
 .rs
 .sp
-The offsets within subject strings that are returned by the matching functions
-are in 16-bit units rather than bytes.
+The lengths and starting offsets of subject strings must be specified in 16-bit
+data units, and the offsets within subject strings that are returned by the
+matching functions are in also 16-bit units rather than bytes.
 .
 .
 .SH "NAMED SUBPATTERNS"
@@ -385,6 +386,6 @@ Cambridge CB2 3QH, England.
 .rs
 .sp
 .nf
-Last updated: 08 November 2012
-Copyright (c) 1997-2012 University of Cambridge.
+Last updated: 12 May 2013
+Copyright (c) 1997-2013 University of Cambridge.
 .fi
diff --git a/doc/pcre32.3 b/doc/pcre32.3
index 48205ca..516c8ee 100644
--- a/doc/pcre32.3
+++ b/doc/pcre32.3
@@ -1,4 +1,4 @@
-.TH PCRE 3 "08 November 2012" "PCRE 8.32"
+.TH PCRE 3 "12 May 2013" "PCRE 8.33"
 .SH NAME
 PCRE - Perl-compatible regular expressions
 .sp
@@ -246,8 +246,9 @@ buffer, including the zero terminator if the string was zero-terminated.
 .SH "SUBJECT STRING OFFSETS"
 .rs
 .sp
-The offsets within subject strings that are returned by the matching functions
-are in 32-bit units rather than bytes.
+The lengths and starting offsets of subject strings must be specified in 32-bit
+data units, and the offsets within subject strings that are returned by the
+matching functions are in also 32-bit units rather than bytes.
 .
 .
 .SH "NAMED SUBPATTERNS"
@@ -384,6 +385,6 @@ Cambridge CB2 3QH, England.
 .rs
 .sp
 .nf
-Last updated: 08 November 2012
-Copyright (c) 1997-2012 University of Cambridge.
+Last updated: 12 May 2013
+Copyright (c) 1997-2013 University of Cambridge.
 .fi
diff --git a/doc/pcre_dfa_exec.3 b/doc/pcre_dfa_exec.3
index d1901a5..9bc7448 100644
--- a/doc/pcre_dfa_exec.3
+++ b/doc/pcre_dfa_exec.3
@@ -1,4 +1,4 @@
-.TH PCRE_DFA_EXEC 3 "24 June 2012" "PCRE 8.30"
+.TH PCRE_DFA_EXEC 3 "12 May 2013" "PCRE 8.33"
 .SH NAME
 PCRE - Perl-compatible regular expressions
 .SH SYNOPSIS
@@ -44,16 +44,17 @@ are:
   \fIextra\fP        Points to an associated \fBpcre[16|32]_extra\fP structure,
                  or is NULL
   \fIsubject\fP      Points to the subject string
-  \fIlength\fP       Length of the subject string, in bytes
-  \fIstartoffset\fP  Offset in bytes in the subject at which to
-                 start matching
+  \fIlength\fP       Length of the subject string
+  \fIstartoffset\fP  Offset in the subject at which to start matching
   \fIoptions\fP      Option bits
   \fIovector\fP      Points to a vector of ints for result offsets
   \fIovecsize\fP     Number of elements in the vector
   \fIworkspace\fP    Points to a vector of ints used as working space
   \fIwscount\fP      Number of elements in the vector
 .sp
-The options are:
+The units for \fIlength\fP and \fIstartoffset\fP are bytes for
+\fBpcre_exec()\fP, 16-bit data items for \fBpcre16_exec()\fP, and 32-bit items
+for \fBpcre32_exec()\fP. The options are:
 .sp
   PCRE_ANCHORED          Match only at the first position
   PCRE_BSR_ANYCRLF       \eR matches only CR, LF, or CRLF
diff --git a/doc/pcre_exec.3 b/doc/pcre_exec.3
index 78012ed..c92c2a5 100644
--- a/doc/pcre_exec.3
+++ b/doc/pcre_exec.3
@@ -1,4 +1,4 @@
-.TH PCRE_EXEC 3 "24 June 2012" "PCRE 8.30"
+.TH PCRE_EXEC 3 "12 May 2013" "PCRE 8.33"
 .SH NAME
 PCRE - Perl-compatible regular expressions
 .SH SYNOPSIS
@@ -36,14 +36,15 @@ offsets to captured substrings. Its arguments are:
   \fIextra\fP        Points to an associated \fBpcre[16|32]_extra\fP structure,
                  or is NULL
   \fIsubject\fP      Points to the subject string
-  \fIlength\fP       Length of the subject string, in bytes
-  \fIstartoffset\fP  Offset in bytes in the subject at which to
-                 start matching
+  \fIlength\fP       Length of the subject string
+  \fIstartoffset\fP  Offset in the subject at which to start matching
   \fIoptions\fP      Option bits
   \fIovector\fP      Points to a vector of ints for result offsets
   \fIovecsize\fP     Number of elements in the vector (a multiple of 3)
 .sp
-The options are:
+The units for \fIlength\fP and \fIstartoffset\fP are bytes for
+\fBpcre_exec()\fP, 16-bit data items for \fBpcre16_exec()\fP, and 32-bit items
+for \fBpcre32_exec()\fP. The options are:
 .sp
   PCRE_ANCHORED          Match only at the first position
   PCRE_BSR_ANYCRLF       \eR matches only CR, LF, or CRLF
diff --git a/doc/pcreapi.3 b/doc/pcreapi.3
index 7145a4e..ac111f1 100644
--- a/doc/pcreapi.3
+++ b/doc/pcreapi.3
@@ -1,4 +1,4 @@
-.TH PCREAPI 3 "10 May 2013" "PCRE 8.33"
+.TH PCREAPI 3 "12 May 2013" "PCRE 8.33"
 .SH NAME
 PCRE - Perl-compatible regular expressions
 .sp
@@ -161,10 +161,10 @@ by UTF16 or UTF32, respectively. This facility is in fact just cosmetic; the
 16-bit and 32-bit option names define the same bit values.
 .P
 References to bytes and UTF-8 in this document should be read as references to
-16-bit data quantities and UTF-16 when using the 16-bit library, or 32-bit data
-quantities and UTF-32 when using the 32-bit library, unless specified
-otherwise. More details of the specific differences for the 16-bit and 32-bit
-libraries are given in the
+16-bit data units and UTF-16 when using the 16-bit library, or 32-bit data
+units and UTF-32 when using the 32-bit library, unless specified otherwise.
+More details of the specific differences for the 16-bit and 32-bit libraries
+are given in the
 .\" HREF
 \fBpcre16\fP
 .\"
@@ -562,15 +562,15 @@ Otherwise, if compilation of a pattern fails, \fBpcre_compile()\fP returns
 NULL, and sets the variable pointed to by \fIerrptr\fP to point to a textual
 error message. This is a static string that is part of the library. You must
 not try to free it. Normally, the offset from the start of the pattern to the
-byte that was being processed when the error was discovered is placed in the
-variable pointed to by \fIerroffset\fP, which must not be NULL (if it is, an
-immediate error is given). However, for an invalid UTF-8 string, the offset is
-that of the first byte of the failing character.
+data unit that was being processed when the error was discovered is placed in
+the variable pointed to by \fIerroffset\fP, which must not be NULL (if it is,
+an immediate error is given). However, for an invalid UTF-8 or UTF-16 string,
+the offset is that of the first data unit of the failing character.
 .P
 Some errors are not detected until the whole pattern has been scanned; in these
 cases, the offset passed back is the length of the pattern. Note that the
-offset is in bytes, not characters, even in UTF-8 mode. It may sometimes point
-into the middle of a UTF-8 character.
+offset is in data units, not characters, even in a UTF mode. It may sometimes
+point into the middle of a UTF-8 or UTF-16 character.
 .P
 If \fBpcre_compile2()\fP is used instead of \fBpcre_compile()\fP, and the
 \fIerrorcodeptr\fP argument is not NULL, a non-zero error code number is
@@ -1323,7 +1323,7 @@ call to \fBpcre_fullinfo()\fP returns the error PCRE_ERROR_UNSET.
 .sp
   PCRE_INFO_MAXLOOKBEHIND
 .sp
-Return the number of characters (NB not bytes) in the longest lookbehind
+Return the number of characters (NB not data units) in the longest lookbehind
 assertion in the pattern. This information is useful when doing multi-segment
 matching using the partial matching facilities. Note that the simple assertions
 \eb and \eB require a one-character lookbehind. \eA also registers a
@@ -1337,11 +1337,11 @@ segment.
 .sp
 If the pattern was studied and a minimum length for matching subject strings
 was computed, its value is returned. Otherwise the returned value is -1. The
-value is a number of characters, which in UTF-8 mode may be different from the
-number of bytes. The fourth argument should point to an \fBint\fP variable. A
-non-negative value is a lower bound to the length of any matching string. There
-may not be any strings of that length that do actually match, but every string
-that does match is at least that long.
+value is a number of characters, which in UTF mode may be different from the
+number of data units. The fourth argument should point to an \fBint\fP
+variable. A non-negative value is a lower bound to the length of any matching
+string. There may not be any strings of that length that do actually match, but
+every string that does match is at least that long.
 .sp
   PCRE_INFO_NAMECOUNT
   PCRE_INFO_NAMEENTRYSIZE
@@ -1364,10 +1364,10 @@ length of the longest name. PCRE_INFO_NAMETABLE returns a pointer to the first
 entry of the table. This is a pointer to \fBchar\fP in the 8-bit library, where
 the first two bytes of each entry are the number of the capturing parenthesis,
 most significant byte first. In the 16-bit library, the pointer points to
-16-bit data units, the first of which contains the parenthesis number.
-In the 32-bit library, the pointer points to 32-bit data units, the first of
-which contains the parenthesis number. The rest
-of the entry is the corresponding name, zero terminated.
+16-bit data units, the first of which contains the parenthesis number. In the
+32-bit library, the pointer points to 32-bit data units, the first of which
+contains the parenthesis number. The rest of the entry is the corresponding
+name, zero terminated.
 .P
 The names are in alphabetical order. Duplicate names may appear if (?| is used
 to create multiple groups with the same number, as described in the
@@ -1449,7 +1449,7 @@ set, the call to \fBpcre_fullinfo()\fP returns the error PCRE_ERROR_UNSET.
 .sp
   PCRE_INFO_SIZE
 .sp
-Return the size of the compiled pattern in bytes (for both libraries). The
+Return the size of the compiled pattern in bytes (for all three libraries). The
 fourth argument should point to a \fBsize_t\fP variable. This value does not
 include the size of the \fBpcre\fP structure that is returned by
 \fBpcre_compile()\fP. The value that is passed as the argument to
@@ -1460,11 +1460,12 @@ does not alter the value returned by this option.
 .sp
   PCRE_INFO_STUDYSIZE
 .sp
-Return the size in bytes of the data block pointed to by the \fIstudy_data\fP
-field in a \fBpcre_extra\fP block. If \fBpcre_extra\fP is NULL, or there is no
-study data, zero is returned. The fourth argument should point to a
-\fBsize_t\fP variable. The \fIstudy_data\fP field is set by \fBpcre_study()\fP
-to record information that will speed up matching (see the section entitled
+Return the size in bytes (for all three libraries) of the data block pointed to
+by the \fIstudy_data\fP field in a \fBpcre_extra\fP block. If \fBpcre_extra\fP
+is NULL, or there is no study data, zero is returned. The fourth argument
+should point to a \fBsize_t\fP variable. The \fIstudy_data\fP field is set by
+\fBpcre_study()\fP to record information that will speed up matching (see the
+section entitled
 .\" HTML <a href="#studyingapattern">
 .\" </a>
 "Studying a pattern"
@@ -1993,13 +1994,18 @@ documentation.
 .rs
 .sp
 The subject string is passed to \fBpcre_exec()\fP as a pointer in
-\fIsubject\fP, a length in bytes in \fIlength\fP, and a starting byte offset
-in \fIstartoffset\fP. If this is negative or greater than the length of the
-subject, \fBpcre_exec()\fP returns PCRE_ERROR_BADOFFSET. When the starting
-offset is zero, the search for a match starts at the beginning of the subject,
-and this is by far the most common case. In UTF-8 mode, the byte offset must
-point to the start of a UTF-8 character (or the end of the subject). Unlike the
-pattern string, the subject may contain binary zero bytes.
+\fIsubject\fP, a length in \fIlength\fP, and a starting offset in
+\fIstartoffset\fP. The units for \fIlength\fP and \fIstartoffset\fP are bytes
+for the 8-bit library, 16-bit data items for the 16-bit library, and 32-bit
+data items for the 32-bit library.
+.P
+If \fIstartoffset\fP is negative or greater than the length of the subject,
+\fBpcre_exec()\fP returns PCRE_ERROR_BADOFFSET. When the starting offset is
+zero, the search for a match starts at the beginning of the subject, and this
+is by far the most common case. In UTF-8 or UTF-16 mode, the offset must point
+to the start of a character, or the end of the subject (in UTF-32 mode, one 
+data unit equals one character, so all offsets are valid). Unlike the pattern
+string, the subject may contain binary zeroes.
 .P
 A non-zero starting offset is useful when searching for another match in the
 same subject by calling \fBpcre_exec()\fP again after a previous success.
@@ -2063,10 +2069,12 @@ rounded down.
 When a match is successful, information about captured substrings is returned
 in pairs of integers, starting at the beginning of \fIovector\fP, and
 continuing up to two-thirds of its length at the most. The first element of
-each pair is set to the byte offset of the first character in a substring, and
-the second is set to the byte offset of the first character after the end of a
-substring. \fBNote\fP: these values are always byte offsets, even in UTF-8
-mode. They are not character counts.
+each pair is set to the offset of the first character in a substring, and the
+second is set to the offset of the first character after the end of a
+substring. These values are always data unit offsets, even in UTF mode. They
+are byte offsets in the 8-bit library, 16-bit data item offsets in the 16-bit
+library, and 32-bit data item offsets in the 32-bit library. \fBNote\fP: they
+are not character counts.
 .P
 The first pair of integers, \fIovector[0]\fP and \fIovector[1]\fP, identify the
 portion of the subject string matched by the entire pattern. The next pair is
@@ -2878,6 +2886,6 @@ Cambridge CB2 3QH, England.
 .rs
 .sp
 .nf
-Last updated: 10 May 2013
+Last updated: 12 May 2013
 Copyright (c) 1997-2013 University of Cambridge.
 .fi
-- 
cgit v1.2.1