summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15>2011-11-21 11:44:55 +0000
committerph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15>2011-11-21 11:44:55 +0000
commita8cb60f9a164ea40e49ee7d3cf6c465c0540b9bc (patch)
treefc611004b87a826965d9d295370198b50007dd37
parentc48ca85a9456be9a04cb0b92df853d6dac797414 (diff)
downloadpcre-a8cb60f9a164ea40e49ee7d3cf6c465c0540b9bc.tar.gz
More 16-bit patches
git-svn-id: svn://vcs.exim.org/pcre/code/branches/pcre16@757 2f5784b3-3f2a-0410-8824-cb99058d5e15
-rw-r--r--Makefile.am49
-rw-r--r--configure.ac126
-rw-r--r--libpcre.pc.in2
-rw-r--r--libpcre16.pc.in12
-rw-r--r--pcre.h.in28
-rw-r--r--pcre16_compile.c45
-rw-r--r--pcre16_convert_utf16.c87
-rw-r--r--pcre16_valid_utf16.c143
-rw-r--r--pcre_compile.c16
-rw-r--r--pcre_internal.h34
10 files changed, 522 insertions, 20 deletions
diff --git a/Makefile.am b/Makefile.am
index 320becb..4651c6f 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -79,7 +79,7 @@ check_SCRIPTS =
dist_noinst_SCRIPTS =
# Some of the binaries we make are to be installed, and others are
-# (non-user-visible) helper programs needed to build libpcre.
+# (non-user-visible) helper programs needed to build libpcre or libpcre16.
bin_PROGRAMS =
noinst_PROGRAMS =
@@ -170,6 +170,9 @@ endif # WITH_REBUILD_CHARTABLES
## The main pcre library
+
+# Build the 8 bit library if it is enabled.
+if WITH_PCRE8
lib_LTLIBRARIES += libpcre.la
libpcre_la_SOURCES = \
pcre_compile.c \
@@ -199,6 +202,22 @@ libpcre_la_SOURCES = \
nodist_libpcre_la_SOURCES = \
pcre_chartables.c
+endif # WITH_PCRE8
+
+# Build the 16 bit library if it is enabled.
+if WITH_PCRE16
+lib_LTLIBRARIES += libpcre16.la
+libpcre16_la_SOURCES = \
+ pcre16_compile.c \
+ pcre16_convert_utf16.c \
+ pcre16_valid_utf16.c
+
+## This file is generated as part of the building process, so don't distribute.
+nodist_libpcre16_la_SOURCES = \
+ pcre_chartables.c
+
+endif # WITH_PCRE16
+
# The pcre_printint.src file is #included by some source files, so it must be
# distributed. The pcre_chartables.c.dist file is the default version of
# pcre_chartables.c, used unless --enable-rebuild-chartables is specified.
@@ -224,7 +243,12 @@ EXTRA_DIST += \
sljit/sljitNativeX86_common.c \
sljit/sljitUtils.c
+if WITH_PCRE8
libpcre_la_LDFLAGS = $(EXTRA_LIBPCRE_LDFLAGS)
+endif # WITH_PCRE8
+if WITH_PCRE16
+libpcre16_la_LDFLAGS = $(EXTRA_LIBPCRE_LDFLAGS)
+endif # WITH_PCRE16
CLEANFILES += pcre_chartables.c
@@ -233,15 +257,23 @@ if WITH_JIT
TESTS += pcre_jit_test
noinst_PROGRAMS += pcre_jit_test
pcre_jit_test_SOURCES = pcre_jit_test.c
-pcre_jit_test_LDADD = libpcre.la
+pcre_jit_test_LDADD =
+if WITH_PCRE8
+pcre_jit_test_LDADD += libpcre.la
+endif # WITH_PCRE8
+if WITH_PCRE16
+pcre_jit_test_LDADD += libpcre16.la
+endif # WITH_PCRE16
endif # WITH_JIT
## A version of the main pcre library that has a posix re API.
+if WITH_PCRE8
lib_LTLIBRARIES += libpcreposix.la
libpcreposix_la_SOURCES = \
pcreposix.c
libpcreposix_la_LDFLAGS = $(EXTRA_LIBPCREPOSIX_LDFLAGS)
libpcreposix_la_LIBADD = libpcre.la
+endif # WITH_PCRE8
## There's a C++ library as well.
if WITH_PCRE_CPP
@@ -282,13 +314,19 @@ dist_noinst_SCRIPTS += RunTest
EXTRA_DIST += RunTest.bat
bin_PROGRAMS += pcretest
pcretest_SOURCES = pcretest.c
-pcretest_LDADD = libpcreposix.la $(LIBREADLINE)
+pcretest_LDADD = $(LIBREADLINE)
+if WITH_PCRE8
+pcretest_LDADD += libpcreposix.la
+endif # WITH_PCRE8
TESTS += RunGrepTest
dist_noinst_SCRIPTS += RunGrepTest
bin_PROGRAMS += pcregrep
pcregrep_SOURCES = pcregrep.c
-pcregrep_LDADD = libpcreposix.la $(LIBZ) $(LIBBZ2)
+pcregrep_LDADD = $(LIBZ) $(LIBBZ2)
+if WITH_PCRE8
+pcregrep_LDADD += libpcreposix.la
+endif # WITH_PCRE8
EXTRA_DIST += \
testdata/grepinput \
@@ -378,6 +416,9 @@ pcre.dll: $(DLL_OBJS)
# We have .pc files for pkg-config users.
pkgconfigdir = $(libdir)/pkgconfig
pkgconfig_DATA = libpcre.pc libpcreposix.pc
+if WITH_PCRE16
+pkgconfig_DATA += libpcre16.pc
+endif
if WITH_PCRE_CPP
pkgconfig_DATA += libpcrecpp.pc
endif
diff --git a/configure.ac b/configure.ac
index ddee8e8..ff516c7 100644
--- a/configure.ac
+++ b/configure.ac
@@ -104,6 +104,18 @@ then
htmldir='${docdir}/html'
fi
+# Handle --disable-pcre8 (enabled by default)
+AC_ARG_ENABLE(pcre8,
+ AS_HELP_STRING([--disable-pcre8],
+ [enable 8 bit character support]),
+ , enable_pcre8=unset)
+
+# Handle --enable-pcre16 (disabled by default)
+AC_ARG_ENABLE(pcre16,
+ AS_HELP_STRING([--enable-pcre16],
+ [enable 16 bit character support]),
+ , enable_pcre16=unset)
+
# Handle --disable-cpp. The substitution of enable_cpp is needed for use in
# pcre-config.
AC_ARG_ENABLE(cpp,
@@ -136,10 +148,16 @@ AC_ARG_ENABLE(utf8,
[enable UTF-8 support (incompatible with --enable-ebcdic)]),
, enable_utf8=unset)
+# Handle --enable-utf16 (disabled by default)
+AC_ARG_ENABLE(utf16,
+ AS_HELP_STRING([--enable-utf16],
+ [enable UTF-16 support (incompatible with --enable-ebcdic)]),
+ , enable_utf16=unset)
+
# Handle --enable-unicode-properties
AC_ARG_ENABLE(unicode-properties,
AS_HELP_STRING([--enable-unicode-properties],
- [enable Unicode properties support (implies --enable-utf8)]),
+ [enable Unicode properties support (implies --enable-utf8 and --enable-utf16)]),
, enable_unicode_properties=no)
# Handle --enable-newline=NL
@@ -245,8 +263,46 @@ AC_ARG_WITH(match-limit-recursion,
[default limit on internal recursion (default=MATCH_LIMIT)]),
, with_match_limit_recursion=MATCH_LIMIT)
-# Make sure that if enable_unicode_properties was set, that UTF-8 support
-# is enabled.
+# Make sure that if enable_utf8 was set, that enable_pcre8 support is enabled
+if test "x$enable_utf8" = "xyes"
+then
+ if test "x$enable_pcre8" = "xno"
+ then
+ AC_MSG_ERROR([support for UTF-8 requires pcre library with 8 bit characters])
+ fi
+ enable_pcre8=yes
+fi
+
+# Make sure that if enable_utf16 was set, that enable_pcre16 support is enabled
+if test "x$enable_utf16" = "xyes"
+then
+ if test "x$enable_pcre16" = "xno"
+ then
+ AC_MSG_ERROR([support for UTF-16 requires pcre library with 16 bit characters])
+ fi
+ enable_pcre16=yes
+fi
+
+# Set the default value for pcre8
+if test "x$enable_pcre8" = "xunset"
+then
+ enable_pcre8=yes
+fi
+
+# Set the default value for pcre16
+if test "x$enable_pcre16" = "xunset"
+then
+ enable_pcre16=no
+fi
+
+# Make sure enable_pcre8 or enable_pcre16 was set
+if test "x$enable_pcre8$enable_pcre16" = "xnono"
+then
+ AC_MSG_ERROR([Either 8 or 16 bit (or both) pcre library must be enabled])
+fi
+
+# Make sure that if enable_unicode_properties was set, that UTF-8 or UTF-16
+# support enabled.
#
if test "x$enable_unicode_properties" = "xyes"
then
@@ -254,17 +310,44 @@ then
then
AC_MSG_ERROR([support for Unicode properties requires UTF-8 support])
fi
- enable_utf8=yes
+ if test "x$enable_utf16" = "xno"
+ then
+ AC_MSG_ERROR([support for Unicode properties requires UTF-16 support])
+ fi
+ if test "x$enable_pcre8" = "xyes"
+ then
+ enable_utf8=yes
+ fi
+ if test "x$enable_pcre16" = "xyes"
+ then
+ enable_utf16=yes
+ fi
fi
+# enable_utf8 is disabled by default.
if test "x$enable_utf8" = "xunset"
then
enable_utf8=no
fi
+# enable_utf16 is disabled by default.
+if test "x$enable_utf16" = "xunset"
+then
+ enable_utf16=no
+fi
+
+# Make sure that if enable_cpp was set, that enable_pcre8 support is enabled
+if test "x$enable_cpp" = "xyes"
+then
+ if test "x$enable_pcre8" = "xno"
+ then
+ AC_MSG_ERROR([C++ library requires pcre library with 8 bit characters])
+ fi
+fi
+
# Make sure that if enable_ebcdic is set, rebuild_chartables is also enabled.
-# Also check that UTF-8 support is not requested, because PCRE cannot handle
-# EBCDIC and UTF-8 in the same build. To do so it would need to use different
+# Also check that UTF-8 or UTF-16 support is not requested, because PCRE cannot
+# handle EBCDIC and UTF in the same build. To do so it would need to use different
# character constants depending on the mode.
#
if test "x$enable_ebcdic" = "xyes"
@@ -274,6 +357,10 @@ then
then
AC_MSG_ERROR([support for EBCDIC and UTF-8 cannot be enabled at the same time])
fi
+ if test "x$enable_utf16" = "xyes"
+ then
+ AC_MSG_ERROR([support for EBCDIC and UTF-16 cannot be enabled at the same time])
+ fi
fi
# Convert the newline identifier into the appropriate integer value.
@@ -410,10 +497,13 @@ AC_SUBST(pcre_have_type_traits)
AC_SUBST(pcre_have_bits_type_traits)
# Conditional compilation
+AM_CONDITIONAL(WITH_PCRE8, test "x$enable_pcre8" = "xyes")
+AM_CONDITIONAL(WITH_PCRE16, test "x$enable_pcre16" = "xyes")
AM_CONDITIONAL(WITH_PCRE_CPP, test "x$enable_cpp" = "xyes")
AM_CONDITIONAL(WITH_REBUILD_CHARTABLES, test "x$enable_rebuild_chartables" = "xyes")
AM_CONDITIONAL(WITH_JIT, test "x$enable_jit" = "xyes")
AM_CONDITIONAL(WITH_UTF8, test "x$enable_utf8" = "xyes")
+AM_CONDITIONAL(WITH_UTF16, test "x$enable_utf16" = "xyes")
# Checks for typedefs, structures, and compiler characteristics.
@@ -482,6 +572,16 @@ AC_SUBST(PCRE_STATIC_CFLAG)
# Here is where pcre specific defines are handled
+if test "$enable_pcre8" = "yes"; then
+ AC_DEFINE([SUPPORT_PCRE8], [], [
+ Define to enable the 8 bit PCRE library.])
+fi
+
+if test "$enable_pcre16" = "yes"; then
+ AC_DEFINE([SUPPORT_PCRE16], [], [
+ Define to enable the 16 bit PCRE library.])
+fi
+
if test "$enable_jit" = "yes"; then
AC_DEFINE([SUPPORT_JIT], [], [
Define to enable support for Just-In-Time compiling.])
@@ -502,6 +602,14 @@ if test "$enable_utf8" = "yes"; then
*or* ASCII/UTF-8, but not both at once.])
fi
+if test "$enable_utf16" = "yes"; then
+ AC_DEFINE([SUPPORT_UTF16], [], [
+ Define to enable support for the UTF-16 Unicode encoding. This will
+ work even in an EBCDIC environment, but it is incompatible with
+ the EBCDIC macro. That is, PCRE can support *either* EBCDIC code
+ *or* ASCII/UTF-16, but not both at once.])
+fi
+
if test "$enable_unicode_properties" = "yes"; then
AC_DEFINE([SUPPORT_UCP], [], [
Define to enable support for Unicode properties.])
@@ -720,7 +828,8 @@ AC_SUBST(LIBREADLINE)
AC_CONFIG_FILES(
Makefile
libpcre.pc
- libpcreposix.pc
+ libpcre16.pc
+ libpcreposix.pc
libpcrecpp.pc
pcre-config
pcre.h
@@ -756,9 +865,12 @@ $PACKAGE-$VERSION configuration summary:
Linker flags .................... : ${LDFLAGS}
Extra libraries ................. : ${LIBS}
+ Build 8 bit pcre library ........ : ${enable_pcre8}
+ Build 16 bit pcre library ....... : ${enable_pcre16}
Build C++ library ............... : ${enable_cpp}
Enable JIT compiling support .... : ${enable_jit}
Enable UTF-8 support ............ : ${enable_utf8}
+ Enable UTF-16 support ........... : ${enable_utf16}
Unicode properties .............. : ${enable_unicode_properties}
Newline char/sequence ........... : ${enable_newline}
\R matches only ANYCRLF ......... : ${enable_bsr_anycrlf}
diff --git a/libpcre.pc.in b/libpcre.pc.in
index 2c3fa19..1f26b32 100644
--- a/libpcre.pc.in
+++ b/libpcre.pc.in
@@ -6,7 +6,7 @@ libdir=@libdir@
includedir=@includedir@
Name: libpcre
-Description: PCRE - Perl compatible regular expressions C library
+Description: PCRE - Perl compatible regular expressions C library with 8 bit character support
Version: @PACKAGE_VERSION@
Libs: -L${libdir} -lpcre
Cflags: -I${includedir} @PCRE_STATIC_CFLAG@
diff --git a/libpcre16.pc.in b/libpcre16.pc.in
new file mode 100644
index 0000000..12009f8
--- /dev/null
+++ b/libpcre16.pc.in
@@ -0,0 +1,12 @@
+# Package Information for pkg-config
+
+prefix=@prefix@
+exec_prefix=@exec_prefix@
+libdir=@libdir@
+includedir=@includedir@
+
+Name: libpcre16
+Description: PCRE - Perl compatible regular expressions C library with 16 bit character support
+Version: @PACKAGE_VERSION@
+Libs: -L${libdir} -lpcre
+Cflags: -I${includedir} @PCRE_STATIC_CFLAG@
diff --git a/pcre.h.in b/pcre.h.in
index c516518..f49de8a 100644
--- a/pcre.h.in
+++ b/pcre.h.in
@@ -111,7 +111,8 @@ compile-time only bits for runtime options, or vice versa. */
#define PCRE_NOTEOL 0x00000100 /* Exec, DFA exec */
#define PCRE_UNGREEDY 0x00000200 /* Compile */
#define PCRE_NOTEMPTY 0x00000400 /* Exec, DFA exec */
-#define PCRE_UTF8 0x00000800 /* Compile */
+#define PCRE_UTF8 0x00000800 /* Compile (Same as PCRE_UTF16) */
+#define PCRE_UTF16 0x00000800 /* Compile (Same as PCRE_UTF8) */
#define PCRE_NO_AUTO_CAPTURE 0x00001000 /* Compile */
#define PCRE_NO_UTF8_CHECK 0x00002000 /* Compile, exec, DFA exec */
#define PCRE_AUTO_CALLOUT 0x00004000 /* Compile */
@@ -191,6 +192,14 @@ compile-time only bits for runtime options, or vice versa. */
#define PCRE_UTF8_ERR20 20
#define PCRE_UTF8_ERR21 21
+/* Specific error codes for UTF-16 validity checks */
+
+#define PCRE_UTF16_ERR0 0
+#define PCRE_UTF16_ERR1 1
+#define PCRE_UTF16_ERR2 2
+#define PCRE_UTF16_ERR3 3
+#define PCRE_UTF16_ERR4 4
+
/* Request types for pcre_fullinfo() */
#define PCRE_INFO_OPTIONS 0
@@ -250,6 +259,17 @@ typedef struct real_pcre pcre;
struct real_pcre_jit_stack; /* declaration; the definition is private */
typedef struct real_pcre_jit_stack pcre_jit_stack;
+/* If PCRE is compiled with 16 bit character support, PCRE_SCHAR16 must contain
+a 16 bit wide signed data type. Otherwise it can be a dummy data type since
+pcre16 functions are not implemented. There is a check for this in pcre_internal.h. */
+#ifndef PCRE_SCHAR16
+#define PCRE_SCHAR16 short
+#endif
+
+#ifndef PCRE_SPTR16
+#define PCRE_SPTR16 const PCRE_SCHAR16 *
+#endif
+
/* When PCRE is compiled as a C++ library, the subject pointer type can be
replaced with a custom type. For conventional use, the public interface is a
const char *. */
@@ -326,8 +346,12 @@ typedef pcre_jit_stack *(*pcre_jit_callback)(void *);
PCRE_EXP_DECL pcre *pcre_compile(const char *, int, const char **, int *,
const unsigned char *);
+PCRE_EXP_DECL pcre *pcre16_compile(PCRE_SPTR16, int, const char **, int *,
+ const unsigned char *);
PCRE_EXP_DECL pcre *pcre_compile2(const char *, int, int *, const char **,
int *, const unsigned char *);
+PCRE_EXP_DECL pcre *pcre16_compile2(PCRE_SPTR16, int, int *, const short **,
+ int *, const unsigned char *);
PCRE_EXP_DECL int pcre_config(int, void *);
PCRE_EXP_DECL int pcre_copy_named_substring(const pcre *, const char *,
int *, int, const char *, char *, int);
@@ -353,6 +377,8 @@ PCRE_EXP_DECL int pcre_get_substring_list(const char *, int *, int,
PCRE_EXP_DECL int pcre_info(const pcre *, int *, int *);
PCRE_EXP_DECL const unsigned char *pcre_maketables(void);
PCRE_EXP_DECL int pcre_refcount(pcre *, int);
+PCRE_EXP_DECL int pcre16_utf16_to_host_byte_order(PCRE_SCHAR16 *,
+ PCRE_SPTR16, int, int);
PCRE_EXP_DECL pcre_extra *pcre_study(const pcre *, int, const char **);
PCRE_EXP_DECL void pcre_free_study(pcre_extra *);
PCRE_EXP_DECL const char *pcre_version(void);
diff --git a/pcre16_compile.c b/pcre16_compile.c
new file mode 100644
index 0000000..d682f99
--- /dev/null
+++ b/pcre16_compile.c
@@ -0,0 +1,45 @@
+/*************************************************
+* Perl-Compatible Regular Expressions *
+*************************************************/
+
+/* PCRE is a library of functions to support regular expressions whose syntax
+and semantics are as close as possible to those of the Perl 5 language.
+
+ Written by Philip Hazel
+ Copyright (c) 1997-2011 University of Cambridge
+
+-----------------------------------------------------------------------------
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice,
+ this list of conditions and the following disclaimer.
+
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+ * Neither the name of the University of Cambridge nor the names of its
+ contributors may be used to endorse or promote products derived from
+ this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+-----------------------------------------------------------------------------
+*/
+
+/* Generate code with 16 bit character support. */
+#define COMPILE_PCRE16
+
+#include "pcre_compile.c"
+
+/* End of pcre16_compile.c */
diff --git a/pcre16_convert_utf16.c b/pcre16_convert_utf16.c
new file mode 100644
index 0000000..34d6965
--- /dev/null
+++ b/pcre16_convert_utf16.c
@@ -0,0 +1,87 @@
+/*************************************************
+* Perl-Compatible Regular Expressions *
+*************************************************/
+
+/* PCRE is a library of functions to support regular expressions whose syntax
+and semantics are as close as possible to those of the Perl 5 language.
+
+ Written by Philip Hazel
+ Copyright (c) 1997-2009 University of Cambridge
+
+-----------------------------------------------------------------------------
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice,
+ this list of conditions and the following disclaimer.
+
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+ * Neither the name of the University of Cambridge nor the names of its
+ contributors may be used to endorse or promote products derived from
+ this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+-----------------------------------------------------------------------------
+*/
+
+
+/* This module contains a function for converting any UTF-16 character
+strings to host byte order. */
+
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include "pcre_internal.h"
+
+int
+pcre16_utf16_to_host_byte_order(PCRE_SCHAR16 *output, PCRE_SPTR16 input, int length, int keep_boms)
+{
+#ifdef SUPPORT_UTF16
+/* This function converts any UTF-16 string to host byte order and optionally removes
+any Byte Order Marks (BOMS). Returns with the remainig length. */
+BOOL same_bo = TRUE;
+PCRE_SPTR16 end = input + length;
+/* The c variable must be unsigned. */
+register uschar c;
+
+while (input < end)
+ {
+ c = *input++;
+ if (c == 0xfeff || c == 0xfffe)
+ {
+ /* Detecting the byte order of the machine is unnecessary, it is
+ enough to know that the UTF-16 string has the same byte order or not. */
+ same_bo = c == 0xfeff;
+ if (keep_boms != 0)
+ *output++ = 0xfeff;
+ else
+ length--;
+ }
+ else
+ *output++ = same_bo ? c : ((c >> 8) | (c << 8)); /* Flip bytes if needed. */
+ }
+
+#else
+(void)(output); /* Keep picky compilers happy */
+(void)(input);
+(void)(keep_boms);
+#endif
+return length;
+}
+
+/* End of pcre16_convert_utf16.c */
diff --git a/pcre16_valid_utf16.c b/pcre16_valid_utf16.c
new file mode 100644
index 0000000..85c4e4d
--- /dev/null
+++ b/pcre16_valid_utf16.c
@@ -0,0 +1,143 @@
+/*************************************************
+* Perl-Compatible Regular Expressions *
+*************************************************/
+
+/* PCRE is a library of functions to support regular expressions whose syntax
+and semantics are as close as possible to those of the Perl 5 language.
+
+ Written by Philip Hazel
+ Copyright (c) 1997-2009 University of Cambridge
+
+-----------------------------------------------------------------------------
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice,
+ this list of conditions and the following disclaimer.
+
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+ * Neither the name of the University of Cambridge nor the names of its
+ contributors may be used to endorse or promote products derived from
+ this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+-----------------------------------------------------------------------------
+*/
+
+
+/* This module contains an internal function for validating UTF-16 character
+strings. */
+
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include "pcre_internal.h"
+
+
+/*************************************************
+* Validate a UTF-16 string *
+*************************************************/
+
+/* This function is called (optionally) at the start of compile or match, to
+check that a supposed UTF-16 string is actually valid. The early check means
+that subsequent code can assume it is dealing with a valid string. The check
+can be turned off for maximum performance, but the consequences of supplying an
+invalid string are then undefined.
+
+From release 8.21 more information about the details of the error are passed
+back in the returned value:
+
+PCRE_UTF16_ERR0 No error
+PCRE_UTF16_ERR1 Missing low surrogate at the end of the string
+PCRE_UTF16_ERR2 Invalid low surrogate
+PCRE_UTF16_ERR3 Isolated low surrogate
+PCRE_UTF16_ERR4 Not allowed character.
+
+Arguments:
+ string points to the string
+ length length of string, or -1 if the string is zero-terminated
+ errp pointer to an error position offset variable
+
+Returns: = 0 if the string is a valid UTF-16 string
+ > 0 otherwise, setting the offset of the bad character
+*/
+
+int
+_pcre16_valid_utf16(USPTR string, int length, int *erroroffset)
+{
+#ifdef SUPPORT_UTF16
+register USPTR p;
+register uschar c;
+
+if (length < 0)
+ {
+ for (p = string; *p != 0; p++);
+ length = p - string;
+ }
+
+for (p = string; length-- > 0; p++)
+ {
+ c = *p;
+
+ if ((c & 0xf800) != 0xd800)
+ {
+ /* Normal UTF-16 code point. Neither high nor low surrogate. */
+
+ /* This is probably a BOM from a different byte-order.
+ Regardless, the string is rejected. */
+ if (c == 0xfffe)
+ {
+ *erroroffset = p - string;
+ return PCRE_UTF16_ERR4;
+ }
+ }
+ else if ((c & 0x0400) == 0)
+ {
+ /* High surrogate. */
+
+ /* Must be a followed by a low surrogate. */
+ if (length == 0)
+ {
+ *erroroffset = p - string;
+ return PCRE_UTF16_ERR1;
+ }
+ p++;
+ length--;
+ if ((*p & 0xfc00) != 0xdc00)
+ {
+ *erroroffset = p - string;
+ return PCRE_UTF16_ERR2;
+ }
+ }
+ else
+ {
+ /* Isolated low surrogate. Always an error. */
+ *erroroffset = p - string;
+ return PCRE_UTF16_ERR3;
+ }
+ }
+
+#else /* SUPPORT_UTF16 */
+(void)(string); /* Keep picky compilers happy */
+(void)(length);
+#endif
+
+return PCRE_UTF16_ERR0; /* This indicates success */
+}
+
+/* End of pcre16_valid_utf16.c */
diff --git a/pcre_compile.c b/pcre_compile.c
index 55d4226..71e41bd 100644
--- a/pcre_compile.c
+++ b/pcre_compile.c
@@ -7213,17 +7213,33 @@ Returns: pointer to compiled data block, or NULL on error,
with errorptr and erroroffset set
*/
+#ifndef COMPILE_PCRE16
PCRE_EXP_DEFN pcre * PCRE_CALL_CONVENTION
pcre_compile(const char *pattern, int options, const char **errorptr,
int *erroroffset, const unsigned char *tables)
+#else
+PCRE_EXP_DEFN pcre * PCRE_CALL_CONVENTION
+pcre16_compile(PCRE_SPTR16 pattern, int options, const char **errorptr,
+ int *erroroffset, const unsigned char *tables)
+#endif
{
+#ifndef COMPILE_PCRE16
return pcre_compile2(pattern, options, NULL, errorptr, erroroffset, tables);
+#else
+return pcre16_compile2(pattern, options, NULL, errorptr, erroroffset, tables);
+#endif
}
+#ifndef COMPILE_PCRE16
PCRE_EXP_DEFN pcre * PCRE_CALL_CONVENTION
pcre_compile2(const char *pattern, int options, int *errorcodeptr,
const char **errorptr, int *erroroffset, const unsigned char *tables)
+#else
+PCRE_EXP_DEFN pcre * PCRE_CALL_CONVENTION
+pcre16_compile2(PCRE_SPTR16 pattern, int options, int *errorcodeptr,
+ const char **errorptr, int *erroroffset, const unsigned char *tables)
+#endif
{
real_pcre *re;
int length = 1; /* For final END opcode */
diff --git a/pcre_internal.h b/pcre_internal.h
index 46ff701..f9a2731 100644
--- a/pcre_internal.h
+++ b/pcre_internal.h
@@ -51,11 +51,11 @@ functions whose names all begin with "_pcre_". */
#define PCRE_DEBUG
#endif
-/* We do not support both EBCDIC and UTF-8 at the same time. The "configure"
+/* We do not support both EBCDIC and UTF-8/16 at the same time. The "configure"
script prevents both being selected, but not everybody uses "configure". */
-#if defined EBCDIC && defined SUPPORT_UTF8
-#error The use of both EBCDIC and SUPPORT_UTF8 is not supported.
+#if defined EBCDIC && (defined SUPPORT_UTF8 || defined SUPPORT_UTF16)
+#error The use of both EBCDIC and SUPPORT_UTF8/16 is not supported.
#endif
/* If SUPPORT_UCP is defined, SUPPORT_UTF8 must also be defined. The
@@ -208,10 +208,25 @@ by "configure". */
/* All character handling must be done as unsigned characters. Otherwise there
are problems with top-bit-set characters and functions such as isspace().
-However, we leave the interface to the outside world as char *, because that
-should make things easier for callers. */
+However, we leave the interface to the outside world as char * or short *,
+because that should make things easier for callers. We define a short type
+for the current character representation (either 8 or 16 bit) to save lots
+of typing. I tried "uchar", but it causes problems on Digital Unix, where
+it is defined in sys/types, so use "uschar" instead. */
+#ifndef COMPILE_PCRE16
typedef unsigned char pcre_uchar;
+#else
+#if USHRT_MAX != 65535
+/* This is a warning message. Change PCRE_SCHAR16 to a 16 bit data type in
+pcre.h(.in) and disable (comment out) this message. */
+#error Warning: PCRE_SCHAR16 is not a 16 bit data type.
+#endif
+typedef pcre_uint16 uschar;
+#endif
+
+/* A 8 bit unsigned data type. */
+typedef unsigned char pcre_uint8;
/* This is an unsigned int value that no character can ever have. UTF-8
characters only go up to 0x7fffffff (though Unicode doesn't go beyond
@@ -270,9 +285,10 @@ must begin with PCRE_. */
#define PCRE_PUCHAR CUSTOM_SUBJECT_PTR
#else
#define PCRE_PUCHAR const pcre_uchar *
-#endif
-
+/* PCRE_SPTR is defined in pcre.h. */
+#define USPTR const uschar *
+#endif
/* Include the public PCRE header and the definitions of UCP character property
values. */
@@ -1936,7 +1952,11 @@ extern BOOL _pcre_is_newline(PCRE_PUCHAR, int, PCRE_PUCHAR,
extern int _pcre_ord2utf8(int, pcre_uint8 *);
extern real_pcre *_pcre_try_flipped(const real_pcre *, real_pcre *,
const pcre_study_data *, pcre_study_data *);
+#ifndef COMPILE_PCRE16
extern int _pcre_valid_utf8(PCRE_PUCHAR, int, int *);
+#else
+extern int _pcre16_valid_utf16(PCRE_PUCHAR, int, int *);
+#endif
extern BOOL _pcre_was_newline(PCRE_PUCHAR, int, PCRE_PUCHAR,
int *, BOOL);
extern BOOL _pcre_xclass(int, const pcre_uchar *);