diff options
author | ph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15> | 2011-11-21 11:44:55 +0000 |
---|---|---|
committer | ph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15> | 2011-11-21 11:44:55 +0000 |
commit | a8cb60f9a164ea40e49ee7d3cf6c465c0540b9bc (patch) | |
tree | fc611004b87a826965d9d295370198b50007dd37 | |
parent | c48ca85a9456be9a04cb0b92df853d6dac797414 (diff) | |
download | pcre-a8cb60f9a164ea40e49ee7d3cf6c465c0540b9bc.tar.gz |
More 16-bit patches
git-svn-id: svn://vcs.exim.org/pcre/code/branches/pcre16@757 2f5784b3-3f2a-0410-8824-cb99058d5e15
-rw-r--r-- | Makefile.am | 49 | ||||
-rw-r--r-- | configure.ac | 126 | ||||
-rw-r--r-- | libpcre.pc.in | 2 | ||||
-rw-r--r-- | libpcre16.pc.in | 12 | ||||
-rw-r--r-- | pcre.h.in | 28 | ||||
-rw-r--r-- | pcre16_compile.c | 45 | ||||
-rw-r--r-- | pcre16_convert_utf16.c | 87 | ||||
-rw-r--r-- | pcre16_valid_utf16.c | 143 | ||||
-rw-r--r-- | pcre_compile.c | 16 | ||||
-rw-r--r-- | pcre_internal.h | 34 |
10 files changed, 522 insertions, 20 deletions
diff --git a/Makefile.am b/Makefile.am index 320becb..4651c6f 100644 --- a/Makefile.am +++ b/Makefile.am @@ -79,7 +79,7 @@ check_SCRIPTS = dist_noinst_SCRIPTS = # Some of the binaries we make are to be installed, and others are -# (non-user-visible) helper programs needed to build libpcre. +# (non-user-visible) helper programs needed to build libpcre or libpcre16. bin_PROGRAMS = noinst_PROGRAMS = @@ -170,6 +170,9 @@ endif # WITH_REBUILD_CHARTABLES ## The main pcre library + +# Build the 8 bit library if it is enabled. +if WITH_PCRE8 lib_LTLIBRARIES += libpcre.la libpcre_la_SOURCES = \ pcre_compile.c \ @@ -199,6 +202,22 @@ libpcre_la_SOURCES = \ nodist_libpcre_la_SOURCES = \ pcre_chartables.c +endif # WITH_PCRE8 + +# Build the 16 bit library if it is enabled. +if WITH_PCRE16 +lib_LTLIBRARIES += libpcre16.la +libpcre16_la_SOURCES = \ + pcre16_compile.c \ + pcre16_convert_utf16.c \ + pcre16_valid_utf16.c + +## This file is generated as part of the building process, so don't distribute. +nodist_libpcre16_la_SOURCES = \ + pcre_chartables.c + +endif # WITH_PCRE16 + # The pcre_printint.src file is #included by some source files, so it must be # distributed. The pcre_chartables.c.dist file is the default version of # pcre_chartables.c, used unless --enable-rebuild-chartables is specified. @@ -224,7 +243,12 @@ EXTRA_DIST += \ sljit/sljitNativeX86_common.c \ sljit/sljitUtils.c +if WITH_PCRE8 libpcre_la_LDFLAGS = $(EXTRA_LIBPCRE_LDFLAGS) +endif # WITH_PCRE8 +if WITH_PCRE16 +libpcre16_la_LDFLAGS = $(EXTRA_LIBPCRE_LDFLAGS) +endif # WITH_PCRE16 CLEANFILES += pcre_chartables.c @@ -233,15 +257,23 @@ if WITH_JIT TESTS += pcre_jit_test noinst_PROGRAMS += pcre_jit_test pcre_jit_test_SOURCES = pcre_jit_test.c -pcre_jit_test_LDADD = libpcre.la +pcre_jit_test_LDADD = +if WITH_PCRE8 +pcre_jit_test_LDADD += libpcre.la +endif # WITH_PCRE8 +if WITH_PCRE16 +pcre_jit_test_LDADD += libpcre16.la +endif # WITH_PCRE16 endif # WITH_JIT ## A version of the main pcre library that has a posix re API. +if WITH_PCRE8 lib_LTLIBRARIES += libpcreposix.la libpcreposix_la_SOURCES = \ pcreposix.c libpcreposix_la_LDFLAGS = $(EXTRA_LIBPCREPOSIX_LDFLAGS) libpcreposix_la_LIBADD = libpcre.la +endif # WITH_PCRE8 ## There's a C++ library as well. if WITH_PCRE_CPP @@ -282,13 +314,19 @@ dist_noinst_SCRIPTS += RunTest EXTRA_DIST += RunTest.bat bin_PROGRAMS += pcretest pcretest_SOURCES = pcretest.c -pcretest_LDADD = libpcreposix.la $(LIBREADLINE) +pcretest_LDADD = $(LIBREADLINE) +if WITH_PCRE8 +pcretest_LDADD += libpcreposix.la +endif # WITH_PCRE8 TESTS += RunGrepTest dist_noinst_SCRIPTS += RunGrepTest bin_PROGRAMS += pcregrep pcregrep_SOURCES = pcregrep.c -pcregrep_LDADD = libpcreposix.la $(LIBZ) $(LIBBZ2) +pcregrep_LDADD = $(LIBZ) $(LIBBZ2) +if WITH_PCRE8 +pcregrep_LDADD += libpcreposix.la +endif # WITH_PCRE8 EXTRA_DIST += \ testdata/grepinput \ @@ -378,6 +416,9 @@ pcre.dll: $(DLL_OBJS) # We have .pc files for pkg-config users. pkgconfigdir = $(libdir)/pkgconfig pkgconfig_DATA = libpcre.pc libpcreposix.pc +if WITH_PCRE16 +pkgconfig_DATA += libpcre16.pc +endif if WITH_PCRE_CPP pkgconfig_DATA += libpcrecpp.pc endif diff --git a/configure.ac b/configure.ac index ddee8e8..ff516c7 100644 --- a/configure.ac +++ b/configure.ac @@ -104,6 +104,18 @@ then htmldir='${docdir}/html' fi +# Handle --disable-pcre8 (enabled by default) +AC_ARG_ENABLE(pcre8, + AS_HELP_STRING([--disable-pcre8], + [enable 8 bit character support]), + , enable_pcre8=unset) + +# Handle --enable-pcre16 (disabled by default) +AC_ARG_ENABLE(pcre16, + AS_HELP_STRING([--enable-pcre16], + [enable 16 bit character support]), + , enable_pcre16=unset) + # Handle --disable-cpp. The substitution of enable_cpp is needed for use in # pcre-config. AC_ARG_ENABLE(cpp, @@ -136,10 +148,16 @@ AC_ARG_ENABLE(utf8, [enable UTF-8 support (incompatible with --enable-ebcdic)]), , enable_utf8=unset) +# Handle --enable-utf16 (disabled by default) +AC_ARG_ENABLE(utf16, + AS_HELP_STRING([--enable-utf16], + [enable UTF-16 support (incompatible with --enable-ebcdic)]), + , enable_utf16=unset) + # Handle --enable-unicode-properties AC_ARG_ENABLE(unicode-properties, AS_HELP_STRING([--enable-unicode-properties], - [enable Unicode properties support (implies --enable-utf8)]), + [enable Unicode properties support (implies --enable-utf8 and --enable-utf16)]), , enable_unicode_properties=no) # Handle --enable-newline=NL @@ -245,8 +263,46 @@ AC_ARG_WITH(match-limit-recursion, [default limit on internal recursion (default=MATCH_LIMIT)]), , with_match_limit_recursion=MATCH_LIMIT) -# Make sure that if enable_unicode_properties was set, that UTF-8 support -# is enabled. +# Make sure that if enable_utf8 was set, that enable_pcre8 support is enabled +if test "x$enable_utf8" = "xyes" +then + if test "x$enable_pcre8" = "xno" + then + AC_MSG_ERROR([support for UTF-8 requires pcre library with 8 bit characters]) + fi + enable_pcre8=yes +fi + +# Make sure that if enable_utf16 was set, that enable_pcre16 support is enabled +if test "x$enable_utf16" = "xyes" +then + if test "x$enable_pcre16" = "xno" + then + AC_MSG_ERROR([support for UTF-16 requires pcre library with 16 bit characters]) + fi + enable_pcre16=yes +fi + +# Set the default value for pcre8 +if test "x$enable_pcre8" = "xunset" +then + enable_pcre8=yes +fi + +# Set the default value for pcre16 +if test "x$enable_pcre16" = "xunset" +then + enable_pcre16=no +fi + +# Make sure enable_pcre8 or enable_pcre16 was set +if test "x$enable_pcre8$enable_pcre16" = "xnono" +then + AC_MSG_ERROR([Either 8 or 16 bit (or both) pcre library must be enabled]) +fi + +# Make sure that if enable_unicode_properties was set, that UTF-8 or UTF-16 +# support enabled. # if test "x$enable_unicode_properties" = "xyes" then @@ -254,17 +310,44 @@ then then AC_MSG_ERROR([support for Unicode properties requires UTF-8 support]) fi - enable_utf8=yes + if test "x$enable_utf16" = "xno" + then + AC_MSG_ERROR([support for Unicode properties requires UTF-16 support]) + fi + if test "x$enable_pcre8" = "xyes" + then + enable_utf8=yes + fi + if test "x$enable_pcre16" = "xyes" + then + enable_utf16=yes + fi fi +# enable_utf8 is disabled by default. if test "x$enable_utf8" = "xunset" then enable_utf8=no fi +# enable_utf16 is disabled by default. +if test "x$enable_utf16" = "xunset" +then + enable_utf16=no +fi + +# Make sure that if enable_cpp was set, that enable_pcre8 support is enabled +if test "x$enable_cpp" = "xyes" +then + if test "x$enable_pcre8" = "xno" + then + AC_MSG_ERROR([C++ library requires pcre library with 8 bit characters]) + fi +fi + # Make sure that if enable_ebcdic is set, rebuild_chartables is also enabled. -# Also check that UTF-8 support is not requested, because PCRE cannot handle -# EBCDIC and UTF-8 in the same build. To do so it would need to use different +# Also check that UTF-8 or UTF-16 support is not requested, because PCRE cannot +# handle EBCDIC and UTF in the same build. To do so it would need to use different # character constants depending on the mode. # if test "x$enable_ebcdic" = "xyes" @@ -274,6 +357,10 @@ then then AC_MSG_ERROR([support for EBCDIC and UTF-8 cannot be enabled at the same time]) fi + if test "x$enable_utf16" = "xyes" + then + AC_MSG_ERROR([support for EBCDIC and UTF-16 cannot be enabled at the same time]) + fi fi # Convert the newline identifier into the appropriate integer value. @@ -410,10 +497,13 @@ AC_SUBST(pcre_have_type_traits) AC_SUBST(pcre_have_bits_type_traits) # Conditional compilation +AM_CONDITIONAL(WITH_PCRE8, test "x$enable_pcre8" = "xyes") +AM_CONDITIONAL(WITH_PCRE16, test "x$enable_pcre16" = "xyes") AM_CONDITIONAL(WITH_PCRE_CPP, test "x$enable_cpp" = "xyes") AM_CONDITIONAL(WITH_REBUILD_CHARTABLES, test "x$enable_rebuild_chartables" = "xyes") AM_CONDITIONAL(WITH_JIT, test "x$enable_jit" = "xyes") AM_CONDITIONAL(WITH_UTF8, test "x$enable_utf8" = "xyes") +AM_CONDITIONAL(WITH_UTF16, test "x$enable_utf16" = "xyes") # Checks for typedefs, structures, and compiler characteristics. @@ -482,6 +572,16 @@ AC_SUBST(PCRE_STATIC_CFLAG) # Here is where pcre specific defines are handled +if test "$enable_pcre8" = "yes"; then + AC_DEFINE([SUPPORT_PCRE8], [], [ + Define to enable the 8 bit PCRE library.]) +fi + +if test "$enable_pcre16" = "yes"; then + AC_DEFINE([SUPPORT_PCRE16], [], [ + Define to enable the 16 bit PCRE library.]) +fi + if test "$enable_jit" = "yes"; then AC_DEFINE([SUPPORT_JIT], [], [ Define to enable support for Just-In-Time compiling.]) @@ -502,6 +602,14 @@ if test "$enable_utf8" = "yes"; then *or* ASCII/UTF-8, but not both at once.]) fi +if test "$enable_utf16" = "yes"; then + AC_DEFINE([SUPPORT_UTF16], [], [ + Define to enable support for the UTF-16 Unicode encoding. This will + work even in an EBCDIC environment, but it is incompatible with + the EBCDIC macro. That is, PCRE can support *either* EBCDIC code + *or* ASCII/UTF-16, but not both at once.]) +fi + if test "$enable_unicode_properties" = "yes"; then AC_DEFINE([SUPPORT_UCP], [], [ Define to enable support for Unicode properties.]) @@ -720,7 +828,8 @@ AC_SUBST(LIBREADLINE) AC_CONFIG_FILES( Makefile libpcre.pc - libpcreposix.pc + libpcre16.pc + libpcreposix.pc libpcrecpp.pc pcre-config pcre.h @@ -756,9 +865,12 @@ $PACKAGE-$VERSION configuration summary: Linker flags .................... : ${LDFLAGS} Extra libraries ................. : ${LIBS} + Build 8 bit pcre library ........ : ${enable_pcre8} + Build 16 bit pcre library ....... : ${enable_pcre16} Build C++ library ............... : ${enable_cpp} Enable JIT compiling support .... : ${enable_jit} Enable UTF-8 support ............ : ${enable_utf8} + Enable UTF-16 support ........... : ${enable_utf16} Unicode properties .............. : ${enable_unicode_properties} Newline char/sequence ........... : ${enable_newline} \R matches only ANYCRLF ......... : ${enable_bsr_anycrlf} diff --git a/libpcre.pc.in b/libpcre.pc.in index 2c3fa19..1f26b32 100644 --- a/libpcre.pc.in +++ b/libpcre.pc.in @@ -6,7 +6,7 @@ libdir=@libdir@ includedir=@includedir@ Name: libpcre -Description: PCRE - Perl compatible regular expressions C library +Description: PCRE - Perl compatible regular expressions C library with 8 bit character support Version: @PACKAGE_VERSION@ Libs: -L${libdir} -lpcre Cflags: -I${includedir} @PCRE_STATIC_CFLAG@ diff --git a/libpcre16.pc.in b/libpcre16.pc.in new file mode 100644 index 0000000..12009f8 --- /dev/null +++ b/libpcre16.pc.in @@ -0,0 +1,12 @@ +# Package Information for pkg-config + +prefix=@prefix@ +exec_prefix=@exec_prefix@ +libdir=@libdir@ +includedir=@includedir@ + +Name: libpcre16 +Description: PCRE - Perl compatible regular expressions C library with 16 bit character support +Version: @PACKAGE_VERSION@ +Libs: -L${libdir} -lpcre +Cflags: -I${includedir} @PCRE_STATIC_CFLAG@ @@ -111,7 +111,8 @@ compile-time only bits for runtime options, or vice versa. */ #define PCRE_NOTEOL 0x00000100 /* Exec, DFA exec */ #define PCRE_UNGREEDY 0x00000200 /* Compile */ #define PCRE_NOTEMPTY 0x00000400 /* Exec, DFA exec */ -#define PCRE_UTF8 0x00000800 /* Compile */ +#define PCRE_UTF8 0x00000800 /* Compile (Same as PCRE_UTF16) */ +#define PCRE_UTF16 0x00000800 /* Compile (Same as PCRE_UTF8) */ #define PCRE_NO_AUTO_CAPTURE 0x00001000 /* Compile */ #define PCRE_NO_UTF8_CHECK 0x00002000 /* Compile, exec, DFA exec */ #define PCRE_AUTO_CALLOUT 0x00004000 /* Compile */ @@ -191,6 +192,14 @@ compile-time only bits for runtime options, or vice versa. */ #define PCRE_UTF8_ERR20 20 #define PCRE_UTF8_ERR21 21 +/* Specific error codes for UTF-16 validity checks */ + +#define PCRE_UTF16_ERR0 0 +#define PCRE_UTF16_ERR1 1 +#define PCRE_UTF16_ERR2 2 +#define PCRE_UTF16_ERR3 3 +#define PCRE_UTF16_ERR4 4 + /* Request types for pcre_fullinfo() */ #define PCRE_INFO_OPTIONS 0 @@ -250,6 +259,17 @@ typedef struct real_pcre pcre; struct real_pcre_jit_stack; /* declaration; the definition is private */ typedef struct real_pcre_jit_stack pcre_jit_stack; +/* If PCRE is compiled with 16 bit character support, PCRE_SCHAR16 must contain +a 16 bit wide signed data type. Otherwise it can be a dummy data type since +pcre16 functions are not implemented. There is a check for this in pcre_internal.h. */ +#ifndef PCRE_SCHAR16 +#define PCRE_SCHAR16 short +#endif + +#ifndef PCRE_SPTR16 +#define PCRE_SPTR16 const PCRE_SCHAR16 * +#endif + /* When PCRE is compiled as a C++ library, the subject pointer type can be replaced with a custom type. For conventional use, the public interface is a const char *. */ @@ -326,8 +346,12 @@ typedef pcre_jit_stack *(*pcre_jit_callback)(void *); PCRE_EXP_DECL pcre *pcre_compile(const char *, int, const char **, int *, const unsigned char *); +PCRE_EXP_DECL pcre *pcre16_compile(PCRE_SPTR16, int, const char **, int *, + const unsigned char *); PCRE_EXP_DECL pcre *pcre_compile2(const char *, int, int *, const char **, int *, const unsigned char *); +PCRE_EXP_DECL pcre *pcre16_compile2(PCRE_SPTR16, int, int *, const short **, + int *, const unsigned char *); PCRE_EXP_DECL int pcre_config(int, void *); PCRE_EXP_DECL int pcre_copy_named_substring(const pcre *, const char *, int *, int, const char *, char *, int); @@ -353,6 +377,8 @@ PCRE_EXP_DECL int pcre_get_substring_list(const char *, int *, int, PCRE_EXP_DECL int pcre_info(const pcre *, int *, int *); PCRE_EXP_DECL const unsigned char *pcre_maketables(void); PCRE_EXP_DECL int pcre_refcount(pcre *, int); +PCRE_EXP_DECL int pcre16_utf16_to_host_byte_order(PCRE_SCHAR16 *, + PCRE_SPTR16, int, int); PCRE_EXP_DECL pcre_extra *pcre_study(const pcre *, int, const char **); PCRE_EXP_DECL void pcre_free_study(pcre_extra *); PCRE_EXP_DECL const char *pcre_version(void); diff --git a/pcre16_compile.c b/pcre16_compile.c new file mode 100644 index 0000000..d682f99 --- /dev/null +++ b/pcre16_compile.c @@ -0,0 +1,45 @@ +/************************************************* +* Perl-Compatible Regular Expressions * +*************************************************/ + +/* PCRE is a library of functions to support regular expressions whose syntax +and semantics are as close as possible to those of the Perl 5 language. + + Written by Philip Hazel + Copyright (c) 1997-2011 University of Cambridge + +----------------------------------------------------------------------------- +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of the University of Cambridge nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +----------------------------------------------------------------------------- +*/ + +/* Generate code with 16 bit character support. */ +#define COMPILE_PCRE16 + +#include "pcre_compile.c" + +/* End of pcre16_compile.c */ diff --git a/pcre16_convert_utf16.c b/pcre16_convert_utf16.c new file mode 100644 index 0000000..34d6965 --- /dev/null +++ b/pcre16_convert_utf16.c @@ -0,0 +1,87 @@ +/************************************************* +* Perl-Compatible Regular Expressions * +*************************************************/ + +/* PCRE is a library of functions to support regular expressions whose syntax +and semantics are as close as possible to those of the Perl 5 language. + + Written by Philip Hazel + Copyright (c) 1997-2009 University of Cambridge + +----------------------------------------------------------------------------- +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of the University of Cambridge nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +----------------------------------------------------------------------------- +*/ + + +/* This module contains a function for converting any UTF-16 character +strings to host byte order. */ + + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "pcre_internal.h" + +int +pcre16_utf16_to_host_byte_order(PCRE_SCHAR16 *output, PCRE_SPTR16 input, int length, int keep_boms) +{ +#ifdef SUPPORT_UTF16 +/* This function converts any UTF-16 string to host byte order and optionally removes +any Byte Order Marks (BOMS). Returns with the remainig length. */ +BOOL same_bo = TRUE; +PCRE_SPTR16 end = input + length; +/* The c variable must be unsigned. */ +register uschar c; + +while (input < end) + { + c = *input++; + if (c == 0xfeff || c == 0xfffe) + { + /* Detecting the byte order of the machine is unnecessary, it is + enough to know that the UTF-16 string has the same byte order or not. */ + same_bo = c == 0xfeff; + if (keep_boms != 0) + *output++ = 0xfeff; + else + length--; + } + else + *output++ = same_bo ? c : ((c >> 8) | (c << 8)); /* Flip bytes if needed. */ + } + +#else +(void)(output); /* Keep picky compilers happy */ +(void)(input); +(void)(keep_boms); +#endif +return length; +} + +/* End of pcre16_convert_utf16.c */ diff --git a/pcre16_valid_utf16.c b/pcre16_valid_utf16.c new file mode 100644 index 0000000..85c4e4d --- /dev/null +++ b/pcre16_valid_utf16.c @@ -0,0 +1,143 @@ +/************************************************* +* Perl-Compatible Regular Expressions * +*************************************************/ + +/* PCRE is a library of functions to support regular expressions whose syntax +and semantics are as close as possible to those of the Perl 5 language. + + Written by Philip Hazel + Copyright (c) 1997-2009 University of Cambridge + +----------------------------------------------------------------------------- +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of the University of Cambridge nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +----------------------------------------------------------------------------- +*/ + + +/* This module contains an internal function for validating UTF-16 character +strings. */ + + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "pcre_internal.h" + + +/************************************************* +* Validate a UTF-16 string * +*************************************************/ + +/* This function is called (optionally) at the start of compile or match, to +check that a supposed UTF-16 string is actually valid. The early check means +that subsequent code can assume it is dealing with a valid string. The check +can be turned off for maximum performance, but the consequences of supplying an +invalid string are then undefined. + +From release 8.21 more information about the details of the error are passed +back in the returned value: + +PCRE_UTF16_ERR0 No error +PCRE_UTF16_ERR1 Missing low surrogate at the end of the string +PCRE_UTF16_ERR2 Invalid low surrogate +PCRE_UTF16_ERR3 Isolated low surrogate +PCRE_UTF16_ERR4 Not allowed character. + +Arguments: + string points to the string + length length of string, or -1 if the string is zero-terminated + errp pointer to an error position offset variable + +Returns: = 0 if the string is a valid UTF-16 string + > 0 otherwise, setting the offset of the bad character +*/ + +int +_pcre16_valid_utf16(USPTR string, int length, int *erroroffset) +{ +#ifdef SUPPORT_UTF16 +register USPTR p; +register uschar c; + +if (length < 0) + { + for (p = string; *p != 0; p++); + length = p - string; + } + +for (p = string; length-- > 0; p++) + { + c = *p; + + if ((c & 0xf800) != 0xd800) + { + /* Normal UTF-16 code point. Neither high nor low surrogate. */ + + /* This is probably a BOM from a different byte-order. + Regardless, the string is rejected. */ + if (c == 0xfffe) + { + *erroroffset = p - string; + return PCRE_UTF16_ERR4; + } + } + else if ((c & 0x0400) == 0) + { + /* High surrogate. */ + + /* Must be a followed by a low surrogate. */ + if (length == 0) + { + *erroroffset = p - string; + return PCRE_UTF16_ERR1; + } + p++; + length--; + if ((*p & 0xfc00) != 0xdc00) + { + *erroroffset = p - string; + return PCRE_UTF16_ERR2; + } + } + else + { + /* Isolated low surrogate. Always an error. */ + *erroroffset = p - string; + return PCRE_UTF16_ERR3; + } + } + +#else /* SUPPORT_UTF16 */ +(void)(string); /* Keep picky compilers happy */ +(void)(length); +#endif + +return PCRE_UTF16_ERR0; /* This indicates success */ +} + +/* End of pcre16_valid_utf16.c */ diff --git a/pcre_compile.c b/pcre_compile.c index 55d4226..71e41bd 100644 --- a/pcre_compile.c +++ b/pcre_compile.c @@ -7213,17 +7213,33 @@ Returns: pointer to compiled data block, or NULL on error, with errorptr and erroroffset set */ +#ifndef COMPILE_PCRE16 PCRE_EXP_DEFN pcre * PCRE_CALL_CONVENTION pcre_compile(const char *pattern, int options, const char **errorptr, int *erroroffset, const unsigned char *tables) +#else +PCRE_EXP_DEFN pcre * PCRE_CALL_CONVENTION +pcre16_compile(PCRE_SPTR16 pattern, int options, const char **errorptr, + int *erroroffset, const unsigned char *tables) +#endif { +#ifndef COMPILE_PCRE16 return pcre_compile2(pattern, options, NULL, errorptr, erroroffset, tables); +#else +return pcre16_compile2(pattern, options, NULL, errorptr, erroroffset, tables); +#endif } +#ifndef COMPILE_PCRE16 PCRE_EXP_DEFN pcre * PCRE_CALL_CONVENTION pcre_compile2(const char *pattern, int options, int *errorcodeptr, const char **errorptr, int *erroroffset, const unsigned char *tables) +#else +PCRE_EXP_DEFN pcre * PCRE_CALL_CONVENTION +pcre16_compile2(PCRE_SPTR16 pattern, int options, int *errorcodeptr, + const char **errorptr, int *erroroffset, const unsigned char *tables) +#endif { real_pcre *re; int length = 1; /* For final END opcode */ diff --git a/pcre_internal.h b/pcre_internal.h index 46ff701..f9a2731 100644 --- a/pcre_internal.h +++ b/pcre_internal.h @@ -51,11 +51,11 @@ functions whose names all begin with "_pcre_". */ #define PCRE_DEBUG #endif -/* We do not support both EBCDIC and UTF-8 at the same time. The "configure" +/* We do not support both EBCDIC and UTF-8/16 at the same time. The "configure" script prevents both being selected, but not everybody uses "configure". */ -#if defined EBCDIC && defined SUPPORT_UTF8 -#error The use of both EBCDIC and SUPPORT_UTF8 is not supported. +#if defined EBCDIC && (defined SUPPORT_UTF8 || defined SUPPORT_UTF16) +#error The use of both EBCDIC and SUPPORT_UTF8/16 is not supported. #endif /* If SUPPORT_UCP is defined, SUPPORT_UTF8 must also be defined. The @@ -208,10 +208,25 @@ by "configure". */ /* All character handling must be done as unsigned characters. Otherwise there are problems with top-bit-set characters and functions such as isspace(). -However, we leave the interface to the outside world as char *, because that -should make things easier for callers. */ +However, we leave the interface to the outside world as char * or short *, +because that should make things easier for callers. We define a short type +for the current character representation (either 8 or 16 bit) to save lots +of typing. I tried "uchar", but it causes problems on Digital Unix, where +it is defined in sys/types, so use "uschar" instead. */ +#ifndef COMPILE_PCRE16 typedef unsigned char pcre_uchar; +#else +#if USHRT_MAX != 65535 +/* This is a warning message. Change PCRE_SCHAR16 to a 16 bit data type in +pcre.h(.in) and disable (comment out) this message. */ +#error Warning: PCRE_SCHAR16 is not a 16 bit data type. +#endif +typedef pcre_uint16 uschar; +#endif + +/* A 8 bit unsigned data type. */ +typedef unsigned char pcre_uint8; /* This is an unsigned int value that no character can ever have. UTF-8 characters only go up to 0x7fffffff (though Unicode doesn't go beyond @@ -270,9 +285,10 @@ must begin with PCRE_. */ #define PCRE_PUCHAR CUSTOM_SUBJECT_PTR #else #define PCRE_PUCHAR const pcre_uchar * -#endif - +/* PCRE_SPTR is defined in pcre.h. */ +#define USPTR const uschar * +#endif /* Include the public PCRE header and the definitions of UCP character property values. */ @@ -1936,7 +1952,11 @@ extern BOOL _pcre_is_newline(PCRE_PUCHAR, int, PCRE_PUCHAR, extern int _pcre_ord2utf8(int, pcre_uint8 *); extern real_pcre *_pcre_try_flipped(const real_pcre *, real_pcre *, const pcre_study_data *, pcre_study_data *); +#ifndef COMPILE_PCRE16 extern int _pcre_valid_utf8(PCRE_PUCHAR, int, int *); +#else +extern int _pcre16_valid_utf16(PCRE_PUCHAR, int, int *); +#endif extern BOOL _pcre_was_newline(PCRE_PUCHAR, int, PCRE_PUCHAR, int *, BOOL); extern BOOL _pcre_xclass(int, const pcre_uchar *); |