summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--MANIFEST162
-rw-r--r--embed.h1
-rw-r--r--global.sym1
-rw-r--r--proto.h22
-rw-r--r--regexec.c22
-rw-r--r--toke.c2
-rw-r--r--utf8.h6
-rw-r--r--win32/Makefile2
-rw-r--r--win32/makefile.mk4
9 files changed, 206 insertions, 16 deletions
diff --git a/MANIFEST b/MANIFEST
index 056e369d37..192caefdcd 100644
--- a/MANIFEST
+++ b/MANIFEST
@@ -589,6 +589,165 @@ lib/syslog.pl Perl library supporting syslogging
lib/tainted.pl Old code for tainting
lib/termcap.pl Perl library supporting termcap usage
lib/timelocal.pl Perl library supporting inverse of localtime, gmtime
+lib/unicode/ArabLink.pl Unicode character database
+lib/unicode/ArabLnkGrp.pl Unicode character database
+lib/unicode/Bidirectional.pl Unicode character database
+lib/unicode/Block.pl Unicode character database
+lib/unicode/Category.pl Unicode character database
+lib/unicode/CombiningClass.pl Unicode character database
+lib/unicode/Decomposition.pl Unicode character database
+lib/unicode/In/AlphabeticPresentationForms.pl Unicode character database
+lib/unicode/In/Arabic.pl Unicode character database
+lib/unicode/In/ArabicPresentationForms-A.pl Unicode character database
+lib/unicode/In/ArabicPresentationForms-B.pl Unicode character database
+lib/unicode/In/Armenian.pl Unicode character database
+lib/unicode/In/Arrows.pl Unicode character database
+lib/unicode/In/BasicLatin.pl Unicode character database
+lib/unicode/In/Bengali.pl Unicode character database
+lib/unicode/In/BlockElements.pl Unicode character database
+lib/unicode/In/Bopomofo.pl Unicode character database
+lib/unicode/In/BoxDrawing.pl Unicode character database
+lib/unicode/In/CJKCompatibility.pl Unicode character database
+lib/unicode/In/CJKCompatibilityForms.pl Unicode character database
+lib/unicode/In/CJKCompatibilityIdeographs.pl Unicode character database
+lib/unicode/In/CJKSymbolsandPunctuation.pl Unicode character database
+lib/unicode/In/CJKUnifiedIdeographs.pl Unicode character database
+lib/unicode/In/CombiningDiacriticalMarks.pl Unicode character database
+lib/unicode/In/CombiningHalfMarks.pl Unicode character database
+lib/unicode/In/CombiningMarksforSymbols.pl Unicode character database
+lib/unicode/In/ControlPictures.pl Unicode character database
+lib/unicode/In/CurrencySymbols.pl Unicode character database
+lib/unicode/In/Cyrillic.pl Unicode character database
+lib/unicode/In/Devanagari.pl Unicode character database
+lib/unicode/In/Dingbats.pl Unicode character database
+lib/unicode/In/EnclosedAlphanumerics.pl Unicode character database
+lib/unicode/In/EnclosedCJKLettersandMonths.pl Unicode character database
+lib/unicode/In/GeneralPunctuation.pl Unicode character database
+lib/unicode/In/GeometricShapes.pl Unicode character database
+lib/unicode/In/Georgian.pl Unicode character database
+lib/unicode/In/Greek.pl Unicode character database
+lib/unicode/In/GreekExtended.pl Unicode character database
+lib/unicode/In/Gujarati.pl Unicode character database
+lib/unicode/In/Gurmukhi.pl Unicode character database
+lib/unicode/In/HalfwidthandFullwidthForms.pl Unicode character database
+lib/unicode/In/HangulCompatibilityJamo.pl Unicode character database
+lib/unicode/In/HangulJamo.pl Unicode character database
+lib/unicode/In/HangulSyllables.pl Unicode character database
+lib/unicode/In/Hebrew.pl Unicode character database
+lib/unicode/In/HighPrivateUseSurrogates.pl Unicode character database
+lib/unicode/In/HighSurrogates.pl Unicode character database
+lib/unicode/In/Hiragana.pl Unicode character database
+lib/unicode/In/IPAExtensions.pl Unicode character database
+lib/unicode/In/Kanbun.pl Unicode character database
+lib/unicode/In/Kannada.pl Unicode character database
+lib/unicode/In/Katakana.pl Unicode character database
+lib/unicode/In/Lao.pl Unicode character database
+lib/unicode/In/Latin-1Supplement.pl Unicode character database
+lib/unicode/In/LatinExtended-A.pl Unicode character database
+lib/unicode/In/LatinExtended-B.pl Unicode character database
+lib/unicode/In/LatinExtendedAdditional.pl Unicode character database
+lib/unicode/In/LetterlikeSymbols.pl Unicode character database
+lib/unicode/In/LowSurrogates.pl Unicode character database
+lib/unicode/In/Malayalam.pl Unicode character database
+lib/unicode/In/MathematicalOperators.pl Unicode character database
+lib/unicode/In/MiscellaneousSymbols.pl Unicode character database
+lib/unicode/In/MiscellaneousTechnical.pl Unicode character database
+lib/unicode/In/NumberForms.pl Unicode character database
+lib/unicode/In/OpticalCharacterRecognition.pl Unicode character database
+lib/unicode/In/Oriya.pl Unicode character database
+lib/unicode/In/PrivateUse.pl Unicode character database
+lib/unicode/In/SmallFormVariants.pl Unicode character database
+lib/unicode/In/SpacingModifierLetters.pl Unicode character database
+lib/unicode/In/Specials.pl Unicode character database
+lib/unicode/In/SuperscriptsandSubscripts.pl Unicode character database
+lib/unicode/In/Tamil.pl Unicode character database
+lib/unicode/In/Telugu.pl Unicode character database
+lib/unicode/In/Thai.pl Unicode character database
+lib/unicode/In/Tibetan.pl Unicode character database
+lib/unicode/Is/Alnum.pl Unicode character database
+lib/unicode/Is/Alpha.pl Unicode character database
+lib/unicode/Is/BidiAN.pl Unicode character database
+lib/unicode/Is/BidiB.pl Unicode character database
+lib/unicode/Is/BidiCS.pl Unicode character database
+lib/unicode/Is/BidiEN.pl Unicode character database
+lib/unicode/Is/BidiES.pl Unicode character database
+lib/unicode/Is/BidiET.pl Unicode character database
+lib/unicode/Is/BidiL.pl Unicode character database
+lib/unicode/Is/BidiON.pl Unicode character database
+lib/unicode/Is/BidiR.pl Unicode character database
+lib/unicode/Is/BidiS.pl Unicode character database
+lib/unicode/Is/BidiWS.pl Unicode character database
+lib/unicode/Is/C.pl Unicode character database
+lib/unicode/Is/Cc.pl Unicode character database
+lib/unicode/Is/Cn.pl Unicode character database
+lib/unicode/Is/Co.pl Unicode character database
+lib/unicode/Is/DCcircle.pl Unicode character database
+lib/unicode/Is/DCcompat.pl Unicode character database
+lib/unicode/Is/DCfinal.pl Unicode character database
+lib/unicode/Is/DCfont.pl Unicode character database
+lib/unicode/Is/DCinital.pl Unicode character database
+lib/unicode/Is/DCinitial.pl Unicode character database
+lib/unicode/Is/DCisolated.pl Unicode character database
+lib/unicode/Is/DCnarrow.pl Unicode character database
+lib/unicode/Is/DCnoBreak.pl Unicode character database
+lib/unicode/Is/DCsmall.pl Unicode character database
+lib/unicode/Is/DCsquare.pl Unicode character database
+lib/unicode/Is/DCsub.pl Unicode character database
+lib/unicode/Is/DCsuper.pl Unicode character database
+lib/unicode/Is/DCvertical.pl Unicode character database
+lib/unicode/Is/DCwide.pl Unicode character database
+lib/unicode/Is/DecoCanon.pl Unicode character database
+lib/unicode/Is/DecoCompat.pl Unicode character database
+lib/unicode/Is/Digit.pl Unicode character database
+lib/unicode/Is/L.pl Unicode character database
+lib/unicode/Is/Ll.pl Unicode character database
+lib/unicode/Is/Lm.pl Unicode character database
+lib/unicode/Is/Lo.pl Unicode character database
+lib/unicode/Is/Lower.pl Unicode character database
+lib/unicode/Is/Lt.pl Unicode character database
+lib/unicode/Is/Lu.pl Unicode character database
+lib/unicode/Is/M.pl Unicode character database
+lib/unicode/Is/Mc.pl Unicode character database
+lib/unicode/Is/Mirrored.pl Unicode character database
+lib/unicode/Is/Mn.pl Unicode character database
+lib/unicode/Is/N.pl Unicode character database
+lib/unicode/Is/Nd.pl Unicode character database
+lib/unicode/Is/No.pl Unicode character database
+lib/unicode/Is/P.pl Unicode character database
+lib/unicode/Is/Pd.pl Unicode character database
+lib/unicode/Is/Pe.pl Unicode character database
+lib/unicode/Is/Po.pl Unicode character database
+lib/unicode/Is/Print.pl Unicode character database
+lib/unicode/Is/Ps.pl Unicode character database
+lib/unicode/Is/S.pl Unicode character database
+lib/unicode/Is/Sc.pl Unicode character database
+lib/unicode/Is/Sm.pl Unicode character database
+lib/unicode/Is/So.pl Unicode character database
+lib/unicode/Is/Space.pl Unicode character database
+lib/unicode/Is/Upper.pl Unicode character database
+lib/unicode/Is/Z.pl Unicode character database
+lib/unicode/Is/Zl.pl Unicode character database
+lib/unicode/Is/Zp.pl Unicode character database
+lib/unicode/Is/Zs.pl Unicode character database
+lib/unicode/JamoShort.pl Unicode character database
+lib/unicode/Makefile Unicode character database
+lib/unicode/Name.pl Unicode character database
+lib/unicode/Number.pl Unicode character database
+lib/unicode/To/Digit.pl Unicode character database
+lib/unicode/To/Lower.pl Unicode character database
+lib/unicode/To/Title.pl Unicode character database
+lib/unicode/To/Upper.pl Unicode character database
+lib/unicode/UnicodeData-Latest.txt Unicode character database
+lib/unicode/arabshp.txt Unicode character database
+lib/unicode/blocks.txt Unicode character database
+lib/unicode/index2.txt Unicode character database
+lib/unicode/jamo2.txt Unicode character database
+lib/unicode/mktables.PL Unicode character database generator
+lib/unicode/names2.txt Unicode character database
+lib/unicode/props2.txt Unicode character database
+lib/unicode/readme.txt Unicode character database info
+lib/utf8.pm Pragma to control Unicode support
+lib/utf8_heavy.pl Support routines for utf8 pragma
lib/validate.pl Perl library supporting wholesale file mode validation
lib/vars.pm Declare pseudo-imported global variables
makeaperl.SH perl script that produces a new perl binary
@@ -769,6 +928,7 @@ sv.c Scalar value code
sv.h Scalar value header
t/README Instructions for regression tests
t/TEST The regression tester
+t/UTEST Run regression tests with -Mutf8
t/base/cond.t See if conditionals work
t/base/if.t See if if works
t/base/lex.t See if lexical items work
@@ -971,6 +1131,8 @@ thread.sym Symbols for threads
toke.c The tokener
universal.c The default UNIVERSAL package methods
unixish.h Defines that are assumed on Unix
+utf8.c Unicode routines
+utf8.h Unicode header
util.c Utility routines
util.h Dummy header
utils/Makefile Extract the utility scripts
diff --git a/embed.h b/embed.h
index 6026c18922..ef19977ce9 100644
--- a/embed.h
+++ b/embed.h
@@ -1063,6 +1063,7 @@
#define utf8_distance Perl_utf8_distance
#define utf8_hop Perl_utf8_hop
#define utf8_to_uv Perl_utf8_to_uv
+#define utf8skip Perl_utf8skip
#define utilize Perl_utilize
#define uv_to_utf8 Perl_uv_to_utf8
#define varies Perl_varies
diff --git a/global.sym b/global.sym
index ac13e65c06..ef16b8a873 100644
--- a/global.sym
+++ b/global.sym
@@ -1102,6 +1102,7 @@ utf16_to_utf8_reversed
utf8_distance
utf8_hop
utf8_to_uv
+utf8skip
utilize
uv_to_utf8
wait4pid
diff --git a/proto.h b/proto.h
index acd88d0d37..7ee3cb4965 100644
--- a/proto.h
+++ b/proto.h
@@ -194,6 +194,28 @@ VIRTUAL U32 intro_my _((void));
VIRTUAL char* instr _((char* big, char* little));
VIRTUAL bool io_close _((IO* io));
VIRTUAL OP* invert _((OP* cmd));
+VIRTUAL bool is_uni_alnum _((U32 c));
+VIRTUAL bool is_uni_idfirst _((U32 c));
+VIRTUAL bool is_uni_alpha _((U32 c));
+VIRTUAL bool is_uni_space _((U32 c));
+VIRTUAL bool is_uni_digit _((U32 c));
+VIRTUAL bool is_uni_upper _((U32 c));
+VIRTUAL bool is_uni_lower _((U32 c));
+VIRTUAL bool is_uni_print _((U32 c));
+VIRTUAL U32 to_uni_upper _((U32 c));
+VIRTUAL U32 to_uni_title _((U32 c));
+VIRTUAL U32 to_uni_lower _((U32 c));
+VIRTUAL bool is_uni_alnum_lc _((U32 c));
+VIRTUAL bool is_uni_idfirst_lc _((U32 c));
+VIRTUAL bool is_uni_alpha_lc _((U32 c));
+VIRTUAL bool is_uni_space_lc _((U32 c));
+VIRTUAL bool is_uni_digit_lc _((U32 c));
+VIRTUAL bool is_uni_upper_lc _((U32 c));
+VIRTUAL bool is_uni_lower_lc _((U32 c));
+VIRTUAL bool is_uni_print_lc _((U32 c));
+VIRTUAL U32 to_uni_upper_lc _((U32 c));
+VIRTUAL U32 to_uni_title_lc _((U32 c));
+VIRTUAL U32 to_uni_lower_lc _((U32 c));
VIRTUAL bool is_utf8_alnum _((unsigned char *p));
VIRTUAL bool is_utf8_idfirst _((unsigned char *p));
VIRTUAL bool is_utf8_alpha _((unsigned char *p));
diff --git a/regexec.c b/regexec.c
index fe9f833634..400843bfe7 100644
--- a/regexec.c
+++ b/regexec.c
@@ -420,7 +420,7 @@ regexec_flags(register regexp *prog, char *stringarg, register char *strend,
I32 back_min =
prog->anchored_substr ? prog->anchored_offset : prog->float_min_offset;
I32 delta = back_max - back_min;
- char *last = HOP(strend, -(CHR_SVLEN(must) + back_min)); /* Cannot start after this */
+ char *last = HOP(strend, 0-(CHR_SVLEN(must) + back_min)); /* Cannot start after this */
char *last1; /* Last position checked before */
if (s > PL_bostr)
@@ -1130,7 +1130,7 @@ regmatch(regnode *prog)
break;
case SANYUTF8:
if (nextchr & 0x80) {
- locinput += PL_utf8skip[nextchr];
+ locinput += utf8skip[nextchr];
if (locinput > PL_regeol)
sayNO;
nextchr = UCHARAT(locinput);
@@ -1147,7 +1147,7 @@ regmatch(regnode *prog)
break;
case ANYUTF8:
if (nextchr & 0x80) {
- locinput += PL_utf8skip[nextchr];
+ locinput += utf8skip[nextchr];
if (locinput > PL_regeol)
sayNO;
nextchr = UCHARAT(locinput);
@@ -1219,7 +1219,7 @@ regmatch(regnode *prog)
sayNO;
if (locinput >= PL_regeol)
sayNO;
- locinput += PL_utf8skip[nextchr];
+ locinput += utf8skip[nextchr];
nextchr = UCHARAT(locinput);
break;
case ANYOF:
@@ -1253,7 +1253,7 @@ regmatch(regnode *prog)
if (!(OP(scan) == ALNUMUTF8
? swash_fetch(PL_utf8_alnum, locinput) : isALNUM_LC_utf8(locinput)))
sayNO;
- locinput += PL_utf8skip[nextchr];
+ locinput += utf8skip[nextchr];
nextchr = UCHARAT(locinput);
break;
}
@@ -1283,7 +1283,7 @@ regmatch(regnode *prog)
if (OP(scan) == NALNUMUTF8
? swash_fetch(PL_utf8_alnum, locinput) : isALNUM_LC_utf8(locinput))
sayNO;
- locinput += PL_utf8skip[nextchr];
+ locinput += utf8skip[nextchr];
nextchr = UCHARAT(locinput);
break;
}
@@ -1351,7 +1351,7 @@ regmatch(regnode *prog)
if (!(OP(scan) == SPACEUTF8
? swash_fetch(PL_utf8_space,locinput) : isSPACE_LC_utf8(locinput)))
sayNO;
- locinput += PL_utf8skip[nextchr];
+ locinput += utf8skip[nextchr];
nextchr = UCHARAT(locinput);
break;
}
@@ -1381,7 +1381,7 @@ regmatch(regnode *prog)
if (OP(scan) == NSPACEUTF8
? swash_fetch(PL_utf8_space,locinput) : isSPACE_LC_utf8(locinput))
sayNO;
- locinput += PL_utf8skip[nextchr];
+ locinput += utf8skip[nextchr];
nextchr = UCHARAT(locinput);
break;
}
@@ -1399,7 +1399,7 @@ regmatch(regnode *prog)
if (nextchr & 0x80) {
if (!(swash_fetch(PL_utf8_digit,locinput)))
sayNO;
- locinput += PL_utf8skip[nextchr];
+ locinput += utf8skip[nextchr];
nextchr = UCHARAT(locinput);
break;
}
@@ -1420,7 +1420,7 @@ regmatch(regnode *prog)
if (nextchr & 0x80) {
if (swash_fetch(PL_utf8_digit,locinput))
sayNO;
- locinput += PL_utf8skip[nextchr];
+ locinput += utf8skip[nextchr];
nextchr = UCHARAT(locinput);
break;
}
@@ -1431,7 +1431,7 @@ regmatch(regnode *prog)
case CLUMP:
if (locinput >= PL_regeol || swash_fetch(PL_utf8_mark, locinput))
sayNO;
- locinput += PL_utf8skip[nextchr];
+ locinput += utf8skip[nextchr];
while (locinput < PL_regeol && swash_fetch(PL_utf8_mark, locinput))
locinput += UTF8SKIP(locinput);
if (locinput > PL_regeol)
diff --git a/toke.c b/toke.c
index 9f9631990b..13cc965cd1 100644
--- a/toke.c
+++ b/toke.c
@@ -915,7 +915,7 @@ scan_const(char *start)
/* range begins (ignore - as first or last char) */
else if (*s == '-' && s+1 < send && s != start) {
if (utf) {
- *d++ = 0xff; /* use illegal utf8 byte--see pmtrans */
+ *d++ = (char)0xff; /* use illegal utf8 byte--see pmtrans */
s++;
continue;
}
diff --git a/utf8.h b/utf8.h
index 6f86f721e6..f39e340423 100644
--- a/utf8.h
+++ b/utf8.h
@@ -8,7 +8,7 @@
*/
#ifdef DOINIT
-EXTCONST unsigned char PL_utf8skip[] = {
+EXTCONST unsigned char utf8skip[] = {
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* ascii */
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* ascii */
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* ascii */
@@ -19,9 +19,9 @@ EXTCONST unsigned char PL_utf8skip[] = {
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,6,6,7,8, /* cjk etc. */
};
#else
-EXTCONST unsigned char PL_utf8skip[];
+EXTCONST unsigned char utf8skip[];
#endif
#define IN_UTF8 (curcop->op_private & HINT_UTF8)
-#define UTF8SKIP(s) PL_utf8skip[*(U8*)s]
+#define UTF8SKIP(s) utf8skip[*(U8*)s]
diff --git a/win32/Makefile b/win32/Makefile
index 8570f5ddb1..addf487a44 100644
--- a/win32/Makefile
+++ b/win32/Makefile
@@ -354,6 +354,7 @@ MICROCORE_SRC = \
..\taint.c \
..\toke.c \
..\universal.c \
+ ..\utf8.c \
..\util.c
!IF "$(PERL_MALLOC)" == "define"
@@ -427,6 +428,7 @@ CORE_NOCFG_H = \
..\sv.h \
..\thread.h \
..\unixish.h \
+ ..\utf8.h \
..\util.h \
..\XSUB.h \
..\EXTERN.h \
diff --git a/win32/makefile.mk b/win32/makefile.mk
index ad24e2112e..50cdda9fc3 100644
--- a/win32/makefile.mk
+++ b/win32/makefile.mk
@@ -198,7 +198,7 @@ OPTIMIZE = -O2 $(RUNTIME)
LINK_DBG =
.ENDIF
-CFLAGS = -w -d -tWM -tWD $(INCLUDES) $(DEFINES) $(LOCDEFS) \
+CFLAGS = -K -w -d -tWM -tWD $(INCLUDES) $(DEFINES) $(LOCDEFS) \
$(PCHFLAGS) $(OPTIMIZE)
LINK_FLAGS = $(LINK_DBG) -L$(CCLIBDIR) $(EXTRALIBDIRS:^"-L")
OBJOUT_FLAG = -o
@@ -471,6 +471,7 @@ MICROCORE_SRC = \
..\taint.c \
..\toke.c \
..\universal.c \
+ ..\utf8.c \
..\util.c
.IF "$(PERL_MALLOC)" == "define"
@@ -544,6 +545,7 @@ CORE_NOCFG_H = \
..\sv.h \
..\thread.h \
..\unixish.h \
+ ..\utf8.h \
..\util.h \
..\XSUB.h \
..\EXTERN.h \