diff options
-rw-r--r-- | MANIFEST | 162 | ||||
-rw-r--r-- | embed.h | 1 | ||||
-rw-r--r-- | global.sym | 1 | ||||
-rw-r--r-- | proto.h | 22 | ||||
-rw-r--r-- | regexec.c | 22 | ||||
-rw-r--r-- | toke.c | 2 | ||||
-rw-r--r-- | utf8.h | 6 | ||||
-rw-r--r-- | win32/Makefile | 2 | ||||
-rw-r--r-- | win32/makefile.mk | 4 |
9 files changed, 206 insertions, 16 deletions
@@ -589,6 +589,165 @@ lib/syslog.pl Perl library supporting syslogging lib/tainted.pl Old code for tainting lib/termcap.pl Perl library supporting termcap usage lib/timelocal.pl Perl library supporting inverse of localtime, gmtime +lib/unicode/ArabLink.pl Unicode character database +lib/unicode/ArabLnkGrp.pl Unicode character database +lib/unicode/Bidirectional.pl Unicode character database +lib/unicode/Block.pl Unicode character database +lib/unicode/Category.pl Unicode character database +lib/unicode/CombiningClass.pl Unicode character database +lib/unicode/Decomposition.pl Unicode character database +lib/unicode/In/AlphabeticPresentationForms.pl Unicode character database +lib/unicode/In/Arabic.pl Unicode character database +lib/unicode/In/ArabicPresentationForms-A.pl Unicode character database +lib/unicode/In/ArabicPresentationForms-B.pl Unicode character database +lib/unicode/In/Armenian.pl Unicode character database +lib/unicode/In/Arrows.pl Unicode character database +lib/unicode/In/BasicLatin.pl Unicode character database +lib/unicode/In/Bengali.pl Unicode character database +lib/unicode/In/BlockElements.pl Unicode character database +lib/unicode/In/Bopomofo.pl Unicode character database +lib/unicode/In/BoxDrawing.pl Unicode character database +lib/unicode/In/CJKCompatibility.pl Unicode character database +lib/unicode/In/CJKCompatibilityForms.pl Unicode character database +lib/unicode/In/CJKCompatibilityIdeographs.pl Unicode character database +lib/unicode/In/CJKSymbolsandPunctuation.pl Unicode character database +lib/unicode/In/CJKUnifiedIdeographs.pl Unicode character database +lib/unicode/In/CombiningDiacriticalMarks.pl Unicode character database +lib/unicode/In/CombiningHalfMarks.pl Unicode character database +lib/unicode/In/CombiningMarksforSymbols.pl Unicode character database +lib/unicode/In/ControlPictures.pl Unicode character database +lib/unicode/In/CurrencySymbols.pl Unicode character database +lib/unicode/In/Cyrillic.pl Unicode character database +lib/unicode/In/Devanagari.pl Unicode character database +lib/unicode/In/Dingbats.pl Unicode character database +lib/unicode/In/EnclosedAlphanumerics.pl Unicode character database +lib/unicode/In/EnclosedCJKLettersandMonths.pl Unicode character database +lib/unicode/In/GeneralPunctuation.pl Unicode character database +lib/unicode/In/GeometricShapes.pl Unicode character database +lib/unicode/In/Georgian.pl Unicode character database +lib/unicode/In/Greek.pl Unicode character database +lib/unicode/In/GreekExtended.pl Unicode character database +lib/unicode/In/Gujarati.pl Unicode character database +lib/unicode/In/Gurmukhi.pl Unicode character database +lib/unicode/In/HalfwidthandFullwidthForms.pl Unicode character database +lib/unicode/In/HangulCompatibilityJamo.pl Unicode character database +lib/unicode/In/HangulJamo.pl Unicode character database +lib/unicode/In/HangulSyllables.pl Unicode character database +lib/unicode/In/Hebrew.pl Unicode character database +lib/unicode/In/HighPrivateUseSurrogates.pl Unicode character database +lib/unicode/In/HighSurrogates.pl Unicode character database +lib/unicode/In/Hiragana.pl Unicode character database +lib/unicode/In/IPAExtensions.pl Unicode character database +lib/unicode/In/Kanbun.pl Unicode character database +lib/unicode/In/Kannada.pl Unicode character database +lib/unicode/In/Katakana.pl Unicode character database +lib/unicode/In/Lao.pl Unicode character database +lib/unicode/In/Latin-1Supplement.pl Unicode character database +lib/unicode/In/LatinExtended-A.pl Unicode character database +lib/unicode/In/LatinExtended-B.pl Unicode character database +lib/unicode/In/LatinExtendedAdditional.pl Unicode character database +lib/unicode/In/LetterlikeSymbols.pl Unicode character database +lib/unicode/In/LowSurrogates.pl Unicode character database +lib/unicode/In/Malayalam.pl Unicode character database +lib/unicode/In/MathematicalOperators.pl Unicode character database +lib/unicode/In/MiscellaneousSymbols.pl Unicode character database +lib/unicode/In/MiscellaneousTechnical.pl Unicode character database +lib/unicode/In/NumberForms.pl Unicode character database +lib/unicode/In/OpticalCharacterRecognition.pl Unicode character database +lib/unicode/In/Oriya.pl Unicode character database +lib/unicode/In/PrivateUse.pl Unicode character database +lib/unicode/In/SmallFormVariants.pl Unicode character database +lib/unicode/In/SpacingModifierLetters.pl Unicode character database +lib/unicode/In/Specials.pl Unicode character database +lib/unicode/In/SuperscriptsandSubscripts.pl Unicode character database +lib/unicode/In/Tamil.pl Unicode character database +lib/unicode/In/Telugu.pl Unicode character database +lib/unicode/In/Thai.pl Unicode character database +lib/unicode/In/Tibetan.pl Unicode character database +lib/unicode/Is/Alnum.pl Unicode character database +lib/unicode/Is/Alpha.pl Unicode character database +lib/unicode/Is/BidiAN.pl Unicode character database +lib/unicode/Is/BidiB.pl Unicode character database +lib/unicode/Is/BidiCS.pl Unicode character database +lib/unicode/Is/BidiEN.pl Unicode character database +lib/unicode/Is/BidiES.pl Unicode character database +lib/unicode/Is/BidiET.pl Unicode character database +lib/unicode/Is/BidiL.pl Unicode character database +lib/unicode/Is/BidiON.pl Unicode character database +lib/unicode/Is/BidiR.pl Unicode character database +lib/unicode/Is/BidiS.pl Unicode character database +lib/unicode/Is/BidiWS.pl Unicode character database +lib/unicode/Is/C.pl Unicode character database +lib/unicode/Is/Cc.pl Unicode character database +lib/unicode/Is/Cn.pl Unicode character database +lib/unicode/Is/Co.pl Unicode character database +lib/unicode/Is/DCcircle.pl Unicode character database +lib/unicode/Is/DCcompat.pl Unicode character database +lib/unicode/Is/DCfinal.pl Unicode character database +lib/unicode/Is/DCfont.pl Unicode character database +lib/unicode/Is/DCinital.pl Unicode character database +lib/unicode/Is/DCinitial.pl Unicode character database +lib/unicode/Is/DCisolated.pl Unicode character database +lib/unicode/Is/DCnarrow.pl Unicode character database +lib/unicode/Is/DCnoBreak.pl Unicode character database +lib/unicode/Is/DCsmall.pl Unicode character database +lib/unicode/Is/DCsquare.pl Unicode character database +lib/unicode/Is/DCsub.pl Unicode character database +lib/unicode/Is/DCsuper.pl Unicode character database +lib/unicode/Is/DCvertical.pl Unicode character database +lib/unicode/Is/DCwide.pl Unicode character database +lib/unicode/Is/DecoCanon.pl Unicode character database +lib/unicode/Is/DecoCompat.pl Unicode character database +lib/unicode/Is/Digit.pl Unicode character database +lib/unicode/Is/L.pl Unicode character database +lib/unicode/Is/Ll.pl Unicode character database +lib/unicode/Is/Lm.pl Unicode character database +lib/unicode/Is/Lo.pl Unicode character database +lib/unicode/Is/Lower.pl Unicode character database +lib/unicode/Is/Lt.pl Unicode character database +lib/unicode/Is/Lu.pl Unicode character database +lib/unicode/Is/M.pl Unicode character database +lib/unicode/Is/Mc.pl Unicode character database +lib/unicode/Is/Mirrored.pl Unicode character database +lib/unicode/Is/Mn.pl Unicode character database +lib/unicode/Is/N.pl Unicode character database +lib/unicode/Is/Nd.pl Unicode character database +lib/unicode/Is/No.pl Unicode character database +lib/unicode/Is/P.pl Unicode character database +lib/unicode/Is/Pd.pl Unicode character database +lib/unicode/Is/Pe.pl Unicode character database +lib/unicode/Is/Po.pl Unicode character database +lib/unicode/Is/Print.pl Unicode character database +lib/unicode/Is/Ps.pl Unicode character database +lib/unicode/Is/S.pl Unicode character database +lib/unicode/Is/Sc.pl Unicode character database +lib/unicode/Is/Sm.pl Unicode character database +lib/unicode/Is/So.pl Unicode character database +lib/unicode/Is/Space.pl Unicode character database +lib/unicode/Is/Upper.pl Unicode character database +lib/unicode/Is/Z.pl Unicode character database +lib/unicode/Is/Zl.pl Unicode character database +lib/unicode/Is/Zp.pl Unicode character database +lib/unicode/Is/Zs.pl Unicode character database +lib/unicode/JamoShort.pl Unicode character database +lib/unicode/Makefile Unicode character database +lib/unicode/Name.pl Unicode character database +lib/unicode/Number.pl Unicode character database +lib/unicode/To/Digit.pl Unicode character database +lib/unicode/To/Lower.pl Unicode character database +lib/unicode/To/Title.pl Unicode character database +lib/unicode/To/Upper.pl Unicode character database +lib/unicode/UnicodeData-Latest.txt Unicode character database +lib/unicode/arabshp.txt Unicode character database +lib/unicode/blocks.txt Unicode character database +lib/unicode/index2.txt Unicode character database +lib/unicode/jamo2.txt Unicode character database +lib/unicode/mktables.PL Unicode character database generator +lib/unicode/names2.txt Unicode character database +lib/unicode/props2.txt Unicode character database +lib/unicode/readme.txt Unicode character database info +lib/utf8.pm Pragma to control Unicode support +lib/utf8_heavy.pl Support routines for utf8 pragma lib/validate.pl Perl library supporting wholesale file mode validation lib/vars.pm Declare pseudo-imported global variables makeaperl.SH perl script that produces a new perl binary @@ -769,6 +928,7 @@ sv.c Scalar value code sv.h Scalar value header t/README Instructions for regression tests t/TEST The regression tester +t/UTEST Run regression tests with -Mutf8 t/base/cond.t See if conditionals work t/base/if.t See if if works t/base/lex.t See if lexical items work @@ -971,6 +1131,8 @@ thread.sym Symbols for threads toke.c The tokener universal.c The default UNIVERSAL package methods unixish.h Defines that are assumed on Unix +utf8.c Unicode routines +utf8.h Unicode header util.c Utility routines util.h Dummy header utils/Makefile Extract the utility scripts @@ -1063,6 +1063,7 @@ #define utf8_distance Perl_utf8_distance #define utf8_hop Perl_utf8_hop #define utf8_to_uv Perl_utf8_to_uv +#define utf8skip Perl_utf8skip #define utilize Perl_utilize #define uv_to_utf8 Perl_uv_to_utf8 #define varies Perl_varies diff --git a/global.sym b/global.sym index ac13e65c06..ef16b8a873 100644 --- a/global.sym +++ b/global.sym @@ -1102,6 +1102,7 @@ utf16_to_utf8_reversed utf8_distance utf8_hop utf8_to_uv +utf8skip utilize uv_to_utf8 wait4pid @@ -194,6 +194,28 @@ VIRTUAL U32 intro_my _((void)); VIRTUAL char* instr _((char* big, char* little)); VIRTUAL bool io_close _((IO* io)); VIRTUAL OP* invert _((OP* cmd)); +VIRTUAL bool is_uni_alnum _((U32 c)); +VIRTUAL bool is_uni_idfirst _((U32 c)); +VIRTUAL bool is_uni_alpha _((U32 c)); +VIRTUAL bool is_uni_space _((U32 c)); +VIRTUAL bool is_uni_digit _((U32 c)); +VIRTUAL bool is_uni_upper _((U32 c)); +VIRTUAL bool is_uni_lower _((U32 c)); +VIRTUAL bool is_uni_print _((U32 c)); +VIRTUAL U32 to_uni_upper _((U32 c)); +VIRTUAL U32 to_uni_title _((U32 c)); +VIRTUAL U32 to_uni_lower _((U32 c)); +VIRTUAL bool is_uni_alnum_lc _((U32 c)); +VIRTUAL bool is_uni_idfirst_lc _((U32 c)); +VIRTUAL bool is_uni_alpha_lc _((U32 c)); +VIRTUAL bool is_uni_space_lc _((U32 c)); +VIRTUAL bool is_uni_digit_lc _((U32 c)); +VIRTUAL bool is_uni_upper_lc _((U32 c)); +VIRTUAL bool is_uni_lower_lc _((U32 c)); +VIRTUAL bool is_uni_print_lc _((U32 c)); +VIRTUAL U32 to_uni_upper_lc _((U32 c)); +VIRTUAL U32 to_uni_title_lc _((U32 c)); +VIRTUAL U32 to_uni_lower_lc _((U32 c)); VIRTUAL bool is_utf8_alnum _((unsigned char *p)); VIRTUAL bool is_utf8_idfirst _((unsigned char *p)); VIRTUAL bool is_utf8_alpha _((unsigned char *p)); @@ -420,7 +420,7 @@ regexec_flags(register regexp *prog, char *stringarg, register char *strend, I32 back_min = prog->anchored_substr ? prog->anchored_offset : prog->float_min_offset; I32 delta = back_max - back_min; - char *last = HOP(strend, -(CHR_SVLEN(must) + back_min)); /* Cannot start after this */ + char *last = HOP(strend, 0-(CHR_SVLEN(must) + back_min)); /* Cannot start after this */ char *last1; /* Last position checked before */ if (s > PL_bostr) @@ -1130,7 +1130,7 @@ regmatch(regnode *prog) break; case SANYUTF8: if (nextchr & 0x80) { - locinput += PL_utf8skip[nextchr]; + locinput += utf8skip[nextchr]; if (locinput > PL_regeol) sayNO; nextchr = UCHARAT(locinput); @@ -1147,7 +1147,7 @@ regmatch(regnode *prog) break; case ANYUTF8: if (nextchr & 0x80) { - locinput += PL_utf8skip[nextchr]; + locinput += utf8skip[nextchr]; if (locinput > PL_regeol) sayNO; nextchr = UCHARAT(locinput); @@ -1219,7 +1219,7 @@ regmatch(regnode *prog) sayNO; if (locinput >= PL_regeol) sayNO; - locinput += PL_utf8skip[nextchr]; + locinput += utf8skip[nextchr]; nextchr = UCHARAT(locinput); break; case ANYOF: @@ -1253,7 +1253,7 @@ regmatch(regnode *prog) if (!(OP(scan) == ALNUMUTF8 ? swash_fetch(PL_utf8_alnum, locinput) : isALNUM_LC_utf8(locinput))) sayNO; - locinput += PL_utf8skip[nextchr]; + locinput += utf8skip[nextchr]; nextchr = UCHARAT(locinput); break; } @@ -1283,7 +1283,7 @@ regmatch(regnode *prog) if (OP(scan) == NALNUMUTF8 ? swash_fetch(PL_utf8_alnum, locinput) : isALNUM_LC_utf8(locinput)) sayNO; - locinput += PL_utf8skip[nextchr]; + locinput += utf8skip[nextchr]; nextchr = UCHARAT(locinput); break; } @@ -1351,7 +1351,7 @@ regmatch(regnode *prog) if (!(OP(scan) == SPACEUTF8 ? swash_fetch(PL_utf8_space,locinput) : isSPACE_LC_utf8(locinput))) sayNO; - locinput += PL_utf8skip[nextchr]; + locinput += utf8skip[nextchr]; nextchr = UCHARAT(locinput); break; } @@ -1381,7 +1381,7 @@ regmatch(regnode *prog) if (OP(scan) == NSPACEUTF8 ? swash_fetch(PL_utf8_space,locinput) : isSPACE_LC_utf8(locinput)) sayNO; - locinput += PL_utf8skip[nextchr]; + locinput += utf8skip[nextchr]; nextchr = UCHARAT(locinput); break; } @@ -1399,7 +1399,7 @@ regmatch(regnode *prog) if (nextchr & 0x80) { if (!(swash_fetch(PL_utf8_digit,locinput))) sayNO; - locinput += PL_utf8skip[nextchr]; + locinput += utf8skip[nextchr]; nextchr = UCHARAT(locinput); break; } @@ -1420,7 +1420,7 @@ regmatch(regnode *prog) if (nextchr & 0x80) { if (swash_fetch(PL_utf8_digit,locinput)) sayNO; - locinput += PL_utf8skip[nextchr]; + locinput += utf8skip[nextchr]; nextchr = UCHARAT(locinput); break; } @@ -1431,7 +1431,7 @@ regmatch(regnode *prog) case CLUMP: if (locinput >= PL_regeol || swash_fetch(PL_utf8_mark, locinput)) sayNO; - locinput += PL_utf8skip[nextchr]; + locinput += utf8skip[nextchr]; while (locinput < PL_regeol && swash_fetch(PL_utf8_mark, locinput)) locinput += UTF8SKIP(locinput); if (locinput > PL_regeol) @@ -915,7 +915,7 @@ scan_const(char *start) /* range begins (ignore - as first or last char) */ else if (*s == '-' && s+1 < send && s != start) { if (utf) { - *d++ = 0xff; /* use illegal utf8 byte--see pmtrans */ + *d++ = (char)0xff; /* use illegal utf8 byte--see pmtrans */ s++; continue; } @@ -8,7 +8,7 @@ */ #ifdef DOINIT -EXTCONST unsigned char PL_utf8skip[] = { +EXTCONST unsigned char utf8skip[] = { 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* ascii */ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* ascii */ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* ascii */ @@ -19,9 +19,9 @@ EXTCONST unsigned char PL_utf8skip[] = { 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,6,6,7,8, /* cjk etc. */ }; #else -EXTCONST unsigned char PL_utf8skip[]; +EXTCONST unsigned char utf8skip[]; #endif #define IN_UTF8 (curcop->op_private & HINT_UTF8) -#define UTF8SKIP(s) PL_utf8skip[*(U8*)s] +#define UTF8SKIP(s) utf8skip[*(U8*)s] diff --git a/win32/Makefile b/win32/Makefile index 8570f5ddb1..addf487a44 100644 --- a/win32/Makefile +++ b/win32/Makefile @@ -354,6 +354,7 @@ MICROCORE_SRC = \ ..\taint.c \ ..\toke.c \ ..\universal.c \ + ..\utf8.c \ ..\util.c !IF "$(PERL_MALLOC)" == "define" @@ -427,6 +428,7 @@ CORE_NOCFG_H = \ ..\sv.h \ ..\thread.h \ ..\unixish.h \ + ..\utf8.h \ ..\util.h \ ..\XSUB.h \ ..\EXTERN.h \ diff --git a/win32/makefile.mk b/win32/makefile.mk index ad24e2112e..50cdda9fc3 100644 --- a/win32/makefile.mk +++ b/win32/makefile.mk @@ -198,7 +198,7 @@ OPTIMIZE = -O2 $(RUNTIME) LINK_DBG = .ENDIF -CFLAGS = -w -d -tWM -tWD $(INCLUDES) $(DEFINES) $(LOCDEFS) \ +CFLAGS = -K -w -d -tWM -tWD $(INCLUDES) $(DEFINES) $(LOCDEFS) \ $(PCHFLAGS) $(OPTIMIZE) LINK_FLAGS = $(LINK_DBG) -L$(CCLIBDIR) $(EXTRALIBDIRS:^"-L") OBJOUT_FLAG = -o @@ -471,6 +471,7 @@ MICROCORE_SRC = \ ..\taint.c \ ..\toke.c \ ..\universal.c \ + ..\utf8.c \ ..\util.c .IF "$(PERL_MALLOC)" == "define" @@ -544,6 +545,7 @@ CORE_NOCFG_H = \ ..\sv.h \ ..\thread.h \ ..\unixish.h \ + ..\utf8.h \ ..\util.h \ ..\XSUB.h \ ..\EXTERN.h \ |