diff options
Diffstat (limited to 'libphobos/src/std/regex/internal/ir.d')
-rw-r--r-- | libphobos/src/std/regex/internal/ir.d | 565 |
1 files changed, 485 insertions, 80 deletions
diff --git a/libphobos/src/std/regex/internal/ir.d b/libphobos/src/std/regex/internal/ir.d index 28b199895d9..ec0cb66631e 100644 --- a/libphobos/src/std/regex/internal/ir.d +++ b/libphobos/src/std/regex/internal/ir.d @@ -30,7 +30,9 @@ CharMatcher[CodepointSet] matcherCache; //accessor with caching @trusted CharMatcher getMatcher(CodepointSet set) -{// @@@BUG@@@ 6357 almost all properties of AA are not @safe +{ + // almost all properties of AA are not @safe + // https://issues.dlang.org/show_bug.cgi?id=6357 if (__ctfe || maxCachedMatchers == 0) return CharMatcher(set); else @@ -47,40 +49,15 @@ CharMatcher[CodepointSet] matcherCache; } } -@trusted auto memoizeExpr(string expr)() -{ - if (__ctfe) - return mixin(expr); - alias T = typeof(mixin(expr)); - static T slot; - static bool initialized; - if (!initialized) - { - slot = mixin(expr); - initialized = true; - } - return slot; -} - -//property for \w character class -@property CodepointSet wordCharacter() +@property ref wordMatcher()() { - return memoizeExpr!("unicode.Alphabetic | unicode.Mn | unicode.Mc - | unicode.Me | unicode.Nd | unicode.Pc")(); -} - -@property CharMatcher wordMatcher() -{ - return memoizeExpr!("CharMatcher(wordCharacter)")(); + static immutable CharMatcher matcher = CharMatcher(wordCharacter); + return matcher; } // some special Unicode white space characters private enum NEL = '\u0085', LS = '\u2028', PS = '\u2029'; -// Characters that need escaping in string posed as regular expressions -alias Escapables = AliasSeq!('[', ']', '\\', '^', '$', '.', '|', '?', ',', '-', - ';', ':', '#', '&', '%', '/', '<', '>', '`', '*', '+', '(', ')', '{', '}', '~'); - //Regular expression engine/parser options: // global - search all nonoverlapping matches in input // casefold - case insensitive matching, do casefolding on match in unicode mode @@ -97,6 +74,20 @@ enum RegexOption: uint { //do not reorder this list alias RegexOptionNames = AliasSeq!('g', 'i', 'x', 'U', 'm', 's'); static assert( RegexOption.max < 0x80); + +package(std) string regexOptionsToString()(uint flags) nothrow pure @safe +{ + flags &= (RegexOption.max << 1) - 1; + if (!flags) + return ""; + char[RegexOptionNames.length] buffer = void; + size_t pos = 0; + foreach (i, flag; __traits(allMembers, RegexOption)) + if (flags & __traits(getMember, RegexOption, flag)) + buffer[pos++] = RegexOptionNames[i]; + return buffer[0 .. pos].idup; +} + // flags that allow guide execution of engine enum RegexInfo : uint { oneShot = 0x80 } @@ -173,7 +164,8 @@ template IRL(IR code) static assert(IRL!(IR.LookaheadStart) == 3); //how many parameters follow the IR, should be optimized fixing some IR bits -int immediateParamsIR(IR i){ +int immediateParamsIR(IR i) @safe pure nothrow @nogc +{ switch (i) { case IR.OrEnd,IR.InfiniteEnd,IR.InfiniteQEnd: @@ -190,49 +182,50 @@ int immediateParamsIR(IR i){ } //full length of IR instruction inlcuding all parameters that might follow it -int lengthOfIR(IR i) +int lengthOfIR(IR i) @safe pure nothrow @nogc { return 1 + immediateParamsIR(i); } //full length of the paired IR instruction inlcuding all parameters that might follow it -int lengthOfPairedIR(IR i) +int lengthOfPairedIR(IR i) @safe pure nothrow @nogc { return 1 + immediateParamsIR(pairedIR(i)); } //if the operation has a merge point (this relies on the order of the ops) -bool hasMerge(IR i) +bool hasMerge(IR i) @safe pure nothrow @nogc { return (i&0b11)==0b10 && i <= IR.RepeatQEnd; } //is an IR that opens a "group" -bool isStartIR(IR i) +bool isStartIR(IR i) @safe pure nothrow @nogc { return (i&0b11)==0b01; } //is an IR that ends a "group" -bool isEndIR(IR i) +bool isEndIR(IR i) @safe pure nothrow @nogc { return (i&0b11)==0b10; } //is a standalone IR -bool isAtomIR(IR i) +bool isAtomIR(IR i) @safe pure nothrow @nogc { return (i&0b11)==0b00; } //makes respective pair out of IR i, swapping start/end bits of instruction -IR pairedIR(IR i) +IR pairedIR(IR i) @safe pure nothrow @nogc { assert(isStartIR(i) || isEndIR(i)); - return cast(IR)(i ^ 0b11); + return cast(IR) (i ^ 0b11); } //encoded IR instruction +@safe pure struct Bytecode { uint raw; @@ -241,6 +234,7 @@ struct Bytecode enum maxData = 1 << 22; enum maxRaw = 1 << 31; +@safe pure: this(IR code, uint data) { assert(data < (1 << 22) && code < 256); @@ -262,8 +256,8 @@ struct Bytecode return t; } - //bit twiddling helpers - //0-arg template due to @@@BUG@@@ 10985 + // bit twiddling helpers + // 0-arg template due to https://issues.dlang.org/show_bug.cgi?id=10985 @property uint data()() const { return raw & 0x003f_ffff; } @property void data()(uint val) @@ -271,12 +265,12 @@ struct Bytecode raw = (raw & ~0x003f_ffff) | (val & 0x003f_ffff); } - //ditto - //0-arg template due to @@@BUG@@@ 10985 + // ditto + // 0-arg template due to https://issues.dlang.org/show_bug.cgi?id=10985 @property uint sequence()() const { return 2 + (raw >> 22 & 0x3); } - //ditto - //0-arg template due to @@@BUG@@@ 10985 + // ditto + // 0-arg template due to https://issues.dlang.org/show_bug.cgi?id=10985 @property IR code()() const { return cast(IR)(raw >> 24); } //ditto @@ -369,11 +363,20 @@ struct NamedGroup //holds pair of start-end markers for a submatch struct Group(DataIndex) { - DataIndex begin, end; + DataIndex begin = DataIndex.max; + DataIndex end = DataIndex.min; + + bool opCast(T : bool)() const + { + return begin <= end; + } + @trusted string toString()() const { + if (begin < end) + return "(unmatched)"; import std.array : appender; - import std.format : formattedWrite; + import std.format.write : formattedWrite; auto a = appender!string(); formattedWrite(a, "%s..%s", begin, end); return a.data; @@ -384,7 +387,7 @@ struct Group(DataIndex) @trusted string disassemble(in Bytecode[] irb, uint pc, in NamedGroup[] dict=[]) { import std.array : appender; - import std.format : formattedWrite; + import std.format.write : formattedWrite; auto output = appender!string(); formattedWrite(output,"%s", irb[pc].mnemonic); switch (irb[pc].code) @@ -452,9 +455,169 @@ struct Group(DataIndex) writeln("\t", disassemble(slice, pc, dict)); } +// Encapsulates memory management, explicit ref counting +// and the exact type of engine created +// there is a single instance per engine combination type x Char +// In future may also maintain a (TLS?) cache of memory +interface MatcherFactory(Char) +{ +@safe: + Matcher!Char create(const ref Regex!Char, in Char[] input) const; + Matcher!Char dup(Matcher!Char m, in Char[] input) const; + size_t incRef(Matcher!Char m) const; + size_t decRef(Matcher!Char m) const; +} + +// Only memory management, no compile-time vs run-time specialities +abstract class GenericFactory(alias EngineType, Char) : MatcherFactory!Char +{ + import core.memory : pureFree; + import std.internal.memory : enforceMalloc; + import core.memory : GC; + // round up to next multiple of size_t for alignment purposes + enum classSize = (__traits(classInstanceSize, EngineType!Char) + size_t.sizeof - 1) & ~(size_t.sizeof - 1); + + EngineType!Char construct(const ref Regex!Char re, in Char[] input, void[] memory) const; + + override EngineType!Char create(const ref Regex!Char re, in Char[] input) const @trusted + { + immutable size = EngineType!Char.initialMemory(re) + classSize; + auto memory = enforceMalloc(size)[0 .. size]; + scope(failure) pureFree(memory.ptr); + GC.addRange(memory.ptr, classSize); + auto engine = construct(re, input, memory); + assert(engine.refCount == 1); + assert(cast(void*) engine == memory.ptr); + return engine; + } + + override EngineType!Char dup(Matcher!Char engine, in Char[] input) const @trusted + { + immutable size = EngineType!Char.initialMemory(engine.pattern) + classSize; + auto memory = enforceMalloc(size)[0 .. size]; + scope(failure) pureFree(memory.ptr); + auto copy = construct(engine.pattern, input, memory); + GC.addRange(memory.ptr, classSize); + engine.dupTo(copy, memory[classSize .. size]); + assert(copy.refCount == 1); + return copy; + } + + override size_t incRef(Matcher!Char m) const + { + return ++m.refCount; + } + + override size_t decRef(Matcher!Char m) const @trusted + { + assert(m.refCount != 0); + auto cnt = --m.refCount; + if (cnt == 0) + { + void* ptr = cast(void*) m; + GC.removeRange(ptr); + pureFree(ptr); + } + return cnt; + } +} + +// A factory for run-time engines +class RuntimeFactory(alias EngineType, Char) : GenericFactory!(EngineType, Char) +{ + override EngineType!Char construct(const ref Regex!Char re, in Char[] input, void[] memory) const + { + import core.lifetime : emplace; + return emplace!(EngineType!Char)(memory[0 .. classSize], + re, Input!Char(input), memory[classSize .. $]); + } +} + +// A factory for compile-time engine +class CtfeFactory(alias EngineType, Char, alias func) : GenericFactory!(EngineType, Char) +{ + override EngineType!Char construct(const ref Regex!Char re, in Char[] input, void[] memory) const + { + import core.lifetime : emplace; + return emplace!(EngineType!Char)(memory[0 .. classSize], + re, &func, Input!Char(input), memory[classSize .. $]); + } +} + +private auto defaultFactoryImpl(Char)(const ref Regex!Char re) +{ + import std.regex.internal.backtracking : BacktrackingMatcher; + import std.regex.internal.thompson : ThompsonMatcher; + import std.algorithm.searching : canFind; + static MatcherFactory!Char backtrackingFactory; + static MatcherFactory!Char thompsonFactory; + if (re.backrefed.canFind!"a != 0") + { + if (backtrackingFactory is null) + backtrackingFactory = new RuntimeFactory!(BacktrackingMatcher, Char); + return backtrackingFactory; + } + else + { + if (thompsonFactory is null) + thompsonFactory = new RuntimeFactory!(ThompsonMatcher, Char); + return thompsonFactory; + } +} + +// Used to generate a pure wrapper for defaultFactoryImpl. Based on the example in +// the std.traits.SetFunctionAttributes documentation. +auto assumePureFunction(T)(T t) +if (isFunctionPointer!T) +{ + enum attrs = functionAttributes!T | FunctionAttribute.pure_; + return cast(SetFunctionAttributes!(T, functionLinkage!T, attrs)) t; +} + +// A workaround for R-T enum re = regex(...) +template defaultFactory(Char) +{ + // defaultFactory is constructed as a safe, pure wrapper over defaultFactoryImpl. + // It can be faked as pure because the static mutable variables are used to cache + // the key and character matcher. The technique used avoids delegates and GC. + @property MatcherFactory!Char defaultFactory(const ref Regex!Char re) @safe pure + { + static auto impl(const ref Regex!Char re) + { + return defaultFactoryImpl(re); + } + + static auto pureImpl(const ref Regex!Char re) @trusted + { + auto p = assumePureFunction(&impl); + return p(re); + } + + return pureImpl(re); + } +} + +// Defining it as an interface has the undesired side-effect: +// casting any class to an interface silently adjusts pointer to point to a nested vtbl +abstract class Matcher(Char) +{ +abstract: + // Get a (next) match + int match(Group!size_t[] matches) pure; + // This only maintains internal ref-count, + // deallocation happens inside MatcherFactory + @property ref size_t refCount() @safe; + // Copy internal state to another engine, using memory arena 'memory' + void dupTo(Matcher!Char m, void[] memory); + // The pattern loaded + @property ref const(Regex!Char) pattern() @safe; + // Re-arm the engine with new Input + Matcher rearm(in Char[] stream); +} + /++ - $(D Regex) object holds regular expression pattern in compiled form. - Instances of this object are constructed via calls to $(D regex). + `Regex` object holds regular expression pattern in compiled form. + Instances of this object are constructed via calls to `regex`. This is an intended form for caching and storage of frequently used regular expressions. +/ @@ -466,17 +629,19 @@ struct Regex(Char) @safe @property bool empty() const nothrow { return ir is null; } - + /++ + `namedCaptures` returns a range of all named captures in a given regular expression. + +/ @safe @property auto namedCaptures() { static struct NamedGroupRange { private: - NamedGroup[] groups; + const(NamedGroup)[] groups; size_t start; size_t end; public: - this(NamedGroup[] g, size_t s, size_t e) + this(const(NamedGroup)[] g, size_t s, size_t e) { assert(s <= e); assert(e <= g.length); @@ -514,7 +679,7 @@ struct Regex(Char) package(std.regex): import std.regex.internal.kickstart : Kickstart; //TODO: get rid of this dependency - NamedGroup[] dict; // maps name -> user group number + const(NamedGroup)[] dict; // maps name -> user group number uint ngroup; // number of internal groups uint maxCounterDepth; // max depth of nested {n,m} repetitions uint hotspotTableSize; // number of entries in merge table @@ -524,6 +689,36 @@ package(std.regex): public const(BitTable)[] filters; // bloom filters for conditional loops uint[] backrefed; // bit array of backreferenced submatches Kickstart!Char kickstart; + MatcherFactory!Char factory; // produces optimal matcher for this pattern + immutable(Char)[] pattern; // copy of pattern to serve as cache key + + const(Regex) withFactory(MatcherFactory!Char factory) pure const @trusted + { + auto r = cast() this; + r.factory = factory; + return r; + } + + const(Regex) withFlags(uint newFlags) pure const @trusted + { + auto r = cast() this; + r.flags = newFlags; + return r; + } + + const(Regex) withCode(const(Bytecode)[] code) pure const @trusted + { + auto r = cast() this; + r.ir = code.dup; // TODO: sidestep const instead? + return r; + } + + const(Regex) withNGroup(uint nGroup) pure const @trusted + { + auto r = cast() this; + r.ngroup = nGroup; + return r; + } //bit access helper uint isBackref(uint n) @@ -564,26 +759,20 @@ package(std.regex): writeln("Max counter nesting depth: ", maxCounterDepth); } -} - -//@@@BUG@@@ (unreduced) - public makes it inaccessible in std.regex.package (!) -/*public*/ struct StaticRegex(Char) -{ -package(std.regex): - import std.regex.internal.backtracking : BacktrackingMatcher; - alias Matcher = BacktrackingMatcher!(true); - alias MatchFn = bool function(ref Matcher!Char) @trusted; - MatchFn nativeFn; -public: - Regex!Char _regex; - alias _regex this; - this(Regex!Char re, MatchFn fn) + public string toString()() const { - _regex = re; - nativeFn = fn; - + import std.format : format; + static if (is(typeof(pattern) : string)) + alias patternString = pattern; + else + { + import std.conv : to; + auto patternString = conv.to!string(pattern); + } + auto quotedEscapedPattern = format("%(%s %)", [patternString]); + auto flagString = regexOptionsToString(flags); + return "Regex!" ~ Char.stringof ~ "(" ~ quotedEscapedPattern ~ ", \"" ~ flagString ~ "\")"; } - } // The stuff below this point is temporarrily part of IR module @@ -622,7 +811,7 @@ if (is(Char :dchar)) @property bool atEnd(){ return _index == _origin.length; } - bool search(Kickstart)(ref Kickstart kick, ref dchar res, ref size_t pos) + bool search(Kickstart)(ref const Kickstart kick, ref dchar res, ref size_t pos) { size_t idx = kick.search(_origin, _index); _index = idx; @@ -653,6 +842,11 @@ struct BackLooperImpl(Input) _origin = input._origin; _index = index; } + this(String input) + { + _origin = input; + _index = input.length; + } @trusted bool nextChar(ref dchar res,ref size_t pos) { pos = _index; @@ -690,10 +884,10 @@ template BackLooper(E) //both helpers below are internal, on its own are quite "explosive" //unsafe, no initialization of elements -@system T[] mallocArray(T)(size_t len) +@system pure T[] mallocArray(T)(size_t len) { - import core.stdc.stdlib : malloc; - return (cast(T*) malloc(len * T.sizeof))[0 .. len]; + import core.memory : pureMalloc; + return (cast(T*) pureMalloc(len * T.sizeof))[0 .. len]; } //very unsafe, no initialization @@ -705,7 +899,7 @@ template BackLooper(E) } // -@trusted uint lookupNamedGroup(String)(NamedGroup[] dict, String name) +@trusted uint lookupNamedGroup(String)(const(NamedGroup)[] dict, String name) {//equal is @system? import std.algorithm.comparison : equal; import std.algorithm.iteration : map; @@ -718,16 +912,15 @@ template BackLooper(E) return dict[fnd].group; } -//whether ch is one of unicode newline sequences -//0-arg template due to @@@BUG@@@ 10985 +// whether ch is one of unicode newline sequences +// 0-arg template due to https://issues.dlang.org/show_bug.cgi?id=10985 bool endOfLine()(dchar front, bool seenCr) { return ((front == '\n') ^ seenCr) || front == '\r' || front == NEL || front == LS || front == PS; } -// -//0-arg template due to @@@BUG@@@ 10985 +// 0-arg template due to https://issues.dlang.org/show_bug.cgi?id=10985 bool startOfLine()(dchar back, bool seenNl) { return ((back == '\r') ^ seenNl) || back == '\n' @@ -786,3 +979,215 @@ struct CharMatcher { return trie[ch]; } } + +// Internal non-resizeble array, switches between inline storage and CoW +// POD-only +struct SmallFixedArray(T, uint SMALL=3) +if (!hasElaborateDestructor!T) +{ + import std.internal.memory : enforceMalloc; + import core.memory : pureFree; + static struct Payload + { + size_t refcount; + T[0] placeholder; + inout(T)* ptr() inout { return placeholder.ptr; } + } + static assert(Payload.sizeof == size_t.sizeof); + union + { + Payload* big; + T[SMALL] small; + } + size_t _sizeMask; + enum BIG_MASK = size_t(1)<<(8*size_t.sizeof-1); + enum SIZE_MASK = ~BIG_MASK; + + @property bool isBig() const { return (_sizeMask & BIG_MASK) != 0; } + @property size_t length() const { return _sizeMask & SIZE_MASK; } + + this(size_t size) + { + if (size <= SMALL) + { + small[] = T.init; + _sizeMask = size; + } + else + { + big = cast(Payload*) enforceMalloc(Payload.sizeof + T.sizeof*size); + big.refcount = 1; + _sizeMask = size | BIG_MASK; + } + } + + private @trusted @property inout(T)[] internalSlice() inout + { + return isBig ? big.ptr[0 .. length] : small[0 .. length]; + } + + this(this) + { + if (isBig) + { + big.refcount++; + } + } + + bool opEquals(SmallFixedArray a) + { + return internalSlice[] == a.internalSlice[]; + } + + size_t toHash() const + { + return hashOf(internalSlice[]); + } + + ref inout(T) opIndex(size_t idx) inout + { + return internalSlice[idx]; + } + + // accesses big to test self-referencing so not @safe + @trusted ref opAssign(SmallFixedArray arr) + { + if (isBig) + { + if (arr.isBig) + { + if (big is arr.big) return this; // self-assign + else + { + abandonRef(); + _sizeMask = arr._sizeMask; + big = arr.big; + big.refcount++; + } + } + else + { + abandonRef(); + _sizeMask = arr._sizeMask; + small = arr.small; + } + } + else + { + if (arr.isBig) + { + _sizeMask = arr._sizeMask; + big = arr.big; + big.refcount++; + } + else + { + _sizeMask = arr._sizeMask; + small = arr.small; + } + } + return this; + } + + void mutate(scope void delegate(T[]) pure filler) + { + if (isBig && big.refcount != 1) // copy on write + { + auto oldSizeMask = _sizeMask; + auto newbig = cast(Payload*) enforceMalloc(Payload.sizeof + T.sizeof*length); + newbig.refcount = 1; + abandonRef(); + big = newbig; + _sizeMask = oldSizeMask; + } + filler(internalSlice); + } + + ~this() + { + if (isBig) + { + abandonRef(); + } + } + + @trusted private void abandonRef() + { + assert(isBig); + if (--big.refcount == 0) + { + pureFree(big); + _sizeMask = 0; + assert(!isBig); + } + } +} + +@system unittest +{ + alias SA = SmallFixedArray!(int, 2); + SA create(int[] data) + { + SA a = SA(data.length); + a.mutate((slice) { slice[] = data[]; }); + assert(a.internalSlice == data); + return a; + } + + { + SA a; + a = SA(1); + assert(a.length == 1); + a = SA.init; + assert(a.length == 0); + } + + { + SA a, b, c, d; + assert(a.length == 0); + assert(a.internalSlice == b.internalSlice); + a = create([1]); + assert(a.internalSlice == [1]); + b = create([2, 3]); + assert(b.internalSlice == [2, 3]); + c = create([3, 4, 5]); + d = create([5, 6, 7, 8]); + assert(c.isBig); + a = c; + assert(a.isBig); + assert(a.big is c.big); + assert(a.big.refcount == 2); + assert(a.internalSlice == [3, 4, 5]); + assert(c.internalSlice == [3, 4, 5]); + a = b; + assert(!a.isBig); + assert(a.internalSlice == [2, 3]); + assert(c.big.refcount == 1); + a = c; + assert(c.big.refcount == 2); + + // mutate copies on write if ref-count is not 1 + a.mutate((slice){ slice[] = 1; }); + assert(a.internalSlice == [1, 1, 1]); + assert(c.internalSlice == [3, 4, 5]); + assert(a.isBig && c.isBig); + assert(a.big.refcount == 1); + assert(c.big.refcount == 1); + + auto e = d; + assert(e.big.refcount == 2); + auto f = d; + f = a; + assert(f.isBig); + assert(f.internalSlice == [1, 1, 1]); + assert(f.big.refcount == 2); // a & f + assert(e.big.refcount == 2); // d & e + a = c; + assert(f.big.refcount == 1); // f + assert(e.big.refcount == 2); // d & e + a = a; + a = a; + a = a; + assert(a.big.refcount == 2); // a & c + } +} |