diff options
author | Georg Brandl <georg@python.org> | 2012-04-04 08:42:40 +0200 |
---|---|---|
committer | Georg Brandl <georg@python.org> | 2012-04-04 08:42:40 +0200 |
commit | 7ebffbb996f28a765fe28637c427f6330e94dc70 (patch) | |
tree | e7577bedb63196ed68381522bad0c8357f98bb01 | |
parent | e59174b0e587a784db432dfbfe2dade4ae08e54d (diff) | |
parent | 50bac235a507de7c347a065b48a826713883e67d (diff) | |
download | pygments-7ebffbb996f28a765fe28637c427f6330e94dc70.tar.gz |
Merged in nolta/pygments-main (pull request #61)
-rw-r--r-- | pygments/lexers/_mapping.py | 3 | ||||
-rw-r--r-- | pygments/lexers/math.py | 161 | ||||
-rw-r--r-- | tests/examplefiles/string.jl | 1031 |
3 files changed, 1192 insertions, 3 deletions
diff --git a/pygments/lexers/_mapping.py b/pygments/lexers/_mapping.py index 7c745a89..8bcc1744 100644 --- a/pygments/lexers/_mapping.py +++ b/pygments/lexers/_mapping.py @@ -132,6 +132,8 @@ LEXERS = { 'JavascriptSmartyLexer': ('pygments.lexers.templates', 'JavaScript+Smarty', ('js+smarty', 'javascript+smarty'), (), ('application/x-javascript+smarty', 'text/x-javascript+smarty', 'text/javascript+smarty')), 'JspLexer': ('pygments.lexers.templates', 'Java Server Page', ('jsp',), ('*.jsp',), ('application/x-jsp',)), 'KotlinLexer': ('pygments.lexers.jvm', 'Kotlin', ('kotlin',), ('*.kt',), ('text/x-kotlin',)), + 'JuliaLexer': ('pygments.lexers.math', 'Julia', ('julia','jl'), ('*.jl',), ('text/x-julia','application/x-julia')), + 'JuliaConsoleLexer': ('pygments.lexers.math', 'Julia console', ('jlcon',), (), ()), 'LighttpdConfLexer': ('pygments.lexers.text', 'Lighttpd configuration file', ('lighty', 'lighttpd'), (), ('text/x-lighttpd-conf',)), 'LiterateHaskellLexer': ('pygments.lexers.functional', 'Literate Haskell', ('lhs', 'literate-haskell'), ('*.lhs',), ('text/x-literate-haskell',)), 'LlvmLexer': ('pygments.lexers.asm', 'LLVM', ('llvm',), ('*.ll',), ('text/x-llvm',)), @@ -290,3 +292,4 @@ if __name__ == '__main__': f.write('LEXERS = {\n %s,\n}\n\n' % ',\n '.join(found_lexers)) f.write(footer) f.close() + diff --git a/pygments/lexers/math.py b/pygments/lexers/math.py index f500231f..7ae7d6b2 100644 --- a/pygments/lexers/math.py +++ b/pygments/lexers/math.py @@ -11,15 +11,170 @@ import re -from pygments.lexer import Lexer, RegexLexer, bygroups, include, do_insertions +from pygments.lexer import Lexer, RegexLexer, bygroups, include, \ + combined, do_insertions from pygments.token import Comment, String, Punctuation, Keyword, Name, \ Operator, Number, Text, Generic from pygments.lexers.agile import PythonLexer from pygments.lexers import _scilab_builtins -__all__ = ['MuPADLexer', 'MatlabLexer', 'MatlabSessionLexer', 'OctaveLexer', - 'ScilabLexer', 'NumPyLexer', 'RConsoleLexer', 'SLexer'] +__all__ = ['JuliaLexer', 'JuliaConsoleLexer', 'MuPADLexer', 'MatlabLexer', + 'MatlabSessionLexer', 'OctaveLexer', 'ScilabLexer', 'NumPyLexer', + 'RConsoleLexer', 'SLexer'] + + +class JuliaLexer(RegexLexer): + name = 'Julia' + aliases = ['julia','jl'] + filenames = ['*.jl'] + mimetypes = ['text/x-julia','application/x-julia'] + + builtins = [ + 'exit','whos','edit','load','is','isa','isequal','typeof','tuple', + 'ntuple','uid','hash','finalizer','convert','promote','subtype', + 'typemin','typemax','realmin','realmax','sizeof','eps','promote_type', + 'method_exists','applicable','invoke','dlopen','dlsym','system', + 'error','throw','assert','new','Inf','Nan','pi','im', + ] + + tokens = { + 'root': [ + (r'\n', Text), + (r'[^\S\n]+', Text), + (r'#.*$', Comment), + (r'[]{}:(),;[@]', Punctuation), + (r'\\\n', Text), + (r'\\', Text), + + # keywords + (r'(begin|while|for|in|return|break|continue|' + r'macro|quote|let|if|elseif|else|try|catch|end|' + r'bitstype|ccall)\b', Keyword), + (r'(local|global|const)\b', Keyword.Declaration), + (r'(module|import|export)\b', Keyword.Reserved), + (r'(Bool|Int|Int8|Int16|Int32|Int64|Uint|Uint8|Uint16|Uint32|Uint64' + r'|Float32|Float64|Complex64|Complex128|Any|Nothing|None)\b', + Keyword.Type), + + # functions + (r'(function)((?:\s|\\\s)+)', + bygroups(Keyword,Name.Function), 'funcname'), + + # types + (r'(type|typealias|abstract)((?:\s|\\\s)+)', + bygroups(Keyword,Name.Class), 'typename'), + + # operators + (r'==|!=|<=|>=|->|&&|\|\||::|<:|[-~+/*%=<>&^|.?!$]', Operator), + (r'\.\*|\.\^|\.\\|\.\/|\\', Operator), + + # builtins + ('(' + '|'.join(builtins) + r')\b', Name.Builtin), + + # backticks + (r'`(?s).*?`', String.Backtick), + + # chars + (r"'(\\.|\\[0-7]{1,3}|\\x[a-fA-F0-9]{1,3}|\\u[a-fA-F0-9]{1,4}|\\U[a-fA-F0-9]{1,6}|[^\\\'\n])'", String.Char), + + # try to match trailing transpose + (r'(?<=[.\w\)\]])\'', Operator), + + # strings + (r'(?:[IL])"', String, 'string'), + (r'[E]?"', String, combined('stringescape', 'string')), + + # names + (r'@[a-zA-Z0-9_.]+', Name.Decorator), + (r'[a-zA-Z_][a-zA-Z0-9_]*', Name), + + # numbers + (r'(\d+\.\d*|\d*\.\d+)([eE][+-]?[0-9]+)?', Number.Float), + (r'\d+[eE][+-]?[0-9]+', Number.Float), + (r'0[0-7]+', Number.Oct), + (r'0[xX][a-fA-F0-9]+', Number.Hex), + (r'\d+', Number.Integer) + ], + + 'funcname': [ + ('[a-zA-Z_][a-zA-Z0-9_]*', Name.Function, '#pop'), + ('\([^\s\w{]{1,2}\)', Operator, '#pop'), + ('[^\s\w{]{1,2}', Operator, '#pop'), + ], + + 'typename': [ + ('[a-zA-Z_][a-zA-Z0-9_]*', Name.Class, '#pop') + ], + + 'stringescape': [ + (r'\\([\\abfnrtv"\']|\n|N{.*?}|u[a-fA-F0-9]{4}|' + r'U[a-fA-F0-9]{8}|x[a-fA-F0-9]{2}|[0-7]{1,3})', String.Escape) + ], + + 'string': [ + (r'"', String, '#pop'), + (r'\\\\|\\"|\\\n', String.Escape), # included here for raw strings + (r'\$(\([a-zA-Z0-9_]+\))?[-#0 +]*([0-9]+|[*])?(\.([0-9]+|[*]))?', + String.Interpol), + (r'[^\\"$]+', String), + # quotes, dollar signs, and backslashes must be parsed one at a time + (r'["\\]', String), + # unhandled string formatting sign + (r'\$', String) + ], + } + + def analyse_text(text): + return shebang_matches(text, r'julia') + + +line_re = re.compile('.*?\n') + +class JuliaConsoleLexer(Lexer): + """ + For Julia console sessions. Modeled after MatlabSessionLexer. + """ + name = 'Julia console' + aliases = ['jlcon'] + + def get_tokens_unprocessed(self, text): + jllexer = JuliaLexer(**self.options) + + curcode = '' + insertions = [] + + for match in line_re.finditer(text): + line = match.group() + + if line.startswith('julia>'): + insertions.append((len(curcode), + [(0, Generic.Prompt, line[:3])])) + curcode += line[3:] + + elif line.startswith(' '): + + idx = len(curcode) + + # without is showing error on same line as before...? + line = "\n" + line + token = (0, Generic.Traceback, line) + insertions.append((idx, [token])) + + else: + if curcode: + for item in do_insertions( + insertions, jllexer.get_tokens_unprocessed(curcode)): + yield item + curcode = '' + insertions = [] + + yield match.start(), Generic.Output, line + + if curcode: # or item: + for item in do_insertions( + insertions, jllexer.get_tokens_unprocessed(curcode)): + yield item class MuPADLexer(RegexLexer): diff --git a/tests/examplefiles/string.jl b/tests/examplefiles/string.jl new file mode 100644 index 00000000..67bf6c70 --- /dev/null +++ b/tests/examplefiles/string.jl @@ -0,0 +1,1031 @@ +## core string functions ## + +length(s::String) = error("you must implement length(",typeof(s),")") +next(s::String, i::Int) = error("you must implement next(",typeof(s),",Int)") +next(s::DirectIndexString, i::Int) = (s[i],i+1) +next(s::String, i::Integer) = next(s,int(i)) + +## generic supplied functions ## + +start(s::String) = 1 +done(s::String,i) = (i > length(s)) +isempty(s::String) = done(s,start(s)) +ref(s::String, i::Int) = next(s,i)[1] +ref(s::String, i::Integer) = s[int(i)] +ref(s::String, x::Real) = s[iround(x)] +ref{T<:Integer}(s::String, r::Range1{T}) = s[int(first(r)):int(last(r))] + +symbol(s::String) = symbol(cstring(s)) +string(s::String) = s + +print(s::String) = for c=s; print(c); end +print(x...) = for i=x; print(i); end +println(args...) = print(args..., '\n') + +show(s::String) = print_quoted(s) + +(*)(s::String...) = strcat(s...) +(^)(s::String, r::Integer) = repeat(s,r) + +size(s::String) = (length(s),) +size(s::String, d::Integer) = d==1 ? length(s) : + error("in size: dimension ",d," out of range") + +strlen(s::DirectIndexString) = length(s) +function strlen(s::String) + i = start(s) + if done(s,i) + return 0 + end + n = 1 + while true + c, j = next(s,i) + if done(s,j) + return n + end + n += 1 + i = j + end +end + +isvalid(s::DirectIndexString, i::Integer) = (start(s) <= i <= length(s)) +function isvalid(s::String, i::Integer) + try + next(s,i) + true + catch + false + end +end + +prevind(s::DirectIndexString, i::Integer) = i-1 +thisind(s::DirectIndexString, i::Integer) = i +nextind(s::DirectIndexString, i::Integer) = i+1 + +prevind(s::String, i::Integer) = thisind(s,thisind(s,i)-1) + +function thisind(s::String, i::Integer) + for j = i:-1:1 + if isvalid(s,j) + return j + end + end + return 0 # out of range +end + +function nextind(s::String, i::Integer) + for j = i+1:length(s) + if isvalid(s,j) + return j + end + end + length(s)+1 # out of range +end + +ind2chr(s::DirectIndexString, i::Integer) = i +chr2ind(s::DirectIndexString, i::Integer) = i + +function ind2chr(s::String, i::Integer) + s[i] # throws error if invalid + j = 1 + k = start(s) + while true + c, l = next(s,k) + if i <= k + return j + end + j += 1 + k = l + end +end + +function chr2ind(s::String, i::Integer) + if i < 1 + return i + end + j = 1 + k = start(s) + while true + c, l = next(s,k) + if i == j + return k + end + j += 1 + k = l + end +end + +function strchr(s::String, c::Char, i::Integer) + i = nextind(s,i) + while !done(s,i) + d, j = next(s,i) + if c == d + return i + end + i = j + end + return 0 +end +strchr(s::String, c::Char) = strchr(s, c, start(s)) +contains(s::String, c::Char) = (strchr(s,c)!=0) + +function chars(s::String) + cx = Array(Char,strlen(s)) + i = 0 + for c in s + cx[i += 1] = c + end + return cx +end + +function cmp(a::String, b::String) + i = start(a) + j = start(b) + while !done(a,i) && !done(b,i) + c, i = next(a,i) + d, j = next(b,j) + if c != d + return c < d ? -1 : +1 + end + end + done(a,i) && !done(b,j) ? -1 : + !done(a,i) && done(b,j) ? +1 : 0 +end + +isequal(a::String, b::String) = cmp(a,b) == 0 +isless(a::String, b::String) = cmp(a,b) < 0 + +# faster comparisons for byte strings + +cmp(a::ByteString, b::ByteString) = lexcmp(a.data, b.data) +isequal(a::ByteString, b::ByteString) = length(a)==length(b) && cmp(a,b)==0 + +## character column width function ## + +charwidth(c::Char) = max(0,int(ccall(:wcwidth, Int32, (Char,), c))) +strwidth(s::String) = (w=0; for c in s; w += charwidth(c); end; w) +strwidth(s::ByteString) = ccall(:u8_strwidth, Int, (Ptr{Uint8},), s.data) +# TODO: implement and use u8_strnwidth that takes a length argument + +## generic string uses only length and next ## + +type GenericString <: String + string::String +end + +length(s::GenericString) = length(s.string) +next(s::GenericString, i::Int) = next(s.string, i) + +## plain old character arrays ## + +type CharString <: String + chars::Array{Char,1} + + CharString(a::Array{Char,1}) = new(a) + CharString(c::Char...) = new([ c[i] | i=1:length(c) ]) +end +CharString(x...) = CharString(map(char,x)...) + +next(s::CharString, i::Int) = (s.chars[i], i+1) +length(s::CharString) = length(s.chars) +strlen(s::CharString) = length(s) + +string(c::Char) = CharString(c) +string(c::Char, x::Char...) = CharString(c, x...) + +## substrings reference original strings ## + +type SubString <: String + string::String + offset::Int + length::Int + + SubString(s::String, i::Int, j::Int) = new(s, i-1, j-i+1) + SubString(s::SubString, i::Int, j::Int) = + new(s.string, i-1+s.offset, j-i+1) +end +SubString(s::String, i::Integer, j::Integer) = SubString(s, int(i), int(j)) + +function next(s::SubString, i::Int) + if i < 1 || i > s.length + error("string index out of bounds") + end + c, i = next(s.string, i+s.offset) + c, i-s.offset +end + +length(s::SubString) = s.length +# TODO: strlen(s::SubString) = ?? +# default implementation will work but it's slow +# can this be delegated efficiently somehow? +# that may require additional string interfaces + +function ref(s::String, r::Range1{Int}) + if first(r) < 1 || length(s) < last(r) + error("in substring slice: index out of range") + end + SubString(s, first(r), last(r)) +end + +## efficient representation of repeated strings ## + +type RepString <: String + string::String + repeat::Integer +end + +length(s::RepString) = length(s.string)*s.repeat +strlen(s::RepString) = strlen(s.string)*s.repeat + +function next(s::RepString, i::Int) + if i < 1 || i > length(s) + error("string index out of bounds") + end + j = mod1(i,length(s.string)) + c, k = next(s.string, j) + c, k-j+i +end + +function repeat(s::String, r::Integer) + r < 0 ? error("can't repeat a string ",r," times") : + r == 0 ? "" : + r == 1 ? s : + RepString(s,r) +end + +## reversed strings without data movement ## + +type RevString <: String + string::String +end + +length(s::RevString) = length(s.string) +strlen(s::RevString) = strlen(s.string) + +start(s::RevString) = (n=length(s); n-thisind(s.string,n)+1) +function next(s::RevString, i::Int) + n = length(s); j = n-i+1 + (s.string[j], n-thisind(s.string,j-1)+1) +end + +reverse(s::String) = RevString(s) +reverse(s::RevString) = s.string + +## ropes for efficient concatenation, etc. ## + +# Idea: instead of this standard binary tree structure, +# how about we keep an array of substrings, with an +# offset array. We can do binary search on the offset +# array so we get O(log(n)) indexing time still, but we +# can compute the offsets lazily and avoid all the +# futzing around while the string is being constructed. + +type RopeString <: String + head::String + tail::String + depth::Int32 + length::Int + + RopeString(h::RopeString, t::RopeString) = + depth(h.tail) + depth(t) < depth(h.head) ? + RopeString(h.head, RopeString(h.tail, t)) : + new(h, t, max(h.depth,t.depth)+1, length(h)+length(t)) + + RopeString(h::RopeString, t::String) = + depth(h.tail) < depth(h.head) ? + RopeString(h.head, RopeString(h.tail, t)) : + new(h, t, h.depth+1, length(h)+length(t)) + + RopeString(h::String, t::RopeString) = + depth(t.head) < depth(t.tail) ? + RopeString(RopeString(h, t.head), t.tail) : + new(h, t, t.depth+1, length(h)+length(t)) + + RopeString(h::String, t::String) = + new(h, t, 1, length(h)+length(t)) +end + +depth(s::String) = 0 +depth(s::RopeString) = s.depth + +function next(s::RopeString, i::Int) + if i <= length(s.head) + return next(s.head, i) + else + c, j = next(s.tail, i-length(s.head)) + return c, j+length(s.head) + end +end + +length(s::RopeString) = s.length +strlen(s::RopeString) = strlen(s.head) + strlen(s.tail) + +strcat() = "" +strcat(s::String) = s +strcat(x...) = strcat(map(string,x)...) +strcat(s::String, t::String...) = + (t = strcat(t...); isempty(s) ? t : isempty(t) ? s : RopeString(s, t)) + +print(s::RopeString) = print(s.head, s.tail) + +## transformed strings ## + +type TransformedString <: String + transform::Function + string::String +end + +length(s::TransformedString) = length(s.string) +strlen(s::TransformedString) = strlen(s.string) + +function next(s::TransformedString, i::Int) + c, j = next(s.string,i) + c = s.transform(c, i) + return c, j +end + +## uppercase and lowercase transformations ## + +uppercase(c::Char) = ccall(:towupper, Char, (Char,), c) +lowercase(c::Char) = ccall(:towlower, Char, (Char,), c) + +uppercase(s::String) = TransformedString((c,i)->uppercase(c), s) +lowercase(s::String) = TransformedString((c,i)->lowercase(c), s) + +ucfirst(s::String) = TransformedString((c,i)->i==1 ? uppercase(c) : c, s) +lcfirst(s::String) = TransformedString((c,i)->i==1 ? lowercase(c) : c, s) + +const uc = uppercase +const lc = lowercase + +## string map ## + +function map(f::Function, s::String) + out = memio(length(s)) + for c in s + write(out, f(c)::Char) + end + takebuf_string(out) +end + +## conversion of general objects to strings ## + +string(x) = print_to_string(show, x) +cstring(x...) = print_to_string(print, x...) + +function cstring(p::Ptr{Uint8}) + p == C_NULL ? error("cannot convert NULL to string") : + ccall(:jl_cstr_to_string, Any, (Ptr{Uint8},), p)::ByteString +end + +## string promotion rules ## + +promote_rule(::Type{UTF8String} , ::Type{ASCIIString}) = UTF8String +promote_rule(::Type{UTF8String} , ::Type{CharString} ) = UTF8String +promote_rule(::Type{ASCIIString}, ::Type{CharString} ) = UTF8String + +## printing literal quoted string data ## + +# TODO: this is really the inverse of print_unbackslashed + +function print_quoted_literal(s::String) + print('"') + for c = s; c == '"' ? print("\\\"") : print(c); end + print('"') +end + +## string escaping & unescaping ## + +escape_nul(s::String, i::Int) = + !done(s,i) && '0' <= next(s,i)[1] <= '7' ? L"\x00" : L"\0" + +is_hex_digit(c::Char) = '0'<=c<='9' || 'a'<=c<='f' || 'A'<=c<='F' +need_full_hex(s::String, i::Int) = !done(s,i) && is_hex_digit(next(s,i)[1]) + +function print_escaped(s::String, esc::String) + i = start(s) + while !done(s,i) + c, j = next(s,i) + c == '\0' ? print(escape_nul(s,j)) : + c == '\e' ? print(L"\e") : + c == '\\' ? print("\\\\") : + contains(esc,c) ? print('\\', c) : + iswprint(c) ? print(c) : + 7 <= c <= 13 ? print('\\', "abtnvfr"[c-6]) : + c <= '\x7f' ? print(L"\x", hex(c, 2)) : + c <= '\uffff' ? print(L"\u", hex(c, need_full_hex(s,j) ? 4 : 2)) : + print(L"\U", hex(c, need_full_hex(s,j) ? 8 : 4)) + i = j + end +end + +escape_string(s::String) = print_to_string(length(s), print_escaped, s, "\"") +print_quoted(s::String) = (print('"'); print_escaped(s, "\"\$"); print('"')) +#" # work around syntax highlighting problem +quote_string(s::String) = print_to_string(length(s)+2, print_quoted, s) + +# bare minimum unescaping function unescapes only given characters + +function print_unescaped_chars(s::String, esc::String) + if !contains(esc,'\\') + esc = strcat("\\", esc) + end + i = start(s) + while !done(s,i) + c, i = next(s,i) + if c == '\\' && !done(s,i) && contains(esc,s[i]) + c, i = next(s,i) + end + print(c) + end +end + +unescape_chars(s::String, esc::String) = + print_to_string(length(s), print_unescaped_chars, s, esc) + +# general unescaping of traditional C and Unicode escape sequences + +function print_unescaped(s::String) + i = start(s) + while !done(s,i) + c, i = next(s,i) + if !done(s,i) && c == '\\' + c, i = next(s,i) + if c == 'x' || c == 'u' || c == 'U' + n = k = 0 + m = c == 'x' ? 2 : + c == 'u' ? 4 : 8 + while (k+=1) <= m && !done(s,i) + c, j = next(s,i) + n = '0' <= c <= '9' ? n<<4 + c-'0' : + 'a' <= c <= 'f' ? n<<4 + c-'a'+10 : + 'A' <= c <= 'F' ? n<<4 + c-'A'+10 : break + i = j + end + if k == 1 + error("\\x used with no following hex digits") + end + if m == 2 # \x escape sequence + write(uint8(n)) + else + print(char(n)) + end + elseif '0' <= c <= '7' + k = 1 + n = c-'0' + while (k+=1) <= 3 && !done(s,i) + c, j = next(s,i) + n = '0' <= c <= '7' ? n<<3 + c-'0' : break + i = j + end + if n > 255 + error("octal escape sequence out of range") + end + write(uint8(n)) + else + print(c == 'a' ? '\a' : + c == 'b' ? '\b' : + c == 't' ? '\t' : + c == 'n' ? '\n' : + c == 'v' ? '\v' : + c == 'f' ? '\f' : + c == 'r' ? '\r' : + c == 'e' ? '\e' : c) + end + else + print(c) + end + end +end + +unescape_string(s::String) = print_to_string(length(s), print_unescaped, s) + +## checking UTF-8 & ACSII validity ## + +byte_string_classify(s::ByteString) = + ccall(:u8_isvalid, Int32, (Ptr{Uint8}, Int), s.data, length(s)) + # 0: neither valid ASCII nor UTF-8 + # 1: valid ASCII + # 2: valid UTF-8 + +is_valid_ascii(s::ByteString) = byte_string_classify(s) == 1 +is_valid_utf8 (s::ByteString) = byte_string_classify(s) != 0 + +check_ascii(s::ByteString) = is_valid_ascii(s) ? s : error("invalid ASCII sequence") +check_utf8 (s::ByteString) = is_valid_utf8(s) ? s : error("invalid UTF-8 sequence") + +## string interpolation parsing ## + +function _jl_interp_parse(s::String, unescape::Function, printer::Function) + sx = {} + i = j = start(s) + while !done(s,j) + c, k = next(s,j) + if c == '$' + if !isempty(s[i:j-1]) + push(sx, unescape(s[i:j-1])) + end + ex, j = parseatom(s,k) + push(sx, ex) + i = j + elseif c == '\\' && !done(s,k) + if s[k] == '$' + if !isempty(s[i:j-1]) + push(sx, unescape(s[i:j-1])) + end + i = k + end + c, j = next(s,k) + else + j = k + end + end + if !isempty(s[i:]) + push(sx, unescape(s[i:j-1])) + end + length(sx) == 1 && isa(sx[1],ByteString) ? sx[1] : + expr(:call, :print_to_string, printer, sx...) +end + +_jl_interp_parse(s::String, u::Function) = _jl_interp_parse(s, u, print) +_jl_interp_parse(s::String) = _jl_interp_parse(s, x->check_utf8(unescape_string(x))) + +function _jl_interp_parse_bytes(s::String) + writer(x...) = for w=x; write(w); end + _jl_interp_parse(s, unescape_string, writer) +end + +## core string macros ## + +macro str(s); _jl_interp_parse(s); end +macro S_str(s); _jl_interp_parse(s); end +macro I_str(s); _jl_interp_parse(s, x->unescape_chars(x,"\"")); end +macro E_str(s); check_utf8(unescape_string(s)); end +macro B_str(s); _jl_interp_parse_bytes(s); end +macro b_str(s); ex = _jl_interp_parse_bytes(s); :(($ex).data); end + +## shell-like command parsing ## + +function _jl_shell_parse(s::String, interp::Bool) + + in_single_quotes = false + in_double_quotes = false + + args = {} + arg = {} + i = start(s) + j = i + + function update_arg(x) + if !isa(x,String) || !isempty(x) + push(arg, x) + end + end + function append_arg() + if isempty(arg); arg = {"",}; end + push(args, arg) + arg = {} + end + + while !done(s,j) + c, k = next(s,j) + if !in_single_quotes && !in_double_quotes && iswspace(c) + update_arg(s[i:j-1]) + append_arg() + j = k + while !done(s,j) + c, k = next(s,j) + if !iswspace(c) + i = j + break + end + j = k + end + elseif interp && !in_single_quotes && c == '$' + update_arg(s[i:j-1]); i = k; j = k + if done(s,k) + error("\$ right before end of command") + end + if iswspace(s[k]) + error("space not allowed right after \$") + end + ex, j = parseatom(s,j) + update_arg(ex); i = j + else + if !in_double_quotes && c == '\'' + in_single_quotes = !in_single_quotes + update_arg(s[i:j-1]); i = k + elseif !in_single_quotes && c == '"' + in_double_quotes = !in_double_quotes + update_arg(s[i:j-1]); i = k + elseif c == '\\' + if in_double_quotes + if done(s,k) + error("unterminated double quote") + end + if s[k] == '"' || s[k] == '$' + update_arg(s[i:j-1]); i = k + c, k = next(s,k) + end + elseif !in_single_quotes + if done(s,k) + error("dangling backslash") + end + update_arg(s[i:j-1]); i = k + c, k = next(s,k) + end + end + j = k + end + end + + if in_single_quotes; error("unterminated single quote"); end + if in_double_quotes; error("unterminated double quote"); end + + update_arg(s[i:]) + append_arg() + + if !interp + return args + end + + # construct an expression + exprs = {} + for arg in args + push(exprs, expr(:tuple, arg)) + end + expr(:tuple,exprs) +end +_jl_shell_parse(s::String) = _jl_shell_parse(s,true) + +function shell_split(s::String) + parsed = _jl_shell_parse(s,false) + args = String[] + for arg in parsed + push(args, strcat(arg...)) + end + args +end + +function print_shell_word(word::String) + if isempty(word) + print("''") + end + has_single = false + has_special = false + for c in word + if iswspace(c) || c=='\\' || c=='\'' || c=='"' || c=='$' + has_special = true + if c == '\'' + has_single = true + end + end + end + if !has_special + print(word) + elseif !has_single + print('\'', word, '\'') + else + print('"') + for c in word + if c == '"' || c == '$' + print('\\') + end + print(c) + end + print('"') + end +end + +function print_shell_escaped(cmd::String, args::String...) + print_shell_word(cmd) + for arg in args + print(' ') + print_shell_word(arg) + end +end + +shell_escape(cmd::String, args::String...) = + print_to_string(print_shell_escaped, cmd, args...) + +## interface to parser ## + +function parse(s::String, pos, greedy) + # returns (expr, end_pos). expr is () in case of parse error. + ex, pos = ccall(:jl_parse_string, Any, + (Ptr{Uint8}, Int32, Int32), + cstring(s), pos-1, greedy ? 1:0) + if isa(ex,Expr) && is(ex.head,:error) + throw(ParseError(ex.args[1])) + end + if ex == (); throw(ParseError("end of input")); end + ex, pos+1 # C is zero-based, Julia is 1-based +end + +parse(s::String) = parse(s, 1, true) +parse(s::String, pos) = parse(s, pos, true) +parseatom(s::String) = parse(s, 1, false) +parseatom(s::String, pos) = parse(s, pos, false) + +## miscellaneous string functions ## + +function lpad(s::String, n::Integer, p::String) + m = n - strlen(s) + if m <= 0; return s; end + l = strlen(p) + if l==1 + return p^m * s + end + q = div(m,l) + r = m - q*l + cstring(p^q*p[1:chr2ind(p,r)]*s) +end + +function rpad(s::String, n::Integer, p::String) + m = n - strlen(s) + if m <= 0; return s; end + l = strlen(p) + if l==1 + return s * p^m + end + q = div(m,l) + r = m - q*l + cstring(s*p^q*p[1:chr2ind(p,r)]) +end + +lpad(s, n::Integer, p) = lpad(string(s), n, string(p)) +rpad(s, n::Integer, p) = rpad(string(s), n, string(p)) + +lpad(s, n::Integer) = lpad(string(s), n, " ") +rpad(s, n::Integer) = rpad(string(s), n, " ") + +function split(s::String, delims, include_empty::Bool) + i = 1 + strs = String[] + len = length(s) + while true + tokstart = tokend = i + while !done(s,i) + (c,i) = next(s,i) + if contains(delims, c) + break + end + tokend = i + end + tok = s[tokstart:(tokend-1)] + if include_empty || !isempty(tok) + push(strs, tok) + end + if !((i <= len) || (i==len+1 && tokend!=i)) + break + end + end + strs +end + +split(s::String) = split(s, (' ','\t','\n','\v','\f','\r'), false) +split(s::String, x) = split(s, x, true) +split(s::String, x::Char, incl::Bool) = split(s, (x,), incl) + +function print_joined(strings, delim, last) + i = start(strings) + if done(strings,i) + return + end + str, i = next(strings,i) + print(str) + while !done(strings,i) + str, i = next(strings,i) + print(done(strings,i) ? last : delim) + print(str) + end +end + +function print_joined(strings, delim) + i = start(strings) + while !done(strings,i) + str, i = next(strings,i) + print(str) + if !done(strings,i) + print(delim) + end + end +end +print_joined(strings) = print_joined(strings, "") + +join(args...) = print_to_string(print_joined, args...) + +chop(s::String) = s[1:thisind(s,length(s))-1] +chomp(s::String) = (i=thisind(s,length(s)); s[i]=='\n' ? s[1:i-1] : s) +chomp(s::ByteString) = s.data[end]==0x0a ? s[1:end-1] : s + +function lstrip(s::String) + i = start(s) + while !done(s,i) + c, j = next(s,i) + if !iswspace(c) + return s[i:end] + end + i = j + end + "" +end + +function rstrip(s::String) + r = reverse(s) + i = start(r) + while !done(r,i) + c, j = next(r,i) + if !iswspace(c) + return s[1:end-i+1] + end + i = j + end + "" +end + +strip(s::String) = lstrip(rstrip(s)) + +## string to integer functions ## + +function parse_int{T<:Integer}(::Type{T}, s::String, base::Integer) + if !(2 <= base <= 36); error("invalid base: ",base); end + i = start(s) + if done(s,i) + error("premature end of integer (in ",show_to_string(s),")") + end + c,i = next(s,i) + sgn = one(T) + if T <: Signed && c == '-' + sgn = -sgn + if done(s,i) + error("premature end of integer (in ",show_to_string(s),")") + end + c,i = next(s,i) + end + base = convert(T,base) + n::T = 0 + while true + d = '0' <= c <= '9' ? c-'0' : + 'A' <= c <= 'Z' ? c-'A'+10 : + 'a' <= c <= 'z' ? c-'a'+10 : typemax(Int) + if d >= base + error(show_to_string(c)," is not a valid digit (in ",show_to_string(s),")") + end + # TODO: overflow detection? + n = n*base + d + if done(s,i) + break + end + c,i = next(s,i) + end + return flipsign(n,sgn) +end + +parse_int(s::String, base::Integer) = parse_int(Int,s,base) +parse_int(T::Type, s::String) = parse_int(T,s,10) +parse_int(s::String) = parse_int(Int,s,10) + +parse_bin(T::Type, s::String) = parse_int(T,s,2) +parse_oct(T::Type, s::String) = parse_int(T,s,8) +parse_hex(T::Type, s::String) = parse_int(T,s,16) + +parse_bin(s::String) = parse_int(Int,s,2) +parse_oct(s::String) = parse_int(Int,s,8) +parse_hex(s::String) = parse_int(Int,s,16) + +integer (s::String) = int(s) +unsigned(s::String) = uint(s) +int (s::String) = parse_int(Int,s) +uint (s::String) = parse_int(Uint,s) +int8 (s::String) = parse_int(Int8,s) +uint8 (s::String) = parse_int(Uint8,s) +int16 (s::String) = parse_int(Int16,s) +uint16 (s::String) = parse_int(Uint16,s) +int32 (s::String) = parse_int(Int32,s) +uint32 (s::String) = parse_int(Uint32,s) +int64 (s::String) = parse_int(Int64,s) +uint64 (s::String) = parse_int(Uint64,s) + +## integer to string functions ## + +const _jl_dig_syms = "0123456789abcdefghijklmnopqrstuvwxyz".data + +function int2str(n::Union(Int64,Uint64), b::Integer, l::Int) + if b < 2 || b > 36; error("int2str: invalid base ", b); end + neg = n < 0 + n = unsigned(abs(n)) + b = convert(typeof(n), b) + ndig = ndigits(n, b) + sz = max(convert(Int, ndig), l) + neg + data = Array(Uint8, sz) + i = sz + if ispow2(b) + digmask = b-1 + shift = trailing_zeros(b) + while i > neg + ch = n & digmask + data[i] = _jl_dig_syms[int(ch)+1] + n >>= shift + i -= 1 + end + else + while i > neg + ch = n % b + data[i] = _jl_dig_syms[int(ch)+1] + n = div(n,b) + i -= 1 + end + end + if neg + data[1] = '-' + end + ASCIIString(data) +end +int2str(n::Integer, b::Integer) = int2str(n, b, 0) +int2str(n::Integer, b::Integer, l::Int) = int2str(int64(n), b, l) + +string(x::Signed) = dec(int64(x)) +cstring(x::Signed) = dec(int64(x)) + +## string to float functions ## + +function float64_isvalid(s::String, out::Array{Float64,1}) + s = cstring(s) + return (ccall(:jl_strtod, Int32, (Ptr{Uint8},Ptr{Float64}), s, out)==0) +end + +function float32_isvalid(s::String, out::Array{Float32,1}) + s = cstring(s) + return (ccall(:jl_strtof, Int32, (Ptr{Uint8},Ptr{Float32}), s, out)==0) +end + +begin + local tmp::Array{Float64,1} = Array(Float64,1) + local tmpf::Array{Float32,1} = Array(Float32,1) + global float64, float32 + function float64(s::String) + if !float64_isvalid(s, tmp) + throw(ArgumentError("float64(String): invalid number format")) + end + return tmp[1] + end + + function float32(s::String) + if !float32_isvalid(s, tmpf) + throw(ArgumentError("float32(String): invalid number format")) + end + return tmpf[1] + end +end + +float(x::String) = float64(x) +parse_float(x::String) = float64(x) +parse_float(::Type{Float64}, x::String) = float64(x) +parse_float(::Type{Float32}, x::String) = float32(x) + +# copying a byte string (generally not needed due to "immutability") + +strcpy{T<:ByteString}(s::T) = T(copy(s.data)) + +# lexicographically compare byte arrays (used by Latin-1 and UTF-8) + +function lexcmp(a::Array{Uint8,1}, b::Array{Uint8,1}) + c = ccall(:memcmp, Int32, (Ptr{Uint8}, Ptr{Uint8}, Uint), + a, b, min(length(a),length(b))) + c < 0 ? -1 : c > 0 ? +1 : cmp(length(a),length(b)) +end + +# find the index of the first occurrence of a byte value in a byte array + +function memchr(a::Array{Uint8,1}, b::Integer) + p = pointer(a) + q = ccall(:memchr, Ptr{Uint8}, (Ptr{Uint8}, Int32, Uint), p, b, length(a)) + q == C_NULL ? 0 : q - p + 1 +end + +# concatenate byte arrays into a single array + +memcat() = Array(Uint8,0) +memcat(a::Array{Uint8,1}) = copy(a) + +function memcat(arrays::Array{Uint8,1}...) + n = 0 + for a in arrays + n += length(a) + end + arr = Array(Uint8, n) + ptr = pointer(arr) + offset = 0 + for a in arrays + ccall(:memcpy, Ptr{Uint8}, (Ptr{Uint8}, Ptr{Uint8}, Uint), + ptr+offset, a, length(a)) + offset += length(a) + end + return arr +end + +# concatenate the data fields of byte strings + +memcat(s::ByteString) = memcat(s.data) +memcat(sx::ByteString...) = memcat(map(s->s.data, sx)...) |