summaryrefslogtreecommitdiff
path: root/tests/examplefiles/newlisp-parser.lsp
diff options
context:
space:
mode:
Diffstat (limited to 'tests/examplefiles/newlisp-parser.lsp')
-rw-r--r--tests/examplefiles/newlisp-parser.lsp298
1 files changed, 298 insertions, 0 deletions
diff --git a/tests/examplefiles/newlisp-parser.lsp b/tests/examplefiles/newlisp-parser.lsp
new file mode 100644
index 00000000..985615b3
--- /dev/null
+++ b/tests/examplefiles/newlisp-parser.lsp
@@ -0,0 +1,298 @@
+#!/usr/bin/env newlisp
+
+;; @module Nlex
+;; @author cormullion
+;; @description newLISP source code lexer/tokenizer/parser
+;; @location somewhere on github
+;; @version 0.1 of 2011-09-19 08:55:19
+;;<h4>About this module</h4>
+;;<p>The Nlex module is a lexer/tokenizer/parser for newLISP source code.
+;; An expert from StackOverflow xplains:
+;; A tokenizer breaks a stream of text into tokens.
+;; A lexer is basically a tokenizer, but it usually attaches extra context to the tokens.
+;; A parser takes the stream of tokens from the lexer and turns it into an abstract syntax tree representing the program represented by the original text.</p>
+;;<p><b>Usage</b></p>
+;;<p>To tokenize/parse source code stored in symbol 'original, use <b>parse-newlisp</b>, To convert the parsed source tree back to plain source, use <b>nlx-to-plaintext</b>:</p>
+;;<pre>
+;;(letn ((converted (Nlex:parse-newlisp original-source)) ; parses
+;; (new-original (Nlex:nlx-to-plaintext converted))) ; converts back to plain text
+;;</pre>
+;;<p>After this round trip, original-source and new-original should be identical.</p>
+;;<p></p>
+
+(context 'Nlex)
+
+; class variables
+
+(define *cursor*)
+(define *source-length*)
+(define *source-list*)
+(define *depth*)
+(define *tree*)
+(define *loc*)
+
+(define (get-next-char)
+ (let ((nch ""))
+ (if (< *cursor* *source-length*)
+ (begin
+ (set 'nch (*source-list* *cursor*))
+ (inc *cursor* (utf8len nch)))
+ (set 'nch nil))
+ nch))
+
+(define (peek-char)
+ (let ((pch ""))
+ (if (< *cursor* *source-length*)
+ (set 'pch (*source-list* *cursor*))
+ (set 'pch nil))))
+
+(define (char-identifier-first? c)
+ (not (find (lower-case c) [text] #;"'(){}.0123456789[/text])))
+
+(define (char-identifier? c)
+ (not (find (lower-case c) { "':,()})))
+
+(define (char-numeric-first? c)
+ (find c {123456789+-.0}))
+
+(define (char-numeric? c)
+ (find c {0123456789+-.xXabcdefABracketedCommandDEF}))
+
+(define (char-whitespace? c)
+ (or (= c " ") (= c "\n") (= c "\t")))
+
+(define (open-paren-token)
+ (add-to-parse-tree '(LeftParen "(")))
+
+(define (close-paren-token)
+ (add-to-parse-tree '(RightParen ")")))
+
+(define (read-comment c)
+ (let ((res c) (ch ""))
+ (while (and (!= (set 'ch (get-next-char)) "\n") ch)
+ (push ch res -1))
+ (add-to-parse-tree (list 'Comment (string res "\n")))))
+
+(define (read-identifier c)
+ (let ((res c) (ch ""))
+ ; look for end of identifier
+ (while (and (not (find (set 'ch (peek-char)) " \"',()\n\t\r")) (!= ch nil))
+ (push (get-next-char) res -1))
+ (add-to-parse-tree (list 'Symbol res))))
+
+(define (read-number-scanner list-so-far)
+ (let ((next-char (peek-char)))
+ ;; if next-char is a digit then recurse
+ (if (and (char-numeric? next-char) next-char)
+ (read-number-scanner (cons (get-next-char) list-so-far))
+ (reverse list-so-far))))
+
+(define (precise-float str)
+; more faithful to original format than newLISP's float?
+ (let ((p "") (q ""))
+ (map set '(p q) (parse str "."))
+ (append p "." q)))
+
+(define (scientific-float str)
+ (let ((p "") (q ""))
+ (map set '(p q) (parse str "e"))
+ (append p "e" q)))
+
+(define (read-number c)
+ (let ((res '() number-as-string ""))
+ (set 'number-as-string (join (read-number-scanner (list c))))
+ (cond
+ ; try hex first
+ ((starts-with (lower-case number-as-string) "0x")
+ (set 'res (list 'Hex number-as-string)))
+ ; scientific notation if there's an e
+ ((find "e" (lower-case number-as-string))
+ (set 'res (list 'Scientific (scientific-float number-as-string))))
+ ; float?
+ ((find "." number-as-string)
+ ; newLISP's float function isn't quite what we want here
+ (set 'res (list 'Float (precise-float number-as-string))))
+ ; octal, not hex or float? 017 is OK, 019 is read as 10
+ ((and (starts-with (lower-case number-as-string) "0")
+ (> (length number-as-string) 1)
+ (empty? (difference (explode number-as-string) (explode "01234567"))))
+ (set 'res (list 'Octal number-as-string)))
+ ; perhaps an integer? 019 is read as 19 ...
+ ((integer? (int number-as-string 0 10))
+ (set 'res (list 'Integer (int number-as-string 0 10))))
+ ; give up
+ (true
+ (set 'res (list 'NaN "NaN"))))
+ (add-to-parse-tree res)))
+
+(define (read-quote)
+ (add-to-parse-tree '(Quote "'")))
+
+(define (read-quoted-string)
+ (let ((res {}) (ch {}))
+ (while (and (!= (set 'ch (get-next-char)) {"}) ch)
+ (push ch res -1)
+ ; check for backslashed quotes
+ (when (= ch {\})
+ (set 'ch (get-next-char))
+ (push ch res -1)))
+ (add-to-parse-tree (list 'QuotedString res))))
+
+(define (read-braced-string)
+ (let ((res "") (ch {}) (level 1))
+ ; we've already seen the first { so we're up to level 1
+ (while (> level 0)
+ (set 'ch (get-next-char))
+ (if (= ch "{") (inc level))
+ (if (= ch "}") (dec level))
+ (if (or (< level 0) (= ch nil)) (throw-error (string "error in a braced string at character " *cursor*)))
+ ; don't push final "}"
+ (if (and (> level 0)) (push ch res -1)))
+ (add-to-parse-tree (list 'BracedString res))))
+
+(define (read-bracketed-string ch)
+ (let ((res "") (ch {}))
+ (cond
+ ; bracketed TEXT?
+ ((= (lower-case (join (slice *source-list* (- *cursor* 1) 6))) "[text]")
+ ; look for final [/text]
+ (inc *cursor* 5)
+ ; look for end
+ (while (and (< *cursor* (- *source-length* 7))
+ (!= (lower-case (join (*cursor* 7 *source-list*))) "[/text]")
+ ch)
+ (push (get-next-char) res -1))
+ (inc *cursor* 7)
+ (add-to-parse-tree (list 'BracketedText res)))
+ ; bracketed CMD?
+ ((= (lower-case (join (slice *source-list* (- *cursor* 1) 5))) "[cmd]")
+ ; look for final [/cmd]
+ (inc *cursor* 4)
+ (while (and (< *cursor* (- *source-length* 6))
+ (!= (lower-case (join (*cursor* 6 *source-list*))) "[/cmd]")
+ ch)
+ (push (get-next-char) res -1))
+ (inc *cursor* 6)
+ (add-to-parse-tree (list 'BracketedCommand res)))
+ ; must be those weird bracketed identifiers
+ (true
+ (while (and (!= (set 'ch (get-next-char)) {]}) ch)
+ (push ch res -1)
+ ; check for backslashed quotes
+ (when (= ch {\})
+ (set 'ch (get-next-char))
+ (push ch res -1)))
+ (add-to-parse-tree (list 'BracketedIdentifier res))))))
+
+(define (read-whitespace ch)
+ (let ((res ch))
+ (while (find (set 'ch (peek-char)) " \n\t")
+ (push (get-next-char) res -1))
+ (add-to-parse-tree (list 'WhiteSpace (base64-enc res)))))
+
+(define (get-token)
+ (let ((first-char (get-next-char)))
+ (if first-char
+ (cond
+ ; a - or + could be the start of a symbol or a number, so look at the next char
+ ((or (= first-char "-") (= first-char "+"))
+ (if (find (peek-char) "1234567890")
+ (read-number first-char)
+ (read-identifier first-char)))
+ ((char-whitespace? first-char)
+ (read-whitespace first-char))
+ ((= first-char {(})
+ (open-paren-token))
+ ((= first-char {)})
+ (close-paren-token))
+ ((= first-char {#})
+ (read-comment first-char))
+ ((= first-char {;})
+ (read-comment first-char))
+ ((= first-char {"})
+ (read-quoted-string))
+ ((= first-char "{")
+ (read-braced-string))
+ ((= first-char "[")
+ (read-bracketed-string first-char))
+ ((= first-char {'})
+ (read-quote))
+ ((char-numeric-first? first-char)
+ (read-number first-char))
+ ((char-identifier-first? first-char)
+ (read-identifier first-char))
+ (true (throw-error (string "{" first-char "} is an unrecognized token")))))))
+
+(define (add-to-parse-tree token-pair)
+ (let (token (first token-pair))
+ (cond
+ ((= token 'LeftParen)
+ (inc *depth*)
+ (push '((LeftParen "(")) *tree* *loc*)
+ (push -1 *loc*))
+ ((= token 'RightParen)
+ (push '(RightParen ")") *tree* *loc*)
+ (dec *depth*)
+ (pop *loc*))
+ (true
+ (push token-pair *tree* *loc*)
+ true))))
+
+(define (parse-newlisp src)
+ ; main function: tokenize/lex/parse the string in src
+ (set '*depth* 0
+ '*tree* '()
+ '*loc* '(-1)
+ '*cursor* 0
+ '*source-list* (explode src)
+ '*source-length* (utf8len src)
+ '*source-length* (length *source-list*))
+ (while (< *cursor* *source-length*)
+ (get-token))
+ *tree*)
+
+(define (nlx-to-plaintext nlx (depth 0))
+ (if (= depth 0) (set 'buff {})) ; if first pass, initialize a buffer
+ (dolist (element nlx)
+ (set 'token-type (first element) 'token-value (last element))
+ (if (atom? token-type)
+ (cond
+ ((= token-type 'LeftParen) ; left parenthesis
+ (extend buff {(}))
+ ((= token-type 'RightParen) ; right parenthesis
+ (extend buff {)}))
+ ((= token-type 'WhiteSpace) ; whitespace
+ (dostring (s (base64-dec token-value))
+ (extend buff (string (char s)))))
+ ((= token-type 'BracedString) ; braced string
+ (extend buff (string "{" token-value "}")))
+ ((= token-type 'QuotedString) ; quoted string
+ (extend buff (string {"} token-value {"})))
+ ((= token-type 'BracketedText) ; bracketed text
+ (extend buff (string {[text]} token-value {[/text]})))
+ ((= token-type 'Quote); quote
+ (extend buff (string "'")))
+ ((= token-type 'Comment) ; comment
+ (extend buff (string (last element) "\n")))
+ ((= token-type 'Integer) ; int
+ (extend buff (string (int (last element)))))
+ ((= token-type 'Float) ; float
+ (extend buff (string (precise-float (last element)))))
+ ((= token-type 'Scientific) ; scientific notation
+ (extend buff (scientific-float (last element))))
+ ((= token-type 'BracketedCommand) ; bracketed command
+ (extend buff (string {[cmd]} (last element) {[/cmd]})))
+ ((or
+ (= token-type 'Symbol) ; close parenthesis
+ (= token-type 'Hex) ; hex
+ (= token-type 'NaN) ; not a number
+ (= token-type 'Octal) ; octal
+ )
+ (extend buff (string (last element))))
+ ((= token-type 'BracketedIdentifier) ; bracketed identifier
+ (extend buff (string {[} (last element) {]}))))
+ ; not an atom, so recurse but don't initialize buffer
+ (nlx-to-plaintext element 1)))
+ buff)
+
+;eof