1 files changed, 298 insertions, 0 deletions
diff --git a/tests/examplefiles/newlisp-parser.lsp b/tests/examplefiles/newlisp-parser.lsp
new file mode 100644
index 00000000..985615b3
--- /dev/null
+++ b/tests/examplefiles/newlisp-parser.lsp
@@ -0,0 +1,298 @@
+#!/usr/bin/env newlisp
+
+;; @module Nlex
+;; @author cormullion
+;; @description  newLISP source code lexer/tokenizer/parser
+;; @location somewhere on github
+;; @version 0.1 of 2011-09-19 08:55:19
+;;<h4>About this module</h4>
+;;<p>The Nlex module is a lexer/tokenizer/parser for newLISP source code.
+;; An expert from StackOverflow xplains:
+;; A tokenizer breaks a stream of text into tokens.
+;; A lexer is basically a tokenizer, but it usually attaches extra context to the tokens.
+;; A parser takes the stream of tokens from the lexer and turns it into an abstract syntax tree representing the program represented by the original text.</p>
+;;<p><b>Usage</b></p>
+;;<p>To tokenize/parse source code stored in symbol 'original, use <b>parse-newlisp</b>, To convert the parsed source tree back to plain source, use <b>nlx-to-plaintext</b>:</p>
+;;<pre>
+;;(letn ((converted    (Nlex:parse-newlisp     original-source)) ; parses 
+;;       (new-original (Nlex:nlx-to-plaintext  converted)))      ; converts back to plain text
+;;</pre>
+;;<p>After this round trip, original-source and new-original should be identical.</p>
+;;<p></p>
+
+(context 'Nlex)
+
+; class variables
+
+(define *cursor*)
+(define *source-length*)
+(define *source-list*)
+(define *depth*)
+(define *tree*)
+(define *loc*)
+
+(define (get-next-char)
+ (let ((nch ""))
+   (if (< *cursor* *source-length*)
+       (begin
+          (set 'nch (*source-list* *cursor*))
+          (inc *cursor* (utf8len nch)))
+       (set 'nch nil))
+   nch))
+
+(define (peek-char)
+ (let ((pch ""))
+   (if (< *cursor* *source-length*)
+       (set 'pch (*source-list* *cursor*))
+       (set 'pch nil))))
+
+(define (char-identifier-first? c)
+  (not (find (lower-case c) [text] #;"'(){}.0123456789[/text])))
+  
+(define (char-identifier? c)
+  (not (find (lower-case c) { "':,()})))
+
+(define (char-numeric-first? c)
+   (find c {123456789+-.0}))
+
+(define (char-numeric? c)
+   (find c {0123456789+-.xXabcdefABracketedCommandDEF}))
+
+(define (char-whitespace? c)
+  (or (= c " ") (= c "\n") (= c "\t")))
+
+(define (open-paren-token)
+  (add-to-parse-tree '(LeftParen "(")))
+
+(define (close-paren-token)
+  (add-to-parse-tree '(RightParen ")")))
+
+(define (read-comment c)
+  (let ((res c) (ch ""))
+     (while (and (!= (set 'ch (get-next-char)) "\n") ch)
+        (push ch res -1))
+    (add-to-parse-tree (list 'Comment (string res "\n")))))
+    
+(define (read-identifier c)
+  (let ((res c) (ch ""))
+   ; look for end of identifier
+    (while (and (not (find (set 'ch (peek-char)) " \"',()\n\t\r")) (!= ch nil))
+      (push (get-next-char) res -1))
+    (add-to-parse-tree (list 'Symbol res))))
+
+(define (read-number-scanner list-so-far)
+    (let ((next-char (peek-char)))
+      ;; if next-char is a digit then recurse
+      (if (and (char-numeric? next-char) next-char)
+          (read-number-scanner (cons (get-next-char) list-so-far))
+          (reverse list-so-far))))
+
+(define (precise-float str)
+; more faithful to original format than newLISP's float?
+  (let ((p "") (q ""))
+    (map set '(p q) (parse str "."))
+    (append p "." q)))
+    
+(define (scientific-float str)
+  (let ((p "") (q ""))
+    (map set '(p q) (parse str "e"))
+    (append p "e" q)))
+
+(define (read-number c)
+  (let ((res '() number-as-string ""))
+     (set 'number-as-string (join (read-number-scanner (list c))))
+     (cond
+       ; try hex first
+       ((starts-with (lower-case number-as-string) "0x")
+          (set 'res  (list 'Hex number-as-string)))
+       ; scientific notation if there's an e
+       ((find "e" (lower-case number-as-string))
+          (set 'res  (list 'Scientific (scientific-float number-as-string))))
+       ; float?
+       ((find "." number-as-string)
+          ; newLISP's float function isn't quite what we want here     
+          (set 'res  (list 'Float (precise-float number-as-string))))
+       ; octal, not hex or float? 017 is OK, 019 is read as 10
+       ((and (starts-with (lower-case number-as-string) "0") 
+             (> (length number-as-string) 1)
+             (empty? (difference (explode number-as-string) (explode "01234567"))))
+          (set 'res (list 'Octal number-as-string)))
+       ; perhaps an integer?  019 is read as 19 ...
+       ((integer? (int number-as-string 0 10))
+         (set 'res  (list 'Integer (int number-as-string 0 10))))
+       ; give up
+       (true
+         (set 'res (list 'NaN "NaN"))))
+  (add-to-parse-tree res)))
+
+(define (read-quote)
+   (add-to-parse-tree '(Quote "'")))
+
+(define (read-quoted-string)
+  (let ((res {}) (ch {}))
+     (while (and (!= (set 'ch (get-next-char)) {"}) ch)
+        (push ch res -1)
+        ; check for backslashed quotes
+        (when (= ch {\}) 
+              (set 'ch (get-next-char))
+              (push ch res -1)))
+    (add-to-parse-tree (list 'QuotedString res))))
+
+(define (read-braced-string)
+  (let ((res "") (ch {}) (level 1)) 
+  ; we've already seen the first { so we're up to level 1
+     (while (> level 0)
+         (set 'ch (get-next-char))
+         (if (= ch "{") (inc level))
+         (if (= ch "}") (dec level))
+         (if (or (< level 0) (= ch nil)) (throw-error (string "error in a braced string at character " *cursor*)))
+         ; don't push final "}"
+         (if (and (> level 0)) (push ch res -1)))
+    (add-to-parse-tree (list 'BracedString res))))
+
+(define (read-bracketed-string ch)
+  (let ((res "") (ch {}))  
+    (cond
+     ; bracketed TEXT?
+     ((= (lower-case (join (slice *source-list* (- *cursor* 1) 6))) "[text]")
+         ; look for final [/text]
+         (inc *cursor* 5)
+         ; look for end
+         (while (and  (< *cursor* (- *source-length* 7)) 
+                      (!= (lower-case (join (*cursor* 7 *source-list*))) "[/text]")
+                      ch)
+                (push (get-next-char) res -1))
+         (inc *cursor* 7)
+         (add-to-parse-tree (list 'BracketedText res)))
+     ; bracketed CMD?
+     ((= (lower-case (join (slice *source-list* (- *cursor* 1) 5))) "[cmd]")
+         ; look for final [/cmd]
+         (inc *cursor* 4)
+         (while (and  (< *cursor* (- *source-length* 6)) 
+                      (!= (lower-case (join (*cursor* 6 *source-list*))) "[/cmd]")
+                      ch)
+                (push (get-next-char) res -1))
+         (inc *cursor* 6)
+         (add-to-parse-tree (list 'BracketedCommand res)))
+     ; must be those weird bracketed identifiers    
+     (true
+         (while (and (!= (set 'ch (get-next-char)) {]}) ch)
+            (push ch res -1)
+            ; check for backslashed quotes
+            (when (= ch {\}) 
+                  (set 'ch (get-next-char))
+                  (push ch res -1)))
+            (add-to-parse-tree (list 'BracketedIdentifier res))))))
+
+(define (read-whitespace ch)
+  (let ((res ch))
+     (while (find (set 'ch (peek-char)) " \n\t")
+        (push (get-next-char) res -1))
+    (add-to-parse-tree (list 'WhiteSpace (base64-enc res)))))
+
+(define (get-token)
+ (let ((first-char (get-next-char)))
+    (if first-char
+      (cond 
+            ; a - or + could be the start of a symbol or a number, so look at the next char
+            ((or (= first-char "-") (= first-char "+"))
+                (if (find (peek-char) "1234567890")
+                    (read-number first-char)
+                    (read-identifier first-char)))
+            ((char-whitespace? first-char)   
+               (read-whitespace first-char))
+            ((= first-char {(})
+               (open-paren-token))
+            ((= first-char {)})
+               (close-paren-token))
+            ((= first-char {#})
+               (read-comment first-char))
+            ((= first-char {;})
+               (read-comment first-char))
+            ((= first-char {"})
+               (read-quoted-string))
+            ((= first-char "{")
+               (read-braced-string))
+            ((= first-char "[")
+               (read-bracketed-string first-char))
+            ((= first-char {'})
+               (read-quote))
+            ((char-numeric-first? first-char)
+               (read-number first-char))
+            ((char-identifier-first? first-char)
+               (read-identifier first-char))
+            (true (throw-error (string "{" first-char "} is an unrecognized token")))))))
+
+(define (add-to-parse-tree token-pair)
+  (let (token (first token-pair))
+  (cond 
+    ((= token 'LeftParen)
+        (inc *depth*)
+        (push '((LeftParen "(")) *tree* *loc*)
+        (push -1 *loc*))
+    ((= token 'RightParen)
+        (push '(RightParen ")") *tree* *loc*)
+        (dec *depth*)
+        (pop *loc*))
+    (true
+        (push token-pair *tree* *loc*)
+        true))))
+
+(define (parse-newlisp src)
+  ; main function: tokenize/lex/parse the string in src
+  (set '*depth* 0 
+       '*tree* '() 
+       '*loc* '(-1) 
+       '*cursor* 0 
+       '*source-list*   (explode src) 
+       '*source-length* (utf8len src)
+       '*source-length* (length *source-list*))
+  (while (< *cursor* *source-length*)
+         (get-token))
+  *tree*)
+
+(define (nlx-to-plaintext nlx (depth 0))
+   (if (= depth 0) (set 'buff {})) ; if first pass, initialize a buffer
+   (dolist (element nlx)
+    (set 'token-type (first element) 'token-value (last element))
+    (if (atom? token-type)
+        (cond 
+           ((= token-type 'LeftParen) ; left parenthesis
+                (extend buff {(}))
+           ((= token-type 'RightParen) ; right parenthesis
+                (extend buff {)}))
+           ((= token-type 'WhiteSpace) ; whitespace
+                (dostring (s (base64-dec token-value)) 
+                  (extend buff (string (char s)))))
+           ((= token-type 'BracedString) ; braced string
+                (extend buff (string  "{" token-value "}")))
+           ((= token-type 'QuotedString) ; quoted string
+                (extend buff (string  {"} token-value {"})))
+           ((= token-type 'BracketedText) ; bracketed  text
+                (extend buff (string  {[text]} token-value {[/text]})))        
+           ((= token-type 'Quote); quote
+                (extend buff (string  "'")))
+           ((= token-type 'Comment) ; comment
+                (extend buff (string (last element) "\n")))
+           ((= token-type 'Integer) ; int
+                (extend buff (string (int (last element)))))
+           ((= token-type 'Float) ; float
+                (extend buff (string (precise-float (last element)))))  
+           ((= token-type 'Scientific) ; scientific notation
+                (extend buff (scientific-float (last element))))  
+           ((= token-type 'BracketedCommand) ; bracketed command
+               (extend buff (string {[cmd]} (last element) {[/cmd]})))
+           ((or 
+                (= token-type 'Symbol) ; close parenthesis
+                (= token-type 'Hex) ; hex
+                (= token-type 'NaN) ; not a number
+                (= token-type 'Octal) ; octal
+                )
+                (extend buff (string (last element))))
+           ((= token-type 'BracketedIdentifier) ; bracketed identifier
+                (extend buff (string {[} (last element) {]}))))
+        ; not an atom, so recurse but don't initialize buffer
+        (nlx-to-plaintext element 1)))
+   buff)
+
+;eof