#!/usr/bin/env newlisp ;; @module markdown ;; @author cormullion ;; @description a port of John Gruber's Markdown to newLISP ;; @location http://unbalanced-parentheses.nfshost.com/ ;; @version of date 2011-10-02 22:36:02 ;; version history: at the end ;; a port of John Gruber's Markdown.pl (http://daringfireball.net/markdown) script to newLISP... ;; see his original Perl script for explanations of the fearsome regexen and ;; byzantine logic, etc... ;; TODO: ;; the following Markdown tests fail: ;; Inline HTML (Advanced) ... FAILED ;; Links, reference style ... FAILED -- nested brackets ;; Links, shortcut references ... FAILED ;; Markdown Documentation - Syntax ... FAILED ;; Ordered and unordered lists ... FAILED -- a nested ordered list error ;; parens in url : ![this is a stupid URL](http://example.com/(parens).jpg) see (Images.text) ;; Add: email address scrambling (context 'Hash) (define HashTable:HashTable) (define (build-escape-table) (set '*escape-chars* [text]\`*_{}[]()>#+-.![/text]) (dolist (c (explode *escape-chars*)) (HashTable c (hash c)))) (define (init-hash txt) ; finds a hash identifier that doesn't occur anywhere in the text (set 'counter 0) (set 'hash-prefix "HASH") (set 'hash-id (string hash-prefix counter)) (do-while (find hash-id txt) (set 'hash-id (string hash-prefix (inc counter)))) (Hash:build-escape-table)) (define (hash s) (HashTable s (string hash-id (inc counter)))) (context 'markdown) (define (markdown:markdown txt) (initialize) (Hash:init-hash txt) (unescape-special-chars (block-transforms (strip-link-definitions (protect (cleanup txt)))))) (define (initialize) (set '*escape-pairs* '( ({\\\\} {\}) ({\\`} {`}) ({\\\*} {*}) ({\\_} {_}) ([text]\\\{[/text] [text]{[/text]) ([text]\\\}[/text] [text]}[/text]) ({\\\[} {[}) ({\\\]} {]}) ({\\\(} {(}) ({\\\)} {)}) ({\\>} {>}) ({\\\#} {#}) ({\\\+} {+}) ({\\\-} {-}) ({\\\.} {.}) ({\\!} {!}))) (set '*hashed-html-blocks* '()) (set '*list-level* 0)) (define (block-transforms txt) (form-paragraphs (protect (block-quotes (code-blocks (lists (horizontal-rules (headers txt)))))))) (define (span-transforms txt) (line-breaks (emphasis (amps-and-angles (auto-links (anchors (images (escape-special-chars (escape-special-chars (code-spans txt) 'inside-attributes))))))))) (define (tokenize-html xhtml) ; return list of tag/text portions of xhtml text (letn ( (tag-match [text]((?s:)| (?s:<\?.*?\?>)| (?:<[a-z/!$](?:[^<>]| (?:<[a-z/!$](?:[^<>]| (?:<[a-z/!$](?:[^<>]| (?:<[a-z/!$](?:[^<>]| (?:<[a-z/!$](?:[^<>]| (?:<[a-z/!$](?:[^<>])*>))*>))*>))*>))*>))*>))[/text]) ; yeah, well... (str xhtml) (len (length str)) (pos 0) (tokens '())) (while (set 'tag-start (find tag-match str 8)) (if (< pos tag-start) (push (list 'text (slice str pos (- tag-start pos))) tokens -1)) (push (list 'tag $0) tokens -1) (set 'str (slice str (+ tag-start (length $0)))) (set 'pos 0)) ; leftovers (if (< pos len) (push (list 'text (slice str pos (- len pos))) tokens -1)) tokens)) (define (escape-special-chars txt (within-tag-attributes nil)) (let ((temp (tokenize-html txt)) (new-text {})) (dolist (pair temp) (if (= (first pair) 'tag) ; 'tag (begin (set 'new-text (replace {\\} (last pair) (HashTable {\\}) 0)) (replace [text](?<=.)(?=.)[/text] new-text (HashTable {`}) 0) (replace {\*} new-text (HashTable {*}) 0) (replace {_} new-text (HashTable {_} ) 0)) ; 'text (if within-tag-attributes (set 'new-text (last pair)) (set 'new-text (encode-backslash-escapes (last pair))))) (setf (temp $idx) (list (first pair) new-text))) ; return as text (join (map last temp)))) (define (encode-backslash-escapes t) (dolist (pair *escape-pairs*) (replace (first pair) t (HashTable (last pair)) 14))) (define (encode-code s) ; encode/escape certain characters inside Markdown code runs (replace {&} s "&" 0) (replace {<} s "<" 0) (replace {>} s ">" 0) (replace {\*} s (HashTable {\\}) 0) (replace {_} s (HashTable {_}) 0) (replace "{" s (HashTable "{") 0) (replace {\[} s (HashTable {[}) 0) (replace {\]} s (HashTable {]}) 0) (replace {\\} s (HashTable "\\") 0)) (define (code-spans s) (replace {(?} (encode-code (trim $2)) {}) 2)) (define (encode-alt s) (replace {&} s "&" 0) (replace {"} s """ 0)) (define (images txt) (let ((alt-text {}) (url {}) (title {}) (ref-regex {(!\[(.*?)\][ ]?(?:\n[ ]*)?\[(.*?)\])}) (inline-regex {(!\[(.*?)\]\([ \t]*?[ \t]*((['"])(.*?)\5[ \t]*)?\))}) (whole-match {}) (result {}) (id-ref {}) (url {})) ; reference links ![alt text][id] (replace ref-regex txt (begin (set 'whole-match $1 'alt-text $2 'id-ref $3) (if alt-text (replace {"} alt-text {"} 0)) (if (empty? id-ref) (set 'id-ref (lower-case alt-text))) (if (lookup id-ref *link-database*) (set 'url (first (lookup id-ref *link-database*))) (set 'url nil)) (if url (begin (replace {\*} url (HashTable {*}) 0) (replace {_} url (HashTable {_}) 0) )) (if (last (lookup id-ref *link-database*)) ; title (begin (set 'title (last (lookup id-ref *link-database*))) (replace {"} title {"} 0) (replace {\*} title (HashTable {*}) 0) (replace {_} title (HashTable {_}) 0)) ; no title (set 'title {}) ) (if url (set 'result (string {} 
          alt-text {})) (set 'result whole-match)) ) 0 ) ; inline image refs: ![alt text](url "optional title") (replace inline-regex txt (begin (set 'whole-match $1) (set 'alt-text $2) (set 'url $3) (set 'title $6) (if alt-text (replace {"} alt-text {"} 0) (set 'alt-text {})) (if title (begin (replace {"} title {"} 0) (replace {\*} title (HashTable {*}) 0) (replace {_} title (HashTable {_}) 0)) (set 'title {})) (replace {\*} url (HashTable {*}) 0) (replace {_} url (HashTable {_}) 0) (string {} 
           alt-text {}) ) 0 ) ; empty ones are possible (set '$1 {}) (replace {!\[(.*?)\]\([ \t]*\)} txt (string {} $1 {}) 0))) (define (make-anchor link-text id-ref ) ; Link defs are in the form: ^[id]: url "optional title" ; stored in link db list as (id (url title)) ; params are text to be linked and the id of the link in the db ; eg bar 1 for [bar][1] (let ((title {}) (id id-ref) (url nil)) (if link-text (begin (replace {"} link-text {"} 0) (replace {\n} link-text { } 0) (replace {[ ]?\n} link-text { } 0))) (if (null? id ) (set 'id (lower-case link-text))) (if (not (nil? (lookup id *link-database*))) (begin (set 'url (first (lookup id *link-database*))) (replace {\*} url (HashTable {*}) 0) (replace {_} url (HashTable {_}) 0) (if (set 'title (last (lookup id *link-database*))) (begin (replace {"} title {"} 0) (replace {\*} title (HashTable {*}) 0) (replace {_} title (HashTable {_}) 0)) (set 'title {}))) (set 'url nil)) (if url (string {} link-text {}) (string {[} link-text {][} id-ref {]})))) (define (anchors txt) (letn ((nested-brackets {(?>[^\[\]]+)*}) (ref-link-regex (string {(\[(} nested-brackets {)\][ ]?(?:\n[ ]*)?\[(.*?)\])})) (inline-regex {(\[(.*?)\]\([ ]*?[ ]*((['"])(.*?)\5[ \t]*)?\))}) (link-text {}) (url {}) (title {})) ; reference-style links: [link text] [id] (set '$1 {} '$2 {} '$3 {} '$4 {} '$5 {} '$6 {}) ; i still don't think I should have to do this... ; what about this regex instead? (set 'ref-link-regex {(\[(.*?)\][ ]?\[(.*?)\])}) (replace ref-link-regex txt (make-anchor $2 $3) 8) ; $2 is link text, $3 is id ; inline links: [link text](url "optional title") (set '$1 {} '$2 {} '$3 {} '$4 {} '$5 {} '$6 {}) (replace inline-regex txt (begin (set 'link-text $2) (set 'url $3) (set 'title $6) (if link-text (replace {"} link-text {"} 0)) (if title (begin (replace {"} title {"} 0) (replace {\*} title (HashTable {*}) 0) (replace {_} title (HashTable {_}) 0)) (set 'title {})) (replace {\*} url (HashTable {*}) 0) (replace {_} url (HashTable {_}) 0) (replace {^<(.*)>$} url $1 0) (string {} link-text {} )) 8 ) ; replace ) txt) (define (auto-links txt) (replace [text]<((https?|ftp):[^'">\s]+)>[/text] txt (string {} $1 {}) 0 ) ; to-do: email ... ) (define (amps-and-angles txt) ; Smart processing for ampersands and angle brackets (replace [text]&(?!\#?[xX]?(?:[0-9a-fA-F]+|\w+);)[/text] txt {&} 10 ) (replace [text]<(?![a-z/?\$!])[/text] txt {<} 10)) (define (emphasis txt) ; italics/bold: strong first (replace [text] (\*\*|__) (?=\S) (.+?[*_]*) (?<=\S) \1 [/text] txt (string {} $2 {}) 8 ) (replace [text] (\*|_) (?=\S) (.+?) (?<=\S) \1 [/text] txt (string {} $2 {}) 8 )) (define (line-breaks txt) ; handles line break markers (replace " {2,}\n" txt "
\n" 0)) (define (hex-str-to-unicode-char strng) ; given a five character string, assume it's "U" + 4 hex chars and convert ; return the character... (char (int (string "0x" (1 strng)) 0 16))) (define (ustring s) ; any four digit string preceded by U (replace "U[0-9a-f]{4,}" s (hex-str-to-unicode-char $0) 0)) (define (cleanup txt) ; cleanup the text by normalizing some possible variations (replace "\r\n|\r" txt "\n" 0) ; standardize line ends (push "\n\n" txt -1) ; end with two returns (set 'txt (detab txt)) ; convert tabs to spaces ; convert inline Unicode: (set 'txt (ustring txt)) (replace "\n[ \t]+\n" txt "\n\n" 0) ; lines with only spaces and tabs ) (define (protect txt) ; protect or "hash html blocks" (letn ((nested-block-regex [text](^<(p|div|h[1-6]|blockquote|pre|table|dl|ol|ul|script|noscript|form|fieldset|iframe|math|ins|del)\b(.*\n)*?[ \t]*(?=\n+|\Z))[/text]) (liberal-tag-regex [text](^<(p|div|h[1-6]|blockquote|pre|table|dl|ol|ul|script|noscript|form|fieldset|iframe|math)\b(.*\n)*?.*[ \t]*(?=\n+|\Z))[/text]) (hr-regex [text](?:(?<=\n\n)|\A\n?)([ ]{0,3}<(hr)\b([^<>])*?/?>[ \t]*(?=\n{2,}|\Z))[/text]) (html-comment-regex [text](?:(?<=\n\n)|\A\n?)([ ]{0,3}(?s:)[ \t]*(?=\n{2,}|\Z))[/text]) (results '()) (chunk-count (length (set 'chunks (parse txt "\n\n")))) (chunk-size 500)) ; due to a limitation in PCRE, long sections have to be divided up otherwise we'll crash ; so divide up long texts into chunks, then do the regex on each chunk ; not an ideal solution, but it works ok :( (for (i 0 chunk-count chunk-size) ; do a chunk (set 'text-chunk (join (i (- (min chunk-count (- (+ i chunk-size) 1)) i) chunks) "\n\n")) (dolist (rgx (list nested-block-regex liberal-tag-regex hr-regex html-comment-regex)) (replace rgx text-chunk (begin (set 'key (Hash:hash $1)) (push (list key $1 ) *hashed-html-blocks* -1) (string "\n\n" key "\n\n")) 2)) ; save this partial result (push text-chunk results -1) ) ; for ; return string result (join results "\n\n"))) (define (unescape-special-chars t) ; Swap back in all the special characters we've hidden. (dolist (pair (HashTable)) (replace (last pair) t (first pair) 10)) t) (define (strip-link-definitions txt) ; strip link definitions from the text and store them ; Link defs are in the form: ^[id]: url "optional title" ; stored in link db list as (id (url title)) (let ((link-db '()) (url {}) (id {}) (title {})) (replace [text]^[ ]{0,3}\[(.+)\]:[ \t]*\n?[ \t]*?[ \t]*\n?[ \t]*(?:(?<=\s)["(](.+?)[")][ \t]*)?(?:\n+|\Z)[/text] txt (begin (set 'id (lower-case $1) 'url (amps-and-angles $2) 'title $3) (if title (replace {"} title {"} 0)) (push (list id (list url title)) link-db) (set '$3 {}) ; necessary? (string {}) ; remove from text ) 10) (set '*link-database* link-db) txt)) (define (horizontal-rules txt) (replace [text]^[ ]{0,2}([ ]?\*[ ]?){3,}[ \t]*$[/text] txt "\n
" 14) (replace [text]^[ ]{0,2}([ ]? -[ ]?){3,}[ \t]*$[/text] txt "\n
" 14) (replace [text]^[ ]{0,2}([ ]? _[ ]?){3,}[ \t]*$[/text] txt "\n
" 14)) (define (headers txt) ; setext headers (let ((level 1)) (replace [text]^(.+)[ \t]*\n=+[ \t]*\n+[/text] txt (string "

" (span-transforms $1) "

\n\n") 2) (replace [text]^(.+)[ \t]*\n-+[ \t]*\n+[/text] txt (string "

" (span-transforms $1) "

\n\n") 2) ; atx headers (replace [text]^(\#{1,6})\s*(.+?)[ ]*\#*(\n+)[/text] txt (begin (set 'level (length $1)) (string "" (span-transforms $2) "\n\n") ) 2))) (define (lists txt) (letn ((marker-ul {[*+-]}) (marker-ol {\d+[.]}) (marker-any (string {(?:} marker-ul {|} marker-ol {)})) (whole-list-regex (string [text](([ ]{0,3}([/text] marker-any [text])[ \t]+)(?s:.+?)(\z|\n{2,}(?=\S)(?![ \t]*[/text] marker-any [text][ \t]+)))[/text])) (my-list {}) (list-type {}) (my-result {})) (replace (if (> *list-level* 0) (string {^} whole-list-regex) (string {(?:(?<=\n\n)|\A\n?)} whole-list-regex)) txt (begin (set 'my-list $1) (if (find $3 marker-ul) (set 'list-type "ul" 'marker-type marker-ul) (set 'list-type "ol" 'marker-type marker-ol)) (replace [text]\n{2,}[/text] my-list "\n\n\n" 0) (set 'my-result (process-list-items my-list marker-any)) (replace {\s+$} my-result {} 0) (string {<} list-type {>} "\n" my-result "\n" {} "\n")) 10 ; must be multiline ))) (define (process-list-items list-text marker-any) (let ((list-regex (string [text](\n)?(^[ \t]*)([/text] marker-any [text])[ \t]+((?s:.+?)(\n{1,2}))(?=\n*(\z|\2([/text] marker-any [text])[ \t]+))[/text])) (item {}) (leading-line {}) (leading-space {}) (result {})) (inc *list-level*) (replace [text]\n{2,}\z[/text] list-text "\n" 0) (set '$1 {} '$2 {} '$3 {} '$4 {} '$5 {}) (replace list-regex list-text (begin (set 'item $4) (set 'leading-line $1) (set 'leading-space $2) (if (or (not (empty? leading-line)) (ends-with item "\n{2,}" 0)) (set 'item (block-transforms (outdent item))) ; recurse for sub lists (begin (set 'item (lists (outdent item))) (set 'item (span-transforms (trim item "\n"))) )) (string {
  • } item {
  • } "\n")) 10) (dec *list-level*) list-text)) (define (code-blocks txt) (let ((code-block {}) (token-list '())) (replace [text](?:\n\n|\A)((?:(?:[ ]{4}|\t).*\n+)+)((?=^[ ]{0,3}\S)|\Z)[/text] txt (begin (set 'code-block $1) ; format if Nestor module is loaded and it's not marked as plain (if (and (not (starts-with code-block " ;plain\n")) (context? Nestor)) ; format newlisp (begin ; remove flag if present (replace "[ ]{4};newlisp\n" code-block {} 0) (set 'code-block (protect (Nestor:nlx-to-html (Nestor:my-read (trim (detab (outdent code-block)) "\n"))))) code-block) ; don't format (begin ; trim leading and trailing newlines (replace "[ ]{4};plain\n" code-block {} 0) (set 'code-block (trim (detab (encode-code (outdent code-block))) "\n")) (set '$1 {}) (set 'code-block (string "\n\n
    " code-block "\n
    \n\n"))))) 10))) (define (block-quotes txt) (let ((block-quote {})) (replace [text]((^[ \t]*>[ \t]?.+\n(.+\n)*\n*)+)[/text] txt (begin (set 'block-quote $1) (replace {^[ ]*>[ ]?} block-quote {} 2) (replace {^[ ]+$} block-quote {} 2) (set 'block-quote (block-transforms block-quote)) ; recurse ; remove leading spaces (replace {(\s*
    .+?
    )} block-quote (trim $1) 2) (string "
    \n" block-quote "\n
    \n\n")) 2))) (define (outdent s) (replace [text]^(\t|[ ]{1,4})[/text] s {} 2)) (define (detab s) (replace [text](.*?)\t[/text] s (string $1 (dup { } (- 4 (% (length $1) 4)))) 2)) (define (form-paragraphs txt) (let ((grafs '()) (original nil)) (set 'txt (trim txt "\n")) ; strip blank lines before and after (set 'grafs (parse txt "\n{2,}" 0)) ; split (dolist (p grafs) (if (set 'original (lookup p *hashed-html-blocks*)) ; html blocks (setf (grafs $idx) original) ; wrap

    tags round everything else (setf (grafs $idx) (string {

    } (replace {^[ ]*} (span-transforms p) {} (+ 4 8 16)) {

    })))) (join grafs "\n\n"))) [text] ; three command line arguments: let's hope last one is a file (when (= 3 (length (main-args))) (println (markdown (read-file (main-args 2)))) (exit)) ; hack for command-line and module loading (set 'level (sys-info 3)) ; if level is 2, then we're probably invoking markdown.lsp directly ; if level is > 3, then we're probably loading it into another script... (when (= level 2) ; running on command line, read STDIN and execute: (while (read-line) (push (current-line) *stdin* -1)) (println (markdown (join *stdin* "\n"))) (exit)) [/text] ;; version 2011-09-16 16:31:29 ;; Changed to different hash routine. Profiling shows that hashing takes 40% of the execution time. ;; Unfortunately this new version is only very slightly faster. ;; Command-line arguments hack in previous version doesn't work. ;; ;; version 2011-08-18 15:04:40 ;; various fixes, and added hack for running this from the command-line: ;; echo "hi there" | newlisp markdown.lsp ;; echo "hello world" | markdown.lsp ;; cat file.text | newlisp markdown.lsp ;; ;; version 2010-11-14 17:34:52 ;; some problems in ustring. Probably remove it one day, as it's non standard... ;; ;; version 2010-10-14 18:41:38 ;; added code to work round PCRE crash in (protect ... ;; ;; version date 2010-07-10 22:20:25 ;; modified call to 'read' since lutz has changed it ;; ;; version date 2009-11-16 22:10:10 ;; fixed bug in tokenize.html ;; ;; version date 2008-10-08 18:44:46 ;; changed nth-set to setf to be version-10 ready. ;; This means that now this script will NOT work with ;; earlier versions of newLISP!!!!!!!!!!! ;; requires Nestor if you want source code colouring... ;; ;; version date 2008-08-08 16:54:56 ;; changed (unless to (if (not ... :( ;; ;; version date 2008-07-20 14:!2:29 ;; added hex-str-to-unicode-char ustring ;; ;; version date 2008-03-07 15:36:09 ;; fixed load error ;; ;; version date 2007-11-17 16:20:57 ;; added syntax colouring module ;; ;; version date 2007-11-14 09:19:42 ;; removed reliance on dostring for compatibility with 9.1 ; eof