;;;; (sxml simple) -- a simple interface to the SSAX parser ;;;; ;;;; Copyright (C) 2009, 2010, 2013 Free Software Foundation, Inc. ;;;; Modified 2004 by Andy Wingo . ;;;; Originally written by Oleg Kiselyov as SXML-to-HTML.scm. ;;;; ;;;; This library is free software; you can redistribute it and/or ;;;; modify it under the terms of the GNU Lesser General Public ;;;; License as published by the Free Software Foundation; either ;;;; version 3 of the License, or (at your option) any later version. ;;;; ;;;; This library is distributed in the hope that it will be useful, ;;;; but WITHOUT ANY WARRANTY; without even the implied warranty of ;;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ;;;; Lesser General Public License for more details. ;;;; ;;;; You should have received a copy of the GNU Lesser General Public ;;;; License along with this library; if not, write to the Free Software ;;;; Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ;;;; ;;; Commentary: ;; ;;A simple interface to XML parsing and serialization. ;; ;;; Code: (define-module (sxml simple) #:use-module (sxml ssax input-parse) #:use-module (sxml ssax) #:use-module (sxml transform) #:use-module (ice-9 match) #:export (xml->sxml sxml->xml sxml->string)) ;; Helpers from upstream/SSAX.scm. ;; ; ssax:reverse-collect-str LIST-OF-FRAGS -> LIST-OF-FRAGS ; given the list of fragments (some of which are text strings) ; reverse the list and concatenate adjacent text strings. ; We can prove from the general case below that if LIST-OF-FRAGS ; has zero or one element, the result of the procedure is equal? ; to its argument. This fact justifies the shortcut evaluation below. (define (ssax:reverse-collect-str fragments) (cond ((null? fragments) '()) ; a shortcut ((null? (cdr fragments)) fragments) ; see the comment above (else (let loop ((fragments fragments) (result '()) (strs '())) (cond ((null? fragments) (if (null? strs) result (cons (string-concatenate/shared strs) result))) ((string? (car fragments)) (loop (cdr fragments) result (cons (car fragments) strs))) (else (loop (cdr fragments) (cons (car fragments) (if (null? strs) result (cons (string-concatenate/shared strs) result))) '()))))))) (define (read-internal-doctype-as-string port) (string-concatenate/shared (let loop () (let ((fragment (next-token '() '(#\]) "reading internal DOCTYPE" port))) (if (eqv? #\> (peek-next-char port)) (begin (read-char port) (cons fragment '())) (cons* fragment "]" (loop))))))) ;; Ideas for the future for this interface: ;; ;; * Allow doctypes to provide parsed entities ;; ;; * Allow validation (the ELEMENTS value from the DOCTYPE handler ;; below) ;; ;; * Parse internal DTDs ;; ;; * Parse external DTDs ;; (define* (xml->sxml #:optional (string-or-port (current-input-port)) #:key (namespaces '()) (declare-namespaces? #t) (trim-whitespace? #f) (entities '()) (default-entity-handler #f) (doctype-handler #f)) "Use SSAX to parse an XML document into SXML. Takes one optional argument, @var{string-or-port}, which defaults to the current input port." ;; NAMESPACES: alist of PREFIX -> URI. Specifies the symbol prefix ;; that the user wants on elements of a given namespace in the ;; resulting SXML, regardless of the abbreviated namespaces defined in ;; the document by xmlns attributes. If DECLARE-NAMESPACES? is true, ;; these namespaces are treated as if they were declared in the DTD. ;; ENTITIES: alist of SYMBOL -> STRING. ;; NAMESPACES: list of (DOC-PREFIX . (USER-PREFIX . URI)). ;; A DOC-PREFIX of #f indicates that it comes from the user. ;; Otherwise, prefixes are symbols. (define (munge-namespaces namespaces) (map (lambda (el) (match el ((prefix . uri-string) (cons* (and declare-namespaces? prefix) prefix (ssax:uri-string->symbol uri-string))))) namespaces)) (define (user-namespaces) (munge-namespaces namespaces)) (define (user-entities) (if (and default-entity-handler (not (assq '*DEFAULT* entities))) (acons '*DEFAULT* default-entity-handler entities) entities)) (define (name->sxml name) (match name ((prefix . local-part) (symbol-append prefix (string->symbol ":") local-part)) (_ name))) (define (doctype-continuation seed) (lambda* (#:key (entities '()) (namespaces '())) (values #f (append entities (user-entities)) (append (munge-namespaces namespaces) (user-namespaces)) seed))) ;; The SEED in this parser is the SXML: initialized to '() at each new ;; level by the fdown handlers; built in reverse by the fhere parsers; ;; and reverse-collected by the fup handlers. (define parser (ssax:make-parser NEW-LEVEL-SEED ; fdown (lambda (elem-gi attributes namespaces expected-content seed) '()) FINISH-ELEMENT ; fup (lambda (elem-gi attributes namespaces parent-seed seed) (let ((seed (if trim-whitespace? (ssax:reverse-collect-str-drop-ws seed) (ssax:reverse-collect-str seed))) (attrs (attlist-fold (lambda (attr accum) (cons (list (name->sxml (car attr)) (cdr attr)) accum)) '() attributes))) (acons (name->sxml elem-gi) (if (null? attrs) seed (cons (cons '@ attrs) seed)) parent-seed))) CHAR-DATA-HANDLER ; fhere (lambda (string1 string2 seed) (if (string-null? string2) (cons string1 seed) (cons* string2 string1 seed))) DOCTYPE ;; -> ELEMS ENTITIES NAMESPACES SEED ;; ;; ELEMS is for validation and currently unused. ;; ;; ENTITIES is an alist of parsed entities (symbol -> string). ;; ;; NAMESPACES is as above. ;; ;; SEED builds up the content. (lambda (port docname systemid internal-subset? seed) (call-with-values (lambda () (cond (doctype-handler (doctype-handler docname systemid (and internal-subset? (read-internal-doctype-as-string port)))) (else (when internal-subset? (ssax:skip-internal-dtd port)) (values)))) (doctype-continuation seed))) UNDECL-ROOT ;; This is like the DOCTYPE handler, but for documents that do not ;; have a entry. (lambda (elem-gi seed) (call-with-values (lambda () (if doctype-handler (doctype-handler #f #f #f) (values))) (doctype-continuation seed))) PI ((*DEFAULT* . (lambda (port pi-tag seed) (cons (list '*PI* pi-tag (ssax:read-pi-body-as-string port)) seed)))))) (let* ((port (if (string? string-or-port) (open-input-string string-or-port) string-or-port)) (elements (reverse (parser port '())))) `(*TOP* ,@elements))) (define check-name (let ((*good-cache* (make-hash-table))) (lambda (name) (if (not (hashq-ref *good-cache* name)) (let* ((str (symbol->string name)) (i (string-index str #\:)) (head (or (and i (substring str 0 i)) str)) (tail (and i (substring str (1+ i))))) (and i (string-index (substring str (1+ i)) #\:) (error "Invalid QName: more than one colon" name)) (for-each (lambda (s) (and s (or (char-alphabetic? (string-ref s 0)) (eq? (string-ref s 0) #\_) (error "Invalid name starting character" s name)) (string-for-each (lambda (c) (or (char-alphabetic? c) (string-index "0123456789.-_" c) (error "Invalid name character" c s name))) s))) (list head tail)) (hashq-set! *good-cache* name #t)))))) ;; The following two functions serialize tags and attributes. They are ;; being used in the node handlers for the post-order function, see ;; below. (define (attribute-value->xml value port) (cond ((pair? value) (attribute-value->xml (car value) port) (attribute-value->xml (cdr value) port)) ((null? value) *unspecified*) ((string? value) (string->escaped-xml value port)) ((procedure? value) (with-output-to-port port value)) (else (string->escaped-xml (call-with-output-string (lambda (port) (display value port))) port)))) (define (attribute->xml attr value port) (check-name attr) (display attr port) (display "=\"" port) (attribute-value->xml value port) (display #\" port)) (define (element->xml tag attrs body port) (check-name tag) (display #\< port) (display tag port) (if attrs (let lp ((attrs attrs)) (if (pair? attrs) (let ((attr (car attrs))) (display #\space port) (if (pair? attr) (attribute->xml (car attr) (cdr attr) port) (error "bad attribute" tag attr)) (lp (cdr attrs))) (if (not (null? attrs)) (error "bad attributes" tag attrs))))) (if (pair? body) (begin (display #\> port) (let lp ((body body)) (cond ((pair? body) (sxml->xml (car body) port) (lp (cdr body))) ((null? body) (display "" port)) (else (error "bad element body" tag body))))) (display " />" port))) ;; FIXME: ensure name is valid (define (entity->xml name port) (display #\& port) (display name port) (display #\; port)) ;; FIXME: ensure tag and str are valid (define (pi->xml tag str port) (display "" port)) (define* (sxml->xml tree #:optional (port (current-output-port))) "Serialize the sxml tree @var{tree} as XML. The output will be written to the current output port, unless the optional argument @var{port} is present." (cond ((pair? tree) (if (symbol? (car tree)) ;; An element. (let ((tag (car tree))) (case tag ((*TOP*) (sxml->xml (cdr tree) port)) ((*ENTITY*) (if (and (list? (cdr tree)) (= (length (cdr tree)) 1)) (entity->xml (cadr tree) port) (error "bad *ENTITY* args" (cdr tree)))) ((*PI*) (if (and (list? (cdr tree)) (= (length (cdr tree)) 2)) (pi->xml (cadr tree) (caddr tree) port) (error "bad *PI* args" (cdr tree)))) (else (let* ((elems (cdr tree)) (attrs (and (pair? elems) (pair? (car elems)) (eq? '@ (caar elems)) (cdar elems)))) (element->xml tag attrs (if attrs (cdr elems) elems) port))))) ;; A nodelist. (for-each (lambda (x) (sxml->xml x port)) tree))) ((string? tree) (string->escaped-xml tree port)) ((null? tree) *unspecified*) ((not tree) *unspecified*) ((eqv? tree #t) *unspecified*) ((procedure? tree) (with-output-to-port port tree)) (else (string->escaped-xml (call-with-output-string (lambda (port) (display tree port))) port)))) (define (sxml->string sxml) "Detag an sxml tree @var{sxml} into a string. Does not perform any formatting." (string-concatenate-reverse (foldts (lambda (seed tree) ; fdown '()) (lambda (seed kid-seed tree) ; fup (append! kid-seed seed)) (lambda (seed tree) ; fhere (if (string? tree) (cons tree seed) seed)) '() sxml))) (define (make-char-quotator char-encoding) (let ((bad-chars (list->char-set (map car char-encoding)))) ;; Check to see if str contains one of the characters in charset, ;; from the position i onward. If so, return that character's index. ;; otherwise, return #f (define (index-cset str i charset) (string-index str charset i)) ;; The body of the function (lambda (str port) (let ((bad-pos (index-cset str 0 bad-chars))) (if (not bad-pos) (display str port) ; str had all good chars (let loop ((from 0) (to bad-pos)) (cond ((>= from (string-length str)) *unspecified*) ((not to) (display (substring str from (string-length str)) port)) (else (let ((quoted-char (cdr (assv (string-ref str to) char-encoding))) (new-to (index-cset str (+ 1 to) bad-chars))) (if (< from to) (display (substring str from to) port)) (display quoted-char port) (loop (1+ to) new-to)))))))))) ;; Given a string, check to make sure it does not contain characters ;; such as '<' or '&' that require encoding. Return either the original ;; string, or a list of string fragments with special characters ;; replaced by appropriate character entities. (define string->escaped-xml (make-char-quotator '((#\< . "<") (#\> . ">") (#\& . "&") (#\" . """)))) ;;; arch-tag: 9c853b25-d82f-42ef-a959-ae26fdc7d1ac ;;; simple.scm ends here