diff options
author | Andy Wingo <wingo@pobox.com> | 2013-01-27 21:56:07 +0100 |
---|---|---|
committer | Andy Wingo <wingo@pobox.com> | 2013-01-28 11:54:11 +0100 |
commit | 1488753a66d499cab55edee8ee7e2b2ea5a64717 (patch) | |
tree | a7c9f28a5015b416e9caa6441fde89bfdebc5bcc | |
parent | bb0615d0157facb67ee1489a9764866dcd97eb20 (diff) | |
download | guile-1488753a66d499cab55edee8ee7e2b2ea5a64717.tar.gz |
make (sxml simple)'s xml->sxml more capable
* module/sxml/simple.scm (xml->sxml): Add #:namespaces,
#:declare-namespaces?, #:entities, #:default-entity-handler, and
#:trim-whitespace? arguments.
* doc/ref/sxml.texi (Reading and Writing XML): Document the new
options.
-rw-r--r-- | doc/ref/sxml.texi | 110 | ||||
-rw-r--r-- | module/sxml/simple.scm | 151 |
2 files changed, 255 insertions, 6 deletions
diff --git a/doc/ref/sxml.texi b/doc/ref/sxml.texi index ab9881567..50c10ae63 100644 --- a/doc/ref/sxml.texi +++ b/doc/ref/sxml.texi @@ -55,11 +55,117 @@ to text. (use-modules (sxml simple)) @end example -@deffn {Scheme Procedure} xml->sxml [port] +@deffn {Scheme Procedure} xml->sxml [port] [#:namespaces='()] @ + [#:declare-namespaces?=#t] [#:trim-whitespace?=#f] @ + [#:entities='()] [#:default-entity-handler=#f] Use SSAX to parse an XML document into SXML. Takes one optional -argument, @var{port}, which defaults to the current input port. +argument, @var{port}, which defaults to the current input port. Returns +the resulting SXML document, and leaves @var{port} pointing at the next +available character in the port. @end deffn +As is normal in SXML, XML elements parse as tagged lists. Attributes, +if any, are placed after the tag, within an @code{@@} element. The root +of the resulting XML will be contained in a special tag, @code{*TOP*}. +This tag will contain the root element of the XML, but also any prior +processing instructions. + +@example +(xml->sxml (open-input-string "<foo/>")) +@result{} (*TOP* (foo)) +(xml->sxml (open-input-string "<foo>text</foo>")) +@result{} (*TOP* (foo "text")) +(xml->sxml (open-input-string "<foo kind=\"bar\">text</foo>")) +@result{} (*TOP* (foo (@@ (kind "bar")) "text")) +(xml->sxml (open-input-string "<?xml version=\"1.0\"?><foo/>")) +@result{} (*TOP* (*PI* xml "version=\"1.0\"") (foo)) +@end example + +All namespaces in the XML document must be declared, via @code{xmlns} +attributes. SXML elements built from non-default namespaces will have +their tags prefixed with their URI. Users can specify custom prefixes +for certain namespaces with the @code{#:namespaces} keyword argument to +@code{xml->sxml}. + +@example +(xml->sxml + (open-input-string + "<foo xmlns=\"http://example.org/ns1\">text</foo>")) +@result{} (*TOP* (http://example.org/ns1:foo "text")) +(xml->sxml + (open-input-string + "<foo xmlns=\"http://example.org/ns1\">text</foo>") + #:namespaces '((ns1 . "http://example.org/ns1"))) +@result{} (*TOP* (ns1:foo "text")) +(xml->sxml + (open-input-string + "<foo xmlns:bar=\"http://example.org/ns2\"><bar:baz/></foo>") + #:namespaces '((ns2 . "http://example.org/ns2"))) +@result{} (*TOP* (foo (ns2:baz))) +@end example + +Passing a true @code{#:declare-namespaces?} argument will cause the +user-given @code{#:namespaces} to be treated as if they were declared on +the root element. + +@example +(xml->sxml (open-input-string "<foo><ns2:baz/></foo>") + #:namespaces '((ns2 . "http://example.org/ns2"))) +@result{} error: undeclared namespace: `bar' +(xml->sxml (open-input-string "<foo><ns2:baz/></foo>") + #:namespaces '((ns2 . "http://example.org/ns2")) + #:declare-namespaces? #t) +@result{} (*TOP* (foo (ns2:baz))) +@end example + +By default, all whitespace in XML is significant. Passing the +@code{#:trim-whitespace?} keyword argument to @code{xml->sxml} will trim +whitespace in front, behind and between elements, treating it as +``unsignificant''. Whitespace in text fragments is left alone. + +@example +(xml->sxml (open-input-string + "<foo>\n<bar> Alfie the parrot! </bar>\n</foo>")) +@result{} (*TOP* (foo "\n" (bar " Alfie the parrot! ") "\n") +(xml->sxml (open-input-string + "<foo>\n<bar> Alfie the parrot! </bar>\n</foo>") + #:trim-whitespace? #t) +@result{} (*TOP* (foo (bar " Alfie the parrot! ")) +@end example + +Parsed entities may be declared with the @code{#:entities} keyword +argument, or handled with the @code{#:default-entity-handler}. By +default, only the standard @code{<}, @code{>}, @code{&}, +@code{'} and @code{"} entities are defined, as well as the +@code{&#@var{N};} and @code{&#x@var{N};} (decimal and hexadecimal) +numeric character entities. + +@example +(xml->sxml (open-input-string "<foo>&</foo>")) +@result{} (*TOP* (foo "&")) +(xml->sxml (open-input-string "<foo> </foo>")) +@result{} error: undefined entity: nbsp +(xml->sxml (open-input-string "<foo> </foo>")) +@result{} (*TOP* (foo "\xa0")) +(xml->sxml (open-input-string "<foo> </foo>") + #:entities '((nbsp . "\xa0"))) +@result{} (*TOP* (foo "\xa0")) +(xml->sxml (open-input-string "<foo> &foo;</foo>") + #:default-entity-handler + (lambda (port name) + (case name + ((nbsp) "\xa0") + (else + (format (current-warning-port) + "~a:~a:~a: undefined entitity: ~a\n" + (or (port-filename port) "<unknown file>") + (port-line port) (port-column port) + name) + (symbol->string name))))) +@print{} <unknown file>:0:17: undefined entitity: foo +@result{} (*TOP* (foo "\xa0 foo")) +@end example + @deffn {Scheme Procedure} sxml->xml tree [port] Serialize the SXML tree @var{tree} as XML. The output will be written to the current output port, unless the optional argument @var{port} is diff --git a/module/sxml/simple.scm b/module/sxml/simple.scm index dcef3b2cb..4d06ff687 100644 --- a/module/sxml/simple.scm +++ b/module/sxml/simple.scm @@ -1,6 +1,6 @@ ;;;; (sxml simple) -- a simple interface to the SSAX parser ;;;; -;;;; Copyright (C) 2009, 2010 Free Software Foundation, Inc. +;;;; Copyright (C) 2009, 2010, 2013 Free Software Foundation, Inc. ;;;; Modified 2004 by Andy Wingo <wingo at pobox dot com>. ;;;; Originally written by Oleg Kiselyov <oleg at pobox dot com> as SXML-to-HTML.scm. ;;;; @@ -28,14 +28,157 @@ (define-module (sxml simple) #:use-module (sxml ssax) #:use-module (sxml transform) - #:use-module (ice-9 optargs) + #:use-module (ice-9 match) #:use-module (srfi srfi-13) #:export (xml->sxml sxml->xml sxml->string)) -(define* (xml->sxml #:optional (port (current-input-port))) +;; Helpers from upstream/SSAX.scm. +;; + +(define (ssax:warn port msg . args) + (format (current-ssax-error-port) + ";;; SSAX warning: ~a ~a\n" msg args)) + +; ssax:reverse-collect-str LIST-OF-FRAGS -> LIST-OF-FRAGS +; given the list of fragments (some of which are text strings) +; reverse the list and concatenate adjacent text strings. +; We can prove from the general case below that if LIST-OF-FRAGS +; has zero or one element, the result of the procedure is equal? +; to its argument. This fact justifies the shortcut evaluation below. +(define (ssax:reverse-collect-str fragments) + (cond + ((null? fragments) '()) ; a shortcut + ((null? (cdr fragments)) fragments) ; see the comment above + (else + (let loop ((fragments fragments) (result '()) (strs '())) + (cond + ((null? fragments) + (if (null? strs) result + (cons (string-concatenate/shared strs) result))) + ((string? (car fragments)) + (loop (cdr fragments) result (cons (car fragments) strs))) + (else + (loop (cdr fragments) + (cons + (car fragments) + (if (null? strs) result + (cons (string-concatenate/shared strs) result))) + '()))))))) + +;; Ideas for the future for this interface: +;; +;; * Allow doctypes to provide parsed entities +;; +;; * Allow validation (the ELEMENTS value from the DOCTYPE handler +;; below) +;; +;; * Parse internal DTDs +;; +;; * Parse external DTDs +;; +(define* (xml->sxml #:optional (port (current-input-port)) #:key + (namespaces '()) + (declare-namespaces? #t) + (trim-whitespace? #f) + (entities '()) + (default-entity-handler #f)) "Use SSAX to parse an XML document into SXML. Takes one optional argument, @var{port}, which defaults to the current input port." - (ssax:xml->sxml port '())) + ;; NAMESPACES: alist of PREFIX -> URI. Specifies the symbol prefix + ;; that the user wants on elements of a given namespace in the + ;; resulting SXML, regardless of the abbreviated namespaces defined in + ;; the document by xmlns attributes. If DECLARE-NAMESPACES? is true, + ;; these namespaces are treated as if they were declared in the DTD. + + ;; ENTITIES: alist of SYMBOL -> STRING. + + ;; NAMESPACES: list of (DOC-PREFIX . (USER-PREFIX . URI)). + ;; A DOC-PREFIX of #f indicates that it comes from the user. + ;; Otherwise, prefixes are symbols. + (define (user-namespaces) + (map (lambda (el) + (match el + ((prefix . uri-string) + (cons* (and declare-namespaces? prefix) + prefix + (ssax:uri-string->symbol uri-string))))) + namespaces)) + + (define (user-entities) + (if (and default-entity-handler + (not (assq '*DEFAULT* entities))) + (acons '*DEFAULT* default-entity-handler entities) + entities)) + + (define (name->sxml name) + (match name + ((prefix . local-part) + (symbol-append prefix (string->symbol ":") local-part)) + (_ name))) + + ;; The SEED in this parser is the SXML: initialized to '() at each new + ;; level by the fdown handlers; built in reverse by the fhere parsers; + ;; and reverse-collected by the fup handlers. + (define parser + (ssax:make-parser + NEW-LEVEL-SEED ; fdown + (lambda (elem-gi attributes namespaces expected-content seed) + '()) + + FINISH-ELEMENT ; fup + (lambda (elem-gi attributes namespaces parent-seed seed) + (let ((seed (if trim-whitespace? + (ssax:reverse-collect-str-drop-ws seed) + (ssax:reverse-collect-str seed))) + (attrs (attlist-fold + (lambda (attr accum) + (cons (list (name->sxml (car attr)) (cdr attr)) + accum)) + '() attributes))) + (acons (name->sxml elem-gi) + (if (null? attrs) + seed + (cons (cons '@ attrs) seed)) + parent-seed))) + + CHAR-DATA-HANDLER ; fhere + (lambda (string1 string2 seed) + (if (string-null? string2) + (cons string1 seed) + (cons* string2 string1 seed))) + + DOCTYPE + ;; -> ELEMS ENTITIES NAMESPACES SEED + ;; + ;; ELEMS is for validation and currently unused. + ;; + ;; ENTITIES is an alist of parsed entities (symbol -> string). + ;; + ;; NAMESPACES is as above. + ;; + ;; SEED builds up the content. + (lambda (port docname systemid internal-subset? seed) + (when internal-subset? + (ssax:warn port "Internal DTD subset is not currently handled ") + (ssax:skip-internal-dtd port)) + (ssax:warn port "DOCTYPE DECL " docname " " + systemid " found and skipped") + (values #f (user-entities) (user-namespaces) seed)) + + UNDECL-ROOT + ;; This is like the DOCTYPE handler, but for documents that do not + ;; have a <!DOCTYPE!> entry. + (lambda (elem-gi seed) + (values #f (user-entities) (user-namespaces) seed)) + + PI + ((*DEFAULT* + . (lambda (port pi-tag seed) + (cons + (list '*PI* pi-tag (ssax:read-pi-body-as-string port)) + seed)))))) + + `(*TOP* ,@(reverse (parser port '())))) (define check-name (let ((*good-cache* (make-hash-table))) |