summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAndy Wingo <wingo@pobox.com>2013-01-27 21:56:07 +0100
committerAndy Wingo <wingo@pobox.com>2013-01-28 11:54:11 +0100
commit1488753a66d499cab55edee8ee7e2b2ea5a64717 (patch)
treea7c9f28a5015b416e9caa6441fde89bfdebc5bcc
parentbb0615d0157facb67ee1489a9764866dcd97eb20 (diff)
downloadguile-1488753a66d499cab55edee8ee7e2b2ea5a64717.tar.gz
make (sxml simple)'s xml->sxml more capable
* module/sxml/simple.scm (xml->sxml): Add #:namespaces, #:declare-namespaces?, #:entities, #:default-entity-handler, and #:trim-whitespace? arguments. * doc/ref/sxml.texi (Reading and Writing XML): Document the new options.
-rw-r--r--doc/ref/sxml.texi110
-rw-r--r--module/sxml/simple.scm151
2 files changed, 255 insertions, 6 deletions
diff --git a/doc/ref/sxml.texi b/doc/ref/sxml.texi
index ab9881567..50c10ae63 100644
--- a/doc/ref/sxml.texi
+++ b/doc/ref/sxml.texi
@@ -55,11 +55,117 @@ to text.
(use-modules (sxml simple))
@end example
-@deffn {Scheme Procedure} xml->sxml [port]
+@deffn {Scheme Procedure} xml->sxml [port] [#:namespaces='()] @
+ [#:declare-namespaces?=#t] [#:trim-whitespace?=#f] @
+ [#:entities='()] [#:default-entity-handler=#f]
Use SSAX to parse an XML document into SXML. Takes one optional
-argument, @var{port}, which defaults to the current input port.
+argument, @var{port}, which defaults to the current input port. Returns
+the resulting SXML document, and leaves @var{port} pointing at the next
+available character in the port.
@end deffn
+As is normal in SXML, XML elements parse as tagged lists. Attributes,
+if any, are placed after the tag, within an @code{@@} element. The root
+of the resulting XML will be contained in a special tag, @code{*TOP*}.
+This tag will contain the root element of the XML, but also any prior
+processing instructions.
+
+@example
+(xml->sxml (open-input-string "<foo/>"))
+@result{} (*TOP* (foo))
+(xml->sxml (open-input-string "<foo>text</foo>"))
+@result{} (*TOP* (foo "text"))
+(xml->sxml (open-input-string "<foo kind=\"bar\">text</foo>"))
+@result{} (*TOP* (foo (@@ (kind "bar")) "text"))
+(xml->sxml (open-input-string "<?xml version=\"1.0\"?><foo/>"))
+@result{} (*TOP* (*PI* xml "version=\"1.0\"") (foo))
+@end example
+
+All namespaces in the XML document must be declared, via @code{xmlns}
+attributes. SXML elements built from non-default namespaces will have
+their tags prefixed with their URI. Users can specify custom prefixes
+for certain namespaces with the @code{#:namespaces} keyword argument to
+@code{xml->sxml}.
+
+@example
+(xml->sxml
+ (open-input-string
+ "<foo xmlns=\"http://example.org/ns1\">text</foo>"))
+@result{} (*TOP* (http://example.org/ns1:foo "text"))
+(xml->sxml
+ (open-input-string
+ "<foo xmlns=\"http://example.org/ns1\">text</foo>")
+ #:namespaces '((ns1 . "http://example.org/ns1")))
+@result{} (*TOP* (ns1:foo "text"))
+(xml->sxml
+ (open-input-string
+ "<foo xmlns:bar=\"http://example.org/ns2\"><bar:baz/></foo>")
+ #:namespaces '((ns2 . "http://example.org/ns2")))
+@result{} (*TOP* (foo (ns2:baz)))
+@end example
+
+Passing a true @code{#:declare-namespaces?} argument will cause the
+user-given @code{#:namespaces} to be treated as if they were declared on
+the root element.
+
+@example
+(xml->sxml (open-input-string "<foo><ns2:baz/></foo>")
+ #:namespaces '((ns2 . "http://example.org/ns2")))
+@result{} error: undeclared namespace: `bar'
+(xml->sxml (open-input-string "<foo><ns2:baz/></foo>")
+ #:namespaces '((ns2 . "http://example.org/ns2"))
+ #:declare-namespaces? #t)
+@result{} (*TOP* (foo (ns2:baz)))
+@end example
+
+By default, all whitespace in XML is significant. Passing the
+@code{#:trim-whitespace?} keyword argument to @code{xml->sxml} will trim
+whitespace in front, behind and between elements, treating it as
+``unsignificant''. Whitespace in text fragments is left alone.
+
+@example
+(xml->sxml (open-input-string
+ "<foo>\n<bar> Alfie the parrot! </bar>\n</foo>"))
+@result{} (*TOP* (foo "\n" (bar " Alfie the parrot! ") "\n")
+(xml->sxml (open-input-string
+ "<foo>\n<bar> Alfie the parrot! </bar>\n</foo>")
+ #:trim-whitespace? #t)
+@result{} (*TOP* (foo (bar " Alfie the parrot! "))
+@end example
+
+Parsed entities may be declared with the @code{#:entities} keyword
+argument, or handled with the @code{#:default-entity-handler}. By
+default, only the standard @code{&lt;}, @code{&gt;}, @code{&amp;},
+@code{&apos;} and @code{&quot;} entities are defined, as well as the
+@code{&#@var{N};} and @code{&#x@var{N};} (decimal and hexadecimal)
+numeric character entities.
+
+@example
+(xml->sxml (open-input-string "<foo>&amp;</foo>"))
+@result{} (*TOP* (foo "&"))
+(xml->sxml (open-input-string "<foo>&nbsp;</foo>"))
+@result{} error: undefined entity: nbsp
+(xml->sxml (open-input-string "<foo>&#xA0;</foo>"))
+@result{} (*TOP* (foo "\xa0"))
+(xml->sxml (open-input-string "<foo>&nbsp;</foo>")
+ #:entities '((nbsp . "\xa0")))
+@result{} (*TOP* (foo "\xa0"))
+(xml->sxml (open-input-string "<foo>&nbsp; &foo;</foo>")
+ #:default-entity-handler
+ (lambda (port name)
+ (case name
+ ((nbsp) "\xa0")
+ (else
+ (format (current-warning-port)
+ "~a:~a:~a: undefined entitity: ~a\n"
+ (or (port-filename port) "<unknown file>")
+ (port-line port) (port-column port)
+ name)
+ (symbol->string name)))))
+@print{} <unknown file>:0:17: undefined entitity: foo
+@result{} (*TOP* (foo "\xa0 foo"))
+@end example
+
@deffn {Scheme Procedure} sxml->xml tree [port]
Serialize the SXML tree @var{tree} as XML. The output will be written to
the current output port, unless the optional argument @var{port} is
diff --git a/module/sxml/simple.scm b/module/sxml/simple.scm
index dcef3b2cb..4d06ff687 100644
--- a/module/sxml/simple.scm
+++ b/module/sxml/simple.scm
@@ -1,6 +1,6 @@
;;;; (sxml simple) -- a simple interface to the SSAX parser
;;;;
-;;;; Copyright (C) 2009, 2010 Free Software Foundation, Inc.
+;;;; Copyright (C) 2009, 2010, 2013 Free Software Foundation, Inc.
;;;; Modified 2004 by Andy Wingo <wingo at pobox dot com>.
;;;; Originally written by Oleg Kiselyov <oleg at pobox dot com> as SXML-to-HTML.scm.
;;;;
@@ -28,14 +28,157 @@
(define-module (sxml simple)
#:use-module (sxml ssax)
#:use-module (sxml transform)
- #:use-module (ice-9 optargs)
+ #:use-module (ice-9 match)
#:use-module (srfi srfi-13)
#:export (xml->sxml sxml->xml sxml->string))
-(define* (xml->sxml #:optional (port (current-input-port)))
+;; Helpers from upstream/SSAX.scm.
+;;
+
+(define (ssax:warn port msg . args)
+ (format (current-ssax-error-port)
+ ";;; SSAX warning: ~a ~a\n" msg args))
+
+; ssax:reverse-collect-str LIST-OF-FRAGS -> LIST-OF-FRAGS
+; given the list of fragments (some of which are text strings)
+; reverse the list and concatenate adjacent text strings.
+; We can prove from the general case below that if LIST-OF-FRAGS
+; has zero or one element, the result of the procedure is equal?
+; to its argument. This fact justifies the shortcut evaluation below.
+(define (ssax:reverse-collect-str fragments)
+ (cond
+ ((null? fragments) '()) ; a shortcut
+ ((null? (cdr fragments)) fragments) ; see the comment above
+ (else
+ (let loop ((fragments fragments) (result '()) (strs '()))
+ (cond
+ ((null? fragments)
+ (if (null? strs) result
+ (cons (string-concatenate/shared strs) result)))
+ ((string? (car fragments))
+ (loop (cdr fragments) result (cons (car fragments) strs)))
+ (else
+ (loop (cdr fragments)
+ (cons
+ (car fragments)
+ (if (null? strs) result
+ (cons (string-concatenate/shared strs) result)))
+ '())))))))
+
+;; Ideas for the future for this interface:
+;;
+;; * Allow doctypes to provide parsed entities
+;;
+;; * Allow validation (the ELEMENTS value from the DOCTYPE handler
+;; below)
+;;
+;; * Parse internal DTDs
+;;
+;; * Parse external DTDs
+;;
+(define* (xml->sxml #:optional (port (current-input-port)) #:key
+ (namespaces '())
+ (declare-namespaces? #t)
+ (trim-whitespace? #f)
+ (entities '())
+ (default-entity-handler #f))
"Use SSAX to parse an XML document into SXML. Takes one optional
argument, @var{port}, which defaults to the current input port."
- (ssax:xml->sxml port '()))
+ ;; NAMESPACES: alist of PREFIX -> URI. Specifies the symbol prefix
+ ;; that the user wants on elements of a given namespace in the
+ ;; resulting SXML, regardless of the abbreviated namespaces defined in
+ ;; the document by xmlns attributes. If DECLARE-NAMESPACES? is true,
+ ;; these namespaces are treated as if they were declared in the DTD.
+
+ ;; ENTITIES: alist of SYMBOL -> STRING.
+
+ ;; NAMESPACES: list of (DOC-PREFIX . (USER-PREFIX . URI)).
+ ;; A DOC-PREFIX of #f indicates that it comes from the user.
+ ;; Otherwise, prefixes are symbols.
+ (define (user-namespaces)
+ (map (lambda (el)
+ (match el
+ ((prefix . uri-string)
+ (cons* (and declare-namespaces? prefix)
+ prefix
+ (ssax:uri-string->symbol uri-string)))))
+ namespaces))
+
+ (define (user-entities)
+ (if (and default-entity-handler
+ (not (assq '*DEFAULT* entities)))
+ (acons '*DEFAULT* default-entity-handler entities)
+ entities))
+
+ (define (name->sxml name)
+ (match name
+ ((prefix . local-part)
+ (symbol-append prefix (string->symbol ":") local-part))
+ (_ name)))
+
+ ;; The SEED in this parser is the SXML: initialized to '() at each new
+ ;; level by the fdown handlers; built in reverse by the fhere parsers;
+ ;; and reverse-collected by the fup handlers.
+ (define parser
+ (ssax:make-parser
+ NEW-LEVEL-SEED ; fdown
+ (lambda (elem-gi attributes namespaces expected-content seed)
+ '())
+
+ FINISH-ELEMENT ; fup
+ (lambda (elem-gi attributes namespaces parent-seed seed)
+ (let ((seed (if trim-whitespace?
+ (ssax:reverse-collect-str-drop-ws seed)
+ (ssax:reverse-collect-str seed)))
+ (attrs (attlist-fold
+ (lambda (attr accum)
+ (cons (list (name->sxml (car attr)) (cdr attr))
+ accum))
+ '() attributes)))
+ (acons (name->sxml elem-gi)
+ (if (null? attrs)
+ seed
+ (cons (cons '@ attrs) seed))
+ parent-seed)))
+
+ CHAR-DATA-HANDLER ; fhere
+ (lambda (string1 string2 seed)
+ (if (string-null? string2)
+ (cons string1 seed)
+ (cons* string2 string1 seed)))
+
+ DOCTYPE
+ ;; -> ELEMS ENTITIES NAMESPACES SEED
+ ;;
+ ;; ELEMS is for validation and currently unused.
+ ;;
+ ;; ENTITIES is an alist of parsed entities (symbol -> string).
+ ;;
+ ;; NAMESPACES is as above.
+ ;;
+ ;; SEED builds up the content.
+ (lambda (port docname systemid internal-subset? seed)
+ (when internal-subset?
+ (ssax:warn port "Internal DTD subset is not currently handled ")
+ (ssax:skip-internal-dtd port))
+ (ssax:warn port "DOCTYPE DECL " docname " "
+ systemid " found and skipped")
+ (values #f (user-entities) (user-namespaces) seed))
+
+ UNDECL-ROOT
+ ;; This is like the DOCTYPE handler, but for documents that do not
+ ;; have a <!DOCTYPE!> entry.
+ (lambda (elem-gi seed)
+ (values #f (user-entities) (user-namespaces) seed))
+
+ PI
+ ((*DEFAULT*
+ . (lambda (port pi-tag seed)
+ (cons
+ (list '*PI* pi-tag (ssax:read-pi-body-as-string port))
+ seed))))))
+
+ `(*TOP* ,@(reverse (parser port '()))))
(define check-name
(let ((*good-cache* (make-hash-table)))