summaryrefslogtreecommitdiff
path: root/module/sxml/ssax.scm
blob: f750c934a94e04413f44f57af81af7c85d824ffb (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
;;;; (sxml ssax) -- the SSAX parser
;;;;
;;;; 	Copyright (C) 2009, 2010,2012,2013  Free Software Foundation, Inc.
;;;;    Modified 2004 by Andy Wingo <wingo at pobox dot com>.
;;;;    Written 2001,2002,2003,2004 by Oleg Kiselyov <oleg at pobox dot com> as SSAX.scm.
;;;; 
;;;; This library is free software; you can redistribute it and/or
;;;; modify it under the terms of the GNU Lesser General Public
;;;; License as published by the Free Software Foundation; either
;;;; version 3 of the License, or (at your option) any later version.
;;;; 
;;;; This library is distributed in the hope that it will be useful,
;;;; but WITHOUT ANY WARRANTY; without even the implied warranty of
;;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
;;;; Lesser General Public License for more details.
;;;; 
;;;; You should have received a copy of the GNU Lesser General Public
;;;; License along with this library; if not, write to the Free Software
;;;; Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;;;; 

;;; Commentary:
;;
;@subheading Functional XML parsing framework
;@subsubheading SAX/DOM and SXML parsers with support for XML Namespaces and validation
;
; This is a package of low-to-high level lexing and parsing procedures
; that can be combined to yield a SAX, a DOM, a validating parser, or
; a parser intended for a particular document type. The procedures in
; the package can be used separately to tokenize or parse various
; pieces of XML documents. The package supports XML Namespaces,
; internal and external parsed entities, user-controlled handling of
; whitespace, and validation. This module therefore is intended to be
; a framework, a set of "Lego blocks" you can use to build a parser
; following any discipline and performing validation to any degree. As
; an example of the parser construction, this file includes a
; semi-validating SXML parser.

; The present XML framework has a "sequential" feel of SAX yet a
; "functional style" of DOM. Like a SAX parser, the framework scans the
; document only once and permits incremental processing. An application
; that handles document elements in order can run as efficiently as
; possible. @emph{Unlike} a SAX parser, the framework does not require
; an application register stateful callbacks and surrender control to
; the parser. Rather, it is the application that can drive the framework
; -- calling its functions to get the current lexical or syntax element.
; These functions do not maintain or mutate any state save the input
; port. Therefore, the framework permits parsing of XML in a pure
; functional style, with the input port being a monad (or a linear,
; read-once parameter).

; Besides the @var{port}, there is another monad -- @var{seed}. Most of
; the middle- and high-level parsers are single-threaded through the
; @var{seed}. The functions of this framework do not process or affect
; the @var{seed} in any way: they simply pass it around as an instance
; of an opaque datatype. User functions, on the other hand, can use the
; seed to maintain user's state, to accumulate parsing results, etc. A
; user can freely mix his own functions with those of the framework. On
; the other hand, the user may wish to instantiate a high-level parser:
; @code{SSAX:make-elem-parser} or @code{SSAX:make-parser}. In the latter
; case, the user must provide functions of specific signatures, which
; are called at predictable moments during the parsing: to handle
; character data, element data, or processing instructions (PI). The
; functions are always given the @var{seed}, among other parameters, and
; must return the new @var{seed}.

; From a functional point of view, XML parsing is a combined
; pre-post-order traversal of a "tree" that is the XML document
; itself. This down-and-up traversal tells the user about an element
; when its start tag is encountered. The user is notified about the
; element once more, after all element's children have been
; handled. The process of XML parsing therefore is a fold over the
; raw XML document. Unlike a fold over trees defined in [1], the
; parser is necessarily single-threaded -- obviously as elements
; in a text XML document are laid down sequentially. The parser
; therefore is a tree fold that has been transformed to accept an
; accumulating parameter [1,2].

; Formally, the denotational semantics of the parser can be expressed
; as
;@smallexample
; parser:: (Start-tag -> Seed -> Seed) ->
;	   (Start-tag -> Seed -> Seed -> Seed) ->
;	   (Char-Data -> Seed -> Seed) ->
;	   XML-text-fragment -> Seed -> Seed
; parser fdown fup fchar "<elem attrs> content </elem>" seed
;  = fup "<elem attrs>" seed
;	(parser fdown fup fchar "content" (fdown "<elem attrs>" seed))
;
; parser fdown fup fchar "char-data content" seed
;  = parser fdown fup fchar "content" (fchar "char-data" seed)
;
; parser fdown fup fchar "elem-content content" seed
;  = parser fdown fup fchar "content" (
;	parser fdown fup fchar "elem-content" seed)
;@end smallexample

; Compare the last two equations with the left fold
;@smallexample
; fold-left kons elem:list seed = fold-left kons list (kons elem seed)
;@end smallexample

; The real parser created by @code{SSAX:make-parser} is slightly more
; complicated, to account for processing instructions, entity
; references, namespaces, processing of document type declaration, etc.


; The XML standard document referred to in this module is
;	@uref{http://www.w3.org/TR/1998/REC-xml-19980210.html}
;
; The present file also defines a procedure that parses the text of an
; XML document or of a separate element into SXML, an S-expression-based
; model of an XML Information Set. SXML is also an Abstract Syntax Tree
; of an XML document. SXML is similar but not identical to DOM; SXML is
; particularly suitable for Scheme-based XML/HTML authoring, SXPath
; queries, and tree transformations. See SXML.html for more details.
; SXML is a term implementation of evaluation of the XML document [3].
; The other implementation is context-passing.

; The present frameworks fully supports the XML Namespaces Recommendation:
;	@uref{http://www.w3.org/TR/REC-xml-names/}
; Other links:
;@table @asis
;@item [1]
; Jeremy Gibbons, Geraint Jones, "The Under-appreciated Unfold,"
; Proc. ICFP'98, 1998, pp. 273-279.
;@item [2]
; Richard S. Bird, The promotion and accumulation strategies in
; transformational programming, ACM Trans. Progr. Lang. Systems,
; 6(4):487-504, October 1984.
;@item [3]
; Ralf Hinze, "Deriving Backtracking Monad Transformers,"
; Functional Pearl. Proc ICFP'00, pp. 186-197.
;@end table
;;
;;; Code:

(define-module (sxml ssax)
  #:use-module (sxml ssax input-parse)
  #:use-module (srfi srfi-1)
  #:use-module (srfi srfi-13)

  #:export     (current-ssax-error-port
                with-ssax-error-to-port
                xml-token? xml-token-kind xml-token-head
                make-empty-attlist attlist-add
                attlist-null?
                attlist-remove-top
                attlist->alist attlist-fold
                define-parsed-entity!
                reset-parsed-entity-definitions!
                ssax:uri-string->symbol
                ssax:skip-internal-dtd
                ssax:read-pi-body-as-string
                ssax:reverse-collect-str-drop-ws
                ssax:read-markup-token
                ssax:read-cdata-body
                ssax:read-char-ref
                ssax:read-attributes
                ssax:complete-start-tag
                ssax:read-external-id
                ssax:read-char-data
                ssax:xml->sxml
                ssax:make-parser
                ssax:make-pi-parser
                ssax:make-elem-parser))

(define (parser-error port message . rest)
  (apply throw 'parser-error port message rest))
(define ascii->char integer->char)
(define char->ascii char->integer)

(define current-ssax-error-port
  (make-parameter (current-error-port)))

(define *current-ssax-error-port*
  (parameter-fluid current-ssax-error-port))

(define (with-ssax-error-to-port port thunk)
  (parameterize ((current-ssax-error-port port))
    (thunk)))

(define (ssax:warn port . args)
  (with-output-to-port (current-ssax-error-port)
    (lambda ()
      (display ";;; SSAX warning: ")
      (for-each display args)
      (newline))))

(define (ucscode->string codepoint)
  (string (integer->char codepoint)))

(define char-newline #\newline)
(define char-return #\return)
(define char-tab #\tab)
(define nl "\n")

;; This isn't a great API, but a more proper fix will involve hacking
;; SSAX.
(define (reset-parsed-entity-definitions!)
  "Restore the set of parsed entity definitions to its initial state."
  (set! ssax:predefined-parsed-entities
        '((amp . "&")
          (lt . "<")
          (gt . ">")
          (apos . "'")
          (quot . "\""))))

(define (define-parsed-entity! entity str)
  "Define a new parsed entity. @var{entity} should be a symbol.

Instances of &@var{entity}; in XML text will be replaced with the
string @var{str}, which will then be parsed."
  (set! ssax:predefined-parsed-entities
        (acons entity str ssax:predefined-parsed-entities)))

;; Execute a sequence of forms and return the result of the _first_ one.
;; Like PROG1 in Lisp. Typically used to evaluate one or more forms with
;; side effects and return a value that must be computed before some or
;; all of the side effects happen.
(define-syntax begin0
  (syntax-rules ()
    ((begin0 form form1 ... ) 
      (let ((val form)) form1 ... val))))

; Like let* but allowing for multiple-value bindings
(define-syntax let*-values
  (syntax-rules ()
    ((let*-values () . bodies) (begin . bodies))
    ((let*-values (((var) initializer) . rest) . bodies)
      (let ((var initializer))		; a single var optimization
	(let*-values rest . bodies)))
    ((let*-values ((vars initializer) . rest) . bodies)
      (call-with-values (lambda () initializer) ; the most generic case
	(lambda vars (let*-values rest . bodies))))))

;; needed for some dumb reason
(define inc 1+)
(define dec 1-)

(define-syntax include-from-path/filtered
  (lambda (x)
    (define (read-filtered accept-list file)
      (with-input-from-file (%search-load-path file)
        (lambda ()
          (let loop ((sexp (read)) (out '()))
            (cond
             ((eof-object? sexp) (reverse out))
             ((and (pair? sexp) (memq (car sexp) accept-list))
              (loop (read) (cons sexp out)))
             (else
              (loop (read) out)))))))
    (syntax-case x ()
      ((_ accept-list file)
       (with-syntax (((exp ...) (datum->syntax
                                 x 
                                 (read-filtered
                                  (syntax->datum #'accept-list)
                                  (syntax->datum #'file)))))
         #'(begin exp ...))))))

(include-from-path "sxml/upstream/assert.scm")
(include-from-path/filtered
 (define define-syntax ssax:define-labeled-arg-macro)
 "sxml/upstream/SSAX.scm")