summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAndy Wingo <wingo@pobox.com>2009-12-21 00:01:13 +0100
committerAndy Wingo <wingo@pobox.com>2009-12-21 00:01:13 +0100
commit47f3ce525efcf2aa30abbae0374c19c9f8960789 (patch)
tree7bb9ee62708bd858c2f272511495f93db21c0f71
parentc66fe8a9a0a6b5106c94a4ce6eeda551275dc2bc (diff)
downloadguile-47f3ce525efcf2aa30abbae0374c19c9f8960789.tar.gz
import statprof, sxml, and texinfo from guile-lib
* module/Makefile.am (LIB_SOURCES): Add statprof, sxml, and texinfo to the build. (NOCOMP_SOURCES): Reindent, and add the upstream SSAX files. * module/statprof.scm: * module/sxml/apply-templates.scm: * module/sxml/fold.scm: * module/sxml/simple.scm: * module/sxml/ssax.scm: * module/sxml/ssax/input-parse.scm: * module/sxml/transform.scm: * module/sxml/upstream/COPYING.SSAX: * module/sxml/upstream/SSAX.scm: * module/sxml/upstream/SXML-tree-trans.scm: * module/sxml/upstream/SXPath-old.scm: * module/sxml/upstream/assert.scm: * module/sxml/upstream/input-parse.scm: * module/sxml/xpath.scm: * module/texinfo.scm: * module/texinfo/docbook.scm: * module/texinfo/html.scm: * module/texinfo/indexing.scm: * module/texinfo/plain-text.scm: * module/texinfo/reflection.scm: * module/texinfo/serialize.scm: * module/texinfo/string-utils.scm: Add files from guile-lib to Guile. It's only Richard, Andreas, Rob, and myself that have copyright on these, and we have all assigned to the FSF. SSAX itself is in the public domain.
-rw-r--r--module/Makefile.am54
-rw-r--r--module/statprof.scm688
-rw-r--r--module/sxml/apply-templates.scm102
-rw-r--r--module/sxml/fold.scm259
-rw-r--r--module/sxml/simple.scm169
-rw-r--r--module/sxml/ssax.scm246
-rw-r--r--module/sxml/ssax/input-parse.scm180
-rw-r--r--module/sxml/transform.scm298
-rw-r--r--module/sxml/upstream/COPYING.SSAX2
-rw-r--r--module/sxml/upstream/SSAX.scm3212
-rw-r--r--module/sxml/upstream/SXML-tree-trans.scm249
-rw-r--r--module/sxml/upstream/SXPath-old.scm1216
-rw-r--r--module/sxml/upstream/assert.scm35
-rw-r--r--module/sxml/upstream/input-parse.scm326
-rw-r--r--module/sxml/xpath.scm493
-rw-r--r--module/texinfo.scm1217
-rw-r--r--module/texinfo/docbook.scm233
-rw-r--r--module/texinfo/html.scm259
-rw-r--r--module/texinfo/indexing.scm78
-rw-r--r--module/texinfo/plain-text.scm319
-rw-r--r--module/texinfo/reflection.scm528
-rw-r--r--module/texinfo/serialize.scm263
-rw-r--r--module/texinfo/string-utils.scm400
23 files changed, 10811 insertions, 15 deletions
diff --git a/module/Makefile.am b/module/Makefile.am
index 21c36243a..54ceca917 100644
--- a/module/Makefile.am
+++ b/module/Makefile.am
@@ -63,7 +63,8 @@ SOURCES = \
$(SCRIPTS_SOURCES) \
$(ECMASCRIPT_LANG_SOURCES) \
$(ELISP_LANG_SOURCES) \
- $(BRAINFUCK_LANG_SOURCES)
+ $(BRAINFUCK_LANG_SOURCES) \
+ $(LIB_SOURCES)
## test.scm is not currently installed.
EXTRA_DIST += \
@@ -283,20 +284,43 @@ SYSTEM_SOURCES = \
system/repl/repl.scm system/repl/common.scm \
system/repl/command.scm
+LIB_SOURCES = \
+ statprof.scm \
+ sxml/apply-templates.scm \
+ sxml/fold.scm \
+ sxml/simple.scm \
+ sxml/ssax/input-parse.scm \
+ sxml/ssax.scm \
+ sxml/transform.scm \
+ sxml/xpath.scm \
+ texinfo.scm \
+ texinfo/docbook.scm \
+ texinfo/html.scm \
+ texinfo/indexing.scm \
+ texinfo/string-utils.scm \
+ texinfo/plain-text.scm \
+ texinfo/reflection.scm \
+ texinfo/serialize.scm
+
EXTRA_DIST += oop/ChangeLog-2008
NOCOMP_SOURCES = \
- ice-9/gds-client.scm \
- ice-9/psyntax.scm \
- ice-9/quasisyntax.scm \
- system/repl/describe.scm \
- ice-9/debugger/command-loop.scm \
- ice-9/debugger/commands.scm \
- ice-9/debugger/state.scm \
- ice-9/debugger/trc.scm \
- ice-9/debugger/utils.scm \
- ice-9/debugging/example-fns.scm \
- ice-9/debugging/steps.scm \
- ice-9/debugging/trace.scm \
- ice-9/debugging/traps.scm \
- ice-9/debugging/trc.scm
+ ice-9/gds-client.scm \
+ ice-9/psyntax.scm \
+ ice-9/quasisyntax.scm \
+ system/repl/describe.scm \
+ ice-9/debugger/command-loop.scm \
+ ice-9/debugger/commands.scm \
+ ice-9/debugger/state.scm \
+ ice-9/debugger/trc.scm \
+ ice-9/debugger/utils.scm \
+ ice-9/debugging/example-fns.scm \
+ ice-9/debugging/steps.scm \
+ ice-9/debugging/trace.scm \
+ ice-9/debugging/traps.scm \
+ ice-9/debugging/trc.scm \
+ sxml/upstream/SSAX.scm \
+ sxml/upstream/SXML-tree-trans.scm \
+ sxml/upstream/SXPath-old.scm \
+ sxml/upstream/assert.scm \
+ sxml/upstream/input-parse.scm
diff --git a/module/statprof.scm b/module/statprof.scm
new file mode 100644
index 000000000..f021778a9
--- /dev/null
+++ b/module/statprof.scm
@@ -0,0 +1,688 @@
+;;;; (statprof) -- a statistical profiler for Guile
+;;;; -*-scheme-*-
+;;;;
+;;;; Copyright (C) 2009 Free Software Foundation, Inc.
+;;;; Copyright (C) 2004, 2009 Andy Wingo <wingo at pobox dot com>
+;;;; Copyright (C) 2001 Rob Browning <rlb at defaultvalue dot org>
+;;;;
+;;;; This library is free software; you can redistribute it and/or
+;;;; modify it under the terms of the GNU Lesser General Public
+;;;; License as published by the Free Software Foundation; either
+;;;; version 3 of the License, or (at your option) any later version.
+;;;;
+;;;; This library is distributed in the hope that it will be useful,
+;;;; but WITHOUT ANY WARRANTY; without even the implied warranty of
+;;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;;;; Lesser General Public License for more details.
+;;;;
+;;;; You should have received a copy of the GNU Lesser General Public
+;;;; License along with this library; if not, write to the Free Software
+;;;; Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;;;;
+
+
+;;; Commentary:
+;;
+;;@code{(statprof)} is intended to be a fairly simple
+;;statistical profiler for guile. It is in the early stages yet, so
+;;consider its output still suspect, and please report any bugs to
+;;@email{guile-devel at gnu.org}, or to me directly at @email{rlb at
+;;defaultvalue.org}.
+;;
+;;A simple use of statprof would look like this:
+;;
+;;@example
+;; (statprof-reset 0 50000 #t)
+;; (statprof-start)
+;; (do-something)
+;; (statprof-stop)
+;; (statprof-display)
+;;@end example
+;;
+;;This would reset statprof, clearing all accumulated statistics, then
+;;start profiling, run some code, stop profiling, and finally display a
+;;gprof flat-style table of statistics which will look something like
+;;this:
+;;
+;;@example
+;; % cumulative self self total
+;; time seconds seconds calls ms/call ms/call name
+;; 35.29 0.23 0.23 2002 0.11 0.11 -
+;; 23.53 0.15 0.15 2001 0.08 0.08 positive?
+;; 23.53 0.15 0.15 2000 0.08 0.08 +
+;; 11.76 0.23 0.08 2000 0.04 0.11 do-nothing
+;; 5.88 0.64 0.04 2001 0.02 0.32 loop
+;; 0.00 0.15 0.00 1 0.00 150.59 do-something
+;; ...
+;;@end example
+;;
+;;All of the numerical data with the exception of the calls column is
+;;statistically approximate. In the following column descriptions, and
+;;in all of statprof, "time" refers to execution time (both user and
+;;system), not wall clock time.
+;;
+;;@table @asis
+;;@item % time
+;;The percent of the time spent inside the procedure itself
+;;(not counting children).
+;;@item cumulative seconds
+;;The total number of seconds spent in the procedure, including
+;;children.
+;;@item self seconds
+;;The total number of seconds spent in the procedure itself (not counting
+;;children).
+;;@item calls
+;;The total number of times the procedure was called.
+;;@item self ms/call
+;;The average time taken by the procedure itself on each call, in ms.
+;;@item total ms/call
+;;The average time taken by each call to the procedure, including time
+;;spent in child functions.
+;;@item name
+;;The name of the procedure.
+;;@end table
+;;
+;;The profiler uses @code{eq?} and the procedure object itself to
+;;identify the procedures, so it won't confuse different procedures with
+;;the same name. They will show up as two different rows in the output.
+;;
+;;Right now the profiler is quite simplistic. I cannot provide
+;;call-graphs or other higher level information. What you see in the
+;;table is pretty much all there is. Patches are welcome :-)
+;;
+;;@section Implementation notes
+;;
+;;The profiler works by setting the unix profiling signal
+;;@code{ITIMER_PROF} to go off after the interval you define in the call
+;;to @code{statprof-reset}. When the signal fires, a sampling routine is
+;;run which looks at the current procedure that's executing, and then
+;;crawls up the stack, and for each procedure encountered, increments
+;;that procedure's sample count. Note that if a procedure is encountered
+;;multiple times on a given stack, it is only counted once. After the
+;;sampling is complete, the profiler resets profiling timer to fire
+;;again after the appropriate interval.
+;;
+;;Meanwhile, the profiler keeps track, via @code{get-internal-run-time},
+;;how much CPU time (system and user -- which is also what
+;;@code{ITIMER_PROF} tracks), has elapsed while code has been executing
+;;within a statprof-start/stop block.
+;;
+;;The profiler also tries to avoid counting or timing its own code as
+;;much as possible.
+;;
+;;; Code:
+
+;; When you add new features, please also add tests to ./tests/ if you
+;; have time, and then add the new files to ./run-tests. Also, if
+;; anyone's bored, there are a lot of existing API bits that don't
+;; have tests yet.
+
+;; TODO
+;;
+;; Check about profiling C functions -- does profiling primitives work?
+;; Also look into stealing code from qprof so we can sample the C stack
+;; Call graphs?
+
+(define-module (statprof)
+ #:use-module (srfi srfi-1)
+ #:autoload (ice-9 format) (format)
+ #:export (statprof-active?
+ statprof-start
+ statprof-stop
+ statprof-reset
+
+ statprof-accumulated-time
+ statprof-sample-count
+ statprof-fold-call-data
+ statprof-proc-call-data
+ statprof-call-data-name
+ statprof-call-data-calls
+ statprof-call-data-cum-samples
+ statprof-call-data-self-samples
+ statprof-call-data->stats
+
+ statprof-stats-proc-name
+ statprof-stats-%-time-in-proc
+ statprof-stats-cum-secs-in-proc
+ statprof-stats-self-secs-in-proc
+ statprof-stats-calls
+ statprof-stats-self-secs-per-call
+ statprof-stats-cum-secs-per-call
+
+ statprof-display
+ statprof-display-anomolies
+
+ statprof-fetch-stacks
+ statprof-fetch-call-tree
+
+ with-statprof))
+
+
+;; This profiler tracks two numbers for every function called while
+;; it's active. It tracks the total number of calls, and the number
+;; of times the function was active when the sampler fired.
+;;
+;; Globally the profiler tracks the total time elapsed and the number
+;; of times the sampler was fired.
+;;
+;; Right now, this profiler is not per-thread and is not thread safe.
+
+(define accumulated-time #f) ; total so far.
+(define last-start-time #f) ; start-time when timer is active.
+(define sample-count #f) ; total count of sampler calls.
+(define sampling-frequency #f) ; in (seconds . microseconds)
+(define remaining-prof-time #f) ; time remaining when prof suspended.
+(define profile-level 0) ; for user start/stop nesting.
+(define %count-calls? #t) ; whether to catch apply-frame.
+(define gc-time-taken 0) ; gc time between statprof-start and
+ ; statprof-stop.
+(define record-full-stacks? #f) ; if #t, stash away the stacks
+ ; for later analysis.
+(define stacks '())
+
+;; procedure-data will be a hash where the key is the function object
+;; itself and the value is the data. The data will be a vector like
+;; this: #(name call-count cum-sample-count self-sample-count)
+(define procedure-data #f)
+
+;; If you change the call-data data structure, you need to also change
+;; sample-uncount-frame.
+(define (make-call-data name call-count cum-sample-count self-sample-count)
+ (vector (or name (error "internal error (we don't count anonymous procs)"))
+ call-count cum-sample-count self-sample-count))
+(define (call-data-name cd) (vector-ref cd 0))
+(define (call-data-call-count cd) (vector-ref cd 1))
+(define (call-data-cum-sample-count cd) (vector-ref cd 2))
+(define (call-data-self-sample-count cd) (vector-ref cd 3))
+
+(define (set-call-data-name! cd name)
+ (vector-set! cd 0 name))
+(define (inc-call-data-call-count! cd)
+ (vector-set! cd 1 (1+ (vector-ref cd 1))))
+(define (inc-call-data-cum-sample-count! cd)
+ (vector-set! cd 2 (1+ (vector-ref cd 2))))
+(define (inc-call-data-self-sample-count! cd)
+ (vector-set! cd 3 (1+ (vector-ref cd 3))))
+
+(define-macro (accumulate-time stop-time)
+ `(set! accumulated-time
+ (+ accumulated-time 0.0 (- ,stop-time last-start-time))))
+
+(define (get-call-data proc)
+ (or (hashq-ref procedure-data proc)
+ (let ((call-data (make-call-data (procedure-name proc) 0 0 0)))
+ (hashq-set! procedure-data proc call-data)
+ call-data)))
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; SIGPROF handler
+
+(define (sample-stack-procs stack)
+ (let ((stacklen (stack-length stack))
+ (hit-count-call? #f))
+
+ (if record-full-stacks?
+ (set! stacks (cons stack stacks)))
+
+ (set! sample-count (+ sample-count 1))
+ ;; Now accumulate stats for the whole stack.
+ (let loop ((frame (stack-ref stack 0))
+ (procs-seen (make-hash-table 13))
+ (self #f))
+ (cond
+ ((not frame)
+ (hash-fold
+ (lambda (proc val accum)
+ (inc-call-data-cum-sample-count!
+ (get-call-data proc)))
+ #f
+ procs-seen)
+ (and=> (and=> self get-call-data)
+ inc-call-data-self-sample-count!))
+ ((frame-procedure frame)
+ => (lambda (proc)
+ (cond
+ ((eq? proc count-call)
+ ;; We're not supposed to be sampling count-call and
+ ;; its sub-functions, so loop again with a clean
+ ;; slate.
+ (set! hit-count-call? #t)
+ (loop (frame-previous frame) (make-hash-table 13) #f))
+ ((procedure-name proc)
+ (hashq-set! procs-seen proc #t)
+ (loop (frame-previous frame)
+ procs-seen
+ (or self proc)))
+ (else
+ (loop (frame-previous frame) procs-seen self)))))
+ (else
+ (loop (frame-previous frame) procs-seen self))))
+ hit-count-call?))
+
+(define inside-profiler? #f)
+
+(define (profile-signal-handler sig)
+ (set! inside-profiler? #t)
+
+ ;; FIXME: with-statprof should be able to set an outer frame for the
+ ;; stack cut
+ (if (positive? profile-level)
+ (let* ((stop-time (get-internal-run-time))
+ ;; cut down to the signal handler, then we rely on
+ ;; knowledge of guile: it dispatches signal handlers
+ ;; through a thunk, so cut one more procedure
+ (stack (make-stack #t profile-signal-handler 0 1))
+ (inside-apply-trap? (sample-stack-procs stack)))
+
+ (if (not inside-apply-trap?)
+ (begin
+ ;; disabling here is just a little more efficient, but
+ ;; not necessary given inside-profiler?. We can't just
+ ;; disable unconditionally at the top of this function
+ ;; and eliminate inside-profiler? because it seems to
+ ;; confuse guile wrt re-enabling the trap when
+ ;; count-call finishes.
+ (if %count-calls? (trap-disable 'apply-frame))
+ (accumulate-time stop-time)))
+
+ (setitimer ITIMER_PROF
+ 0 0
+ (car sampling-frequency)
+ (cdr sampling-frequency))
+
+ (if (not inside-apply-trap?)
+ (begin
+ (set! last-start-time (get-internal-run-time))
+ (if %count-calls? (trap-enable 'apply-frame))))))
+
+ (set! inside-profiler? #f))
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Count total calls.
+
+(define (count-call trap-name continuation tail)
+ (if (not inside-profiler?)
+ (begin
+ (accumulate-time (get-internal-run-time))
+
+ (and=> (frame-procedure (last-stack-frame continuation))
+ (lambda (proc)
+ (if (procedure-name proc)
+ (inc-call-data-call-count!
+ (get-call-data proc)))))
+
+ (set! last-start-time (get-internal-run-time)))))
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+(define (statprof-active?)
+ "Returns @code{#t} if @code{statprof-start} has been called more times
+than @code{statprof-stop}, @code{#f} otherwise."
+ (positive? profile-level))
+
+;; Do not call this from statprof internal functions -- user only.
+(define (statprof-start)
+ "Start the profiler.@code{}"
+ ;; After some head-scratching, I don't *think* I need to mask/unmask
+ ;; signals here, but if I'm wrong, please let me know.
+ (set! profile-level (+ profile-level 1))
+ (if (= profile-level 1)
+ (let* ((rpt remaining-prof-time)
+ (use-rpt? (and rpt
+ (or (positive? (car rpt))
+ (positive? (cdr rpt))))))
+ (set! remaining-prof-time #f)
+ (set! last-start-time (get-internal-run-time))
+ (set! gc-time-taken
+ (cdr (assq 'gc-time-taken (gc-stats))))
+ (if use-rpt?
+ (setitimer ITIMER_PROF 0 0 (car rpt) (cdr rpt))
+ (setitimer ITIMER_PROF
+ 0 0
+ (car sampling-frequency)
+ (cdr sampling-frequency)))
+ (trap-enable 'apply-frame)
+ #t)))
+
+;; Do not call this from statprof internal functions -- user only.
+(define (statprof-stop)
+ "Stop the profiler.@code{}"
+ ;; After some head-scratching, I don't *think* I need to mask/unmask
+ ;; signals here, but if I'm wrong, please let me know.
+ (set! profile-level (- profile-level 1))
+ (if (zero? profile-level)
+ (begin
+ (set! gc-time-taken
+ (- (cdr (assq 'gc-time-taken (gc-stats))) gc-time-taken))
+ (trap-disable 'apply-frame)
+ ;; I believe that we need to do this before getting the time
+ ;; (unless we want to make things even more complicated).
+ (set! remaining-prof-time (setitimer ITIMER_PROF 0 0 0 0))
+ (accumulate-time (get-internal-run-time))
+ (set! last-start-time #f))))
+
+(define (statprof-reset sample-seconds sample-microseconds count-calls?
+ . full-stacks?)
+ "Reset the statprof sampler interval to @var{sample-seconds} and
+@var{sample-microseconds}. If @var{count-calls?} is true, arrange to
+instrument procedure calls as well as collecting statistical profiling
+data. If @var{full-stacks?} is true, collect all sampled stacks into a
+list for later analysis.
+
+Enables traps and debugging as necessary."
+ (if (positive? profile-level)
+ (error "Can't reset profiler while profiler is running."))
+ (set! %count-calls? count-calls?)
+ (set! accumulated-time 0)
+ (set! last-start-time #f)
+ (set! sample-count 0)
+ (set! sampling-frequency (cons sample-seconds sample-microseconds))
+ (set! remaining-prof-time #f)
+ (set! procedure-data (make-hash-table 131))
+ (if %count-calls?
+ (begin
+ (trap-set! apply-frame-handler count-call)
+ (trap-enable 'traps)))
+ (set! record-full-stacks? (and (pair? full-stacks?) (car full-stacks?)))
+ (set! stacks '())
+ (debug-enable 'debug)
+ (sigaction SIGPROF profile-signal-handler)
+ #t)
+
+(define (statprof-fold-call-data proc init)
+ "Fold @var{proc} over the call-data accumulated by statprof. Cannot be
+called while statprof is active. @var{proc} should take two arguments,
+@code{(@var{call-data} @var{prior-result})}.
+
+Note that a given proc-name may appear multiple times, but if it does,
+it represents different functions with the same name."
+ (if (positive? profile-level)
+ (error "Can't call statprof-fold-called while profiler is running."))
+
+ (hash-fold
+ (lambda (key value prior-result)
+ (proc value prior-result))
+ init
+ procedure-data))
+
+(define (statprof-proc-call-data proc)
+ "Returns the call-data associated with @var{proc}, or @code{#f} if
+none is available."
+ (if (positive? profile-level)
+ (error "Can't call statprof-fold-called while profiler is running."))
+
+ (hashq-ref procedure-data proc))
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Stats
+
+(define (statprof-call-data->stats call-data)
+ "Returns an object of type @code{statprof-stats}."
+ ;; returns (vector proc-name
+ ;; %-time-in-proc
+ ;; cum-seconds-in-proc
+ ;; self-seconds-in-proc
+ ;; num-calls
+ ;; self-secs-per-call
+ ;; total-secs-per-call)
+
+ (let* ((proc-name (call-data-name call-data))
+ (self-samples (call-data-self-sample-count call-data))
+ (cum-samples (call-data-cum-sample-count call-data))
+ (all-samples (statprof-sample-count))
+ (secs-per-sample (/ (statprof-accumulated-time)
+ (statprof-sample-count)))
+ (num-calls (and %count-calls? (statprof-call-data-calls call-data))))
+
+ (vector proc-name
+ (* (/ self-samples all-samples) 100.0)
+ (* cum-samples secs-per-sample 1.0)
+ (* self-samples secs-per-sample 1.0)
+ num-calls
+ (and num-calls ;; maybe we only sampled in children
+ (if (zero? self-samples) 0.0
+ (/ (* self-samples secs-per-sample) 1.0 num-calls)))
+ (and num-calls ;; cum-samples must be positive
+ (/ (* cum-samples secs-per-sample) 1.0 num-calls)))))
+
+(define (statprof-stats-proc-name stats) (vector-ref stats 0))
+(define (statprof-stats-%-time-in-proc stats) (vector-ref stats 1))
+(define (statprof-stats-cum-secs-in-proc stats) (vector-ref stats 2))
+(define (statprof-stats-self-secs-in-proc stats) (vector-ref stats 3))
+(define (statprof-stats-calls stats) (vector-ref stats 4))
+(define (statprof-stats-self-secs-per-call stats) (vector-ref stats 5))
+(define (statprof-stats-cum-secs-per-call stats) (vector-ref stats 6))
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+(define (stats-sorter x y)
+ (let ((diff (- (statprof-stats-self-secs-in-proc x)
+ (statprof-stats-self-secs-in-proc y))))
+ (positive?
+ (if (= diff 0)
+ (- (statprof-stats-cum-secs-in-proc x)
+ (statprof-stats-cum-secs-in-proc y))
+ diff))))
+
+(define (statprof-display . port)
+ "Displays a gprof-like summary of the statistics collected. Unless an
+optional @var{port} argument is passed, uses the current output port."
+ (if (null? port) (set! port (current-output-port)))
+
+ (cond
+ ((zero? (statprof-sample-count))
+ (format port "No samples recorded.\n"))
+ (else
+ (let* ((stats-list (statprof-fold-call-data
+ (lambda (data prior-value)
+ (cons (statprof-call-data->stats data)
+ prior-value))
+ '()))
+ (sorted-stats (sort stats-list stats-sorter)))
+
+ (define (display-stats-line stats)
+ (if %count-calls?
+ (format port "~6,2f ~9,2f ~9,2f ~8r ~8,2f ~8,2f "
+ (statprof-stats-%-time-in-proc stats)
+ (statprof-stats-cum-secs-in-proc stats)
+ (statprof-stats-self-secs-in-proc stats)
+ (statprof-stats-calls stats)
+ (* 1000 (statprof-stats-self-secs-per-call stats))
+ (* 1000 (statprof-stats-cum-secs-per-call stats)))
+ (format port "~6,2f ~9,2f ~9,2f "
+ (statprof-stats-%-time-in-proc stats)
+ (statprof-stats-cum-secs-in-proc stats)
+ (statprof-stats-self-secs-in-proc stats)))
+ (display (statprof-stats-proc-name stats) port)
+ (newline port))
+
+ (if %count-calls?
+ (begin
+ (format port "~5a ~10a ~7a ~8a ~8a ~8a ~8@a\n"
+ "% " "cumulative" "self" "" "self" "total" "")
+ (format port "~5a ~9a ~8a ~8a ~8a ~8a ~8@a\n"
+ "time" "seconds" "seconds" "calls" "ms/call" "ms/call" "name"))
+ (begin
+ (format port "~5a ~10a ~7a ~8@a\n"
+ "%" "cumulative" "self" "")
+ (format port "~5a ~10a ~7a ~8@a\n"
+ "time" "seconds" "seconds" "name")))
+
+ (for-each display-stats-line sorted-stats)
+
+ (display "---\n" port)
+ (simple-format #t "Sample count: ~A\n" (statprof-sample-count))
+ (simple-format #t "Total time: ~A seconds (~A seconds in GC)\n"
+ (statprof-accumulated-time)
+ (/ gc-time-taken internal-time-units-per-second))))))
+
+(define (statprof-display-anomolies)
+ "A sanity check that attempts to detect anomolies in statprof's
+statistics.@code{}"
+ (statprof-fold-call-data
+ (lambda (data prior-value)
+ (if (and %count-calls?
+ (zero? (call-data-call-count data))
+ (positive? (call-data-sample-count data)))
+ (simple-format #t
+ "==[~A ~A ~A]\n"
+ (call-data-name data)
+ (call-data-call-count data)
+ (call-data-sample-count data))))
+ #f)
+ (simple-format #t "Total time: ~A\n" (statprof-accumulated-time))
+ (simple-format #t "Sample count: ~A\n" (statprof-sample-count)))
+
+(define (statprof-accumulated-time)
+ "Returns the time accumulated during the last statprof run.@code{}"
+ (if (positive? profile-level)
+ (error "Can't get accumulated time while profiler is running."))
+ (/ accumulated-time internal-time-units-per-second))
+
+(define (statprof-sample-count)
+ "Returns the number of samples taken during the last statprof run.@code{}"
+ (if (positive? profile-level)
+ (error "Can't get accumulated time while profiler is running."))
+ sample-count)
+
+(define statprof-call-data-name call-data-name)
+(define statprof-call-data-calls call-data-call-count)
+(define statprof-call-data-cum-samples call-data-cum-sample-count)
+(define statprof-call-data-self-samples call-data-self-sample-count)
+
+(define (statprof-fetch-stacks)
+ "Returns a list of stacks, as they were captured since the last call
+to @code{statprof-reset}.
+
+Note that stacks are only collected if the @var{full-stacks?} argument
+to @code{statprof-reset} is true."
+ stacks)
+
+(define procedure=?
+ (if (false-if-exception (resolve-interface '(system base compile)))
+ (lambda (a b)
+ (cond
+ ((eq? a b))
+ ((and ((@ (system vm program) program?) a)
+ ((@ (system vm program) program?) b))
+ (eq? ((@ (system vm program) program-objcode) a)
+ ((@ (system vm program) program-objcode) b)))
+ ((and (closure? a) (closure? b)
+ (procedure-source a) (procedure-source b))
+ (and (eq? (procedure-name a) (procedure-name b))
+ (equal? (procedure-source a) (procedure-source b))))
+ (else
+ #f)))
+ (lambda (a b)
+ (cond
+ ((eq? a b))
+ ((and (closure? a) (closure? b)
+ (procedure-source a) (procedure-source b))
+ (and (eq? (procedure-name a) (procedure-name b))
+ (equal? (procedure-source a) (procedure-source b))))
+ (else
+ #f)))))
+
+;; tree ::= (car n . tree*)
+
+(define (lists->trees lists equal?)
+ (let lp ((in lists) (n-terminal 0) (tails '()))
+ (cond
+ ((null? in)
+ (let ((trees (map (lambda (tail)
+ (cons (car tail)
+ (lists->trees (cdr tail) equal?)))
+ tails)))
+ (cons (apply + n-terminal (map cadr trees))
+ (sort trees
+ (lambda (a b) (> (cadr a) (cadr b)))))))
+ ((null? (car in))
+ (lp (cdr in) (1+ n-terminal) tails))
+ ((find (lambda (x) (equal? (car x) (caar in)))
+ tails)
+ => (lambda (tail)
+ (lp (cdr in)
+ n-terminal
+ (assq-set! tails
+ (car tail)
+ (cons (cdar in) (cdr tail))))))
+ (else
+ (lp (cdr in)
+ n-terminal
+ (acons (caar in) (list (cdar in)) tails))))))
+
+(define (stack->procedures stack)
+ (filter identity
+ (unfold-right (lambda (x) (not x))
+ frame-procedure
+ frame-previous
+ (stack-ref stack 0))))
+
+(define (statprof-fetch-call-tree)
+ "Return a call tree for the previous statprof run.
+
+The return value is a list of nodes, each of which is of the type:
+@code
+ node ::= (@var{proc} @var{count} . @var{nodes})
+@end code"
+ (cons #t (lists->trees (map stack->procedures stacks) procedure=?)))
+
+(define-macro (with-statprof . args)
+ "Profiles the expressions in its body.
+
+Keyword arguments:
+
+@table @code
+@item #:loop
+Execute the body @var{loop} number of times, or @code{#f} for no looping
+
+default: @code{#f}
+@item #:hz
+Sampling rate
+
+default: @code{20}
+@item #:count-calls?
+Whether to instrument each function call (expensive)
+
+default: @code{#f}
+@item #:full-stacks?
+Whether to collect away all sampled stacks into a list
+
+default: @code{#f}
+@end table"
+ (define (kw-arg-ref kw args def)
+ (cond
+ ((null? args) (error "Invalid macro body"))
+ ((keyword? (car args))
+ (if (eq? (car args) kw)
+ (cadr args)
+ (kw-arg-ref kw (cddr args) def)))
+ ((eq? kw #f def) ;; asking for the body
+ args)
+ (else def))) ;; kw not found
+ (let ((loop (kw-arg-ref #:loop args #f))
+ (hz (kw-arg-ref #:hz args 20))
+ (count-calls? (kw-arg-ref #:count-calls? args #f))
+ (full-stacks? (kw-arg-ref #:full-stacks? args #f))
+ (body (kw-arg-ref #f args #f)))
+ `(dynamic-wind
+ (lambda ()
+ (statprof-reset (inexact->exact (floor (/ 1 ,hz)))
+ (inexact->exact (* 1e6 (- (/ 1 ,hz)
+ (floor (/ 1 ,hz)))))
+ ,count-calls?
+ ,full-stacks?)
+ (statprof-start))
+ (lambda ()
+ ,(if loop
+ (let ((lp (gensym "statprof ")) (x (gensym)))
+ `(let ,lp ((,x ,loop))
+ (if (not (zero? ,x))
+ (begin ,@body (,lp (1- ,x))))))
+ `(begin ,@body)))
+ (lambda ()
+ (statprof-stop)
+ (statprof-display)
+ (set! (@@ (statprof) procedure-data) #f)))))
+
+;;; arch-tag: 83969178-b576-4c52-a31c-6a9c2be85d10
diff --git a/module/sxml/apply-templates.scm b/module/sxml/apply-templates.scm
new file mode 100644
index 000000000..0ee27477c
--- /dev/null
+++ b/module/sxml/apply-templates.scm
@@ -0,0 +1,102 @@
+;;;; (sxml apply-templates) -- xslt-like transformation for sxml
+;;;;
+;;;; Copyright (C) 2009 Free Software Foundation, Inc.
+;;;; Copyright 2004 by Andy Wingo <wingo at pobox dot com>.
+;;;; Written 2003 by Oleg Kiselyov <oleg at pobox dot com> as apply-templates.scm.
+;;;;
+;;;; This library is free software; you can redistribute it and/or
+;;;; modify it under the terms of the GNU Lesser General Public
+;;;; License as published by the Free Software Foundation; either
+;;;; version 3 of the License, or (at your option) any later version.
+;;;;
+;;;; This library is distributed in the hope that it will be useful,
+;;;; but WITHOUT ANY WARRANTY; without even the implied warranty of
+;;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;;;; Lesser General Public License for more details.
+;;;;
+;;;; You should have received a copy of the GNU Lesser General Public
+;;;; License along with this library; if not, write to the Free Software
+;;;; Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;;;;
+
+;;; Commentary:
+;;
+;; Pre-order traversal of a tree and creation of a new tree:
+;;
+;;@smallexample
+;; apply-templates:: tree x <templates> -> <new-tree>
+;;@end smallexample
+;; where
+;;@smallexample
+;; <templates> ::= (<template> ...)
+;; <template> ::= (<node-test> <node-test> ... <node-test> . <handler>)
+;; <node-test> ::= an argument to node-typeof? above
+;; <handler> ::= <tree> -> <new-tree>
+;;@end smallexample
+;;
+;; This procedure does a @emph{normal}, pre-order traversal of an SXML
+;; tree. It walks the tree, checking at each node against the list of
+;; matching templates.
+;;
+;; If the match is found (which must be unique, i.e., unambiguous), the
+;; corresponding handler is invoked and given the current node as an
+;; argument. The result from the handler, which must be a @code{<tree>},
+;; takes place of the current node in the resulting tree.
+;;
+;; The name of the function is not accidental: it resembles rather
+;; closely an @code{apply-templates} function of XSLT.
+;;
+;;; Code:
+
+(define-module (sxml apply-templates)
+ #:use-module (sxml ssax)
+ #:use-module ((sxml xpath) :hide (filter))
+
+ #:export (apply-templates))
+
+(define (apply-templates tree templates)
+
+ ; Filter the list of templates. If a template does not
+ ; contradict the given node (that is, its head matches
+ ; the type of the node), chop off the head and keep the
+ ; rest as the result. All contradicting templates are removed.
+ (define (filter-templates node templates)
+ (cond
+ ((null? templates) templates)
+ ((not (pair? (car templates))) ; A good template must be a list
+ (filter-templates node (cdr templates)))
+ (((node-typeof? (caar templates)) node)
+ (cons (cdar templates) (filter-templates node (cdr templates))))
+ (else
+ (filter-templates node (cdr templates)))))
+
+ ; Here <templates> ::= [<template> | <handler>]
+ ; If there is a <handler> in the above list, it must
+ ; be only one. If found, return it; otherwise, return #f
+ (define (find-handler templates)
+ (and (pair? templates)
+ (cond
+ ((procedure? (car templates))
+ (if (find-handler (cdr templates))
+ (error "ambiguous template match"))
+ (car templates))
+ (else (find-handler (cdr templates))))))
+
+ (let loop ((tree tree) (active-templates '()))
+ ;(cout "active-templates: " active-templates nl "tree: " tree nl)
+ (if (nodeset? tree)
+ (map-union (lambda (a-tree) (loop a-tree active-templates)) tree)
+ (let ((still-active-templates
+ (append
+ (filter-templates tree active-templates)
+ (filter-templates tree templates))))
+ (cond
+ ;((null? still-active-templates) '())
+ ((find-handler still-active-templates) =>
+ (lambda (handler) (handler tree)))
+ ((not (pair? tree)) '())
+ (else
+ (loop (cdr tree) still-active-templates)))))))
+
+;;; arch-tag: 88cd87de-8825-4ab3-9721-cf99694fb787
+;;; templates.scm ends here
diff --git a/module/sxml/fold.scm b/module/sxml/fold.scm
new file mode 100644
index 000000000..4a39da050
--- /dev/null
+++ b/module/sxml/fold.scm
@@ -0,0 +1,259 @@
+;;;; (sxml fold) -- transformation of sxml via fold operations
+;;;;
+;;;; Copyright (C) 2009 Free Software Foundation, Inc.
+;;;; Written 2007 by Andy Wingo <wingo at pobox dot com>.
+;;;;
+;;;; This library is free software; you can redistribute it and/or
+;;;; modify it under the terms of the GNU Lesser General Public
+;;;; License as published by the Free Software Foundation; either
+;;;; version 3 of the License, or (at your option) any later version.
+;;;;
+;;;; This library is distributed in the hope that it will be useful,
+;;;; but WITHOUT ANY WARRANTY; without even the implied warranty of
+;;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;;;; Lesser General Public License for more details.
+;;;;
+;;;; You should have received a copy of the GNU Lesser General Public
+;;;; License along with this library; if not, write to the Free Software
+;;;; Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;;;;
+
+;;; Commentary:
+;;
+;; @code{(sxml fold)} defines a number of variants of the @dfn{fold}
+;; algorithm for use in transforming SXML trees. Additionally it defines
+;; the layout operator, @code{fold-layout}, which might be described as
+;; a context-passing variant of SSAX's @code{pre-post-order}.
+;;
+;;; Code:
+
+(define-module (sxml fold)
+ #:export (foldt
+ fold
+ foldts
+ foldts*
+ fold-values
+ foldts*-values
+ fold-layout))
+
+(define (atom? x)
+ (not (pair? x)))
+
+(define (foldt fup fhere tree)
+ "The standard multithreaded tree fold.
+
+@var{fup} is of type [a] -> a. @var{fhere} is of type object -> a.
+"
+ (if (atom? tree)
+ (fhere tree)
+ (fup (map (lambda (kid)
+ (foldt fup fhere kid))
+ tree))))
+
+(define (fold proc seed list)
+ "The standard list fold.
+
+@var{proc} is of type a -> b -> b. @var{seed} is of type b. @var{list}
+is of type [a]."
+ (if (null? list)
+ seed
+ (fold proc (proc (car list) seed) (cdr list))))
+
+(define (foldts fdown fup fhere seed tree)
+ "The single-threaded tree fold originally defined in SSAX.
+@xref{sxml ssax,,(sxml ssax)}, for more information."
+ (if (atom? tree)
+ (fhere seed tree)
+ (fup seed
+ (fold (lambda (kid kseed)
+ (foldts fdown fup fhere kseed kid))
+ (fdown seed tree)
+ tree)
+ tree)))
+
+(define (foldts* fdown fup fhere seed tree)
+ "A variant of @ref{sxml fold foldts,,foldts} that allows pre-order
+tree rewrites. Originally defined in Andy Wingo's 2007 paper,
+@emph{Applications of fold to XML transformation}."
+ (if (atom? tree)
+ (fhere seed tree)
+ (call-with-values
+ (lambda () (fdown seed tree))
+ (lambda (kseed tree)
+ (fup seed
+ (fold (lambda (kid kseed)
+ (foldts* fdown fup fhere
+ kseed kid))
+ kseed
+ tree)
+ tree)))))
+
+(define (fold-values proc list . seeds)
+ "A variant of @ref{sxml fold fold,,fold} that allows multi-valued
+seeds. Note that the order of the arguments differs from that of
+@code{fold}."
+ (if (null? list)
+ (apply values seeds)
+ (call-with-values
+ (lambda () (apply proc (car list) seeds))
+ (lambda seeds
+ (apply fold-values proc (cdr list) seeds)))))
+
+(define (foldts*-values fdown fup fhere tree . seeds)
+ "A variant of @ref{sxml fold foldts*,,foldts*} that allows
+multi-valued seeds. Originally defined in Andy Wingo's 2007 paper,
+@emph{Applications of fold to XML transformation}."
+ (if (atom? tree)
+ (apply fhere tree seeds)
+ (call-with-values
+ (lambda () (apply fdown tree seeds))
+ (lambda (tree . kseeds)
+ (call-with-values
+ (lambda ()
+ (apply fold-values
+ (lambda (tree . seeds)
+ (apply foldts*-values
+ fdown fup fhere tree seeds))
+ tree kseeds))
+ (lambda kseeds
+ (apply fup tree (append seeds kseeds))))))))
+
+(define (assq-ref alist key default)
+ (cond ((assq key alist) => cdr)
+ (else default)))
+
+(define (fold-layout tree bindings params layout stylesheet)
+ "A traversal combinator in the spirit of SSAX's @ref{sxml transform
+pre-post-order,,pre-post-order}.
+
+@code{fold-layout} was originally presented in Andy Wingo's 2007 paper,
+@emph{Applications of fold to XML transformation}.
+
+@example
+bindings := (<binding>...)
+binding := (<tag> <bandler-pair>...)
+ | (*default* . <post-handler>)
+ | (*text* . <text-handler>)
+tag := <symbol>
+handler-pair := (pre-layout . <pre-layout-handler>)
+ | (post . <post-handler>)
+ | (bindings . <bindings>)
+ | (pre . <pre-handler>)
+ | (macro . <macro-handler>)
+@end example
+
+@table @var
+@item pre-layout-handler
+A function of three arguments:
+
+@table @var
+@item kids
+the kids of the current node, before traversal
+@item params
+the params of the current node
+@item layout
+the layout coming into this node
+@end table
+
+@var{pre-layout-handler} is expected to use this information to return a
+layout to pass to the kids. The default implementation returns the
+layout given in the arguments.
+
+@item post-handler
+A function of five arguments:
+@table @var
+@item tag
+the current tag being processed
+@item params
+the params of the current node
+@item layout
+the layout coming into the current node, before any kids were processed
+@item klayout
+the layout after processing all of the children
+@item kids
+the already-processed child nodes
+@end table
+
+@var{post-handler} should return two values, the layout to pass to the
+next node and the final tree.
+
+@item text-handler
+@var{text-handler} is a function of three arguments:
+@table @var
+@item text
+the string
+@item params
+the current params
+@item layout
+the current layout
+@end table
+
+@var{text-handler} should return two values, the layout to pass to the
+next node and the value to which the string should transform.
+@end table
+"
+ (define (err . args)
+ (error "no binding available" args))
+ (define (fdown tree bindings pcont params layout ret)
+ (define (fdown-helper new-bindings new-layout cont)
+ (let ((cont-with-tag (lambda args
+ (apply cont (car tree) args)))
+ (bindings (if new-bindings
+ (append new-bindings bindings)
+ bindings))
+ (style-params (assq-ref stylesheet (car tree) '())))
+ (cond
+ ((null? (cdr tree))
+ (values
+ '() bindings cont-with-tag (cons style-params params) new-layout '()))
+ ((and (pair? (cadr tree)) (eq? (caadr tree) '@))
+ (let ((params (cons (append (cdadr tree) style-params) params)))
+ (values
+ (cddr tree) bindings cont-with-tag params new-layout '())))
+ (else
+ (values
+ (cdr tree) bindings cont-with-tag (cons style-params params) new-layout '())))))
+ (define (no-bindings)
+ (fdown-helper #f layout (assq-ref bindings '*default* err)))
+ (define (macro macro-handler)
+ (fdown (apply macro-handler tree)
+ bindings pcont params layout ret))
+ (define (pre pre-handler)
+ (values '() bindings
+ (lambda (params layout old-layout kids)
+ (values layout (reverse kids)))
+ params layout (apply pre-handler tree)))
+ (define (have-bindings tag-bindings)
+ (fdown-helper
+ (assq-ref tag-bindings 'bindings #f)
+ ((assq-ref tag-bindings 'pre-layout
+ (lambda (tag params layout)
+ layout))
+ tree params layout)
+ (assq-ref tag-bindings 'post
+ (assq-ref bindings '*default* err))))
+ (let ((tag-bindings (assq-ref bindings (car tree) #f)))
+ (cond
+ ((not tag-bindings) (no-bindings))
+ ((assq-ref tag-bindings 'macro #f) => macro)
+ ((assq-ref tag-bindings 'pre #f) => pre)
+ (else (have-bindings tag-bindings)))))
+ (define (fup tree bindings cont params layout ret
+ kbindings kcont kparams klayout kret)
+ (call-with-values
+ (lambda ()
+ (kcont kparams layout klayout (reverse kret)))
+ (lambda (klayout kret)
+ (values bindings cont params klayout (cons kret ret)))))
+ (define (fhere tree bindings cont params layout ret)
+ (call-with-values
+ (lambda ()
+ ((assq-ref bindings '*text* err) tree params layout))
+ (lambda (tlayout tret)
+ (values bindings cont params tlayout (cons tret ret)))))
+ (call-with-values
+ (lambda ()
+ (foldts*-values
+ fdown fup fhere tree bindings #f (cons params '()) layout '()))
+ (lambda (bindings cont params layout ret)
+ (values (car ret) layout))))
diff --git a/module/sxml/simple.scm b/module/sxml/simple.scm
new file mode 100644
index 000000000..a1b485420
--- /dev/null
+++ b/module/sxml/simple.scm
@@ -0,0 +1,169 @@
+;;;; (sxml simple) -- a simple interface to the SSAX parser
+;;;;
+;;;; Copyright (C) 2009 Free Software Foundation, Inc.
+;;;; Modified 2004 by Andy Wingo <wingo at pobox dot com>.
+;;;; Originally written by Oleg Kiselyov <oleg at pobox dot com> as SXML-to-HTML.scm.
+;;;;
+;;;; This library is free software; you can redistribute it and/or
+;;;; modify it under the terms of the GNU Lesser General Public
+;;;; License as published by the Free Software Foundation; either
+;;;; version 3 of the License, or (at your option) any later version.
+;;;;
+;;;; This library is distributed in the hope that it will be useful,
+;;;; but WITHOUT ANY WARRANTY; without even the implied warranty of
+;;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;;;; Lesser General Public License for more details.
+;;;;
+;;;; You should have received a copy of the GNU Lesser General Public
+;;;; License along with this library; if not, write to the Free Software
+;;;; Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;;;;
+
+;;; Commentary:
+;;
+;;A simple interface to XML parsing and serialization.
+;;
+;;; Code:
+
+(define-module (sxml simple)
+ #:use-module (sxml ssax)
+ #:use-module (sxml transform)
+ #:use-module (ice-9 optargs)
+ #:use-module (srfi srfi-13)
+ #:export (xml->sxml sxml->xml sxml->string universal-sxslt-rules))
+
+(define* (xml->sxml #:optional (port (current-input-port)))
+ "Use SSAX to parse an XML document into SXML. Takes one optional
+argument, @var{port}, which defaults to the current input port."
+ (ssax:xml->sxml port '()))
+
+;; Universal transformation rules. Works for all XML.
+(define universal-sxslt-rules
+ #;
+ "A set of @code{pre-post-order} rules that transform any SXML tree
+into a form suitable for XML serialization by @code{(sxml transform)}'s
+@code{SRV:send-reply}. Used internally by @code{sxml->xml}."
+ `((@
+ ((*default* . ,(lambda (attr-key . value) ((enattr attr-key) value))))
+ . ,(lambda (trigger . value) (list '@ value)))
+ (*ENTITY* . ,(lambda (tag name) (list "&" name ";")))
+ (*PI* . ,(lambda (pi tag str) (list "<?" tag " " str "?>")))
+ ;; Is this right for entities? I don't have a reference for
+ ;; public-id/system-id at the moment...
+ (*default* . ,(lambda (tag . elems) (apply (entag tag) elems)))
+ (*text* . ,(lambda (trigger str)
+ (if (string? str) (string->escaped-xml str) str)))))
+
+(define* (sxml->xml tree #:optional (port (current-output-port)))
+ "Serialize the sxml tree @var{tree} as XML. The output will be written
+to the current output port, unless the optional argument @var{port} is
+present."
+ (with-output-to-port port
+ (lambda ()
+ (SRV:send-reply
+ (post-order
+ tree
+ universal-sxslt-rules)))))
+
+(define (sxml->string sxml)
+ "Detag an sxml tree @var{sxml} into a string. Does not perform any
+formatting."
+ (string-concatenate-reverse
+ (foldts
+ (lambda (seed tree) ; fdown
+ '())
+ (lambda (seed kid-seed tree) ; fup
+ (append! kid-seed seed))
+ (lambda (seed tree) ; fhere
+ (if (string? tree) (cons tree seed) seed))
+ '()
+ sxml)))
+
+;; The following two functions serialize tags and attributes. They are
+;; being used in the node handlers for the post-order function, see
+;; above.
+
+(define (check-name name)
+ (let* ((str (symbol->string name))
+ (i (string-index str #\:))
+ (head (or (and i (substring str 0 i)) str))
+ (tail (and i (substring str (1+ i)))))
+ (and i (string-index (substring str (1+ i)) #\:)
+ (error "Invalid QName: more than one colon" name))
+ (for-each
+ (lambda (s)
+ (and s
+ (or (char-alphabetic? (string-ref s 0))
+ (eq? (string-ref s 0) #\_)
+ (error "Invalid name starting character" s name))
+ (string-for-each
+ (lambda (c)
+ (or (char-alphabetic? c) (string-index "0123456789.-_" c)
+ (error "Invalid name character" c s name)))
+ s)))
+ (list head tail))))
+
+(define (entag tag)
+ (check-name tag)
+ (lambda elems
+ (if (and (pair? elems) (pair? (car elems)) (eq? '@ (caar elems)))
+ (list #\< tag (cdar elems)
+ (if (pair? (cdr elems))
+ (list #\> (cdr elems) "</" tag #\>)
+ " />"))
+ (list #\< tag
+ (if (pair? elems)
+ (list #\> elems "</" tag #\>)
+ " />")))))
+
+(define (enattr attr-key)
+ (check-name attr-key)
+ (let ((attr-str (symbol->string attr-key)))
+ (lambda (value)
+ (list #\space attr-str
+ "=\"" (and (not (null? value)) value) #\"))))
+
+(define (make-char-quotator char-encoding)
+ (let ((bad-chars (map car char-encoding)))
+
+ ;; Check to see if str contains one of the characters in charset,
+ ;; from the position i onward. If so, return that character's index.
+ ;; otherwise, return #f
+ (define (index-cset str i charset)
+ (let loop ((i i))
+ (and (< i (string-length str))
+ (if (memv (string-ref str i) charset) i
+ (loop (+ 1 i))))))
+
+ ;; The body of the function
+ (lambda (str)
+ (let ((bad-pos (index-cset str 0 bad-chars)))
+ (if (not bad-pos) str ; str had all good chars
+ (string-concatenate-reverse
+ (let loop ((from 0) (to bad-pos) (out '()))
+ (cond
+ ((>= from (string-length str)) out)
+ ((not to)
+ (cons (substring str from (string-length str)) out))
+ (else
+ (let ((quoted-char
+ (cdr (assv (string-ref str to) char-encoding)))
+ (new-to
+ (index-cset str (+ 1 to) bad-chars)))
+ (loop (1+ to) new-to
+ (if (< from to)
+ (cons* quoted-char (substring str from to) out)
+ (cons quoted-char out)))))))))))))
+
+;; Given a string, check to make sure it does not contain characters
+;; such as '<' or '&' that require encoding. Return either the original
+;; string, or a list of string fragments with special characters
+;; replaced by appropriate character entities.
+
+(define string->escaped-xml
+ (make-char-quotator
+ '((#\< . "&lt;") (#\> . "&gt;") (#\& . "&amp;") (#\" . "&quot;"))))
+
+;;; arch-tag: 9c853b25-d82f-42ef-a959-ae26fdc7d1ac
+;;; simple.scm ends here
+
diff --git a/module/sxml/ssax.scm b/module/sxml/ssax.scm
new file mode 100644
index 000000000..8794927e9
--- /dev/null
+++ b/module/sxml/ssax.scm
@@ -0,0 +1,246 @@
+;;;; (sxml ssax) -- the SSAX parser
+;;;;
+;;;; Copyright (C) 2009 Free Software Foundation, Inc.
+;;;; Modified 2004 by Andy Wingo <wingo at pobox dot com>.
+;;;; Written 2001,2002,2003,2004 by Oleg Kiselyov <oleg at pobox dot com> as SSAX.scm.
+;;;;
+;;;; This library is free software; you can redistribute it and/or
+;;;; modify it under the terms of the GNU Lesser General Public
+;;;; License as published by the Free Software Foundation; either
+;;;; version 3 of the License, or (at your option) any later version.
+;;;;
+;;;; This library is distributed in the hope that it will be useful,
+;;;; but WITHOUT ANY WARRANTY; without even the implied warranty of
+;;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;;;; Lesser General Public License for more details.
+;;;;
+;;;; You should have received a copy of the GNU Lesser General Public
+;;;; License along with this library; if not, write to the Free Software
+;;;; Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;;;;
+
+;;; Commentary:
+;;
+;@subheading Functional XML parsing framework
+;@subsubheading SAX/DOM and SXML parsers with support for XML Namespaces and validation
+;
+; This is a package of low-to-high level lexing and parsing procedures
+; that can be combined to yield a SAX, a DOM, a validating parser, or
+; a parser intended for a particular document type. The procedures in
+; the package can be used separately to tokenize or parse various
+; pieces of XML documents. The package supports XML Namespaces,
+; internal and external parsed entities, user-controlled handling of
+; whitespace, and validation. This module therefore is intended to be
+; a framework, a set of "Lego blocks" you can use to build a parser
+; following any discipline and performing validation to any degree. As
+; an example of the parser construction, this file includes a
+; semi-validating SXML parser.
+
+; The present XML framework has a "sequential" feel of SAX yet a
+; "functional style" of DOM. Like a SAX parser, the framework scans the
+; document only once and permits incremental processing. An application
+; that handles document elements in order can run as efficiently as
+; possible. @emph{Unlike} a SAX parser, the framework does not require
+; an application register stateful callbacks and surrender control to
+; the parser. Rather, it is the application that can drive the framework
+; -- calling its functions to get the current lexical or syntax element.
+; These functions do not maintain or mutate any state save the input
+; port. Therefore, the framework permits parsing of XML in a pure
+; functional style, with the input port being a monad (or a linear,
+; read-once parameter).
+
+; Besides the @var{port}, there is another monad -- @var{seed}. Most of
+; the middle- and high-level parsers are single-threaded through the
+; @var{seed}. The functions of this framework do not process or affect
+; the @var{seed} in any way: they simply pass it around as an instance
+; of an opaque datatype. User functions, on the other hand, can use the
+; seed to maintain user's state, to accumulate parsing results, etc. A
+; user can freely mix his own functions with those of the framework. On
+; the other hand, the user may wish to instantiate a high-level parser:
+; @code{SSAX:make-elem-parser} or @code{SSAX:make-parser}. In the latter
+; case, the user must provide functions of specific signatures, which
+; are called at predictable moments during the parsing: to handle
+; character data, element data, or processing instructions (PI). The
+; functions are always given the @var{seed}, among other parameters, and
+; must return the new @var{seed}.
+
+; From a functional point of view, XML parsing is a combined
+; pre-post-order traversal of a "tree" that is the XML document
+; itself. This down-and-up traversal tells the user about an element
+; when its start tag is encountered. The user is notified about the
+; element once more, after all element's children have been
+; handled. The process of XML parsing therefore is a fold over the
+; raw XML document. Unlike a fold over trees defined in [1], the
+; parser is necessarily single-threaded -- obviously as elements
+; in a text XML document are laid down sequentially. The parser
+; therefore is a tree fold that has been transformed to accept an
+; accumulating parameter [1,2].
+
+; Formally, the denotational semantics of the parser can be expressed
+; as
+;@smallexample
+; parser:: (Start-tag -> Seed -> Seed) ->
+; (Start-tag -> Seed -> Seed -> Seed) ->
+; (Char-Data -> Seed -> Seed) ->
+; XML-text-fragment -> Seed -> Seed
+; parser fdown fup fchar "<elem attrs> content </elem>" seed
+; = fup "<elem attrs>" seed
+; (parser fdown fup fchar "content" (fdown "<elem attrs>" seed))
+;
+; parser fdown fup fchar "char-data content" seed
+; = parser fdown fup fchar "content" (fchar "char-data" seed)
+;
+; parser fdown fup fchar "elem-content content" seed
+; = parser fdown fup fchar "content" (
+; parser fdown fup fchar "elem-content" seed)
+;@end smallexample
+
+; Compare the last two equations with the left fold
+;@smallexample
+; fold-left kons elem:list seed = fold-left kons list (kons elem seed)
+;@end smallexample
+
+; The real parser created by @code{SSAX:make-parser} is slightly more
+; complicated, to account for processing instructions, entity
+; references, namespaces, processing of document type declaration, etc.
+
+
+; The XML standard document referred to in this module is
+; @uref{http://www.w3.org/TR/1998/REC-xml-19980210.html}
+;
+; The present file also defines a procedure that parses the text of an
+; XML document or of a separate element into SXML, an S-expression-based
+; model of an XML Information Set. SXML is also an Abstract Syntax Tree
+; of an XML document. SXML is similar but not identical to DOM; SXML is
+; particularly suitable for Scheme-based XML/HTML authoring, SXPath
+; queries, and tree transformations. See SXML.html for more details.
+; SXML is a term implementation of evaluation of the XML document [3].
+; The other implementation is context-passing.
+
+; The present frameworks fully supports the XML Namespaces Recommendation:
+; @uref{http://www.w3.org/TR/REC-xml-names/}
+; Other links:
+;@table @asis
+;@item [1]
+; Jeremy Gibbons, Geraint Jones, "The Under-appreciated Unfold,"
+; Proc. ICFP'98, 1998, pp. 273-279.
+;@item [2]
+; Richard S. Bird, The promotion and accumulation strategies in
+; transformational programming, ACM Trans. Progr. Lang. Systems,
+; 6(4):487-504, October 1984.
+;@item [3]
+; Ralf Hinze, "Deriving Backtracking Monad Transformers,"
+; Functional Pearl. Proc ICFP'00, pp. 186-197.
+;@end table
+;;
+;;; Code:
+
+(define-module (sxml ssax)
+ #:use-module (sxml ssax input-parse)
+ #:use-module (srfi srfi-1)
+ #:use-module (srfi srfi-13)
+
+ #:export (current-ssax-error-port
+ with-ssax-error-to-port
+ xml-token? xml-token-kind xml-token-head
+ make-empty-attlist attlist-add
+ attlist-null?
+ attlist-remove-top
+ attlist->alist attlist-fold
+ ssax:uri-string->symbol
+ ssax:skip-internal-dtd
+ ssax:read-pi-body-as-string
+ ssax:reverse-collect-str-drop-ws
+ ssax:read-markup-token
+ ssax:read-cdata-body
+ ssax:read-char-ref
+ ssax:read-attributes
+ ssax:complete-start-tag
+ ssax:read-external-id
+ ssax:read-char-data
+ ssax:xml->sxml
+ ssax:make-parser
+ ssax:make-pi-parser
+ ssax:make-elem-parser))
+
+(define (parser-error port message . rest)
+ (apply throw 'parser-error port message rest))
+(define ascii->char integer->char)
+(define char->ascii char->integer)
+
+(define *current-ssax-error-port* (make-fluid))
+(define (current-ssax-error-port)
+ (fluid-ref *current-ssax-error-port*))
+
+(define (with-ssax-error-to-port port thunk)
+ (with-fluids ((*current-ssax-error-port* port))
+ (thunk)))
+
+(define (ssax:warn port msg . args)
+ (format (current-ssax-error-port)
+ ";;; SSAX warning: ~a ~a\n" msg args))
+
+(define (ucscode->string codepoint)
+ (string (integer->char codepoint)))
+
+(define char-newline #\newline)
+(define char-return #\return)
+(define char-tab #\tab)
+(define nl "\n")
+
+;; if condition is true, execute stmts in turn and return the result of
+;; the last statement otherwise, return #f
+(define-syntax when
+ (syntax-rules ()
+ ((when condition . stmts)
+ (and condition (begin . stmts)))))
+
+;; Execute a sequence of forms and return the result of the _first_ one.
+;; Like PROG1 in Lisp. Typically used to evaluate one or more forms with
+;; side effects and return a value that must be computed before some or
+;; all of the side effects happen.
+(define-syntax begin0
+ (syntax-rules ()
+ ((begin0 form form1 ... )
+ (let ((val form)) form1 ... val))))
+
+; Like let* but allowing for multiple-value bindings
+(define-syntax let*-values
+ (syntax-rules ()
+ ((let*-values () . bodies) (begin . bodies))
+ ((let*-values (((var) initializer) . rest) . bodies)
+ (let ((var initializer)) ; a single var optimization
+ (let*-values rest . bodies)))
+ ((let*-values ((vars initializer) . rest) . bodies)
+ (call-with-values (lambda () initializer) ; the most generic case
+ (lambda vars (let*-values rest . bodies))))))
+
+;; needed for some dumb reason
+(define inc 1+)
+(define dec 1-)
+
+(define-syntax include-from-path/filtered
+ (lambda (x)
+ (define (read-filtered accept-list file)
+ (with-input-from-file (%search-load-path file)
+ (lambda ()
+ (let loop ((sexp (read)) (out '()))
+ (cond
+ ((eof-object? sexp) (reverse out))
+ ((and (pair? sexp) (memq (car sexp) accept-list))
+ (loop (read) (cons sexp out)))
+ (else
+ (loop (read) out)))))))
+ (syntax-case x ()
+ ((_ accept-list file)
+ (with-syntax (((exp ...) (datum->syntax
+ x
+ (read-filtered
+ (syntax->datum #'accept-list)
+ (syntax->datum #'file)))))
+ #'(begin exp ...))))))
+
+(include-from-path "sxml/upstream/assert.scm")
+(include-from-path/filtered
+ (define define-syntax ssax:define-labeled-arg-macro)
+ "sxml/upstream/SSAX.scm")
diff --git a/module/sxml/ssax/input-parse.scm b/module/sxml/ssax/input-parse.scm
new file mode 100644
index 000000000..6e845eef8
--- /dev/null
+++ b/module/sxml/ssax/input-parse.scm
@@ -0,0 +1,180 @@
+;;;; (sxml ssax input-parse) -- a simple lexer
+;;;;
+;;;; Copyright (C) 2009 Free Software Foundation, Inc.
+;;;; Modified 2004 by Andy Wingo <wingo at pobox dot com>.
+;;;; Written 2003 by Oleg Kiselyov <oleg at pobox dot com> as input-parse.scm.
+;;;;
+;;;; This library is free software; you can redistribute it and/or
+;;;; modify it under the terms of the GNU Lesser General Public
+;;;; License as published by the Free Software Foundation; either
+;;;; version 3 of the License, or (at your option) any later version.
+;;;;
+;;;; This library is distributed in the hope that it will be useful,
+;;;; but WITHOUT ANY WARRANTY; without even the implied warranty of
+;;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;;;; Lesser General Public License for more details.
+;;;;
+;;;; You should have received a copy of the GNU Lesser General Public
+;;;; License along with this library; if not, write to the Free Software
+;;;; Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;;;;
+
+;;; Commentary:
+;;
+;; A simple lexer.
+;;
+;; The procedures in this module surprisingly often suffice to parse an
+;; input stream. They either skip, or build and return tokens, according
+;; to inclusion or delimiting semantics. The list of characters to
+;; expect, include, or to break at may vary from one invocation of a
+;; function to another. This allows the functions to easily parse even
+;; context-sensitive languages.
+;;
+;; EOF is generally frowned on, and thrown up upon if encountered.
+;; Exceptions are mentioned specifically. The list of expected
+;; characters (characters to skip until, or break-characters) may
+;; include an EOF "character", which is to be coded as the symbol,
+;; @code{*eof*}.
+;;
+;; The input stream to parse is specified as a @dfn{port}, which is
+;; usually the last (and optional) argument. It defaults to the current
+;; input port if omitted.
+;;
+;; If the parser encounters an error, it will throw an exception to the
+;; key @code{parser-error}. The arguments will be of the form
+;; @code{(@var{port} @var{message} @var{specialising-msg}*)}.
+;;
+;; The first argument is a port, which typically points to the offending
+;; character or its neighborhood. You can then use @code{port-column}
+;; and @code{port-line} to query the current position. @var{message} is
+;; the description of the error. Other arguments supply more details
+;; about the problem.
+;;
+;;; Code:
+
+(define-module (sxml ssax input-parse)
+ #:use-module (ice-9 rdelim)
+ #:export (peek-next-char
+ assert-curr-char
+ skip-until
+ skip-while
+ next-token
+ next-token-of
+ read-text-line
+ read-string
+ find-string-from-port?))
+
+(define ascii->char integer->char)
+(define char->ascii char->integer)
+(define char-newline #\newline)
+(define char-return #\return)
+(define inc 1+)
+(define dec 1-)
+
+;; rewrite oleg's define-opt into define* style
+(define-macro (define-opt bindings body . body-rest)
+ (let* ((rev-bindings (reverse bindings))
+ (opt-bindings
+ (and (pair? rev-bindings) (pair? (car rev-bindings))
+ (eq? 'optional (caar rev-bindings))
+ (cdar rev-bindings))))
+ (if opt-bindings
+ `(define* ,(append (reverse (cons #:optional (cdr rev-bindings)))
+ opt-bindings)
+ ,body ,@body-rest)
+ `(define* ,bindings ,body ,@body-rest))))
+
+(define (parser-error port message . rest)
+ (apply throw 'parser-error port message rest))
+
+(include-from-path "sxml/upstream/input-parse.scm")
+
+;; This version for guile is quite speedy, due to read-delimited (which
+;; is implemented in C).
+(define-opt (next-token prefix-skipped-chars break-chars
+ (optional (comment "") (port (current-input-port))) )
+ (let ((delims (list->string (delete '*eof* break-chars))))
+ (if (eof-object? (if (null? prefix-skipped-chars)
+ (peek-char port)
+ (skip-while prefix-skipped-chars port)))
+ (if (memq '*eof* break-chars)
+ ""
+ (parser-error port "EOF while reading a token " comment))
+ (let ((token (read-delimited delims port 'peek)))
+ (if (and (eof-object? (peek-char port))
+ (not (memq '*eof* break-chars)))
+ (parser-error port "EOF while reading a token " comment)
+ token)))))
+
+(define-opt (read-text-line (optional (port (current-input-port))) )
+ (read-line port))
+
+;; Written 1995, 1996 by Oleg Kiselyov (oleg@acm.org)
+;; Modified 1996, 1997, 1998, 2001 by A. Jaffer (agj@alum.mit.edu)
+;; Modified 2003 by Steve VanDevender (stevev@hexadecimal.uoregon.edu)
+;; Modified 2004 Andy Wingo <wingo at pobox dot com>
+;; This function is from SLIB's strsrch.scm, and is in the public domain.
+(define (find-string-from-port? str <input-port> . max-no-char)
+ "Looks for @var{str} in @var{<input-port>}, optionally within the
+first @var{max-no-char} characters."
+ (set! max-no-char (if (null? max-no-char) #f (car max-no-char)))
+ (letrec
+ ((no-chars-read 0)
+ (peeked? #f)
+ (my-peek-char ; Return a peeked char or #f
+ (lambda () (and (or (not (number? max-no-char))
+ (< no-chars-read max-no-char))
+ (let ((c (peek-char <input-port>)))
+ (cond (peeked? c)
+ ((eof-object? c) #f)
+ ((procedure? max-no-char)
+ (set! peeked? #t)
+ (if (max-no-char c) #f c))
+ ((eqv? max-no-char c) #f)
+ (else c))))))
+ (next-char (lambda () (set! peeked? #f) (read-char <input-port>)
+ (set! no-chars-read (+ 1 no-chars-read))))
+ (match-1st-char ; of the string str
+ (lambda ()
+ (let ((c (my-peek-char)))
+ (and c
+ (begin (next-char)
+ (if (char=? c (string-ref str 0))
+ (match-other-chars 1)
+ (match-1st-char)))))))
+ ;; There has been a partial match, up to the point pos-to-match
+ ;; (for example, str[0] has been found in the stream)
+ ;; Now look to see if str[pos-to-match] for would be found, too
+ (match-other-chars
+ (lambda (pos-to-match)
+ (if (>= pos-to-match (string-length str))
+ no-chars-read ; the entire string has matched
+ (let ((c (my-peek-char)))
+ (and c
+ (if (not (char=? c (string-ref str pos-to-match)))
+ (backtrack 1 pos-to-match)
+ (begin (next-char)
+ (match-other-chars (+ 1 pos-to-match)))))))))
+
+ ;; There had been a partial match, but then a wrong char showed up.
+ ;; Before discarding previously read (and matched) characters, we check
+ ;; to see if there was some smaller partial match. Note, characters read
+ ;; so far (which matter) are those of str[0..matched-substr-len - 1]
+ ;; In other words, we will check to see if there is such i>0 that
+ ;; substr(str,0,j) = substr(str,i,matched-substr-len)
+ ;; where j=matched-substr-len - i
+ (backtrack
+ (lambda (i matched-substr-len)
+ (let ((j (- matched-substr-len i)))
+ (if (<= j 0)
+ ;; backed off completely to the begining of str
+ (match-1st-char)
+ (let loop ((k 0))
+ (if (>= k j)
+ (match-other-chars j) ; there was indeed a shorter match
+ (if (char=? (string-ref str k)
+ (string-ref str (+ i k)))
+ (loop (+ 1 k))
+ (backtrack (+ 1 i) matched-substr-len))))))))
+ )
+ (match-1st-char)))
diff --git a/module/sxml/transform.scm b/module/sxml/transform.scm
new file mode 100644
index 000000000..c905456ba
--- /dev/null
+++ b/module/sxml/transform.scm
@@ -0,0 +1,298 @@
+;;;; (sxml transform) -- pre- and post-order sxml transformation
+;;;;
+;;;; Copyright (C) 2009 Free Software Foundation, Inc.
+;;;; Modified 2004 by Andy Wingo <wingo at pobox dot com>.
+;;;; Written 2003 by Oleg Kiselyov <oleg at pobox dot com> as SXML-tree-trans.scm.
+;;;;
+;;;; This library is free software; you can redistribute it and/or
+;;;; modify it under the terms of the GNU Lesser General Public
+;;;; License as published by the Free Software Foundation; either
+;;;; version 3 of the License, or (at your option) any later version.
+;;;;
+;;;; This library is distributed in the hope that it will be useful,
+;;;; but WITHOUT ANY WARRANTY; without even the implied warranty of
+;;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;;;; Lesser General Public License for more details.
+;;;;
+;;;; You should have received a copy of the GNU Lesser General Public
+;;;; License along with this library; if not, write to the Free Software
+;;;; Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;;;;
+
+;;; Commentary:
+;;
+;;@heading SXML expression tree transformers
+;
+;@subheading Pre-Post-order traversal of a tree and creation of a new tree
+;@smallexample
+;pre-post-order:: <tree> x <bindings> -> <new-tree>
+;@end smallexample
+; where
+;@smallexample
+; <bindings> ::= (<binding> ...)
+; <binding> ::= (<trigger-symbol> *preorder* . <handler>) |
+; (<trigger-symbol> *macro* . <handler>) |
+; (<trigger-symbol> <new-bindings> . <handler>) |
+; (<trigger-symbol> . <handler>)
+; <trigger-symbol> ::= XMLname | *text* | *default*
+; <handler> :: <trigger-symbol> x [<tree>] -> <new-tree>
+;@end smallexample
+;
+; The pre-post-order function visits the nodes and nodelists
+; pre-post-order (depth-first). For each @code{<Node>} of the form
+; @code{(@var{name} <Node> ...)}, it looks up an association with the
+; given @var{name} among its @var{<bindings>}. If failed,
+; @code{pre-post-order} tries to locate a @code{*default*} binding. It's
+; an error if the latter attempt fails as well. Having found a binding,
+; the @code{pre-post-order} function first checks to see if the binding
+; is of the form
+;@smallexample
+; (<trigger-symbol> *preorder* . <handler>)
+;@end smallexample
+;
+; If it is, the handler is 'applied' to the current node. Otherwise, the
+; pre-post-order function first calls itself recursively for each child
+; of the current node, with @var{<new-bindings>} prepended to the
+; @var{<bindings>} in effect. The result of these calls is passed to the
+; @var{<handler>} (along with the head of the current @var{<Node>}). To
+; be more precise, the handler is _applied_ to the head of the current
+; node and its processed children. The result of the handler, which
+; should also be a @code{<tree>}, replaces the current @var{<Node>}. If
+; the current @var{<Node>} is a text string or other atom, a special
+; binding with a symbol @code{*text*} is looked up.
+;
+; A binding can also be of a form
+;@smallexample
+; (<trigger-symbol> *macro* . <handler>)
+;@end smallexample
+; This is equivalent to @code{*preorder*} described above. However, the
+; result is re-processed again, with the current stylesheet.
+;;
+;;; Code:
+
+(define-module (sxml transform)
+ #:export (SRV:send-reply
+ foldts
+ post-order
+ pre-post-order
+ replace-range))
+
+;; Upstream version:
+; $Id: SXML-tree-trans.scm,v 1.8 2003/04/24 19:39:53 oleg Exp oleg $
+
+; Like let* but allowing for multiple-value bindings
+(define-macro (let*-values bindings . body)
+ (if (null? bindings) (cons 'begin body)
+ (apply
+ (lambda (vars initializer)
+ (let ((cont
+ (cons 'let*-values
+ (cons (cdr bindings) body))))
+ (cond
+ ((not (pair? vars)) ; regular let case, a single var
+ `(let ((,vars ,initializer)) ,cont))
+ ((null? (cdr vars)) ; single var, see the prev case
+ `(let ((,(car vars) ,initializer)) ,cont))
+ (else ; the most generic case
+ `(call-with-values (lambda () ,initializer)
+ (lambda ,vars ,cont))))))
+ (car bindings))))
+
+(define (SRV:send-reply . fragments)
+ "Output the @var{fragments} to the current output port.
+
+The fragments are a list of strings, characters, numbers, thunks,
+@code{#f}, @code{#t} -- and other fragments. The function traverses the
+tree depth-first, writes out strings and characters, executes thunks,
+and ignores @code{#f} and @code{'()}. The function returns @code{#t} if
+anything was written at all; otherwise the result is @code{#f} If
+@code{#t} occurs among the fragments, it is not written out but causes
+the result of @code{SRV:send-reply} to be @code{#t}."
+ (let loop ((fragments fragments) (result #f))
+ (cond
+ ((null? fragments) result)
+ ((not (car fragments)) (loop (cdr fragments) result))
+ ((null? (car fragments)) (loop (cdr fragments) result))
+ ((eq? #t (car fragments)) (loop (cdr fragments) #t))
+ ((pair? (car fragments))
+ (loop (cdr fragments) (loop (car fragments) result)))
+ ((procedure? (car fragments))
+ ((car fragments))
+ (loop (cdr fragments) #t))
+ (else
+ (display (car fragments))
+ (loop (cdr fragments) #t)))))
+
+
+
+;------------------------------------------------------------------------
+; Traversal of an SXML tree or a grove:
+; a <Node> or a <Nodelist>
+;
+; A <Node> and a <Nodelist> are mutually-recursive datatypes that
+; underlie the SXML tree:
+; <Node> ::= (name . <Nodelist>) | "text string"
+; An (ordered) set of nodes is just a list of the constituent nodes:
+; <Nodelist> ::= (<Node> ...)
+; Nodelists, and Nodes other than text strings are both lists. A
+; <Nodelist> however is either an empty list, or a list whose head is
+; not a symbol (an atom in general). A symbol at the head of a node is
+; either an XML name (in which case it's a tag of an XML element), or
+; an administrative name such as '@'.
+; See SXPath.scm and SSAX.scm for more information on SXML.
+
+
+;; see the commentary for docs
+(define (pre-post-order tree bindings)
+ (let* ((default-binding (assq '*default* bindings))
+ (text-binding (or (assq '*text* bindings) default-binding))
+ (text-handler ; Cache default and text bindings
+ (and text-binding
+ (if (procedure? (cdr text-binding))
+ (cdr text-binding) (cddr text-binding)))))
+ (let loop ((tree tree))
+ (cond
+ ((null? tree) '())
+ ((not (pair? tree))
+ (let ((trigger '*text*))
+ (if text-handler (text-handler trigger tree)
+ (error "Unknown binding for " trigger " and no default"))))
+ ((not (symbol? (car tree))) (map loop tree)) ; tree is a nodelist
+ (else ; tree is an SXML node
+ (let* ((trigger (car tree))
+ (binding (or (assq trigger bindings) default-binding)))
+ (cond
+ ((not binding)
+ (error "Unknown binding for " trigger " and no default"))
+ ((not (pair? (cdr binding))) ; must be a procedure: handler
+ (apply (cdr binding) trigger (map loop (cdr tree))))
+ ((eq? '*preorder* (cadr binding))
+ (apply (cddr binding) tree))
+ ((eq? '*macro* (cadr binding))
+ (loop (apply (cddr binding) tree)))
+ (else ; (cadr binding) is a local binding
+ (apply (cddr binding) trigger
+ (pre-post-order (cdr tree) (append (cadr binding) bindings)))
+ ))))))))
+
+; post-order is a strict subset of pre-post-order without *preorder*
+; (let alone *macro*) traversals.
+; Now pre-post-order is actually faster than the old post-order.
+; The function post-order is deprecated and is aliased below for
+; backward compatibility.
+(define post-order pre-post-order)
+
+;------------------------------------------------------------------------
+; Extended tree fold
+; tree = atom | (node-name tree ...)
+;
+; foldts fdown fup fhere seed (Leaf str) = fhere seed str
+; foldts fdown fup fhere seed (Nd kids) =
+; fup seed $ foldl (foldts fdown fup fhere) (fdown seed) kids
+
+; procedure fhere: seed -> atom -> seed
+; procedure fdown: seed -> node -> seed
+; procedure fup: parent-seed -> last-kid-seed -> node -> seed
+; foldts returns the final seed
+
+(define (foldts fdown fup fhere seed tree)
+ (cond
+ ((null? tree) seed)
+ ((not (pair? tree)) ; An atom
+ (fhere seed tree))
+ (else
+ (let loop ((kid-seed (fdown seed tree)) (kids (cdr tree)))
+ (if (null? kids)
+ (fup seed kid-seed tree)
+ (loop (foldts fdown fup fhere kid-seed (car kids))
+ (cdr kids)))))))
+
+;------------------------------------------------------------------------
+; Traverse a forest depth-first and cut/replace ranges of nodes.
+;
+; The nodes that define a range don't have to have the same immediate
+; parent, don't have to be on the same level, and the end node of a
+; range doesn't even have to exist. A replace-range procedure removes
+; nodes from the beginning node of the range up to (but not including)
+; the end node of the range. In addition, the beginning node of the
+; range can be replaced by a node or a list of nodes. The range of
+; nodes is cut while depth-first traversing the forest. If all
+; branches of the node are cut a node is cut as well. The procedure
+; can cut several non-overlapping ranges from a forest.
+
+; replace-range:: BEG-PRED x END-PRED x FOREST -> FOREST
+; where
+; type FOREST = (NODE ...)
+; type NODE = Atom | (Name . FOREST) | FOREST
+;
+; The range of nodes is specified by two predicates, beg-pred and end-pred.
+; beg-pred:: NODE -> #f | FOREST
+; end-pred:: NODE -> #f | FOREST
+; The beg-pred predicate decides on the beginning of the range. The node
+; for which the predicate yields non-#f marks the beginning of the range
+; The non-#f value of the predicate replaces the node. The value can be a
+; list of nodes. The replace-range procedure then traverses the tree and skips
+; all the nodes, until the end-pred yields non-#f. The value of the end-pred
+; replaces the end-range node. The new end node and its brothers will be
+; re-scanned.
+; The predicates are evaluated pre-order. We do not descend into a node that
+; is marked as the beginning of the range.
+
+(define (replace-range beg-pred end-pred forest)
+
+ ; loop forest keep? new-forest
+ ; forest is the forest to traverse
+ ; new-forest accumulates the nodes we will keep, in the reverse
+ ; order
+ ; If keep? is #t, keep the curr node if atomic. If the node is not atomic,
+ ; traverse its children and keep those that are not in the skip range.
+ ; If keep? is #f, skip the current node if atomic. Otherwise,
+ ; traverse its children. If all children are skipped, skip the node
+ ; as well.
+
+ (define (loop forest keep? new-forest)
+ (if (null? forest) (values (reverse new-forest) keep?)
+ (let ((node (car forest)))
+ (if keep?
+ (cond ; accumulate mode
+ ((beg-pred node) => ; see if the node starts the skip range
+ (lambda (repl-branches) ; if so, skip/replace the node
+ (loop (cdr forest) #f
+ (append (reverse repl-branches) new-forest))))
+ ((not (pair? node)) ; it's an atom, keep it
+ (loop (cdr forest) keep? (cons node new-forest)))
+ (else
+ (let*-values
+ (((node?) (symbol? (car node))) ; or is it a nodelist?
+ ((new-kids keep?) ; traverse its children
+ (loop (if node? (cdr node) node) #t '())))
+ (loop (cdr forest) keep?
+ (cons
+ (if node? (cons (car node) new-kids) new-kids)
+ new-forest)))))
+ ; skip mode
+ (cond
+ ((end-pred node) => ; end the skip range
+ (lambda (repl-branches) ; repl-branches will be re-scanned
+ (loop (append repl-branches (cdr forest)) #t
+ new-forest)))
+ ((not (pair? node)) ; it's an atom, skip it
+ (loop (cdr forest) keep? new-forest))
+ (else
+ (let*-values
+ (((node?) (symbol? (car node))) ; or is it a nodelist?
+ ((new-kids keep?) ; traverse its children
+ (loop (if node? (cdr node) node) #f '())))
+ (loop (cdr forest) keep?
+ (if (or keep? (pair? new-kids))
+ (cons
+ (if node? (cons (car node) new-kids) new-kids)
+ new-forest)
+ new-forest) ; if all kids are skipped
+ )))))))) ; skip the node too
+
+ (let*-values (((new-forest keep?) (loop forest #t '())))
+ new-forest))
+
+;;; arch-tag: 6c814f4b-38f7-42c1-b8ef-ce3447edefc7
+;;; transform.scm ends here
diff --git a/module/sxml/upstream/COPYING.SSAX b/module/sxml/upstream/COPYING.SSAX
new file mode 100644
index 000000000..9dc72b8ea
--- /dev/null
+++ b/module/sxml/upstream/COPYING.SSAX
@@ -0,0 +1,2 @@
+The files in this directory are imported directly from upstream SSAX,
+and are in the public domain.
diff --git a/module/sxml/upstream/SSAX.scm b/module/sxml/upstream/SSAX.scm
new file mode 100644
index 000000000..776e3119e
--- /dev/null
+++ b/module/sxml/upstream/SSAX.scm
@@ -0,0 +1,3212 @@
+; Functional XML parsing framework: SAX/DOM and SXML parsers
+; with support for XML Namespaces and validation
+;
+; This is a package of low-to-high level lexing and parsing procedures
+; that can be combined to yield a SAX, a DOM, a validating parsers, or
+; a parser intended for a particular document type. The procedures in
+; the package can be used separately to tokenize or parse various
+; pieces of XML documents. The package supports XML Namespaces,
+; internal and external parsed entities, user-controlled handling of
+; whitespace, and validation. This module therefore is intended to be
+; a framework, a set of "Lego blocks" you can use to build a parser
+; following any discipline and performing validation to any degree. As
+; an example of the parser construction, this file includes a
+; semi-validating SXML parser.
+
+; The present XML framework has a "sequential" feel of SAX yet a
+; "functional style" of DOM. Like a SAX parser, the framework scans
+; the document only once and permits incremental processing. An
+; application that handles document elements in order can run as
+; efficiently as possible. _Unlike_ a SAX parser, the framework does
+; not require an application register stateful callbacks and surrender
+; control to the parser. Rather, it is the application that can drive
+; the framework -- calling its functions to get the current lexical or
+; syntax element. These functions do not maintain or mutate any state
+; save the input port. Therefore, the framework permits parsing of XML
+; in a pure functional style, with the input port being a monad (or a
+; linear, read-once parameter).
+
+; Besides the PORT, there is another monad -- SEED. Most of the
+; middle- and high-level parsers are single-threaded through the
+; seed. The functions of this framework do not process or affect the
+; SEED in any way: they simply pass it around as an instance of an
+; opaque datatype. User functions, on the other hand, can use the
+; seed to maintain user's state, to accumulate parsing results, etc. A
+; user can freely mix his own functions with those of the
+; framework. On the other hand, the user may wish to instantiate a
+; high-level parser: ssax:make-elem-parser or ssax:make-parser. In
+; the latter case, the user must provide functions of specific
+; signatures, which are called at predictable moments during the
+; parsing: to handle character data, element data, or processing
+; instructions (PI). The functions are always given the SEED, among
+; other parameters, and must return the new SEED.
+
+; From a functional point of view, XML parsing is a combined
+; pre-post-order traversal of a "tree" that is the XML document
+; itself. This down-and-up traversal tells the user about an element
+; when its start tag is encountered. The user is notified about the
+; element once more, after all element's children have been
+; handled. The process of XML parsing therefore is a fold over the
+; raw XML document. Unlike a fold over trees defined in [1], the
+; parser is necessarily single-threaded -- obviously as elements
+; in a text XML document are laid down sequentially. The parser
+; therefore is a tree fold that has been transformed to accept an
+; accumulating parameter [1,2].
+
+; Formally, the denotational semantics of the parser can be expressed
+; as
+; parser:: (Start-tag -> Seed -> Seed) ->
+; (Start-tag -> Seed -> Seed -> Seed) ->
+; (Char-Data -> Seed -> Seed) ->
+; XML-text-fragment -> Seed -> Seed
+; parser fdown fup fchar "<elem attrs> content </elem>" seed
+; = fup "<elem attrs>" seed
+; (parser fdown fup fchar "content" (fdown "<elem attrs>" seed))
+;
+; parser fdown fup fchar "char-data content" seed
+; = parser fdown fup fchar "content" (fchar "char-data" seed)
+;
+; parser fdown fup fchar "elem-content content" seed
+; = parser fdown fup fchar "content" (
+; parser fdown fup fchar "elem-content" seed)
+
+; Compare the last two equations with the left fold
+; fold-left kons elem:list seed = fold-left kons list (kons elem seed)
+
+; The real parser created my ssax:make-parser is slightly more complicated,
+; to account for processing instructions, entity references, namespaces,
+; processing of document type declaration, etc.
+
+
+; The XML standard document referred to in this module is
+; http://www.w3.org/TR/1998/REC-xml-19980210.html
+;
+; The present file also defines a procedure that parses the text of an
+; XML document or of a separate element into SXML, an
+; S-expression-based model of an XML Information Set. SXML is also an
+; Abstract Syntax Tree of an XML document. SXML is similar
+; but not identical to DOM; SXML is particularly suitable for
+; Scheme-based XML/HTML authoring, SXPath queries, and tree
+; transformations. See SXML.html for more details.
+; SXML is a term implementation of evaluation of the XML document [3].
+; The other implementation is context-passing.
+
+; The present frameworks fully supports the XML Namespaces Recommendation:
+; http://www.w3.org/TR/REC-xml-names/
+; Other links:
+; [1] Jeremy Gibbons, Geraint Jones, "The Under-appreciated Unfold,"
+; Proc. ICFP'98, 1998, pp. 273-279.
+; [2] Richard S. Bird, The promotion and accumulation strategies in
+; transformational programming, ACM Trans. Progr. Lang. Systems,
+; 6(4):487-504, October 1984.
+; [3] Ralf Hinze, "Deriving Backtracking Monad Transformers,"
+; Functional Pearl. Proc ICFP'00, pp. 186-197.
+
+; IMPORT
+; parser-error ssax:warn, see Handling of errors, below
+; functions declared in files util.scm, input-parse.scm and look-for-str.scm
+; char-encoding.scm for various platform-specific character-encoding functions.
+; From SRFI-13: string-concatenate/shared and string-concatenate-reverse/shared
+; If a particular implementation lacks SRFI-13 support, please
+; include the file srfi-13-local.scm
+
+; Handling of errors
+; This package relies on a function parser-error, which must be defined
+; by a user of the package. The function has the following signature:
+; parser-error PORT MESSAGE SPECIALISING-MSG*
+; Many procedures of this package call 'parser-error' whenever a
+; parsing, well-formedness or validation error is encountered. The
+; first argument is a port, which typically points to the offending
+; character or its neighborhood. Most of the Scheme systems let the
+; user query a PORT for the current position. The MESSAGE argument
+; indicates a failed XML production or a failed XML constraint. The
+; latter is referred to by its anchor name in the XML Recommendation
+; or XML Namespaces Recommendation. The parsing library (e.g.,
+; next-token, assert-curr-char) invoke 'parser-error' as well, in
+; exactly the same way. See input-parse.scm for more details.
+; See
+; http://pair.com/lisovsky/download/parse-error.scm
+; for an excellent example of such a redefined parser-error function.
+;
+; In addition, the present code invokes a function ssax:warn
+; ssax:warn PORT MESSAGE SPECIALISING-MSG*
+; to notify the user about warnings that are NOT errors but still
+; may alert the user.
+;
+; Again, parser-error and ssax:warn are supposed to be defined by the
+; user. However, if a run-test macro below is set to include
+; self-tests, this present code does provide the definitions for these
+; functions to allow tests to run.
+
+; Misc notes
+; It seems it is highly desirable to separate tests out in a dedicated
+; file.
+;
+; Jim Bender wrote on Mon, 9 Sep 2002 20:03:42 EDT on the SSAX-SXML
+; mailing list (message A fine-grained "lego")
+; The task was to record precise source location information, as PLT
+; does with its current XML parser. That parser records the start and
+; end location (filepos, line#, column#) for pi, elements, attributes,
+; chuncks of "pcdata".
+; As suggested above, though, in some cases I needed to be able force
+; open an interface that did not yet exist. For instance, I added an
+; "end-char-data-hook", which would be called at the end of char-data
+; fragment. This returns a function of type (seed -> seed) which is
+; invoked on the current seed only if read-char-data has indeed reached
+; the end of a block of char data (after reading a new token.
+; But the deepest interface that I needed to expose was that of reading
+; attributes. In the official distribution, this is not even a separate
+; function. Instead, it is embedded within SSAX:read-attributes. This
+; required some small re-structuring as well.
+; This definitely will not be to everyone's taste (nor needed by most).
+; Certainly, the existing make-parser interface addresses most custom
+; needs. And likely 80-90 lines of a "link specification" to create a
+; parser from many tiny little lego blocks may please only a few, while
+; appalling others.
+; The code is available at http://celtic.benderweb.net/ssax-lego.plt or
+; http://celtic.benderweb.net/ssax-lego.tar.gz
+; In the examples directory, I provide:
+; - a unit version of the make-parser interface,
+; - a simple SXML parser using that interface,
+; - an SXML parser which directly uses the "new lego",
+; - a pseudo-SXML parser, which records source location information
+; - and lastly a parser which returns the structures used in PLT's xml
+; collection, with source location information
+
+; $Id: SSAX.scm,v 5.1 2004/07/07 16:02:30 sperber Exp $
+;^^^^^^^^^
+
+
+ ; See the Makefile in the ../tests directory
+ ; (in particular, the rule vSSAX) for an example of how
+ ; to run this code on various Scheme systems.
+ ; See SSAX examples for many samples of using this code,
+ ; again, on a variety of Scheme systems.
+ ; See http://ssax.sf.net/
+
+
+; The following macro runs built-in test cases -- or does not run,
+; depending on which of the two cases below you commented out
+; Case 1: no tests:
+;(define-macro run-test (lambda body '(begin #f)))
+;(define-syntax run-test (syntax-rules () ((run-test . args) (begin #f))))
+
+; Case 2: with tests.
+; The following macro could've been defined just as
+; (define-macro run-test (lambda body `(begin (display "\n-->Test\n") ,@body)))
+;
+; Instead, it's more involved, to make up for case-insensitivity of
+; symbols on some Scheme systems. In Gambit, symbols are case
+; sensitive: (eq? 'A 'a) is #f and (eq? 'Aa (string->symbol "Aa")) is
+; #t. On some systems, symbols are case-insensitive and just the
+; opposite is true. Therefore, we introduce a notation '"ASymbol" (a
+; quoted string) that stands for a case-_sensitive_ ASymbol -- on any
+; R5RS Scheme system. This notation is valid only within the body of
+; run-test.
+; The notation is implemented by scanning the run-test's
+; body and replacing every occurrence of (quote "str") with the result
+; of (string->symbol "str"). We can do such a replacement at macro-expand
+; time (rather than at run time).
+
+; Here's the previous version of run-test, implemented as a low-level
+; macro.
+; (define-macro run-test
+; (lambda body
+; (define (re-write body)
+; (cond
+; ((vector? body)
+; (list->vector (re-write (vector->list body))))
+; ((not (pair? body)) body)
+; ((and (eq? 'quote (car body)) (pair? (cdr body))
+; (string? (cadr body)))
+; (string->symbol (cadr body)))
+; (else (cons (re-write (car body)) (re-write (cdr body))))))
+; (cons 'begin (re-write body))))
+;
+; For portability, it is re-written as syntax-rules. The syntax-rules
+; version is less powerful: for example, it can't handle
+; (case x (('"Foo") (do-on-Foo))) whereas the low-level macro
+; could correctly place a case-sensitive symbol at the right place.
+; We also do not scan vectors (because we don't use them here).
+; Twice-deep quasiquotes aren't handled either.
+; Still, the syntax-rules version satisfies our immediate needs.
+; Incidentally, I originally didn't believe that the macro below
+; was at all possible.
+;
+; The macro is written in a continuation-passing style. A continuation
+; typically has the following structure: (k-head ! . args)
+; When the continuation is invoked, we expand into
+; (k-head <computed-result> . arg). That is, the dedicated symbol !
+; is the placeholder for the result.
+;
+; It seems that the most modular way to write the run-test macro would
+; be the following
+;
+; (define-syntax run-test
+; (syntax-rules ()
+; ((run-test . ?body)
+; (letrec-syntax
+; ((scan-exp ; (scan-exp body k)
+; (syntax-rules (quote quasiquote !)
+; ((scan-exp (quote (hd . tl)) k)
+; (scan-lit-lst (hd . tl) (do-wrap ! quasiquote k)))
+; ((scan-exp (quote x) (k-head ! . args))
+; (k-head
+; (if (string? (quote x)) (string->symbol (quote x)) (quote x))
+; . args))
+; ((scan-exp (hd . tl) k)
+; (scan-exp hd (do-tl ! scan-exp tl k)))
+; ((scan-exp x (k-head ! . args))
+; (k-head x . args))))
+; (do-tl
+; (syntax-rules (!)
+; ((do-tl processed-hd fn () (k-head ! . args))
+; (k-head (processed-hd) . args))
+; ((do-tl processed-hd fn old-tl k)
+; (fn old-tl (do-cons ! processed-hd k)))))
+; ...
+; (do-finish
+; (syntax-rules ()
+; ((do-finish (new-body)) new-body)
+; ((do-finish new-body) (begin . new-body))))
+; ...
+; (scan-exp ?body (do-finish !))
+; ))))
+;
+; Alas, that doesn't work on all systems. We hit yet another dark
+; corner of the R5RS macros. The reason is that run-test is used in
+; the code below to introduce definitions. For example:
+; (run-test
+; (define (ssax:warn port msg . other-msg)
+; (apply cerr (cons* nl "Warning: " msg other-msg)))
+; )
+; This code expands to
+; (begin
+; (define (ssax:warn port msg . other-msg) ...))
+; so the definition gets spliced in into the top level. Right?
+; Well, On Petite Chez Scheme it is so. However, many other systems
+; don't like this approach. The reason is that the invocation of
+; (run-test (define (ssax:warn port msg . other-msg) ...))
+; first expands into
+; (letrec-syntax (...)
+; (scan-exp ((define (ssax:warn port msg . other-msg) ...)) ...))
+; because of the presence of (letrec-syntax ...), the begin form that
+; is generated eventually is no longer at the top level! The begin
+; form in Scheme is an overloading of two distinct forms: top-level
+; begin and the other begin. The forms have different rules: for example,
+; (begin (define x 1)) is OK for a top-level begin but not OK for
+; the other begin. Some Scheme systems see the that the macro
+; (run-test ...) expands into (letrec-syntax ...) and decide right there
+; that any further (begin ...) forms are NOT top-level begin forms.
+; The only way out is to make sure all our macros are top-level.
+; The best approach <sigh> seems to be to make run-test one huge
+; top-level macro.
+
+
+(define-syntax run-test
+ (syntax-rules (define)
+ ((run-test "scan-exp" (define vars body))
+ (define vars (run-test "scan-exp" body)))
+ ((run-test "scan-exp" ?body)
+ (letrec-syntax
+ ((scan-exp ; (scan-exp body k)
+ (syntax-rules (quote quasiquote !)
+ ((scan-exp '() (k-head ! . args))
+ (k-head '() . args))
+ ((scan-exp (quote (hd . tl)) k)
+ (scan-lit-lst (hd . tl) (do-wrap ! quasiquote k)))
+ ((scan-exp (quasiquote (hd . tl)) k)
+ (scan-lit-lst (hd . tl) (do-wrap ! quasiquote k)))
+ ((scan-exp (quote x) (k-head ! . args))
+ (k-head
+ (if (string? (quote x)) (string->symbol (quote x)) (quote x))
+ . args))
+ ((scan-exp (hd . tl) k)
+ (scan-exp hd (do-tl ! scan-exp tl k)))
+ ((scan-exp x (k-head ! . args))
+ (k-head x . args))))
+ (do-tl
+ (syntax-rules (!)
+ ((do-tl processed-hd fn () (k-head ! . args))
+ (k-head (processed-hd) . args))
+ ((do-tl processed-hd fn old-tl k)
+ (fn old-tl (do-cons ! processed-hd k)))))
+ (do-cons
+ (syntax-rules (!)
+ ((do-cons processed-tl processed-hd (k-head ! . args))
+ (k-head (processed-hd . processed-tl) . args))))
+ (do-wrap
+ (syntax-rules (!)
+ ((do-wrap val fn (k-head ! . args))
+ (k-head (fn val) . args))))
+ (do-finish
+ (syntax-rules ()
+ ((do-finish new-body) new-body)))
+
+ (scan-lit-lst ; scan literal list
+ (syntax-rules (quote unquote unquote-splicing !)
+ ((scan-lit-lst '() (k-head ! . args))
+ (k-head '() . args))
+ ((scan-lit-lst (quote (hd . tl)) k)
+ (do-tl quote scan-lit-lst ((hd . tl)) k))
+ ((scan-lit-lst (unquote x) k)
+ (scan-exp x (do-wrap ! unquote k)))
+ ((scan-lit-lst (unquote-splicing x) k)
+ (scan-exp x (do-wrap ! unquote-splicing k)))
+ ((scan-lit-lst (quote x) (k-head ! . args))
+ (k-head
+ ,(if (string? (quote x)) (string->symbol (quote x)) (quote x))
+ . args))
+ ((scan-lit-lst (hd . tl) k)
+ (scan-lit-lst hd (do-tl ! scan-lit-lst tl k)))
+ ((scan-lit-lst x (k-head ! . args))
+ (k-head x . args))))
+ )
+ (scan-exp ?body (do-finish !))))
+ ((run-test body ...)
+ (begin
+ (run-test "scan-exp" body) ...))
+))
+
+;========================================================================
+; Data Types
+
+; TAG-KIND
+; a symbol 'START, 'END, 'PI, 'DECL, 'COMMENT, 'CDSECT
+; or 'ENTITY-REF that identifies a markup token
+
+; UNRES-NAME
+; a name (called GI in the XML Recommendation) as given in an xml
+; document for a markup token: start-tag, PI target, attribute name.
+; If a GI is an NCName, UNRES-NAME is this NCName converted into
+; a Scheme symbol. If a GI is a QName, UNRES-NAME is a pair of
+; symbols: (PREFIX . LOCALPART)
+
+; RES-NAME
+; An expanded name, a resolved version of an UNRES-NAME.
+; For an element or an attribute name with a non-empty namespace URI,
+; RES-NAME is a pair of symbols, (URI-SYMB . LOCALPART).
+; Otherwise, it's a single symbol.
+
+; ELEM-CONTENT-MODEL
+; A symbol:
+; ANY - anything goes, expect an END tag.
+; EMPTY-TAG - no content, and no END-tag is coming
+; EMPTY - no content, expect the END-tag as the next token
+; PCDATA - expect character data only, and no children elements
+; MIXED
+; ELEM-CONTENT
+
+; URI-SYMB
+; A symbol representing a namespace URI -- or other symbol chosen
+; by the user to represent URI. In the former case,
+; URI-SYMB is created by %-quoting of bad URI characters and
+; converting the resulting string into a symbol.
+
+; NAMESPACES
+; A list representing namespaces in effect. An element of the list
+; has one of the following forms:
+; (PREFIX URI-SYMB . URI-SYMB) or
+; (PREFIX USER-PREFIX . URI-SYMB)
+; USER-PREFIX is a symbol chosen by the user
+; to represent the URI.
+; (#f USER-PREFIX . URI-SYMB)
+; Specification of the user-chosen prefix and a URI-SYMBOL.
+; (*DEFAULT* USER-PREFIX . URI-SYMB)
+; Declaration of the default namespace
+; (*DEFAULT* #f . #f)
+; Un-declaration of the default namespace. This notation
+; represents overriding of the previous declaration
+; A NAMESPACES list may contain several elements for the same PREFIX.
+; The one closest to the beginning of the list takes effect.
+
+; ATTLIST
+; An ordered collection of (NAME . VALUE) pairs, where NAME is
+; a RES-NAME or an UNRES-NAME. The collection is an ADT
+
+; STR-HANDLER
+; A procedure of three arguments: STRING1 STRING2 SEED
+; returning a new SEED
+; The procedure is supposed to handle a chunk of character data
+; STRING1 followed by a chunk of character data STRING2.
+; STRING2 is a short string, often "\n" and even ""
+
+; ENTITIES
+; An assoc list of pairs:
+; (named-entity-name . named-entity-body)
+; where named-entity-name is a symbol under which the entity was
+; declared, named-entity-body is either a string, or
+; (for an external entity) a thunk that will return an
+; input port (from which the entity can be read).
+; named-entity-body may also be #f. This is an indication that a
+; named-entity-name is currently being expanded. A reference to
+; this named-entity-name will be an error: violation of the
+; WFC nonrecursion.
+
+; XML-TOKEN -- a record
+
+; In Gambit, you can use the following declaration:
+; (define-structure xml-token kind head)
+; The following declaration is "standard" as it follows SRFI-9:
+;;(define-record-type xml-token (make-xml-token kind head) xml-token?
+;; (kind xml-token-kind)
+;; (head xml-token-head) )
+; No field mutators are declared as SSAX is a pure functional parser
+;
+; But to make the code more portable, we define xml-token simply as
+; a pair. It suffices for us. Furthermore, xml-token-kind and xml-token-head
+; can be defined as simple procedures. However, they are declared as
+; macros below for efficiency.
+
+(define (make-xml-token kind head) (cons kind head))
+(define xml-token? pair?)
+(define-syntax xml-token-kind
+ (syntax-rules () ((xml-token-kind token) (car token))))
+(define-syntax xml-token-head
+ (syntax-rules () ((xml-token-head token) (cdr token))))
+
+; (define-macro xml-token-kind (lambda (token) `(car ,token)))
+; (define-macro xml-token-head (lambda (token) `(cdr ,token)))
+
+; This record represents a markup, which is, according to the XML
+; Recommendation, "takes the form of start-tags, end-tags, empty-element tags,
+; entity references, character references, comments, CDATA section delimiters,
+; document type declarations, and processing instructions."
+;
+; kind -- a TAG-KIND
+; head -- an UNRES-NAME. For xml-tokens of kinds 'COMMENT and
+; 'CDSECT, the head is #f
+;
+; For example,
+; <P> => kind='START, head='P
+; </P> => kind='END, head='P
+; <BR/> => kind='EMPTY-EL, head='BR
+; <!DOCTYPE OMF ...> => kind='DECL, head='DOCTYPE
+; <?xml version="1.0"?> => kind='PI, head='xml
+; &my-ent; => kind = 'ENTITY-REF, head='my-ent
+;
+; Character references are not represented by xml-tokens as these references
+; are transparently resolved into the corresponding characters.
+;
+
+
+
+; XML-DECL -- a record
+
+; The following is Gambit-specific, see below for a portable declaration
+;(define-structure xml-decl elems entities notations)
+
+; The record represents a datatype of an XML document: the list of
+; declared elements and their attributes, declared notations, list of
+; replacement strings or loading procedures for parsed general
+; entities, etc. Normally an xml-decl record is created from a DTD or
+; an XML Schema, although it can be created and filled in in many other
+; ways (e.g., loaded from a file).
+;
+; elems: an (assoc) list of decl-elem or #f. The latter instructs
+; the parser to do no validation of elements and attributes.
+;
+; decl-elem: declaration of one element:
+; (elem-name elem-content decl-attrs)
+; elem-name is an UNRES-NAME for the element.
+; elem-content is an ELEM-CONTENT-MODEL.
+; decl-attrs is an ATTLIST, of (ATTR-NAME . VALUE) associations
+; !!!This element can declare a user procedure to handle parsing of an
+; element (e.g., to do a custom validation, or to build a hash of
+; IDs as they're encountered).
+;
+; decl-attr: an element of an ATTLIST, declaration of one attribute
+; (attr-name content-type use-type default-value)
+; attr-name is an UNRES-NAME for the declared attribute
+; content-type is a symbol: CDATA, NMTOKEN, NMTOKENS, ...
+; or a list of strings for the enumerated type.
+; use-type is a symbol: REQUIRED, IMPLIED, FIXED
+; default-value is a string for the default value, or #f if not given.
+;
+;
+
+; see a function make-empty-xml-decl to make a XML declaration entry
+; suitable for a non-validating parsing.
+
+
+;-------------------------
+; Utilities
+
+; ssax:warn PORT MESSAGE SPECIALISING-MSG*
+; to notify the user about warnings that are NOT errors but still
+; may alert the user.
+; Result is unspecified.
+; We need to define the function to allow the self-tests to run.
+; Normally the definition of ssax:warn is to be provided by the user.
+(run-test
+ (define (ssax:warn port msg . other-msg)
+ (apply cerr (cons* nl "Warning: " msg other-msg)))
+)
+
+
+; parser-error PORT MESSAGE SPECIALISING-MSG*
+; to let the user know of a syntax error or a violation of a
+; well-formedness or validation constraint.
+; Result is unspecified.
+; We need to define the function to allow the self-tests to run.
+; Normally the definition of parser-error is to be provided by the user.
+(run-test
+ (define (parser-error port msg . specializing-msgs)
+ (apply error (cons msg specializing-msgs)))
+)
+
+; The following is a function that is often used in validation tests,
+; to make sure that the computed result matches the expected one.
+; This function is a standard equal? predicate with one exception.
+; On Scheme systems where (string->symbol "A") and a symbol A
+; are the same, equal_? is precisely equal?
+; On other Scheme systems, we compare symbols disregarding their case.
+; Since this function is used only in tests, we don't have to
+; strive to make it efficient.
+(run-test
+ (define (equal_? e1 e2)
+ (if (eq? 'A (string->symbol "A")) (equal? e1 e2)
+ (cond
+ ((symbol? e1)
+ (and (symbol? e2)
+ (string-ci=? (symbol->string e1) (symbol->string e2))))
+ ((pair? e1)
+ (and (pair? e2)
+ (equal_? (car e1) (car e2)) (equal_? (cdr e1) (cdr e2))))
+ ((vector? e1)
+ (and (vector? e2) (equal_? (vector->list e1) (vector->list e2))))
+ (else
+ (equal? e1 e2)))))
+)
+
+; The following function, which is often used in validation tests,
+; lets us conveniently enter newline, CR and tab characters in a character
+; string.
+; unesc-string: ESC-STRING -> STRING
+; where ESC-STRING is a character string that may contain
+; %n -- for #\newline
+; %r -- for #\return
+; %t -- for #\tab
+; %% -- for #\%
+;
+; The result of unesc-string is a character string with all %-combinations
+; above replaced with their character equivalents
+
+(run-test
+ (define (unesc-string str)
+ (call-with-input-string str
+ (lambda (port)
+ (let loop ((frags '()))
+ (let* ((token (next-token '() '(#\% *eof*) "unesc-string" port))
+ (cterm (read-char port))
+ (frags (cons token frags)))
+ (if (eof-object? cterm) (string-concatenate-reverse/shared frags)
+ (let ((cchar (read-char port))) ; char after #\%
+ (if (eof-object? cchar)
+ (error "unexpected EOF after reading % in unesc-string:" str)
+ (loop
+ (cons
+ (case cchar
+ ((#\n) (string #\newline))
+ ((#\r) (string char-return))
+ ((#\t) (string char-tab))
+ ((#\%) "%")
+ (else (error "bad %-char in unesc-string:" cchar)))
+ frags))))))))))
+)
+
+
+; Test if a string is made of only whitespace
+; An empty string is considered made of whitespace as well
+(define (string-whitespace? str)
+ (let ((len (string-length str)))
+ (cond
+ ((zero? len) #t)
+ ((= 1 len) (char-whitespace? (string-ref str 0)))
+ ((= 2 len) (and (char-whitespace? (string-ref str 0))
+ (char-whitespace? (string-ref str 1))))
+ (else
+ (let loop ((i 0))
+ (or (>= i len)
+ (and (char-whitespace? (string-ref str i))
+ (loop (inc i)))))))))
+
+; Find val in alist
+; Return (values found-el remaining-alist) or
+; (values #f alist)
+
+(define (assq-values val alist)
+ (let loop ((alist alist) (scanned '()))
+ (cond
+ ((null? alist) (values #f scanned))
+ ((equal? val (caar alist))
+ (values (car alist) (append scanned (cdr alist))))
+ (else
+ (loop (cdr alist) (cons (car alist) scanned))))))
+
+; From SRFI-1
+(define (fold-right kons knil lis1)
+ (let recur ((lis lis1))
+ (if (null? lis) knil
+ (let ((head (car lis)))
+ (kons head (recur (cdr lis)))))))
+
+; Left fold combinator for a single list
+(define (fold kons knil lis1)
+ (let lp ((lis lis1) (ans knil))
+ (if (null? lis) ans
+ (lp (cdr lis) (kons (car lis) ans)))))
+
+
+
+;========================================================================
+; Lower-level parsers and scanners
+;
+; They deal with primitive lexical units (Names, whitespaces, tags)
+; and with pieces of more generic productions. Most of these parsers
+; must be called in appropriate context. For example, ssax:complete-start-tag
+; must be called only when the start-tag has been detected and its GI
+; has been read.
+
+;------------------------------------------------------------------------
+; Low-level parsing code
+
+; Skip the S (whitespace) production as defined by
+; [3] S ::= (#x20 | #x9 | #xD | #xA)
+; The procedure returns the first not-whitespace character it
+; encounters while scanning the PORT. This character is left
+; on the input stream.
+
+(define ssax:S-chars (map ascii->char '(32 10 9 13)))
+
+(define (ssax:skip-S port)
+ (skip-while ssax:S-chars port))
+
+
+; Read a Name lexem and return it as string
+; [4] NameChar ::= Letter | Digit | '.' | '-' | '_' | ':'
+; | CombiningChar | Extender
+; [5] Name ::= (Letter | '_' | ':') (NameChar)*
+;
+; This code supports the XML Namespace Recommendation REC-xml-names,
+; which modifies the above productions as follows:
+;
+; [4] NCNameChar ::= Letter | Digit | '.' | '-' | '_'
+; | CombiningChar | Extender
+; [5] NCName ::= (Letter | '_') (NCNameChar)*
+; As the Rec-xml-names says,
+; "An XML document conforms to this specification if all other tokens
+; [other than element types and attribute names] in the document which
+; are required, for XML conformance, to match the XML production for
+; Name, match this specification's production for NCName."
+; Element types and attribute names must match the production QName,
+; defined below.
+
+; Check to see if a-char may start a NCName
+(define (ssax:ncname-starting-char? a-char)
+ (and (char? a-char)
+ (or
+ (char-alphabetic? a-char)
+ (char=? #\_ a-char))))
+
+
+; Read a NCName starting from the current position in the PORT and
+; return it as a symbol.
+(define (ssax:read-NCName port)
+ (let ((first-char (peek-char port)))
+ (or (ssax:ncname-starting-char? first-char)
+ (parser-error port "XMLNS [4] for '" first-char "'")))
+ (string->symbol
+ (next-token-of
+ (lambda (c)
+ (cond
+ ((eof-object? c) #f)
+ ((char-alphabetic? c) c)
+ ((string-index "0123456789.-_" c) c)
+ (else #f)))
+ port)))
+
+; Read a (namespace-) Qualified Name, QName, from the current
+; position in the PORT.
+; From REC-xml-names:
+; [6] QName ::= (Prefix ':')? LocalPart
+; [7] Prefix ::= NCName
+; [8] LocalPart ::= NCName
+; Return: an UNRES-NAME
+(define (ssax:read-QName port)
+ (let ((prefix-or-localpart (ssax:read-NCName port)))
+ (case (peek-char port)
+ ((#\:) ; prefix was given after all
+ (read-char port) ; consume the colon
+ (cons prefix-or-localpart (ssax:read-NCName port)))
+ (else prefix-or-localpart) ; Prefix was omitted
+ )))
+
+; The prefix of the pre-defined XML namespace
+(define ssax:Prefix-XML (string->symbol "xml"))
+
+(run-test
+ (assert (eq? '_
+ (call-with-input-string "_" ssax:read-NCName)))
+ (assert (eq? '_
+ (call-with-input-string "_" ssax:read-QName)))
+ (assert (eq? (string->symbol "_abc_")
+ (call-with-input-string "_abc_;" ssax:read-NCName)))
+ (assert (eq? (string->symbol "_abc_")
+ (call-with-input-string "_abc_;" ssax:read-QName)))
+ (assert (eq? (string->symbol "_a.b")
+ (call-with-input-string "_a.b " ssax:read-QName)))
+ (assert (equal? (cons (string->symbol "_a.b") (string->symbol "d.1-ef-"))
+ (call-with-input-string "_a.b:d.1-ef-;" ssax:read-QName)))
+ (assert (equal? (cons (string->symbol "a") (string->symbol "b"))
+ (call-with-input-string "a:b:c" ssax:read-QName)))
+
+ (assert (failed? (call-with-input-string ":abc" ssax:read-NCName)))
+ (assert (failed? (call-with-input-string "1:bc" ssax:read-NCName)))
+)
+
+; Compare one RES-NAME or an UNRES-NAME with the other.
+; Return a symbol '<, '>, or '= depending on the result of
+; the comparison.
+; Names without PREFIX are always smaller than those with the PREFIX.
+(define name-compare
+ (letrec ((symbol-compare
+ (lambda (symb1 symb2)
+ (cond
+ ((eq? symb1 symb2) '=)
+ ((string<? (symbol->string symb1) (symbol->string symb2))
+ '<)
+ (else '>)))))
+ (lambda (name1 name2)
+ (cond
+ ((symbol? name1) (if (symbol? name2) (symbol-compare name1 name2)
+ '<))
+ ((symbol? name2) '>)
+ ((eq? name2 ssax:largest-unres-name) '<)
+ ((eq? name1 ssax:largest-unres-name) '>)
+ ((eq? (car name1) (car name2)) ; prefixes the same
+ (symbol-compare (cdr name1) (cdr name2)))
+ (else (symbol-compare (car name1) (car name2)))))))
+
+; An UNRES-NAME that is postulated to be larger than anything that can occur in
+; a well-formed XML document.
+; name-compare enforces this postulate.
+(define ssax:largest-unres-name (cons
+ (string->symbol "#LARGEST-SYMBOL")
+ (string->symbol "#LARGEST-SYMBOL")))
+
+(run-test
+ (assert (eq? '= (name-compare 'ABC 'ABC)))
+ (assert (eq? '< (name-compare 'ABC 'ABCD)))
+ (assert (eq? '> (name-compare 'XB 'ABCD)))
+ (assert (eq? '> (name-compare '(HTML . PRE) 'PRE)))
+ (assert (eq? '< (name-compare 'HTML '(HTML . PRE))))
+ (assert (eq? '= (name-compare '(HTML . PRE) '(HTML . PRE))))
+ (assert (eq? '< (name-compare '(HTML . PRE) '(XML . PRE))))
+ (assert (eq? '> (name-compare '(HTML . PRE) '(HTML . P))))
+ (assert (eq? '< (name-compare '(HTML . PRE) ssax:largest-unres-name)))
+ (assert (eq? '< (name-compare '(ZZZZ . ZZZ) ssax:largest-unres-name)))
+ (assert (eq? '> (name-compare ssax:largest-unres-name '(ZZZZ . ZZZ) )))
+)
+
+
+
+; procedure: ssax:read-markup-token PORT
+; This procedure starts parsing of a markup token. The current position
+; in the stream must be #\<. This procedure scans enough of the input stream
+; to figure out what kind of a markup token it is seeing. The procedure returns
+; an xml-token structure describing the token. Note, generally reading
+; of the current markup is not finished! In particular, no attributes of
+; the start-tag token are scanned.
+;
+; Here's a detailed break out of the return values and the position in the PORT
+; when that particular value is returned:
+; PI-token: only PI-target is read.
+; To finish the Processing Instruction and disregard it,
+; call ssax:skip-pi. ssax:read-attributes may be useful
+; as well (for PIs whose content is attribute-value
+; pairs)
+; END-token: The end tag is read completely; the current position
+; is right after the terminating #\> character.
+; COMMENT is read and skipped completely. The current position
+; is right after "-->" that terminates the comment.
+; CDSECT The current position is right after "<!CDATA["
+; Use ssax:read-cdata-body to read the rest.
+; DECL We have read the keyword (the one that follows "<!")
+; identifying this declaration markup. The current
+; position is after the keyword (usually a
+; whitespace character)
+;
+; START-token We have read the keyword (GI) of this start tag.
+; No attributes are scanned yet. We don't know if this
+; tag has an empty content either.
+; Use ssax:complete-start-tag to finish parsing of
+; the token.
+
+(define ssax:read-markup-token ; procedure ssax:read-markup-token port
+ (let ()
+ ; we have read "<!-". Skip through the rest of the comment
+ ; Return the 'COMMENT token as an indication we saw a comment
+ ; and skipped it.
+ (define (skip-comment port)
+ (assert-curr-char '(#\-) "XML [15], second dash" port)
+ (if (not (find-string-from-port? "-->" port))
+ (parser-error port "XML [15], no -->"))
+ (make-xml-token 'COMMENT #f))
+
+ ; we have read "<![" that must begin a CDATA section
+ (define (read-cdata port)
+ (assert (string=? "CDATA[" (read-string 6 port)))
+ (make-xml-token 'CDSECT #f))
+
+ (lambda (port)
+ (assert-curr-char '(#\<) "start of the token" port)
+ (case (peek-char port)
+ ((#\/) (read-char port)
+ (begin0 (make-xml-token 'END (ssax:read-QName port))
+ (ssax:skip-S port)
+ (assert-curr-char '(#\>) "XML [42]" port)))
+ ((#\?) (read-char port) (make-xml-token 'PI (ssax:read-NCName port)))
+ ((#\!)
+ (case (peek-next-char port)
+ ((#\-) (read-char port) (skip-comment port))
+ ((#\[) (read-char port) (read-cdata port))
+ (else (make-xml-token 'DECL (ssax:read-NCName port)))))
+ (else (make-xml-token 'START (ssax:read-QName port)))))
+))
+
+
+; The current position is inside a PI. Skip till the rest of the PI
+(define (ssax:skip-pi port)
+ (if (not (find-string-from-port? "?>" port))
+ (parser-error port "Failed to find ?> terminating the PI")))
+
+
+; The current position is right after reading the PITarget. We read the
+; body of PI and return is as a string. The port will point to the
+; character right after '?>' combination that terminates PI.
+; [16] PI ::= '<?' PITarget (S (Char* - (Char* '?>' Char*)))? '?>'
+
+(define (ssax:read-pi-body-as-string port)
+ (ssax:skip-S port) ; skip WS after the PI target name
+ (string-concatenate/shared
+ (let loop ()
+ (let ((pi-fragment
+ (next-token '() '(#\?) "reading PI content" port)))
+ (if (eqv? #\> (peek-next-char port))
+ (begin
+ (read-char port)
+ (cons pi-fragment '()))
+ (cons* pi-fragment "?" (loop)))))))
+
+(run-test
+ (assert (equal? "p1 content "
+ (call-with-input-string "<?pi1 p1 content ?>"
+ (lambda (port)
+ (ssax:read-markup-token port)
+ (ssax:read-pi-body-as-string port)))))
+ (assert (equal? "pi2? content? ?"
+ (call-with-input-string "<?pi2 pi2? content? ??>"
+ (lambda (port)
+ (ssax:read-markup-token port)
+ (ssax:read-pi-body-as-string port)))))
+)
+
+;(define (ssax:read-pi-body-as-name-values port)
+
+; The current pos in the port is inside an internal DTD subset
+; (e.g., after reading #\[ that begins an internal DTD subset)
+; Skip until the "]>" combination that terminates this DTD
+(define (ssax:skip-internal-dtd port)
+ (if (not (find-string-from-port? "]>" port))
+ (parser-error port
+ "Failed to find ]> terminating the internal DTD subset")))
+
+
+; procedure+: ssax:read-cdata-body PORT STR-HANDLER SEED
+;
+; This procedure must be called after we have read a string "<![CDATA["
+; that begins a CDATA section. The current position must be the first
+; position of the CDATA body. This function reads _lines_ of the CDATA
+; body and passes them to a STR-HANDLER, a character data consumer.
+;
+; The str-handler is a STR-HANDLER, a procedure STRING1 STRING2 SEED.
+; The first STRING1 argument to STR-HANDLER never contains a newline.
+; The second STRING2 argument often will. On the first invocation of
+; the STR-HANDLER, the seed is the one passed to ssax:read-cdata-body
+; as the third argument. The result of this first invocation will be
+; passed as the seed argument to the second invocation of the line
+; consumer, and so on. The result of the last invocation of the
+; STR-HANDLER is returned by the ssax:read-cdata-body. Note a
+; similarity to the fundamental 'fold' iterator.
+;
+; Within a CDATA section all characters are taken at their face value,
+; with only three exceptions:
+; CR, LF, and CRLF are treated as line delimiters, and passed
+; as a single #\newline to the STR-HANDLER
+; "]]>" combination is the end of the CDATA section.
+; &gt; is treated as an embedded #\> character
+; Note, &lt; and &amp; are not specially recognized (and are not expanded)!
+
+(define ssax:read-cdata-body
+ (let ((cdata-delimiters (list char-return #\newline #\] #\&)))
+
+ (lambda (port str-handler seed)
+ (let loop ((seed seed))
+ (let ((fragment (next-token '() cdata-delimiters
+ "reading CDATA" port)))
+ ; that is, we're reading the char after the 'fragment'
+ (case (read-char port)
+ ((#\newline) (loop (str-handler fragment nl seed)))
+ ((#\])
+ (if (not (eqv? (peek-char port) #\]))
+ (loop (str-handler fragment "]" seed))
+ (let check-after-second-braket
+ ((seed (if (string-null? fragment) seed
+ (str-handler fragment "" seed))))
+ (case (peek-next-char port) ; after the second bracket
+ ((#\>) (read-char port) seed) ; we have read "]]>"
+ ((#\]) (check-after-second-braket
+ (str-handler "]" "" seed)))
+ (else (loop (str-handler "]]" "" seed)))))))
+ ((#\&) ; Note that #\& within CDATA may stand for itself
+ (let ((ent-ref ; it does not have to start an entity ref
+ (next-token-of (lambda (c)
+ (and (not (eof-object? c)) (char-alphabetic? c) c)) port)))
+ (cond ; "&gt;" is to be replaced with #\>
+ ((and (string=? "gt" ent-ref) (eqv? (peek-char port) #\;))
+ (read-char port)
+ (loop (str-handler fragment ">" seed)))
+ (else
+ (loop
+ (str-handler ent-ref ""
+ (str-handler fragment "&" seed)))))))
+ (else ; Must be CR: if the next char is #\newline, skip it
+ (if (eqv? (peek-char port) #\newline) (read-char port))
+ (loop (str-handler fragment nl seed)))
+ ))))))
+
+; a few lines of validation code
+(run-test (letrec
+ ((consumer (lambda (fragment foll-fragment seed)
+ (cons* (if (equal? foll-fragment (string #\newline))
+ " NL" foll-fragment) fragment seed)))
+ (test (lambda (str expected-result)
+ (newline) (display "body: ") (write str)
+ (newline) (display "Result: ")
+ (let ((result
+ (reverse
+ (call-with-input-string (unesc-string str)
+ (lambda (port) (ssax:read-cdata-body port consumer '()))
+ ))))
+ (write result)
+ (assert (equal? result expected-result)))))
+ )
+ (test "]]>" '())
+ (test "abcd]]>" '("abcd" ""))
+ (test "abcd]]]>" '("abcd" "" "]" ""))
+ (test "abcd]]]]>" '("abcd" "" "]" "" "]" ""))
+ (test "abcd]]]]]>" '("abcd" "" "]" "" "]" "" "]" ""))
+ (test "abcd]]]a]]>" '("abcd" "" "]" "" "]]" "" "a" ""))
+ (test "abc%r%ndef%n]]>" '("abc" " NL" "def" " NL"))
+ (test "%r%n%r%n]]>" '("" " NL" "" " NL"))
+ (test "%r%n%r%na]]>" '("" " NL" "" " NL" "a" ""))
+ (test "%r%r%r%na]]>" '("" " NL" "" " NL" "" " NL" "a" ""))
+ (test "abc&!!!]]>" '("abc" "&" "" "" "!!!" ""))
+ (test "abc]]&gt;&gt&amp;]]]&gt;and]]>"
+ '("abc" "" "]]" "" "" ">" "" "&" "gt" "" "" "&" "amp" "" ";" "" "]" ""
+ "]]" "" "" ">" "and" ""))
+))
+
+
+; procedure+: ssax:read-char-ref PORT
+;
+; [66] CharRef ::= '&#' [0-9]+ ';'
+; | '&#x' [0-9a-fA-F]+ ';'
+;
+; This procedure must be called after we we have read "&#"
+; that introduces a char reference.
+; The procedure reads this reference and returns the corresponding char
+; The current position in PORT will be after ";" that terminates
+; the char reference
+; Faults detected:
+; WFC: XML-Spec.html#wf-Legalchar
+;
+; According to Section "4.1 Character and Entity References"
+; of the XML Recommendation:
+; "[Definition: A character reference refers to a specific character
+; in the ISO/IEC 10646 character set, for example one not directly
+; accessible from available input devices.]"
+; Therefore, we use a ucscode->string function to convert a character
+; code into the character -- *regardless* of the current character
+; encoding of the input stream.
+
+(define (ssax:read-char-ref port)
+ (let* ((base
+ (cond ((eqv? (peek-char port) #\x) (read-char port) 16)
+ (else 10)))
+ (name (next-token '() '(#\;) "XML [66]" port))
+ (char-code (string->number name base)))
+ (read-char port) ; read the terminating #\; char
+ (if (integer? char-code) (ucscode->string char-code)
+ (parser-error port "[wf-Legalchar] broken for '" name "'"))))
+
+
+; procedure+: ssax:handle-parsed-entity PORT NAME ENTITIES
+; CONTENT-HANDLER STR-HANDLER SEED
+;
+; Expand and handle a parsed-entity reference
+; port - a PORT
+; name - the name of the parsed entity to expand, a symbol
+; entities - see ENTITIES
+; content-handler -- procedure PORT ENTITIES SEED
+; that is supposed to return a SEED
+; str-handler - a STR-HANDLER. It is called if the entity in question
+; turns out to be a pre-declared entity
+;
+; The result is the one returned by CONTENT-HANDLER or STR-HANDLER
+; Faults detected:
+; WFC: XML-Spec.html#wf-entdeclared
+; WFC: XML-Spec.html#norecursion
+
+(define ssax:predefined-parsed-entities
+ `(
+ (,(string->symbol "amp") . "&")
+ (,(string->symbol "lt") . "<")
+ (,(string->symbol "gt") . ">")
+ (,(string->symbol "apos") . "'")
+ (,(string->symbol "quot") . "\"")))
+
+(define (ssax:handle-parsed-entity port name entities
+ content-handler str-handler seed)
+ (cond ; First we check the list of the declared entities
+ ((assq name entities) =>
+ (lambda (decl-entity)
+ (let ((ent-body (cdr decl-entity)) ; mark the list to prevent recursion
+ (new-entities (cons (cons name #f) entities)))
+ (cond
+ ((string? ent-body)
+ (call-with-input-string ent-body
+ (lambda (port) (content-handler port new-entities seed))))
+ ((procedure? ent-body)
+ (let ((port (ent-body)))
+ (begin0
+ (content-handler port new-entities seed)
+ (close-input-port port))))
+ (else
+ (parser-error port "[norecursion] broken for " name))))))
+ ((assq name ssax:predefined-parsed-entities)
+ => (lambda (decl-entity)
+ (str-handler (cdr decl-entity) "" seed)))
+ (else (parser-error port "[wf-entdeclared] broken for " name))))
+
+
+
+; The ATTLIST Abstract Data Type
+; Currently is implemented as an assoc list sorted in the ascending
+; order of NAMES.
+
+(define (make-empty-attlist) '())
+
+; Add a name-value pair to the existing attlist preserving the order
+; Return the new list, in the sorted ascending order.
+; Return #f if a pair with the same name already exists in the attlist
+
+(define (attlist-add attlist name-value)
+ (if (null? attlist) (cons name-value attlist)
+ (case (name-compare (car name-value) (caar attlist))
+ ((=) #f)
+ ((<) (cons name-value attlist))
+ (else (cons (car attlist) (attlist-add (cdr attlist) name-value)))
+ )))
+
+(define attlist-null? null?)
+
+; Given an non-null attlist, return a pair of values: the top and the rest
+(define (attlist-remove-top attlist)
+ (values (car attlist) (cdr attlist)))
+
+(define (attlist->alist attlist) attlist)
+(define attlist-fold fold)
+
+; procedure+: ssax:read-attributes PORT ENTITIES
+;
+; This procedure reads and parses a production Attribute*
+; [41] Attribute ::= Name Eq AttValue
+; [10] AttValue ::= '"' ([^<&"] | Reference)* '"'
+; | "'" ([^<&'] | Reference)* "'"
+; [25] Eq ::= S? '=' S?
+;
+;
+; The procedure returns an ATTLIST, of Name (as UNRES-NAME), Value (as string)
+; pairs. The current character on the PORT is a non-whitespace character
+; that is not an ncname-starting character.
+;
+; Note the following rules to keep in mind when reading an 'AttValue'
+; "Before the value of an attribute is passed to the application
+; or checked for validity, the XML processor must normalize it as follows:
+; - a character reference is processed by appending the referenced
+; character to the attribute value
+; - an entity reference is processed by recursively processing the
+; replacement text of the entity [see ENTITIES]
+; [named entities amp lt gt quot apos are assumed pre-declared]
+; - a whitespace character (#x20, #xD, #xA, #x9) is processed by appending #x20
+; to the normalized value, except that only a single #x20 is appended for a
+; "#xD#xA" sequence that is part of an external parsed entity or the
+; literal entity value of an internal parsed entity
+; - other characters are processed by appending them to the normalized value "
+;
+;
+; Faults detected:
+; WFC: XML-Spec.html#CleanAttrVals
+; WFC: XML-Spec.html#uniqattspec
+
+(define ssax:read-attributes ; ssax:read-attributes port entities
+ (let ((value-delimeters (append ssax:S-chars '(#\< #\&))))
+ ; Read the AttValue from the PORT up to the delimiter
+ ; (which can be a single or double-quote character,
+ ; or even a symbol *eof*)
+ ; 'prev-fragments' is the list of string fragments, accumulated
+ ; so far, in reverse order.
+ ; Return the list of fragments with newly read fragments
+ ; prepended.
+ (define (read-attrib-value delimiter port entities prev-fragments)
+ (let* ((new-fragments
+ (cons (next-token '() (cons delimiter value-delimeters)
+ "XML [10]" port)
+ prev-fragments))
+ (cterm (read-char port)))
+ (cond
+ ((or (eof-object? cterm) (eqv? cterm delimiter))
+ new-fragments)
+ ((eqv? cterm char-return) ; treat a CR and CRLF as a LF
+ (if (eqv? (peek-char port) #\newline) (read-char port))
+ (read-attrib-value delimiter port entities
+ (cons " " new-fragments)))
+ ((memv cterm ssax:S-chars)
+ (read-attrib-value delimiter port entities
+ (cons " " new-fragments)))
+ ((eqv? cterm #\&)
+ (cond
+ ((eqv? (peek-char port) #\#)
+ (read-char port)
+ (read-attrib-value delimiter port entities
+ (cons (ssax:read-char-ref port) new-fragments)))
+ (else
+ (read-attrib-value delimiter port entities
+ (read-named-entity port entities new-fragments)))))
+ (else (parser-error port "[CleanAttrVals] broken")))))
+
+ ; we have read "&" that introduces a named entity reference.
+ ; read this reference and return the result of
+ ; normalizing of the corresponding string
+ ; (that is, read-attrib-value is applied to the replacement
+ ; text of the entity)
+ ; The current position will be after ";" that terminates
+ ; the entity reference
+ (define (read-named-entity port entities fragments)
+ (let ((name (ssax:read-NCName port)))
+ (assert-curr-char '(#\;) "XML [68]" port)
+ (ssax:handle-parsed-entity port name entities
+ (lambda (port entities fragments)
+ (read-attrib-value '*eof* port entities fragments))
+ (lambda (str1 str2 fragments)
+ (if (equal? "" str2) (cons str1 fragments)
+ (cons* str2 str1 fragments)))
+ fragments)))
+
+ (lambda (port entities)
+ (let loop ((attr-list (make-empty-attlist)))
+ (if (not (ssax:ncname-starting-char? (ssax:skip-S port))) attr-list
+ (let ((name (ssax:read-QName port)))
+ (ssax:skip-S port)
+ (assert-curr-char '(#\=) "XML [25]" port)
+ (ssax:skip-S port)
+ (let ((delimiter
+ (assert-curr-char '(#\' #\" ) "XML [10]" port)))
+ (loop
+ (or (attlist-add attr-list
+ (cons name
+ (string-concatenate-reverse/shared
+ (read-attrib-value delimiter port entities
+ '()))))
+ (parser-error port "[uniqattspec] broken for " name))))))))
+))
+
+; a few lines of validation code
+(run-test (letrec
+ ((test (lambda (str decl-entities expected-res)
+ (newline) (display "input: ") (write str)
+ (newline) (display "Result: ")
+ (let ((result
+ (call-with-input-string (unesc-string str)
+ (lambda (port)
+ (ssax:read-attributes port decl-entities)))))
+ (write result) (newline)
+ (assert (equal? result expected-res))))))
+ (test "" '() '())
+ (test "href='http://a%tb%r%n%r%n%nc'" '()
+ `((,(string->symbol "href") . "http://a b c")))
+ (test "href='http://a%tb%r%r%n%rc'" '()
+ `((,(string->symbol "href") . "http://a b c")))
+ (test "_1 ='12&amp;' _2= \"%r%n%t12&#10;3\">" '()
+ `((_1 . "12&") (_2 . ,(unesc-string " 12%n3"))))
+ (test "%tAbc='&lt;&amp;&gt;&#x0A;'%nNext='12&ent;34' />"
+ '((ent . "&lt;xx&gt;"))
+ `((,(string->symbol "Abc") . ,(unesc-string "<&>%n"))
+ (,(string->symbol "Next") . "12<xx>34")))
+ (test "%tAbc='&lt;&amp;&gt;&#x0d;'%nNext='12&ent;34' />"
+ '((ent . "&lt;xx&gt;"))
+ `((,(string->symbol "Abc") . ,(unesc-string "<&>%r"))
+ (,(string->symbol "Next") . "12<xx>34")))
+ (test "%tAbc='&lt;&amp;&gt;&#x0A;'%nNext='12&en;34' />"
+ `((en . ,(lambda () (open-input-string "&quot;xx&apos;"))))
+ `((,(string->symbol "Abc") . ,(unesc-string "<&>%n"))
+ (,(string->symbol "Next") . "12\"xx'34")))
+ (test "%tAbc='&lt;&amp;&gt;&#x0A;'%nNext='12&ent;34' />"
+ '((ent . "&lt;&ent1;T;&gt;") (ent1 . "&amp;"))
+ `((,(string->symbol "Abc") . ,(unesc-string "<&>%n"))
+ (,(string->symbol "Next") . "12<&T;>34")))
+ (assert (failed?
+ (test "%tAbc='&lt;&amp;&gt;&#x0A;'%nNext='12&ent;34' />"
+ '((ent . "<&ent1;T;&gt;") (ent1 . "&amp;")) '())))
+ (assert (failed?
+ (test "%tAbc='&lt;&amp;&gt;&#x0A;'%nNext='12&ent;34' />"
+ '((ent . "&lt;&ent;T;&gt;") (ent1 . "&amp;")) '())))
+ (assert (failed?
+ (test "%tAbc='&lt;&amp;&gt;&#x0A;'%nNext='12&ent;34' />"
+ '((ent . "&lt;&ent1;T;&gt;") (ent1 . "&ent;")) '())))
+ (test "html:href='http://a%tb%r%n%r%n%nc'" '()
+ `(((,(string->symbol "html") . ,(string->symbol "href"))
+ . "http://a b c")))
+ (test "html:href='ref1' html:src='ref2'" '()
+ `(((,(string->symbol "html") . ,(string->symbol "href"))
+ . "ref1")
+ ((,(string->symbol "html") . ,(string->symbol "src"))
+ . "ref2")))
+ (test "html:href='ref1' xml:html='ref2'" '()
+ `(((,(string->symbol "html") . ,(string->symbol "href"))
+ . "ref1")
+ ((,ssax:Prefix-XML . ,(string->symbol "html"))
+ . "ref2")))
+ (assert (failed? (test "html:href='ref1' html:href='ref2'" '() '())))
+ (assert (failed? (test "html:href='<' html:href='ref2'" '() '())))
+ (assert (failed? (test "html:href='ref1' html:href='&ref2;'" '() '())))
+))
+
+; ssax:resolve-name PORT UNRES-NAME NAMESPACES apply-default-ns?
+;
+; Convert an UNRES-NAME to a RES-NAME given the appropriate NAMESPACES
+; declarations.
+; the last parameter apply-default-ns? determines if the default
+; namespace applies (for instance, it does not for attribute names)
+;
+; Per REC-xml-names/#nsc-NSDeclared, "xml" prefix is considered pre-declared
+; and bound to the namespace name "http://www.w3.org/XML/1998/namespace".
+;
+; This procedure tests for the namespace constraints:
+; http://www.w3.org/TR/REC-xml-names/#nsc-NSDeclared
+
+(define (ssax:resolve-name port unres-name namespaces apply-default-ns?)
+ (cond
+ ((pair? unres-name) ; it's a QNAME
+ (cons
+ (cond
+ ((assq (car unres-name) namespaces) => cadr)
+ ((eq? (car unres-name) ssax:Prefix-XML) ssax:Prefix-XML)
+ (else
+ (parser-error port "[nsc-NSDeclared] broken; prefix " (car unres-name))))
+ (cdr unres-name)))
+ (apply-default-ns? ; Do apply the default namespace, if any
+ (let ((default-ns (assq '*DEFAULT* namespaces)))
+ (if (and default-ns (cadr default-ns))
+ (cons (cadr default-ns) unres-name)
+ unres-name))) ; no default namespace declared
+ (else unres-name))) ; no prefix, don't apply the default-ns
+
+
+(run-test
+ (let* ((namespaces
+ '((HTML UHTML . URN-HTML)
+ (HTML UHTML-1 . URN-HTML)
+ (A UHTML . URN-HTML)))
+ (namespaces-def
+ (cons
+ '(*DEFAULT* DEF . URN-DEF) namespaces))
+ (namespaces-undef
+ (cons
+ '(*DEFAULT* #f . #f) namespaces-def))
+ (port (current-input-port)))
+
+ (assert (equal? 'ABC
+ (ssax:resolve-name port 'ABC namespaces #t)))
+ (assert (equal? '(DEF . ABC)
+ (ssax:resolve-name port 'ABC namespaces-def #t)))
+ (assert (equal? 'ABC
+ (ssax:resolve-name port 'ABC namespaces-def #f)))
+ (assert (equal? 'ABC
+ (ssax:resolve-name port 'ABC namespaces-undef #t)))
+ (assert (equal? '(UHTML . ABC)
+ (ssax:resolve-name port '(HTML . ABC) namespaces-def #t)))
+ (assert (equal? '(UHTML . ABC)
+ (ssax:resolve-name port '(HTML . ABC) namespaces-def #f)))
+ (assert (equal? `(,ssax:Prefix-XML . space)
+ (ssax:resolve-name port
+ `(,(string->symbol "xml") . space) namespaces-def #f)))
+ (assert (failed?
+ (ssax:resolve-name port '(XXX . ABC) namespaces-def #f)))
+))
+
+
+; procedure+: ssax:uri-string->symbol URI-STR
+; Convert a URI-STR to an appropriate symbol
+(define (ssax:uri-string->symbol uri-str)
+ (string->symbol uri-str))
+
+; procedure+: ssax:complete-start-tag TAG PORT ELEMS ENTITIES NAMESPACES
+;
+; This procedure is to complete parsing of a start-tag markup. The
+; procedure must be called after the start tag token has been
+; read. TAG is an UNRES-NAME. ELEMS is an instance of xml-decl::elems;
+; it can be #f to tell the function to do _no_ validation of elements
+; and their attributes.
+;
+; This procedure returns several values:
+; ELEM-GI: a RES-NAME.
+; ATTRIBUTES: element's attributes, an ATTLIST of (RES-NAME . STRING)
+; pairs. The list does NOT include xmlns attributes.
+; NAMESPACES: the input list of namespaces amended with namespace
+; (re-)declarations contained within the start-tag under parsing
+; ELEM-CONTENT-MODEL
+
+; On exit, the current position in PORT will be the first character after
+; #\> that terminates the start-tag markup.
+;
+; Faults detected:
+; VC: XML-Spec.html#enum
+; VC: XML-Spec.html#RequiredAttr
+; VC: XML-Spec.html#FixedAttr
+; VC: XML-Spec.html#ValueType
+; WFC: XML-Spec.html#uniqattspec (after namespaces prefixes are resolved)
+; VC: XML-Spec.html#elementvalid
+; WFC: REC-xml-names/#dt-NSName
+
+; Note, although XML Recommendation does not explicitly say it,
+; xmlns and xmlns: attributes don't have to be declared (although they
+; can be declared, to specify their default value)
+
+; Procedure: ssax:complete-start-tag tag-head port elems entities namespaces
+(define ssax:complete-start-tag
+
+ (let ((xmlns (string->symbol "xmlns"))
+ (largest-dummy-decl-attr (list ssax:largest-unres-name #f #f #f)))
+
+ ; Scan through the attlist and validate it, against decl-attrs
+ ; Return an assoc list with added fixed or implied attrs.
+ ; Note that both attlist and decl-attrs are ATTLISTs, and therefore,
+ ; sorted
+ (define (validate-attrs port attlist decl-attrs)
+
+ ; Check to see decl-attr is not of use type REQUIRED. Add
+ ; the association with the default value, if any declared
+ (define (add-default-decl decl-attr result)
+ (let*-values
+ (((attr-name content-type use-type default-value)
+ (apply values decl-attr)))
+ (and (eq? use-type 'REQUIRED)
+ (parser-error port "[RequiredAttr] broken for" attr-name))
+ (if default-value
+ (cons (cons attr-name default-value) result)
+ result)))
+
+ (let loop ((attlist attlist) (decl-attrs decl-attrs) (result '()))
+ (if (attlist-null? attlist)
+ (attlist-fold add-default-decl result decl-attrs)
+ (let*-values
+ (((attr attr-others)
+ (attlist-remove-top attlist))
+ ((decl-attr other-decls)
+ (if (attlist-null? decl-attrs)
+ (values largest-dummy-decl-attr decl-attrs)
+ (attlist-remove-top decl-attrs)))
+ )
+ (case (name-compare (car attr) (car decl-attr))
+ ((<)
+ (if (or (eq? xmlns (car attr))
+ (and (pair? (car attr)) (eq? xmlns (caar attr))))
+ (loop attr-others decl-attrs (cons attr result))
+ (parser-error port "[ValueType] broken for " attr)))
+ ((>)
+ (loop attlist other-decls
+ (add-default-decl decl-attr result)))
+ (else ; matched occurrence of an attr with its declaration
+ (let*-values
+ (((attr-name content-type use-type default-value)
+ (apply values decl-attr)))
+ ; Run some tests on the content of the attribute
+ (cond
+ ((eq? use-type 'FIXED)
+ (or (equal? (cdr attr) default-value)
+ (parser-error port "[FixedAttr] broken for " attr-name)))
+ ((eq? content-type 'CDATA) #t) ; everything goes
+ ((pair? content-type)
+ (or (member (cdr attr) content-type)
+ (parser-error port "[enum] broken for " attr-name "="
+ (cdr attr))))
+ (else
+ (ssax:warn port "declared content type " content-type
+ " not verified yet")))
+ (loop attr-others other-decls (cons attr result)))))
+ ))))
+
+
+ ; Add a new namespace declaration to namespaces.
+ ; First we convert the uri-str to a uri-symbol and search namespaces for
+ ; an association (_ user-prefix . uri-symbol).
+ ; If found, we return the argument namespaces with an association
+ ; (prefix user-prefix . uri-symbol) prepended.
+ ; Otherwise, we prepend (prefix uri-symbol . uri-symbol)
+ (define (add-ns port prefix uri-str namespaces)
+ (and (equal? "" uri-str)
+ (parser-error port "[dt-NSName] broken for " prefix))
+ (let ((uri-symbol (ssax:uri-string->symbol uri-str)))
+ (let loop ((nss namespaces))
+ (cond
+ ((null? nss)
+ (cons (cons* prefix uri-symbol uri-symbol) namespaces))
+ ((eq? uri-symbol (cddar nss))
+ (cons (cons* prefix (cadar nss) uri-symbol) namespaces))
+ (else (loop (cdr nss)))))))
+
+ ; partition attrs into proper attrs and new namespace declarations
+ ; return two values: proper attrs and the updated namespace declarations
+ (define (adjust-namespace-decl port attrs namespaces)
+ (let loop ((attrs attrs) (proper-attrs '()) (namespaces namespaces))
+ (cond
+ ((null? attrs) (values proper-attrs namespaces))
+ ((eq? xmlns (caar attrs)) ; re-decl of the default namespace
+ (loop (cdr attrs) proper-attrs
+ (if (equal? "" (cdar attrs)) ; un-decl of the default ns
+ (cons (cons* '*DEFAULT* #f #f) namespaces)
+ (add-ns port '*DEFAULT* (cdar attrs) namespaces))))
+ ((and (pair? (caar attrs)) (eq? xmlns (caaar attrs)))
+ (loop (cdr attrs) proper-attrs
+ (add-ns port (cdaar attrs) (cdar attrs) namespaces)))
+ (else
+ (loop (cdr attrs) (cons (car attrs) proper-attrs) namespaces)))))
+
+ ; The body of the function
+ (lambda (tag-head port elems entities namespaces)
+ (let*-values
+ (((attlist) (ssax:read-attributes port entities))
+ ((empty-el-tag?)
+ (begin
+ (ssax:skip-S port)
+ (and
+ (eqv? #\/
+ (assert-curr-char '(#\> #\/) "XML [40], XML [44], no '>'" port))
+ (assert-curr-char '(#\>) "XML [44], no '>'" port))))
+ ((elem-content decl-attrs) ; see xml-decl for their type
+ (if elems ; elements declared: validate!
+ (cond
+ ((assoc tag-head elems) =>
+ (lambda (decl-elem) ; of type xml-decl::decl-elem
+ (values
+ (if empty-el-tag? 'EMPTY-TAG (cadr decl-elem))
+ (caddr decl-elem))))
+ (else
+ (parser-error port "[elementvalid] broken, no decl for " tag-head)))
+ (values ; non-validating parsing
+ (if empty-el-tag? 'EMPTY-TAG 'ANY)
+ #f) ; no attributes declared
+ ))
+ ((merged-attrs) (if decl-attrs (validate-attrs port attlist decl-attrs)
+ (attlist->alist attlist)))
+ ((proper-attrs namespaces)
+ (adjust-namespace-decl port merged-attrs namespaces))
+ )
+ ;(cerr "proper attrs: " proper-attrs nl)
+ ; build the return value
+ (values
+ (ssax:resolve-name port tag-head namespaces #t)
+ (fold-right
+ (lambda (name-value attlist)
+ (or
+ (attlist-add attlist
+ (cons (ssax:resolve-name port (car name-value) namespaces #f)
+ (cdr name-value)))
+ (parser-error port "[uniqattspec] after NS expansion broken for "
+ name-value)))
+ (make-empty-attlist)
+ proper-attrs)
+ namespaces
+ elem-content)))))
+
+(run-test
+ (let* ((urn-a (string->symbol "urn:a"))
+ (urn-b (string->symbol "urn:b"))
+ (urn-html (string->symbol "http://w3c.org/html"))
+ (namespaces
+ `((#f '"UHTML" . ,urn-html)
+ ('"A" '"UA" . ,urn-a)))
+ (test
+ (lambda (tag-head-name elems str)
+ (call-with-input-string str
+ (lambda (port)
+ (call-with-values
+ (lambda ()
+ (ssax:complete-start-tag
+ (call-with-input-string tag-head-name
+ (lambda (port) (ssax:read-QName port)))
+ port
+ elems '() namespaces))
+ list))))))
+
+ ; First test with no validation of elements
+ ;(test "TAG1" #f "")
+ (assert (equal? `('"TAG1" () ,namespaces ANY)
+ (test "TAG1" #f ">")))
+ (assert (equal? `('"TAG1" () ,namespaces EMPTY-TAG)
+ (test "TAG1" #f "/>")))
+ (assert (equal? `('"TAG1" (('"HREF" . "a")) ,namespaces EMPTY-TAG)
+ (test "TAG1" #f "HREF='a'/>")))
+ (assert (equal? `(('"UA" . '"TAG1") (('"HREF" . "a"))
+ ,(cons `(*DEFAULT* '"UA" . ,urn-a) namespaces) ANY)
+ (test "TAG1" #f "HREF='a' xmlns='urn:a'>")))
+ (assert (equal? `('"TAG1" (('"HREF" . "a"))
+ ,(cons '(*DEFAULT* #f . #f) namespaces) ANY)
+ (test "TAG1" #f "HREF='a' xmlns=''>")))
+ (assert (failed? (test "UA:TAG1" #f "HREF='a' xmlns=''/>")))
+ (assert (equal? `(('"UA" . '"TAG1") ((('"UA" . '"HREF") . "a"))
+ ,(cons '(*DEFAULT* #f . #f) namespaces) ANY)
+ (test "A:TAG1" #f "A:HREF='a' xmlns=''>")))
+ (assert (equal? `(('"UA" . '"TAG1") ((('"UA" . '"HREF") . "a"))
+ ,(cons `(*DEFAULT* ,urn-b . ,urn-b) namespaces) ANY)
+ (test "A:TAG1" #f "A:HREF='a' xmlns='urn:b'>")))
+ (assert (failed? (test "B:TAG1" #f "A:HREF='a' xmlns:b=''/>")))
+ (assert (equal? `((,urn-b . '"TAG1") ((('"UA" . '"HREF") . "a"))
+ ,(cons `('"B" ,urn-b . ,urn-b) namespaces) ANY)
+ (test "B:TAG1" #f "A:HREF='a' xmlns:B='urn:b'>")))
+ (assert (equal? `((,urn-b . '"TAG1") ((('"UA" . '"HREF") . "a")
+ ((,urn-b . '"SRC") . "b"))
+ ,(cons `('"B" ,urn-b . ,urn-b) namespaces) ANY)
+ (test "B:TAG1" #f
+ "B:SRC='b' A:HREF='a' xmlns:B='urn:b'>")))
+ (assert (equal? `((,urn-b . '"TAG1") ((('"UA" . '"HREF") . "a")
+ ((,urn-b . '"HREF") . "b"))
+ ,(cons `('"B" ,urn-b . ,urn-b) namespaces) ANY)
+ (test "B:TAG1" #f
+ "B:HREF=\"b\" A:HREF='a' xmlns:B='urn:b'>")))
+ ; must be an error! Duplicate attr
+ (assert (failed? (test "B:TAG1" #f
+ "HREF=\"b\" HREF='a' xmlns:B='urn:a'/>")))
+ ; must be an error! Duplicate attr after ns expansion
+ (assert (failed? (test "B:TAG1" #f
+ "B:HREF=\"b\" A:HREF='a' xmlns:B='urn:a'/>")))
+ (assert (equal? `(('"UA" . '"TAG1") (('"HREF" . "a")
+ (('"UA" . '"HREF") . "b"))
+ ,(cons `(*DEFAULT* '"UA" . ,urn-a) namespaces) ANY)
+ (test "TAG1" #f
+ "A:HREF=\"b\" HREF='a' xmlns='urn:a'>")))
+ (assert (equal? `('"TAG1" ((('"UHTML" . '"HREF") . "a")
+ ((,urn-b . '"HREF") . "b"))
+ ,(append `(
+ ('"HTML" '"UHTML" . ,urn-html)
+ ('"B" ,urn-b . ,urn-b))
+ namespaces) ANY)
+ (test "TAG1" #f
+ "B:HREF=\"b\" xmlns:B='urn:b' xmlns:HTML='http://w3c.org/html' HTML:HREF='a' >")))
+
+ ; Now test the validating parsing
+ ; No decl for tag1
+ (assert (failed? (test "TAG1" '((TAG2 ANY ()))
+ "B:HREF='b' xmlns:B='urn:b'>")))
+ ; No decl for HREF elem
+;; (cond-expand
+;; ((not (or scm mit-scheme)) ; Regretfully, SCM treats '() as #f
+;; (assert (failed?
+;; (test "TAG1" '(('"TAG1" ANY ()))
+;; "B:HREF='b' xmlns:B='urn:b'>"))))
+;; (else #t))
+ ; No decl for HREF elem
+ (assert (failed?
+ (test "TAG1" '(('"TAG1" ANY (('"HREF1" CDATA IMPLIED #f))))
+ "B:HREF='b' xmlns:B='urn:b'>")))
+ (assert (equal? `('"TAG1" (('"HREF" . "b")) ,namespaces EMPTY-TAG)
+ (test "TAG1" '(('"TAG1" PCDATA (('"HREF" CDATA REQUIRED #f))))
+ "HREF='b'/>")))
+ (assert (equal? `('"TAG1" (('"HREF" . "b")) ,namespaces PCDATA)
+ (test "TAG1" '(('"TAG1" PCDATA (('"HREF" CDATA REQUIRED #f))))
+ "HREF='b'>")))
+ ; Req'd attribute not given error
+ (assert (failed?
+ (test "TAG1" '(('"TAG1" PCDATA (('"HREF" CDATA REQUIRED #f))))
+ ">")))
+ ; Wrong content-type of the attribute
+ (assert (failed?
+ (test "TAG1" '(('"TAG1" PCDATA (('"HREF" ("c") REQUIRED #f))))
+ "HREF='b'>")))
+ (assert (equal? `('"TAG1" (('"HREF" . "b")) ,namespaces PCDATA)
+ (test "TAG1" '(('"TAG1" PCDATA (('"HREF" ("c" "b") IMPLIED #f))))
+ "HREF='b'>")))
+ (assert (equal? `('"TAG1" (('"HREF" . "b")) ,namespaces PCDATA)
+ (test "TAG1" '(('"TAG1" PCDATA (('"HREF" CDATA IMPLIED "c"))))
+ "HREF='b'>")))
+ ; Bad fixed attribute
+ (assert (failed?
+ (test "TAG1" '(('"TAG1" PCDATA (('"HREF" CDATA FIXED "c"))))
+ "HREF='b'>")))
+ (assert (equal? `('"TAG1" (('"HREF" . "b")) ,namespaces PCDATA)
+ (test "TAG1" '(('"TAG1" PCDATA (('"HREF" CDATA FIXED "b"))))
+ "HREF='b'>")))
+ (assert (equal? `('"TAG1" (('"HREF" . "b")) ,namespaces PCDATA)
+ (test "TAG1" '(('"TAG1" PCDATA (('"HREF" CDATA FIXED "b")))) ">")))
+ (assert (equal? `('"TAG1" (('"HREF" . "b")) ,namespaces PCDATA)
+ (test "TAG1" '(('"TAG1" PCDATA (('"HREF" CDATA IMPLIED "b")))) ">")))
+ (assert (equal? `('"TAG1" () ,namespaces PCDATA)
+ (test "TAG1" '(('"TAG1" PCDATA (('"HREF" CDATA IMPLIED #f)))) ">")))
+ ; Undeclared attr
+ (assert (failed?
+ (test "TAG1"
+ '(('"TAG1" PCDATA ((('"A" . '"HREF") CDATA IMPLIED "c"))))
+ "HREF='b'>")))
+ (assert (equal? `('"TAG1" (('"HREF" . "b") (('"UA" . '"HREF") . "c"))
+ ,namespaces PCDATA)
+ (test "TAG1" '(('"TAG1" PCDATA (('"HREF" CDATA REQUIRED #f)
+ (('"A" . '"HREF") CDATA IMPLIED "c"))))
+ "HREF='b'>")))
+ (assert (equal? `(('"UA" . '"TAG1")
+ (('"HREF" . "b") (('"UA" . '"HREF") . "c"))
+ ,namespaces PCDATA)
+ (test "A:TAG1" '((('"A" . '"TAG1") PCDATA
+ (('"HREF" NMTOKEN REQUIRED #f)
+ (('"A" . '"HREF") CDATA IMPLIED "c"))))
+ "HREF='b'>")))
+ (assert (equal? `((,urn-b . '"TAG1") (('"HREF" . "b"))
+ ,(cons `('"B" ,urn-b . ,urn-b) namespaces) PCDATA)
+ (test "B:TAG1" '((('"B" . '"TAG1") PCDATA (('"HREF" CDATA REQUIRED #f)
+ (('"xmlns" . '"B") CDATA IMPLIED "urn:b"))))
+ "HREF='b'>")))
+ (assert (equal? `((,urn-b . '"TAG1") (((,urn-b . '"HREF") . "b"))
+ ,(cons `('"B" ,urn-b . ,urn-b) namespaces) PCDATA)
+ (test "B:TAG1" '((('"B" . '"TAG1") PCDATA
+ ((('"B" . '"HREF") CDATA REQUIRED #f)
+ (('"xmlns" . '"B") CDATA IMPLIED "urn:b"))))
+ "B:HREF='b'>")))
+ (assert (equal? `((,urn-b . '"TAG1") (('"HREF" . "b"))
+ ,(cons `(*DEFAULT* ,urn-b . ,urn-b) namespaces) PCDATA)
+ (test "TAG1" '(('"TAG1" PCDATA (('"HREF" CDATA REQUIRED #f)
+ ('"xmlns" CDATA IMPLIED "urn:b"))))
+ "HREF='b'>")))
+ ; xmlns not declared
+ (assert (equal? `((,urn-b . '"TAG1") (('"HREF" . "b"))
+ ,(cons `(*DEFAULT* ,urn-b . ,urn-b) namespaces) PCDATA)
+ (test "TAG1" '(('"TAG1" PCDATA (('"HREF" CDATA REQUIRED #f)
+ )))
+ "HREF='b' xmlns='urn:b'>")))
+ ; xmlns:B not declared
+ (assert (equal? `((,urn-b . '"TAG1") (((,urn-b . '"HREF") . "b"))
+ ,(cons `('"B" ,urn-b . ,urn-b) namespaces) PCDATA)
+ (test "B:TAG1" '((('"B" . '"TAG1") PCDATA
+ ((('"B" . '"HREF") CDATA REQUIRED #f)
+ )))
+ "B:HREF='b' xmlns:B='urn:b'>")))
+))
+
+; procedure+: ssax:read-external-id PORT
+;
+; This procedure parses an ExternalID production:
+; [75] ExternalID ::= 'SYSTEM' S SystemLiteral
+; | 'PUBLIC' S PubidLiteral S SystemLiteral
+; [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'")
+; [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'"
+; [13] PubidChar ::= #x20 | #xD | #xA | [a-zA-Z0-9]
+; | [-'()+,./:=?;!*#@$_%]
+;
+; This procedure is supposed to be called when an ExternalID is expected;
+; that is, the current character must be either #\S or #\P that start
+; correspondingly a SYSTEM or PUBLIC token. This procedure returns the
+; SystemLiteral as a string. A PubidLiteral is disregarded if present.
+
+(define (ssax:read-external-id port)
+ (let ((discriminator (ssax:read-NCName port)))
+ (assert-curr-char ssax:S-chars "space after SYSTEM or PUBLIC" port)
+ (ssax:skip-S port)
+ (let ((delimiter
+ (assert-curr-char '(#\' #\" ) "XML [11], XML [12]" port)))
+ (cond
+ ((eq? discriminator (string->symbol "SYSTEM"))
+ (begin0
+ (next-token '() (list delimiter) "XML [11]" port)
+ (read-char port) ; reading the closing delim
+ ))
+ ((eq? discriminator (string->symbol "PUBLIC"))
+ (skip-until (list delimiter) port)
+ (assert-curr-char ssax:S-chars "space after PubidLiteral" port)
+ (ssax:skip-S port)
+ (let* ((delimiter
+ (assert-curr-char '(#\' #\" ) "XML [11]" port))
+ (systemid
+ (next-token '() (list delimiter) "XML [11]" port)))
+ (read-char port) ; reading the closing delim
+ systemid))
+ (else
+ (parser-error port "XML [75], " discriminator
+ " rather than SYSTEM or PUBLIC"))))))
+
+
+;-----------------------------------------------------------------------------
+; Higher-level parsers and scanners
+;
+; They parse productions corresponding to the whole (document) entity
+; or its higher-level pieces (prolog, root element, etc).
+
+
+; Scan the Misc production in the context
+; [1] document ::= prolog element Misc*
+; [22] prolog ::= XMLDecl? Misc* (doctypedec l Misc*)?
+; [27] Misc ::= Comment | PI | S
+;
+; The following function should be called in the prolog or epilog contexts.
+; In these contexts, whitespaces are completely ignored.
+; The return value from ssax:scan-Misc is either a PI-token,
+; a DECL-token, a START token, or EOF.
+; Comments are ignored and not reported.
+
+(define (ssax:scan-Misc port)
+ (let loop ((c (ssax:skip-S port)))
+ (cond
+ ((eof-object? c) c)
+ ((not (char=? c #\<))
+ (parser-error port "XML [22], char '" c "' unexpected"))
+ (else
+ (let ((token (ssax:read-markup-token port)))
+ (case (xml-token-kind token)
+ ((COMMENT) (loop (ssax:skip-S port)))
+ ((PI DECL START) token)
+ (else
+ (parser-error port "XML [22], unexpected token of kind "
+ (xml-token-kind token)
+ ))))))))
+
+; procedure+: ssax:read-char-data PORT EXPECT-EOF? STR-HANDLER SEED
+;
+; This procedure is to read the character content of an XML document
+; or an XML element.
+; [43] content ::=
+; (element | CharData | Reference | CDSect | PI
+; | Comment)*
+; To be more precise, the procedure reads CharData, expands CDSect
+; and character entities, and skips comments. The procedure stops
+; at a named reference, EOF, at the beginning of a PI or a start/end tag.
+;
+; port
+; a PORT to read
+; expect-eof?
+; a boolean indicating if EOF is normal, i.e., the character
+; data may be terminated by the EOF. EOF is normal
+; while processing a parsed entity.
+; str-handler
+; a STR-HANDLER
+; seed
+; an argument passed to the first invocation of STR-HANDLER.
+;
+; The procedure returns two results: SEED and TOKEN.
+; The SEED is the result of the last invocation of STR-HANDLER, or the
+; original seed if STR-HANDLER was never called.
+;
+; TOKEN can be either an eof-object (this can happen only if
+; expect-eof? was #t), or:
+; - an xml-token describing a START tag or an END-tag;
+; For a start token, the caller has to finish reading it.
+; - an xml-token describing the beginning of a PI. It's up to an
+; application to read or skip through the rest of this PI;
+; - an xml-token describing a named entity reference.
+;
+; CDATA sections and character references are expanded inline and
+; never returned. Comments are silently disregarded.
+;
+; As the XML Recommendation requires, all whitespace in character data
+; must be preserved. However, a CR character (#xD) must be disregarded
+; if it appears before a LF character (#xA), or replaced by a #xA character
+; otherwise. See Secs. 2.10 and 2.11 of the XML Recommendation. See also
+; the canonical XML Recommendation.
+
+ ; ssax:read-char-data port expect-eof? str-handler seed
+(define ssax:read-char-data
+ (let
+ ((terminators-usual (list #\< #\& char-return))
+ (terminators-usual-eof (list #\< '*eof* #\& char-return))
+
+ (handle-fragment
+ (lambda (fragment str-handler seed)
+ (if (string-null? fragment) seed
+ (str-handler fragment "" seed))))
+ )
+
+ (lambda (port expect-eof? str-handler seed)
+
+ ; Very often, the first character we encounter is #\<
+ ; Therefore, we handle this case in a special, fast path
+ (if (eqv? #\< (peek-char port))
+
+ ; The fast path
+ (let ((token (ssax:read-markup-token port)))
+ (case (xml-token-kind token)
+ ((START END) ; The most common case
+ (values seed token))
+ ((CDSECT)
+ (let ((seed (ssax:read-cdata-body port str-handler seed)))
+ (ssax:read-char-data port expect-eof? str-handler seed)))
+ ((COMMENT) (ssax:read-char-data port expect-eof?
+ str-handler seed))
+ (else
+ (values seed token))))
+
+
+ ; The slow path
+ (let ((char-data-terminators
+ (if expect-eof? terminators-usual-eof terminators-usual)))
+
+ (let loop ((seed seed))
+ (let* ((fragment
+ (next-token '() char-data-terminators
+ "reading char data" port))
+ (term-char (peek-char port)) ; one of char-data-terminators
+ )
+ (if (eof-object? term-char)
+ (values
+ (handle-fragment fragment str-handler seed)
+ term-char)
+ (case term-char
+ ((#\<)
+ (let ((token (ssax:read-markup-token port)))
+ (case (xml-token-kind token)
+ ((CDSECT)
+ (loop
+ (ssax:read-cdata-body port str-handler
+ (handle-fragment fragment str-handler seed))))
+ ((COMMENT)
+ (loop (handle-fragment fragment str-handler seed)))
+ (else
+ (values
+ (handle-fragment fragment str-handler seed)
+ token)))))
+ ((#\&)
+ (case (peek-next-char port)
+ ((#\#) (read-char port)
+ (loop (str-handler fragment
+ (ssax:read-char-ref port)
+ seed)))
+ (else
+ (let ((name (ssax:read-NCName port)))
+ (assert-curr-char '(#\;) "XML [68]" port)
+ (values
+ (handle-fragment fragment str-handler seed)
+ (make-xml-token 'ENTITY-REF name))))))
+ (else ; This must be a CR character
+ (if (eqv? (peek-next-char port) #\newline)
+ (read-char port))
+ (loop (str-handler fragment (string #\newline) seed))))
+ ))))))))
+
+
+; a few lines of validation code
+(run-test (letrec
+ ((a-tag (make-xml-token 'START (string->symbol "BR")))
+ (a-ref (make-xml-token 'ENTITY-REF (string->symbol "lt")))
+ (eof-object (lambda () eof-object)) ; a unique value
+ (str-handler (lambda (fragment foll-fragment seed)
+ (if (string-null? foll-fragment) (cons fragment seed)
+ (cons* foll-fragment fragment seed))))
+ (test (lambda (str expect-eof? expected-data expected-token)
+ (newline) (display "body: ") (write str)
+ (newline) (display "Result: ")
+ (let*-values
+ (((seed token)
+ (call-with-input-string (unesc-string str)
+ (lambda (port)
+ (ssax:read-char-data port expect-eof? str-handler '()))))
+ ((result) (reverse seed)))
+ (write result)
+ (display " ")
+ (display token)
+ (assert (equal? result (map unesc-string expected-data))
+ (if (eq? expected-token eof-object)
+ (eof-object? token)
+ (equal? token expected-token))))))
+ )
+ (test "" #t '() eof-object)
+ (assert (failed? (test "" #f '() eof-object)))
+ (test " " #t '(" ") eof-object)
+ (test "<BR/>" #f '() a-tag)
+ (test " <BR />" #f '(" ") a-tag)
+
+ (test " &lt;" #f '(" ") a-ref)
+ (test " a&lt;" #f '(" a") a-ref)
+ (test " a &lt;" #f '(" a ") a-ref)
+
+ (test " <!-- comment--> a a<BR/>" #f '(" " " a a") a-tag)
+ (test " <!-- comment-->%ra a<BR/>" #f '(" " "" "%n" "a a") a-tag)
+ (test " <!-- comment-->%r%na a<BR/>" #f '(" " "" "%n" "a a") a-tag)
+ (test " <!-- comment-->%r%na%t%r%r%na<BR/>" #f
+ '(" " "" "%n" "a%t" "%n" "" "%n" "a") a-tag)
+ (test "a<!-- comment--> a a<BR/>" #f '("a" " a a") a-tag)
+ (test "&#x21;<BR/>" #f '("" "!") a-tag)
+ (test "&#x21;%n<BR/>" #f '("" "!" "%n") a-tag)
+ (test "%t&#x21;%n<BR/>" #f '("%t" "!" "%n") a-tag)
+ (test "%t&#x21;%na a<BR/>" #f '("%t" "!" "%na a") a-tag)
+ (test "%t&#x21;%ra a<BR/>" #f '("%t" "!" "" "%n" "a a") a-tag)
+ (test "%t&#x21;%r%na a<BR/>" #f '("%t" "!" "" "%n" "a a") a-tag)
+
+ (test " %ta &#x21; b <BR/>" #f '(" %ta " "!" " b ") a-tag)
+ (test " %ta &#x20; b <BR/>" #f '(" %ta " " " " b ") a-tag)
+
+ (test "<![CDATA[<]]><BR/>" #f '("<") a-tag)
+ (test "<![CDATA[]]]><BR/>" #f '("]") a-tag)
+ (test "%t<![CDATA[<]]><BR/>" #f '("%t" "<") a-tag)
+ (test "%t<![CDATA[<]]>a b<BR/>" #f '("%t" "<" "a b") a-tag)
+ (test "%t<![CDATA[<]]> a b<BR/>" #f '("%t" "<" " a b") a-tag)
+
+ (test "%td <![CDATA[ <%r%r%n]]> a b<BR/>" #f
+ '("%td " " <" "%n" "" "%n" " a b") a-tag)
+))
+
+
+
+; procedure+: ssax:assert-token TOKEN KIND GI
+; Make sure that TOKEN is of anticipated KIND and has anticipated GI
+; Note GI argument may actually be a pair of two symbols, Namespace
+; URI or the prefix, and of the localname.
+; If the assertion fails, error-cont is evaluated by passing it
+; three arguments: token kind gi. The result of error-cont is returned.
+(define (ssax:assert-token token kind gi error-cont)
+ (or
+ (and (xml-token? token)
+ (eq? kind (xml-token-kind token))
+ (equal? gi (xml-token-head token)))
+ (error-cont token kind gi)))
+
+;========================================================================
+; Highest-level parsers: XML to SXML
+
+; These parsers are a set of syntactic forms to instantiate a SSAX parser.
+; A user can instantiate the parser to do the full validation, or
+; no validation, or any particular validation. The user specifies
+; which PI he wants to be notified about. The user tells what to do
+; with the parsed character and element data. The latter handlers
+; determine if the parsing follows a SAX or a DOM model.
+
+; syntax: ssax:make-pi-parser my-pi-handlers
+; Create a parser to parse and process one Processing Element (PI).
+
+; my-pi-handlers
+; An assoc list of pairs (PI-TAG . PI-HANDLER)
+; where PI-TAG is an NCName symbol, the PI target, and
+; PI-HANDLER is a procedure PORT PI-TAG SEED
+; where PORT points to the first symbol after the PI target.
+; The handler should read the rest of the PI up to and including
+; the combination '?>' that terminates the PI. The handler should
+; return a new seed.
+; One of the PI-TAGs may be the symbol *DEFAULT*. The corresponding
+; handler will handle PIs that no other handler will. If the
+; *DEFAULT* PI-TAG is not specified, ssax:make-pi-parser will assume
+; the default handler that skips the body of the PI
+;
+; The output of the ssax:make-pi-parser is a procedure
+; PORT PI-TAG SEED
+; that will parse the current PI according to the user-specified handlers.
+;
+; The previous version of ssax:make-pi-parser was a low-level macro:
+; (define-macro ssax:make-pi-parser
+; (lambda (my-pi-handlers)
+; `(lambda (port target seed)
+; (case target
+; ; Generate the body of the case statement
+; ,@(let loop ((pi-handlers my-pi-handlers) (default #f))
+; (cond
+; ((null? pi-handlers)
+; (if default `((else (,default port target seed)))
+; '((else
+; (ssax:warn port "Skipping PI: " target nl)
+; (ssax:skip-pi port)
+; seed))))
+; ((eq? '*DEFAULT* (caar pi-handlers))
+; (loop (cdr pi-handlers) (cdar pi-handlers)))
+; (else
+; (cons
+; `((,(caar pi-handlers)) (,(cdar pi-handlers) port target seed))
+; (loop (cdr pi-handlers) default)))))))))
+
+(define-syntax ssax:make-pi-parser
+ (syntax-rules ()
+ ((ssax:make-pi-parser orig-handlers)
+ (letrec-syntax
+ ; Generate the clauses of the case statement
+ ((loop
+ (syntax-rules (*DEFAULT*)
+ ((loop () #f accum port target seed) ; no default
+ (make-case
+ ((else
+ (ssax:warn port "Skipping PI: " target nl)
+ (ssax:skip-pi port)
+ seed)
+ . accum)
+ () target))
+ ((loop () default accum port target seed)
+ (make-case
+ ((else (default port target seed)) . accum)
+ () target))
+ ((loop ((*DEFAULT* . default) . handlers) old-def accum
+ port target seed)
+ (loop handlers default accum port target seed))
+ ((loop ((tag . handler) . handlers) default accum port target seed)
+ (loop handlers default
+ (((tag) (handler port target seed)) . accum)
+ port target seed))
+ ))
+ (make-case ; Reverse the clauses, make the 'case'
+ (syntax-rules ()
+ ((make-case () clauses target)
+ (case target . clauses))
+ ((make-case (clause . clauses) accum target)
+ (make-case clauses (clause . accum) target)))
+ ))
+ (lambda (port target seed)
+ (loop orig-handlers #f () port target seed))
+ ))))
+
+(run-test
+ (pp (ssax:make-pi-parser ()))
+ (pp (ssax:make-pi-parser ((xml . (lambda (port target seed) seed)))))
+ (pp (ssax:make-pi-parser ((xml . (lambda (port target seed) seed))
+ (html . list)
+ (*DEFAULT* . ssax:warn))))
+)
+
+; syntax: ssax:make-elem-parser my-new-level-seed my-finish-element
+; my-char-data-handler my-pi-handlers
+
+; Create a parser to parse and process one element, including its
+; character content or children elements. The parser is typically
+; applied to the root element of a document.
+
+; my-new-level-seed
+; procedure ELEM-GI ATTRIBUTES NAMESPACES EXPECTED-CONTENT SEED
+; where ELEM-GI is a RES-NAME of the element
+; about to be processed.
+; This procedure is to generate the seed to be passed
+; to handlers that process the content of the element.
+; This is the function identified as 'fdown' in the denotational
+; semantics of the XML parser given in the title comments to this
+; file.
+;
+; my-finish-element
+; procedure ELEM-GI ATTRIBUTES NAMESPACES PARENT-SEED SEED
+; This procedure is called when parsing of ELEM-GI is finished.
+; The SEED is the result from the last content parser (or
+; from my-new-level-seed if the element has the empty content).
+; PARENT-SEED is the same seed as was passed to my-new-level-seed.
+; The procedure is to generate a seed that will be the result
+; of the element parser.
+; This is the function identified as 'fup' in the denotational
+; semantics of the XML parser given in the title comments to this
+; file.
+;
+; my-char-data-handler
+; A STR-HANDLER
+;
+; my-pi-handlers
+; See ssax:make-pi-handler above
+;
+
+; The generated parser is a
+; procedure START-TAG-HEAD PORT ELEMS ENTITIES
+; NAMESPACES PRESERVE-WS? SEED
+; The procedure must be called after the start tag token has been
+; read. START-TAG-HEAD is an UNRES-NAME from the start-element tag.
+; ELEMS is an instance of xml-decl::elems.
+; See ssax:complete-start-tag::preserve-ws?
+
+; Faults detected:
+; VC: XML-Spec.html#elementvalid
+; WFC: XML-Spec.html#GIMatch
+
+
+(define-syntax ssax:make-elem-parser
+ (syntax-rules ()
+ ((ssax:make-elem-parser my-new-level-seed my-finish-element
+ my-char-data-handler my-pi-handlers)
+
+ (lambda (start-tag-head port elems entities namespaces
+ preserve-ws? seed)
+
+ (define xml-space-gi (cons ssax:Prefix-XML
+ (string->symbol "space")))
+
+ (let handle-start-tag ((start-tag-head start-tag-head)
+ (port port) (entities entities)
+ (namespaces namespaces)
+ (preserve-ws? preserve-ws?) (parent-seed seed))
+ (let*-values
+ (((elem-gi attributes namespaces expected-content)
+ (ssax:complete-start-tag start-tag-head port elems
+ entities namespaces))
+ ((seed)
+ (my-new-level-seed elem-gi attributes
+ namespaces expected-content parent-seed)))
+ (case expected-content
+ ((EMPTY-TAG)
+ (my-finish-element
+ elem-gi attributes namespaces parent-seed seed))
+ ((EMPTY) ; The end tag must immediately follow
+ (ssax:assert-token
+ (and (eqv? #\< (ssax:skip-S port)) (ssax:read-markup-token port))
+ 'END start-tag-head
+ (lambda (token exp-kind exp-head)
+ (parser-error port "[elementvalid] broken for " token
+ " while expecting "
+ exp-kind exp-head)))
+ (my-finish-element
+ elem-gi attributes namespaces parent-seed seed))
+ (else ; reading the content...
+ (let ((preserve-ws? ; inherit or set the preserve-ws? flag
+ (cond
+ ((assoc xml-space-gi attributes) =>
+ (lambda (name-value)
+ (equal? "preserve" (cdr name-value))))
+ (else preserve-ws?))))
+ (let loop ((port port) (entities entities)
+ (expect-eof? #f) (seed seed))
+ (let*-values
+ (((seed term-token)
+ (ssax:read-char-data port expect-eof?
+ my-char-data-handler seed)))
+ (if (eof-object? term-token)
+ seed
+ (case (xml-token-kind term-token)
+ ((END)
+ (ssax:assert-token term-token 'END start-tag-head
+ (lambda (token exp-kind exp-head)
+ (parser-error port "[GIMatch] broken for "
+ term-token " while expecting "
+ exp-kind exp-head)))
+ (my-finish-element
+ elem-gi attributes namespaces parent-seed seed))
+ ((PI)
+ (let ((seed
+ ((ssax:make-pi-parser my-pi-handlers)
+ port (xml-token-head term-token) seed)))
+ (loop port entities expect-eof? seed)))
+ ((ENTITY-REF)
+ (let ((seed
+ (ssax:handle-parsed-entity
+ port (xml-token-head term-token)
+ entities
+ (lambda (port entities seed)
+ (loop port entities #t seed))
+ my-char-data-handler
+ seed))) ; keep on reading the content after ent
+ (loop port entities expect-eof? seed)))
+ ((START) ; Start of a child element
+ (if (eq? expected-content 'PCDATA)
+ (parser-error port "[elementvalid] broken for "
+ elem-gi
+ " with char content only; unexpected token "
+ term-token))
+ ; Do other validation of the element content
+ (let ((seed
+ (handle-start-tag
+ (xml-token-head term-token)
+ port entities namespaces
+ preserve-ws? seed)))
+ (loop port entities expect-eof? seed)))
+ (else
+ (parser-error port "XML [43] broken for "
+ term-token))))))))
+ )))
+))))
+
+
+; syntax: ssax:make-parser user-handler-tag user-handler-proc ...
+;
+; Create an XML parser, an instance of the XML parsing framework.
+; This will be a SAX, a DOM, or a specialized parser depending
+; on the supplied user-handlers.
+
+; user-handler-tag is a symbol that identifies a procedural expression
+; that follows the tag. Given below are tags and signatures of the
+; corresponding procedures. Not all tags have to be specified. If some
+; are omitted, reasonable defaults will apply.
+;
+
+; tag: DOCTYPE
+; handler-procedure: PORT DOCNAME SYSTEMID INTERNAL-SUBSET? SEED
+; If internal-subset? is #t, the current position in the port
+; is right after we have read #\[ that begins the internal DTD subset.
+; We must finish reading of this subset before we return
+; (or must call skip-internal-subset if we aren't interested in reading it).
+; The port at exit must be at the first symbol after the whole
+; DOCTYPE declaration.
+; The handler-procedure must generate four values:
+; ELEMS ENTITIES NAMESPACES SEED
+; See xml-decl::elems for ELEMS. It may be #f to switch off the validation.
+; NAMESPACES will typically contain USER-PREFIXes for selected URI-SYMBs.
+; The default handler-procedure skips the internal subset,
+; if any, and returns (values #f '() '() seed)
+
+; tag: UNDECL-ROOT
+; handler-procedure: ELEM-GI SEED
+; where ELEM-GI is an UNRES-NAME of the root element. This procedure
+; is called when an XML document under parsing contains _no_ DOCTYPE
+; declaration.
+; The handler-procedure, as a DOCTYPE handler procedure above,
+; must generate four values:
+; ELEMS ENTITIES NAMESPACES SEED
+; The default handler-procedure returns (values #f '() '() seed)
+
+; tag: DECL-ROOT
+; handler-procedure: ELEM-GI SEED
+; where ELEM-GI is an UNRES-NAME of the root element. This procedure
+; is called when an XML document under parsing does contains the DOCTYPE
+; declaration.
+; The handler-procedure must generate a new SEED (and verify
+; that the name of the root element matches the doctype, if the handler
+; so wishes).
+; The default handler-procedure is the identity function.
+
+; tag: NEW-LEVEL-SEED
+; handler-procedure: see ssax:make-elem-parser, my-new-level-seed
+
+; tag: FINISH-ELEMENT
+; handler-procedure: see ssax:make-elem-parser, my-finish-element
+
+; tag: CHAR-DATA-HANDLER
+; handler-procedure: see ssax:make-elem-parser, my-char-data-handler
+
+; tag: PI
+; handler-procedure: see ssax:make-pi-parser
+; The default value is '()
+
+; The generated parser is a
+; procedure PORT SEED
+
+; This procedure parses the document prolog and then exits to
+; an element parser (created by ssax:make-elem-parser) to handle
+; the rest.
+;
+; [1] document ::= prolog element Misc*
+; [22] prolog ::= XMLDecl? Misc* (doctypedec | Misc*)?
+; [27] Misc ::= Comment | PI | S
+;
+; [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S?
+; ('[' (markupdecl | PEReference | S)* ']' S?)? '>'
+; [29] markupdecl ::= elementdecl | AttlistDecl
+; | EntityDecl
+; | NotationDecl | PI
+; | Comment
+;
+
+
+; This is ssax:make-parser with all the (specialization) handlers given
+; as positional arguments. It is called by ssax:make-parser, see below
+(define-syntax ssax:make-parser/positional-args
+ (syntax-rules ()
+ ((ssax:make-parser/positional-args
+ *handler-DOCTYPE
+ *handler-UNDECL-ROOT
+ *handler-DECL-ROOT
+ *handler-NEW-LEVEL-SEED
+ *handler-FINISH-ELEMENT
+ *handler-CHAR-DATA-HANDLER
+ *handler-PI)
+ (lambda (port seed)
+
+ ; We must've just scanned the DOCTYPE token
+ ; Handle the doctype declaration and exit to
+ ; scan-for-significant-prolog-token-2, and eventually, to the
+ ; element parser.
+ (define (handle-decl port token-head seed)
+ (or (eq? (string->symbol "DOCTYPE") token-head)
+ (parser-error port "XML [22], expected DOCTYPE declaration, found "
+ token-head))
+ (assert-curr-char ssax:S-chars "XML [28], space after DOCTYPE" port)
+ (ssax:skip-S port)
+ (let*-values
+ (((docname) (ssax:read-QName port))
+ ((systemid)
+ (and (ssax:ncname-starting-char? (ssax:skip-S port))
+ (ssax:read-external-id port)))
+ ((internal-subset?)
+ (begin (ssax:skip-S port)
+ (eqv? #\[ (assert-curr-char '(#\> #\[)
+ "XML [28], end-of-DOCTYPE" port))))
+ ((elems entities namespaces seed)
+ (*handler-DOCTYPE port docname systemid
+ internal-subset? seed))
+ )
+ (scan-for-significant-prolog-token-2 port elems entities namespaces
+ seed)))
+
+
+ ; Scan the leading PIs until we encounter either a doctype declaration
+ ; or a start token (of the root element)
+ ; In the latter two cases, we exit to the appropriate continuation
+ (define (scan-for-significant-prolog-token-1 port seed)
+ (let ((token (ssax:scan-Misc port)))
+ (if (eof-object? token)
+ (parser-error port "XML [22], unexpected EOF")
+ (case (xml-token-kind token)
+ ((PI)
+ (let ((seed
+ ((ssax:make-pi-parser *handler-PI)
+ port (xml-token-head token) seed)))
+ (scan-for-significant-prolog-token-1 port seed)))
+ ((DECL) (handle-decl port (xml-token-head token) seed))
+ ((START)
+ (let*-values
+ (((elems entities namespaces seed)
+ (*handler-UNDECL-ROOT (xml-token-head token) seed)))
+ (element-parser (xml-token-head token) port elems
+ entities namespaces #f seed)))
+ (else (parser-error port "XML [22], unexpected markup "
+ token))))))
+
+
+ ; Scan PIs after the doctype declaration, till we encounter
+ ; the start tag of the root element. After that we exit
+ ; to the element parser
+ (define (scan-for-significant-prolog-token-2 port elems entities
+ namespaces seed)
+ (let ((token (ssax:scan-Misc port)))
+ (if (eof-object? token)
+ (parser-error port "XML [22], unexpected EOF")
+ (case (xml-token-kind token)
+ ((PI)
+ (let ((seed
+ ((ssax:make-pi-parser *handler-PI)
+ port (xml-token-head token) seed)))
+ (scan-for-significant-prolog-token-2 port elems entities
+ namespaces seed)))
+ ((START)
+ (element-parser (xml-token-head token) port elems
+ entities namespaces #f
+ (*handler-DECL-ROOT (xml-token-head token) seed)))
+ (else (parser-error port "XML [22], unexpected markup "
+ token))))))
+
+
+ ; A procedure start-tag-head port elems entities namespaces
+ ; preserve-ws? seed
+ (define element-parser
+ (ssax:make-elem-parser *handler-NEW-LEVEL-SEED
+ *handler-FINISH-ELEMENT
+ *handler-CHAR-DATA-HANDLER
+ *handler-PI))
+
+ ; Get the ball rolling ...
+ (scan-for-significant-prolog-token-1 port seed)
+))))
+
+
+
+; The following meta-macro turns a regular macro (with positional
+; arguments) into a form with keyword (labeled) arguments. We later
+; use the meta-macro to convert ssax:make-parser/positional-args into
+; ssax:make-parser. The latter provides a prettier (with labeled
+; arguments and defaults) interface to
+; ssax:make-parser/positional-args
+;
+; ssax:define-labeled-arg-macro LABELED-ARG-MACRO-NAME
+; (POS-MACRO-NAME ARG-DESCRIPTOR ...)
+; expands into the definition of a macro
+; LABELED-ARG-MACRO-NAME KW-NAME KW-VALUE KW-NAME1 KW-VALUE1 ...
+; which, in turn, expands into
+; POS-MACRO-NAME ARG1 ARG2 ...
+; where each ARG1 etc. comes either from KW-VALUE or from
+; the deafult part of ARG-DESCRIPTOR. ARG1 corresponds to the first
+; ARG-DESCRIPTOR, ARG2 corresponds to the second descriptor, etc.
+; Here ARG-DESCRIPTOR describes one argument of the positional macro.
+; It has the form
+; (ARG-NAME DEFAULT-VALUE)
+; or
+; (ARG-NAME)
+; In the latter form, the default value is not given, so that the invocation of
+; LABELED-ARG-MACRO-NAME must mention the corresponding parameter.
+; ARG-NAME can be anything: an identifier, a string, or even a number.
+
+
+(define-syntax ssax:define-labeled-arg-macro
+ (syntax-rules ()
+ ((ssax:define-labeled-arg-macro
+ labeled-arg-macro-name
+ (positional-macro-name
+ (arg-name . arg-def) ...))
+ (define-syntax labeled-arg-macro-name
+ (syntax-rules ()
+ ((labeled-arg-macro-name . kw-val-pairs)
+ (letrec-syntax
+ ((find
+ (syntax-rules (arg-name ...)
+ ((find k-args (arg-name . default) arg-name
+ val . others) ; found arg-name among kw-val-pairs
+ (next val . k-args)) ...
+ ((find k-args key arg-no-match-name val . others)
+ (find k-args key . others))
+ ((find k-args (arg-name default)) ; default must be here
+ (next default . k-args)) ...
+ ))
+ (next ; pack the continuation to find
+ (syntax-rules ()
+ ((next val vals key . keys)
+ (find ((val . vals) . keys) key . kw-val-pairs))
+ ((next val vals) ; processed all arg-descriptors
+ (rev-apply (val) vals))))
+ (rev-apply
+ (syntax-rules ()
+ ((rev-apply form (x . xs))
+ (rev-apply (x . form) xs))
+ ((rev-apply form ()) form))))
+ (next positional-macro-name ()
+ (arg-name . arg-def) ...))))))))
+
+
+; The definition of ssax:make-parser
+(ssax:define-labeled-arg-macro ssax:make-parser
+ (ssax:make-parser/positional-args
+ (DOCTYPE
+ (lambda (port docname systemid internal-subset? seed)
+ (when internal-subset?
+ (ssax:warn port "Internal DTD subset is not currently handled ")
+ (ssax:skip-internal-dtd port))
+ (ssax:warn port "DOCTYPE DECL " docname " "
+ systemid " found and skipped")
+ (values #f '() '() seed)
+ ))
+ (UNDECL-ROOT
+ (lambda (elem-gi seed) (values #f '() '() seed)))
+ (DECL-ROOT
+ (lambda (elem-gi seed) seed))
+ (NEW-LEVEL-SEED) ; required
+ (FINISH-ELEMENT) ; required
+ (CHAR-DATA-HANDLER) ; required
+ (PI ())
+ ))
+
+(run-test
+ (letrec ((simple-parser
+ (lambda (str doctype-fn)
+ (call-with-input-string str
+ (lambda (port)
+ ((ssax:make-parser
+ NEW-LEVEL-SEED
+ (lambda (elem-gi attributes namespaces
+ expected-content seed)
+ '())
+
+ FINISH-ELEMENT
+ (lambda (elem-gi attributes namespaces parent-seed seed)
+ (let
+ ((seed (if (null? namespaces) (reverse seed)
+ (cons (list '*NAMESPACES* namespaces)
+ (reverse seed)))))
+ (let ((seed (if (attlist-null? attributes) seed
+ (cons
+ (cons '@
+ (map (lambda (attr)
+ (list (car attr) (cdr attr)))
+ (attlist->alist attributes)))
+ seed))))
+ (cons (cons elem-gi seed) parent-seed))))
+
+ CHAR-DATA-HANDLER
+ (lambda (string1 string2 seed)
+ (if (string-null? string2) (cons string1 seed)
+ (cons* string2 string1 seed)))
+
+ DOCTYPE
+ (lambda (port docname systemid internal-subset? seed)
+ (when internal-subset?
+ (ssax:warn port
+ "Internal DTD subset is not currently handled ")
+ (ssax:skip-internal-dtd port))
+ (ssax:warn port "DOCTYPE DECL " docname " "
+ systemid " found and skipped")
+ (doctype-fn docname seed))
+
+ UNDECL-ROOT
+ (lambda (elem-gi seed)
+ (doctype-fn elem-gi seed))
+ )
+ port '())))))
+
+ (dummy-doctype-fn (lambda (elem-gi seed) (values #f '() '() seed)))
+ (test
+ (lambda (str doctype-fn expected)
+ (cout nl "Parsing: " str nl)
+ (let ((result (simple-parser (unesc-string str) doctype-fn)))
+ (write result)
+ (assert (equal? result expected)))))
+ )
+
+ (test "<BR/>" dummy-doctype-fn '(('"BR")))
+ (assert (failed? (test "<BR>" dummy-doctype-fn '())))
+ (test "<BR></BR>" dummy-doctype-fn '(('"BR")))
+ (assert (failed? (test "<BR></BB>" dummy-doctype-fn '())))
+
+ (test " <A HREF='URL'> link <I>itlink </I> &amp;amp;</A>"
+ dummy-doctype-fn
+ '(('"A" (@ ('"HREF" "URL")) " link " ('"I" "itlink ")
+ " " "&" "amp;")))
+
+ (test
+ " <A HREF='URL' xml:space='preserve'> link <I>itlink </I> &amp;amp;</A>" dummy-doctype-fn
+ '(('"A" (@ ('"HREF" "URL") (('"xml" . '"space") "preserve"))
+ " link " ('"I" "itlink ") " " "&" "amp;")))
+
+ (test " <A HREF='URL' xml:space='preserve'> link <I xml:space='default'>itlink </I> &amp;amp;</A>" dummy-doctype-fn
+ '(('"A" (@ ('"HREF" "URL") (('"xml" . '"space") "preserve"))
+ " link "
+ ('"I" (@ (('"xml" . '"space") "default")) "itlink ")
+ " " "&" "amp;")))
+ (test "<itemize><item>This is item 1 </item>%n<!-- Just:a comment --><item>Item 2</item>%n </itemize>" dummy-doctype-fn
+ `(('"itemize" ('"item" "This is item 1 ")
+ ,(unesc-string "%n") ('"item" "Item 2") ,(unesc-string "%n "))))
+ (test " <P><![CDATA[<BR>%n<![CDATA[<BR>]]&gt;]]></P>"
+ dummy-doctype-fn `(('"P" "<BR>" ,nl "<![CDATA[<BR>" "]]" "" ">")))
+
+ (test " <P><![CDATA[<BR>%r<![CDATA[<BR>]]&gt;]]></P>"
+ dummy-doctype-fn `(('"P" "<BR>" ,nl "<![CDATA[<BR>" "]]" "" ">")))
+
+ (test "<?xml version='1.0'?>%n%n<Reports TStamp='1'></Reports>"
+ dummy-doctype-fn '(('"Reports" (@ ('"TStamp" "1")))))
+ (test "%n<?PI xxx?><!-- Comment %n -%r-->%n<?PI1 zzz?><T/>"
+ dummy-doctype-fn '(('"T")))
+ (test "<!DOCTYPE T SYSTEM 'system1' ><!-- comment -->%n<T/>"
+ (lambda (elem-gi seed) (assert (equal? elem-gi ''"T"))
+ (values #f '() '() seed))
+ '(('"T")))
+ (test "<!DOCTYPE T PUBLIC '//EN/T' \"system1\" [ <!ELEMENT a 'aa'> ]>%n<?pi?><T/>"
+ (lambda (elem-gi seed) (assert (equal? elem-gi ''"T"))
+ (values #f '() '() seed))
+ '(('"T")))
+ (test "<BR/>"
+ (lambda (elem-gi seed)
+ (values '(('"BR" EMPTY ())) '() '() seed)) '(('"BR")))
+ (test "<BR></BR>"
+ (lambda (elem-gi seed)
+ (values '(('"BR" EMPTY ())) '() '() seed)) '(('"BR")))
+ (assert (failed? (test "<BR>aa</BR>"
+ (lambda (elem-gi seed)
+ (values '(('"BR" EMPTY ())) '() '() seed)) '())))
+ (test "<BR>aa</BR>"
+ (lambda (elem-gi seed)
+ (values '(('"BR" PCDATA ())) '() '() seed)) '(('"BR" "aa")))
+ (assert (failed? (test "<BR>a<I>a</I></BR>"
+ (lambda (elem-gi seed)
+ (values '(('"BR" PCDATA ())) '() '() seed)) '())))
+ (test "<BR>a<I>a</I></BR>"
+ (lambda (elem-gi seed)
+ (values '(('"BR" ANY ()) ('"I" PCDATA ())) '() '() seed))
+ '(('"BR" "a" ('"I" "a"))))
+
+
+ (test "<DIV>Example: \"&example;\"</DIV>"
+ (lambda (elem-gi seed)
+ (values #f '((example . "<P>An ampersand (&#38;) may be escaped numerically (&#38;#38;) or with a general entity (&amp;amp;).</P>")) '() seed))
+ '(('"DIV" "Example: \""
+ ('"P" "An ampersand (" "&" ") may be escaped numerically (" "&" "#38;) or with a general entity (" "&" "amp;).") "\"")))
+ (test "<DIV>Example: \"&example;\" <P/></DIV>"
+ (lambda (elem-gi seed)
+ (values #f '(('"quote" . "<I>example:</I> ex")
+ ('"example" . "<Q>&quote;!</Q>?")) '() seed))
+ '(('"DIV" "Example: \"" ('"Q" ('"I" "example:") " ex" "!") "?"
+ "\" " ('"P"))))
+ (assert (failed?
+ (test "<DIV>Example: \"&example;\" <P/></DIV>"
+ (lambda (elem-gi seed)
+ (values #f '(('"quote" . "<I>example:")
+ ('"example" . "<Q>&quote;</I>!</Q>?")) '() seed))
+ '())))
+
+ (test "<DIV A:B='A' B='B' xmlns:A='URI1' xmlns='URI1'><A:P xmlns=''><BR/></A:P></DIV>"
+ (lambda (elem-gi seed)
+ (values #f '() '() seed))
+ '((('"URI1" . '"DIV") (@ ('"B" "B") (('"URI1" . '"B") "A"))
+ (*NAMESPACES* (('"A" '"URI1" . '"URI1")
+ (*DEFAULT* '"URI1" . '"URI1")))
+ (('"URI1" . '"P")
+ (*NAMESPACES* ((*DEFAULT* #f . #f) ('"A" '"URI1" . '"URI1")
+ (*DEFAULT* '"URI1" . '"URI1")))
+ ('"BR"
+ (*NAMESPACES* ((*DEFAULT* #f . #f)
+ ('"A" '"URI1" . '"URI1")
+ (*DEFAULT* '"URI1" . '"URI1"))))))))
+ (test "<DIV A:B='A' B='B' xmlns:A='URI1' xmlns='URI1'><A:P xmlns=''><BR/></A:P></DIV>"
+ (lambda (elem-gi seed)
+ (values #f '() '((#f '"UA" . '"URI1")) seed))
+ '((('"UA" . '"DIV") (@ ('"B" "B") (('"UA" . '"B") "A"))
+ (*NAMESPACES* (('"A" '"UA" . '"URI1")
+ (*DEFAULT* '"UA" . '"URI1") (#f '"UA" . '"URI1")))
+ (('"UA" . '"P")
+ (*NAMESPACES* ((*DEFAULT* #f . #f) ('"A" '"UA" . '"URI1")
+ (*DEFAULT* '"UA" . '"URI1") (#f '"UA" . '"URI1")))
+ ('"BR"
+ (*NAMESPACES* ((*DEFAULT* #f . #f) ('"A" '"UA" . '"URI1")
+ (*DEFAULT* '"UA" . '"URI1")
+ (#f '"UA" . '"URI1"))))))))
+ ; uniqattr should fail
+ (assert (failed?
+ (test "<DIV A:B='A' B='B' xmlns:A='URI1' xmlns='URI1'><A:P xmlns=''><BR/></A:P></DIV>"
+ (lambda (elem-gi seed)
+ (values
+ `(('"DIV" ANY (('"B" CDATA IMPLIED #f)
+ (('"A" . '"B") CDATA IMPLIED #f)
+ (('"C" . '"B") CDATA IMPLIED "xx")
+ (('"xmlns" . '"C") CDATA IMPLIED "URI1")
+ ))
+ (('"A" . '"P") ANY ()) ('"BR" '"EMPTY" ()))
+ '() '((#f '"UA" . '"URI1")) seed))
+ '())))
+ ; prefix C undeclared
+ (assert (failed?
+ (test "<DIV A:B='A' B='B' xmlns:A='URI1' xmlns='URI1'><A:P xmlns=''><BR/></A:P></DIV>"
+ (lambda (elem-gi seed)
+ (values
+ '(('"DIV" ANY (('"B" CDATA IMPLIED #f)
+ ('"xmlns" CDATA IMPLIED "URI1")
+ (('"A" . '"B") CDATA IMPLIED #f)
+ (('"C" . '"B") CDATA IMPLIED "xx")
+ ))
+ (('"A" . '"P") ANY ()) ('"BR" EMPTY ()))
+ '() '((#f '"UA" . '"URI1")) seed))
+ '())))
+
+ ; contradiction to xmlns declaration
+ (assert (failed?
+ (test "<DIV A:B='A' B='B' xmlns:A='URI1' xmlns='URI1'><A:P xmlns=''><BR/></A:P></DIV>"
+ (lambda (elem-gi seed)
+ (values
+ '(('"DIV" ANY (('"B" CDATA IMPLIED #f)
+ ('"xmlns" CDATA FIXED "URI2")
+ (('"A" . '"B") CDATA IMPLIED #f)
+ ))
+ (('"A" . '"P") ANY ()) ('"BR" EMPTY ()))
+ '() '((#f '"UA" . '"URI1")) seed))
+ '())))
+
+ (test "<DIV A:B='A' B='B' xmlns:A='URI1' xmlns='URI1'><A:P xmlns=''><BR/></A:P></DIV>"
+ (lambda (elem-gi seed)
+ (values
+ '(('"DIV" ANY (('"B" CDATA IMPLIED #f)
+ ('"xmlns" CDATA FIXED "URI1")
+ (('"A" . '"B") CDATA IMPLIED #f)
+ ))
+ (('"A" . '"P") ANY ()) ('"BR" EMPTY ()))
+ '() '((#f '"UA" . '"URI1")) seed))
+ '((('"UA" . '"DIV") (@ ('"B" "B") (('"UA" . '"B") "A"))
+ (*NAMESPACES* ((*DEFAULT* '"UA" . '"URI1")
+ ('"A" '"UA" . '"URI1") (#f '"UA" . '"URI1")))
+ (('"UA" . '"P")
+ (*NAMESPACES* ((*DEFAULT* #f . #f)
+ (*DEFAULT* '"UA" . '"URI1")
+ ('"A" '"UA" . '"URI1") (#f '"UA" . '"URI1")))
+ ('"BR"
+ (*NAMESPACES* ((*DEFAULT* #f . #f) (*DEFAULT* '"UA" . '"URI1")
+ ('"A" '"UA" . '"URI1") (#f '"UA" . '"URI1"))))))))
+
+ (test "<DIV A:B='A' B='B' xmlns:A='URI1' xmlns='URI1'><A:P xmlns=''><BR/></A:P></DIV>"
+ (lambda (elem-gi seed)
+ (values
+ '(('"DIV" ANY (('"B" CDATA IMPLIED #f)
+ (('"A" . '"B") CDATA IMPLIED #f)
+ (('"C" . '"B") CDATA IMPLIED "xx")
+ (('"xmlns" . '"C") CDATA IMPLIED "URI2")
+ ))
+ (('"A" . '"P") ANY ()) ('"BR" EMPTY ()))
+ '() '((#f '"UA" . '"URI1")) seed))
+ '((('"UA" . '"DIV") (@ ('"B" "B") (('"UA" . '"B") "A")
+ (('"URI2" . '"B") "xx"))
+ (*NAMESPACES* ((*DEFAULT* '"UA" . '"URI1")
+ ('"A" '"UA" . '"URI1")
+ ('"C" '"URI2" . '"URI2")
+ (#f '"UA" . '"URI1")))
+ (('"UA" . '"P")
+ (*NAMESPACES* ((*DEFAULT* #f . #f) (*DEFAULT* '"UA" . '"URI1")
+ ('"A" '"UA" . '"URI1")
+ ('"C" '"URI2" . '"URI2") (#f '"UA" . '"URI1")))
+ ('"BR"
+ (*NAMESPACES* ((*DEFAULT* #f . #f)
+ (*DEFAULT* '"UA" . '"URI1")
+ ('"A" '"UA" . '"URI1")
+ ('"C" '"URI2" . '"URI2")
+ (#f '"UA" . '"URI1"))))))))
+))
+
+
+
+;========================================================================
+; Highest-level parsers: XML to SXML
+;
+
+; First, a few utility procedures that turned out useful
+
+; ssax:reverse-collect-str LIST-OF-FRAGS -> LIST-OF-FRAGS
+; given the list of fragments (some of which are text strings)
+; reverse the list and concatenate adjacent text strings.
+; We can prove from the general case below that if LIST-OF-FRAGS
+; has zero or one element, the result of the procedure is equal?
+; to its argument. This fact justifies the shortcut evaluation below.
+(define (ssax:reverse-collect-str fragments)
+ (cond
+ ((null? fragments) '()) ; a shortcut
+ ((null? (cdr fragments)) fragments) ; see the comment above
+ (else
+ (let loop ((fragments fragments) (result '()) (strs '()))
+ (cond
+ ((null? fragments)
+ (if (null? strs) result
+ (cons (string-concatenate/shared strs) result)))
+ ((string? (car fragments))
+ (loop (cdr fragments) result (cons (car fragments) strs)))
+ (else
+ (loop (cdr fragments)
+ (cons
+ (car fragments)
+ (if (null? strs) result
+ (cons (string-concatenate/shared strs) result)))
+ '())))))))
+
+
+; ssax:reverse-collect-str-drop-ws LIST-OF-FRAGS -> LIST-OF-FRAGS
+; given the list of fragments (some of which are text strings)
+; reverse the list and concatenate adjacent text strings.
+; We also drop "unsignificant" whitespace, that is, whitespace
+; in front, behind and between elements. The whitespace that
+; is included in character data is not affected.
+; We use this procedure to "intelligently" drop "insignificant"
+; whitespace in the parsed SXML. If the strict compliance with
+; the XML Recommendation regarding the whitespace is desired, please
+; use the ssax:reverse-collect-str procedure instead.
+
+(define (ssax:reverse-collect-str-drop-ws fragments)
+ (cond
+ ((null? fragments) '()) ; a shortcut
+ ((null? (cdr fragments)) ; another shortcut
+ (if (and (string? (car fragments)) (string-whitespace? (car fragments)))
+ '() fragments)) ; remove trailing ws
+ (else
+ (let loop ((fragments fragments) (result '()) (strs '())
+ (all-whitespace? #t))
+ (cond
+ ((null? fragments)
+ (if all-whitespace? result ; remove leading ws
+ (cons (string-concatenate/shared strs) result)))
+ ((string? (car fragments))
+ (loop (cdr fragments) result (cons (car fragments) strs)
+ (and all-whitespace?
+ (string-whitespace? (car fragments)))))
+ (else
+ (loop (cdr fragments)
+ (cons
+ (car fragments)
+ (if all-whitespace? result
+ (cons (string-concatenate/shared strs) result)))
+ '() #t)))))))
+
+
+; procedure: ssax:xml->sxml PORT NAMESPACE-PREFIX-ASSIG
+;
+; This is an instance of a SSAX parser above that returns an SXML
+; representation of the XML document to be read from PORT.
+; NAMESPACE-PREFIX-ASSIG is a list of (USER-PREFIX . URI-STRING)
+; that assigns USER-PREFIXes to certain namespaces identified by
+; particular URI-STRINGs. It may be an empty list.
+; The procedure returns an SXML tree. The port points out to the
+; first character after the root element.
+
+(define (ssax:xml->sxml port namespace-prefix-assig)
+ (letrec
+ ((namespaces
+ (map (lambda (el)
+ (cons* #f (car el) (ssax:uri-string->symbol (cdr el))))
+ namespace-prefix-assig))
+
+ (RES-NAME->SXML
+ (lambda (res-name)
+ (string->symbol
+ (string-append
+ (symbol->string (car res-name))
+ ":"
+ (symbol->string (cdr res-name))))))
+
+ )
+ (let ((result
+ (reverse
+ ((ssax:make-parser
+ NEW-LEVEL-SEED
+ (lambda (elem-gi attributes namespaces
+ expected-content seed)
+ '())
+
+ FINISH-ELEMENT
+ (lambda (elem-gi attributes namespaces parent-seed seed)
+ (let ((seed (ssax:reverse-collect-str seed))
+ (attrs
+ (attlist-fold
+ (lambda (attr accum)
+ (cons (list
+ (if (symbol? (car attr)) (car attr)
+ (RES-NAME->SXML (car attr)))
+ (cdr attr)) accum))
+ '() attributes)))
+ (cons
+ (cons
+ (if (symbol? elem-gi) elem-gi
+ (RES-NAME->SXML elem-gi))
+ (if (null? attrs) seed
+ (cons (cons '@ attrs) seed)))
+ parent-seed)))
+
+ CHAR-DATA-HANDLER
+ (lambda (string1 string2 seed)
+ (if (string-null? string2) (cons string1 seed)
+ (cons* string2 string1 seed)))
+
+ DOCTYPE
+ (lambda (port docname systemid internal-subset? seed)
+ (when internal-subset?
+ (ssax:warn port
+ "Internal DTD subset is not currently handled ")
+ (ssax:skip-internal-dtd port))
+ (ssax:warn port "DOCTYPE DECL " docname " "
+ systemid " found and skipped")
+ (values #f '() namespaces seed))
+
+ UNDECL-ROOT
+ (lambda (elem-gi seed)
+ (values #f '() namespaces seed))
+
+ PI
+ ((*DEFAULT* .
+ (lambda (port pi-tag seed)
+ (cons
+ (list '*PI* pi-tag (ssax:read-pi-body-as-string port))
+ seed))))
+ )
+ port '()))))
+ (cons '*TOP*
+ (if (null? namespace-prefix-assig) result
+ (cons
+ (list '@ (cons '*NAMESPACES*
+ (map (lambda (ns) (list (car ns) (cdr ns)))
+ namespace-prefix-assig)))
+ result)))
+)))
+
+; For backwards compatibility
+(define SSAX:XML->SXML ssax:xml->sxml)
+
+
+; a few lines of validation code
+(run-test (letrec
+ ((test (lambda (str namespace-assig expected-res)
+ (newline) (display "input: ")
+ (write (unesc-string str)) (newline) (display "Result: ")
+ (let ((result
+ (call-with-input-string (unesc-string str)
+ (lambda (port)
+ (ssax:xml->sxml port namespace-assig)))))
+ (pp result)
+ (assert (equal_? result expected-res))))))
+
+ (test " <BR/>" '() '(*TOP* (BR)))
+ (test "<BR></BR>" '() '(*TOP* (BR)))
+ (test " <BR CLEAR='ALL'%nCLASS='Class1'/>" '()
+ '(*TOP* (BR (@ (CLEAR "ALL") (CLASS "Class1")))))
+ (test " <A HREF='URL'> link <I>itlink </I> &amp;amp;</A>" '()
+ '(*TOP* (A (@ (HREF "URL")) " link " (I "itlink ") " &amp;")))
+ (test " <A HREF='URL' xml:space='preserve'> link <I>itlink </I> &amp;amp;</A>" '()
+ '(*TOP* (A (@ (xml:space "preserve") (HREF "URL"))
+ " link " (I "itlink ") " &amp;")))
+ (test " <A HREF='URL' xml:space='preserve'> link <I xml:space='default'>itlink </I> &amp;amp;</A>" '()
+ '(*TOP* (A (@ (xml:space "preserve") (HREF "URL"))
+ " link " (I (@ (xml:space "default"))
+ "itlink ") " &amp;")))
+ (test " <P><?pi1 p1 content ?>?<?pi2 pi2? content? ??></P>" '()
+ '(*TOP* (P (*PI* pi1 "p1 content ") "?"
+ (*PI* pi2 "pi2? content? ?"))))
+ (test " <P>some text <![CDATA[<]]>1%n&quot;<B>strong</B>&quot;%r</P>"
+ '()
+ `(*TOP* (P ,(unesc-string "some text <1%n\"")
+ (B "strong") ,(unesc-string "\"%n"))))
+ (test " <P><![CDATA[<BR>%n<![CDATA[<BR>]]&gt;]]></P>" '()
+ `(*TOP* (P ,(unesc-string "<BR>%n<![CDATA[<BR>]]>"))))
+; (test "<T1><T2>it&apos;s%r%nand that%n</T2>%r%n%r%n%n</T1>" '()
+; '(*TOP* (T1 (T2 "it's%nand that%n") "%n%n%n")))
+ (test "<T1><T2>it&apos;s%r%nand that%n</T2>%r%n%r%n%n</T1>" '()
+ `(*TOP* (T1 (T2 ,(unesc-string "it's%nand that%n")) ,(unesc-string "%n%n%n"))))
+ (test "<T1><T2>it&apos;s%rand that%n</T2>%r%n%r%n%n</T1>" '()
+ `(*TOP* (T1 (T2 ,(unesc-string "it's%nand that%n")) ,(unesc-string "%n%n%n"))))
+ (test "<!DOCTYPE T SYSTEM 'system1' ><!-- comment -->%n<T/>" '()
+ '(*TOP* (T)))
+ (test "<?xml version='1.0'?>%n<WEIGHT unit=\"pound\">%n<NET certified='certified'> 67 </NET>%n<GROSS> 95 </GROSS>%n</WEIGHT>" '()
+ `(*TOP* (*PI* xml "version='1.0'") (WEIGHT (@ (unit "pound"))
+ ,nl (NET (@ (certified "certified")) " 67 ") ,nl
+ (GROSS " 95 ") ,nl)
+ ))
+; (test "<?xml version='1.0'?>%n<WEIGHT unit=\"pound\">%n<NET certified='certified'> 67 </NET>%n<GROSS> 95 </GROSS>%n</WEIGHT>" '()
+; '(*TOP* (*PI* xml "version='1.0'") (WEIGHT (@ (unit "pound"))
+; "%n" (NET (@ (certified "certified")) " 67 ")
+; "%n" (GROSS " 95 ") "%n")
+; ))
+ (test "<DIV A:B='A' B='B' xmlns:A='URI1' xmlns='URI1'><A:P xmlns=''><BR/></A:P></DIV>" '()
+ '(*TOP* (URI1:DIV (@ (URI1:B "A") (B "B")) (URI1:P (BR)))))
+ (test "<DIV A:B='A' B='B' xmlns:A='URI1' xmlns='URI1'><A:P xmlns=''><BR/></A:P></DIV>" '((UA . "URI1"))
+ '(*TOP* (@ (*NAMESPACES* (UA "URI1")))
+ (UA:DIV (@ (UA:B "A") (B "B")) (UA:P (BR)))))
+
+ ; A few tests from XML Namespaces Recommendation
+ (test (string-append
+ "<x xmlns:edi='http://ecommerce.org/schema'>"
+ "<!-- the 'taxClass' attribute's ns http://ecommerce.org/schema -->"
+ "<lineItem edi:taxClass='exempt'>Baby food</lineItem>" nl
+ "</x>") '()
+ `(*TOP*
+ (x (lineItem
+ (@ (http://ecommerce.org/schema:taxClass "exempt"))
+ "Baby food") ,nl)))
+ (test (string-append
+ "<x xmlns:edi='http://ecommerce.org/schema'>"
+ "<!-- the 'taxClass' attribute's ns http://ecommerce.org/schema -->"
+ "<lineItem edi:taxClass='exempt'>Baby food</lineItem>"
+ "</x>") '((EDI . "http://ecommerce.org/schema"))
+ '(*TOP*
+ (@ (*NAMESPACES* (EDI "http://ecommerce.org/schema")))
+ (x (lineItem
+ (@ (EDI:taxClass "exempt"))
+ "Baby food"))))
+
+ (test (string-append
+ "<bk:book xmlns:bk='urn:loc.gov:books' "
+ "xmlns:isbn='urn:ISBN:0-395-36341-6'>"
+ "<bk:title>Cheaper by the Dozen</bk:title>"
+ "<isbn:number>1568491379</isbn:number></bk:book>")
+ '()
+ '(*TOP* (urn:loc.gov:books:book
+ (urn:loc.gov:books:title "Cheaper by the Dozen")
+ (urn:ISBN:0-395-36341-6:number "1568491379"))))
+
+ (test (string-append
+ "<!-- initially, the default namespace is 'books' -->"
+ "<book xmlns='urn:loc.gov:books' "
+ "xmlns:isbn='urn:ISBN:0-395-36341-6'>"
+ "<title>Cheaper by the Dozen</title>"
+ "<isbn:number>1568491379</isbn:number>"
+ "<notes>"
+ "<!-- make HTML the default namespace for some commentary -->"
+ "<p xmlns='urn:w3-org-ns:HTML'>"
+ "This is a <i>funny</i> book!"
+ "</p>"
+ "</notes>"
+ "</book>") '()
+ '(*TOP* (urn:loc.gov:books:book
+ (urn:loc.gov:books:title "Cheaper by the Dozen")
+ (urn:ISBN:0-395-36341-6:number "1568491379")
+ (urn:loc.gov:books:notes
+ (urn:w3-org-ns:HTML:p
+ "This is a " (urn:w3-org-ns:HTML:i "funny")
+ " book!")))))
+
+ (test (string-append
+ "<Beers>"
+ "<!-- the default namespace is now that of HTML -->"
+ "<table xmlns='http://www.w3.org/TR/REC-html40'>"
+ "<th><td>Name</td><td>Origin</td><td>Description</td></th>"
+ "<tr>"
+ "<!-- no default namespace inside table cells -->"
+ "<td><brandName xmlns=\"\">Huntsman</brandName></td>"
+ "<td><origin xmlns=''>Bath, UK</origin></td>"
+ "<td>"
+ "<details xmlns=''><class>Bitter</class><hop>Fuggles</hop>"
+ "<pro>Wonderful hop, light alcohol, good summer beer</pro>"
+ "<con>Fragile; excessive variance pub to pub</con>"
+ "</details>"
+ "</td>"
+ "</tr>"
+ "</table>"
+ "</Beers>")
+ '((html . "http://www.w3.org/TR/REC-html40"))
+ '(*TOP*
+ (@ (*NAMESPACES* (html "http://www.w3.org/TR/REC-html40")))
+ (Beers (html:table
+ (html:th (html:td "Name")
+ (html:td "Origin")
+ (html:td "Description"))
+ (html:tr (html:td (brandName "Huntsman"))
+ (html:td (origin "Bath, UK"))
+ (html:td
+ (details
+ (class "Bitter")
+ (hop "Fuggles")
+ (pro "Wonderful hop, light alcohol, good summer beer")
+ (con "Fragile; excessive variance pub to pub"))))))))
+
+ (test (string-append
+ "<!-- 1 --><RESERVATION xmlns:HTML='http://www.w3.org/TR/REC-html40'>"
+ "<!-- 2 --><NAME HTML:CLASS=\"largeSansSerif\">Layman, A</NAME>"
+ "<!-- 3 --><SEAT CLASS='Y' HTML:CLASS=\"largeMonotype\">33B</SEAT>"
+ "<!-- 4 --><HTML:A HREF='/cgi-bin/ResStatus'>Check Status</HTML:A>"
+ "<!-- 5 --><DEPARTURE>1997-05-24T07:55:00+1</DEPARTURE></RESERVATION>")
+ '((HTML . "http://www.w3.org/TR/REC-html40"))
+ '(*TOP*
+ (@ (*NAMESPACES* (HTML "http://www.w3.org/TR/REC-html40")))
+ (RESERVATION
+ (NAME (@ (HTML:CLASS "largeSansSerif")) "Layman, A")
+ (SEAT (@ (HTML:CLASS "largeMonotype") (CLASS "Y")) "33B")
+ (HTML:A (@ (HREF "/cgi-bin/ResStatus")) "Check Status")
+ (DEPARTURE "1997-05-24T07:55:00+1"))))
+ ; Part of RDF from the XML Infoset
+ (test (string-concatenate/shared '(
+ "<?xml version='1.0' encoding='utf-8' standalone='yes'?>"
+ "<!-- this can be decoded as US-ASCII or iso-8859-1 as well,"
+ " since it contains no characters outside the US-ASCII repertoire -->"
+ "<rdf:RDF xmlns:rdf='http://www.w3.org/1999/02/22-rdf-syntax-ns#'"
+ " xmlns:rdfs='http://www.w3.org/2000/01/rdf-schema#'"
+ " xmlns='http://www.w3.org/2001/02/infoset#'>"
+ "<rdfs:Class ID='Boolean'/>"
+ "<Boolean ID='Boolean.true'/>"
+ "<Boolean ID='Boolean.false'/>"
+ "<!--Info item classes-->"
+ "<rdfs:Class ID='InfoItem'/>"
+ "<rdfs:Class ID='Document' rdfs:subClassOf='#InfoItem'/>"
+ "<rdfs:Class ID='Element' rdfs:subClassOf='#InfoItem'/>"
+ "<rdfs:Class ID='Attribute' rdfs:subClassOf='#InfoItem'/>"
+ "<rdfs:Class ID='InfoItemSet'
+ rdfs:subClassOf='http://www.w3.org/1999/02/22-rdf-syntax-ns#Bag'/>"
+ "<rdfs:Class ID='AttributeSet' rdfs:subClassOf='#InfoItemSet'/>"
+ "<!--Info item properties-->"
+ "<rdfs:Property ID='allDeclarationsProcessed'>"
+ "<rdfs:domain resource='#Document'/>"
+ "<rdfs:range resource='#Boolean'/></rdfs:Property>"
+ "<rdfs:Property ID='attributes'>"
+ "<rdfs:domain resource='#Element'/>"
+ "<rdfs:range resource='#AttributeSet'/>"
+ "</rdfs:Property>"
+ "</rdf:RDF>"))
+ '((RDF . "http://www.w3.org/1999/02/22-rdf-syntax-ns#")
+ (RDFS . "http://www.w3.org/2000/01/rdf-schema#")
+ (ISET . "http://www.w3.org/2001/02/infoset#"))
+ '(*TOP* (@ (*NAMESPACES*
+ (RDF "http://www.w3.org/1999/02/22-rdf-syntax-ns#")
+ (RDFS "http://www.w3.org/2000/01/rdf-schema#")
+ (ISET "http://www.w3.org/2001/02/infoset#")))
+ (*PI* xml "version='1.0' encoding='utf-8' standalone='yes'")
+ (RDF:RDF
+ (RDFS:Class (@ (ID "Boolean")))
+ (ISET:Boolean (@ (ID "Boolean.true")))
+ (ISET:Boolean (@ (ID "Boolean.false")))
+ (RDFS:Class (@ (ID "InfoItem")))
+ (RDFS:Class (@ (RDFS:subClassOf "#InfoItem") (ID "Document")))
+ (RDFS:Class (@ (RDFS:subClassOf "#InfoItem") (ID "Element")))
+ (RDFS:Class (@ (RDFS:subClassOf "#InfoItem") (ID "Attribute")))
+ (RDFS:Class
+ (@ (RDFS:subClassOf
+ "http://www.w3.org/1999/02/22-rdf-syntax-ns#Bag")
+ (ID "InfoItemSet")))
+ (RDFS:Class
+ (@ (RDFS:subClassOf "#InfoItemSet") (ID "AttributeSet")))
+ (RDFS:Property
+ (@ (ID "allDeclarationsProcessed"))
+ (RDFS:domain (@ (resource "#Document")))
+ (RDFS:range (@ (resource "#Boolean"))))
+ (RDFS:Property
+ (@ (ID "attributes"))
+ (RDFS:domain (@ (resource "#Element")))
+ (RDFS:range (@ (resource "#AttributeSet")))))))
+
+ ; Part of RDF from RSS of the Daemon News Mall
+ (test (string-concatenate/shared (list-intersperse '(
+ "<?xml version='1.0'?><rdf:RDF "
+ "xmlns:rdf='http://www.w3.org/1999/02/22-rdf-syntax-ns#' "
+ "xmlns='http://my.netscape.com/rdf/simple/0.9/'>"
+ "<channel>"
+ "<title>Daemon News Mall</title>"
+ "<link>http://mall.daemonnews.org/</link>"
+ "<description>Central source for all your BSD needs</description>"
+ "</channel>"
+ "<item>"
+ "<title>Daemon News Jan/Feb Issue NOW Available! Subscribe $24.95</title>"
+ "<link>http://mall.daemonnews.org/?page=shop/flypage&amp;product_id=880</link>"
+ "</item>"
+ "<item>"
+ "<title>The Design and Implementation of the 4.4BSD Operating System $54.95</title>"
+ "<link>http://mall.daemonnews.org/?page=shop/flypage&amp;product_id=912&amp;category_id=1761</link>"
+ "</item>"
+ "</rdf:RDF>")
+ (string #\newline)
+ ))
+ '((RDF . "http://www.w3.org/1999/02/22-rdf-syntax-ns#")
+ (RSS . "http://my.netscape.com/rdf/simple/0.9/")
+ (ISET . "http://www.w3.org/2001/02/infoset#"))
+ `(*TOP* (@ (*NAMESPACES*
+ (RDF "http://www.w3.org/1999/02/22-rdf-syntax-ns#")
+ (RSS "http://my.netscape.com/rdf/simple/0.9/")
+ (ISET "http://www.w3.org/2001/02/infoset#")))
+ (*PI* xml "version='1.0'")
+ (RDF:RDF ,nl
+ (RSS:channel ,nl
+ (RSS:title "Daemon News Mall") ,nl
+ (RSS:link "http://mall.daemonnews.org/") ,nl
+ (RSS:description "Central source for all your BSD needs") ,nl) ,nl
+ (RSS:item ,nl
+ (RSS:title
+ "Daemon News Jan/Feb Issue NOW Available! Subscribe $24.95") ,nl
+ (RSS:link
+ "http://mall.daemonnews.org/?page=shop/flypage&product_id=880") ,nl) ,nl
+ (RSS:item ,nl
+ (RSS:title
+ "The Design and Implementation of the 4.4BSD Operating System $54.95") ,nl
+ (RSS:link
+ "http://mall.daemonnews.org/?page=shop/flypage&product_id=912&category_id=1761") ,nl) ,nl)))
+
+ (test (string-concatenate/shared
+ '("<Forecasts TStamp='958082142'>"
+ "<TAF TStamp='958066200' LatLon='36.583, -121.850' BId='724915'"
+ " SName='KMRY, MONTEREY PENINSULA'>"
+ "<VALID TRange='958068000, 958154400'>111730Z 111818</VALID>"
+ "<PERIOD TRange='958068000, 958078800'>"
+ "<PREVAILING>31010KT P6SM FEW030</PREVAILING>"
+ "</PERIOD>"
+ "<PERIOD TRange='958078800, 958104000' Title='FM2100'>"
+ "<PREVAILING>29016KT P6SM FEW040</PREVAILING>"
+ "</PERIOD>"
+ "<PERIOD TRange='958104000, 958154400' Title='FM0400'>"
+ "<PREVAILING>29010KT P6SM SCT200</PREVAILING>"
+ "<VAR Title='BECMG 0708' TRange='958114800, 958118400'>VRB05KT</VAR>"
+ "</PERIOD></TAF>"
+ "</Forecasts>"))
+ '()
+ '(*TOP* (Forecasts
+ (@ (TStamp "958082142"))
+ (TAF (@ (TStamp "958066200")
+ (SName "KMRY, MONTEREY PENINSULA")
+ (LatLon "36.583, -121.850")
+ (BId "724915"))
+ (VALID (@ (TRange "958068000, 958154400")) "111730Z 111818")
+ (PERIOD (@ (TRange "958068000, 958078800"))
+ (PREVAILING "31010KT P6SM FEW030"))
+ (PERIOD (@ (Title "FM2100") (TRange "958078800, 958104000"))
+ (PREVAILING "29016KT P6SM FEW040"))
+ (PERIOD (@ (Title "FM0400") (TRange "958104000, 958154400"))
+ (PREVAILING "29010KT P6SM SCT200")
+ (VAR (@ (Title "BECMG 0708")
+ (TRange "958114800, 958118400"))
+ "VRB05KT"))))))
+))
+
+(run-test
+ (newline)
+ (display "All tests passed")
+ (newline)
+)
diff --git a/module/sxml/upstream/SXML-tree-trans.scm b/module/sxml/upstream/SXML-tree-trans.scm
new file mode 100644
index 000000000..f2c3293ca
--- /dev/null
+++ b/module/sxml/upstream/SXML-tree-trans.scm
@@ -0,0 +1,249 @@
+; XML/HTML processing in Scheme
+; SXML expression tree transformers
+;
+; IMPORT
+; A prelude appropriate for your Scheme system
+; (myenv-bigloo.scm, myenv-mit.scm, etc.)
+;
+; EXPORT
+; (provide SRV:send-reply
+; post-order pre-post-order replace-range)
+;
+; See vSXML-tree-trans.scm for the validation code, which also
+; serves as usage examples.
+;
+; $Id: SXML-tree-trans.scm,v 1.6 2003/04/25 19:16:15 oleg Exp $
+
+
+; Output the 'fragments'
+; The fragments are a list of strings, characters,
+; numbers, thunks, #f, #t -- and other fragments.
+; The function traverses the tree depth-first, writes out
+; strings and characters, executes thunks, and ignores
+; #f and '().
+; The function returns #t if anything was written at all;
+; otherwise the result is #f
+; If #t occurs among the fragments, it is not written out
+; but causes the result of SRV:send-reply to be #t
+
+(define (SRV:send-reply . fragments)
+ (let loop ((fragments fragments) (result #f))
+ (cond
+ ((null? fragments) result)
+ ((not (car fragments)) (loop (cdr fragments) result))
+ ((null? (car fragments)) (loop (cdr fragments) result))
+ ((eq? #t (car fragments)) (loop (cdr fragments) #t))
+ ((pair? (car fragments))
+ (loop (cdr fragments) (loop (car fragments) result)))
+ ((procedure? (car fragments))
+ ((car fragments))
+ (loop (cdr fragments) #t))
+ (else
+ (display (car fragments))
+ (loop (cdr fragments) #t)))))
+
+
+
+;------------------------------------------------------------------------
+; Traversal of an SXML tree or a grove:
+; a <Node> or a <Nodelist>
+;
+; A <Node> and a <Nodelist> are mutually-recursive datatypes that
+; underlie the SXML tree:
+; <Node> ::= (name . <Nodelist>) | "text string"
+; An (ordered) set of nodes is just a list of the constituent nodes:
+; <Nodelist> ::= (<Node> ...)
+; Nodelists, and Nodes other than text strings are both lists. A
+; <Nodelist> however is either an empty list, or a list whose head is
+; not a symbol (an atom in general). A symbol at the head of a node is
+; either an XML name (in which case it's a tag of an XML element), or
+; an administrative name such as '@'.
+; See SXPath.scm and SSAX.scm for more information on SXML.
+
+
+; Pre-Post-order traversal of a tree and creation of a new tree:
+; pre-post-order:: <tree> x <bindings> -> <new-tree>
+; where
+; <bindings> ::= (<binding> ...)
+; <binding> ::= (<trigger-symbol> *preorder* . <handler>) |
+; (<trigger-symbol> *macro* . <handler>) |
+; (<trigger-symbol> <new-bindings> . <handler>) |
+; (<trigger-symbol> . <handler>)
+; <trigger-symbol> ::= XMLname | *text* | *default*
+; <handler> :: <trigger-symbol> x [<tree>] -> <new-tree>
+;
+; The pre-post-order function visits the nodes and nodelists
+; pre-post-order (depth-first). For each <Node> of the form (name
+; <Node> ...) it looks up an association with the given 'name' among
+; its <bindings>. If failed, pre-post-order tries to locate a
+; *default* binding. It's an error if the latter attempt fails as
+; well. Having found a binding, the pre-post-order function first
+; checks to see if the binding is of the form
+; (<trigger-symbol> *preorder* . <handler>)
+; If it is, the handler is 'applied' to the current node. Otherwise,
+; the pre-post-order function first calls itself recursively for each
+; child of the current node, with <new-bindings> prepended to the
+; <bindings> in effect. The result of these calls is passed to the
+; <handler> (along with the head of the current <Node>). To be more
+; precise, the handler is _applied_ to the head of the current node
+; and its processed children. The result of the handler, which should
+; also be a <tree>, replaces the current <Node>. If the current <Node>
+; is a text string or other atom, a special binding with a symbol
+; *text* is looked up.
+;
+; A binding can also be of a form
+; (<trigger-symbol> *macro* . <handler>)
+; This is equivalent to *preorder* described above. However, the result
+; is re-processed again, with the current stylesheet.
+
+(define (pre-post-order tree bindings)
+ (let* ((default-binding (assq '*default* bindings))
+ (text-binding (or (assq '*text* bindings) default-binding))
+ (text-handler ; Cache default and text bindings
+ (and text-binding
+ (if (procedure? (cdr text-binding))
+ (cdr text-binding) (cddr text-binding)))))
+ (let loop ((tree tree))
+ (cond
+ ((null? tree) '())
+ ((not (pair? tree))
+ (let ((trigger '*text*))
+ (if text-handler (text-handler trigger tree)
+ (error "Unknown binding for " trigger " and no default"))))
+ ((not (symbol? (car tree))) (map loop tree)) ; tree is a nodelist
+ (else ; tree is an SXML node
+ (let* ((trigger (car tree))
+ (binding (or (assq trigger bindings) default-binding)))
+ (cond
+ ((not binding)
+ (error "Unknown binding for " trigger " and no default"))
+ ((not (pair? (cdr binding))) ; must be a procedure: handler
+ (apply (cdr binding) trigger (map loop (cdr tree))))
+ ((eq? '*preorder* (cadr binding))
+ (apply (cddr binding) tree))
+ ((eq? '*macro* (cadr binding))
+ (loop (apply (cddr binding) tree)))
+ (else ; (cadr binding) is a local binding
+ (apply (cddr binding) trigger
+ (pre-post-order (cdr tree) (append (cadr binding) bindings)))
+ ))))))))
+
+; post-order is a strict subset of pre-post-order without *preorder*
+; (let alone *macro*) traversals.
+; Now pre-post-order is actually faster than the old post-order.
+; The function post-order is deprecated and is aliased below for
+; backward compatibility.
+(define post-order pre-post-order)
+
+;------------------------------------------------------------------------
+; Extended tree fold
+; tree = atom | (node-name tree ...)
+;
+; foldts fdown fup fhere seed (Leaf str) = fhere seed str
+; foldts fdown fup fhere seed (Nd kids) =
+; fup seed $ foldl (foldts fdown fup fhere) (fdown seed) kids
+
+; procedure fhere: seed -> atom -> seed
+; procedure fdown: seed -> node -> seed
+; procedure fup: parent-seed -> last-kid-seed -> node -> seed
+; foldts returns the final seed
+
+(define (foldts fdown fup fhere seed tree)
+ (cond
+ ((null? tree) seed)
+ ((not (pair? tree)) ; An atom
+ (fhere seed tree))
+ (else
+ (let loop ((kid-seed (fdown seed tree)) (kids (cdr tree)))
+ (if (null? kids)
+ (fup seed kid-seed tree)
+ (loop (foldts fdown fup fhere kid-seed (car kids))
+ (cdr kids)))))))
+
+;------------------------------------------------------------------------
+; Traverse a forest depth-first and cut/replace ranges of nodes.
+;
+; The nodes that define a range don't have to have the same immediate
+; parent, don't have to be on the same level, and the end node of a
+; range doesn't even have to exist. A replace-range procedure removes
+; nodes from the beginning node of the range up to (but not including)
+; the end node of the range. In addition, the beginning node of the
+; range can be replaced by a node or a list of nodes. The range of
+; nodes is cut while depth-first traversing the forest. If all
+; branches of the node are cut a node is cut as well. The procedure
+; can cut several non-overlapping ranges from a forest.
+
+; replace-range:: BEG-PRED x END-PRED x FOREST -> FOREST
+; where
+; type FOREST = (NODE ...)
+; type NODE = Atom | (Name . FOREST) | FOREST
+;
+; The range of nodes is specified by two predicates, beg-pred and end-pred.
+; beg-pred:: NODE -> #f | FOREST
+; end-pred:: NODE -> #f | FOREST
+; The beg-pred predicate decides on the beginning of the range. The node
+; for which the predicate yields non-#f marks the beginning of the range
+; The non-#f value of the predicate replaces the node. The value can be a
+; list of nodes. The replace-range procedure then traverses the tree and skips
+; all the nodes, until the end-pred yields non-#f. The value of the end-pred
+; replaces the end-range node. The new end node and its brothers will be
+; re-scanned.
+; The predicates are evaluated pre-order. We do not descend into a node that
+; is marked as the beginning of the range.
+
+(define (replace-range beg-pred end-pred forest)
+
+ ; loop forest keep? new-forest
+ ; forest is the forest to traverse
+ ; new-forest accumulates the nodes we will keep, in the reverse
+ ; order
+ ; If keep? is #t, keep the curr node if atomic. If the node is not atomic,
+ ; traverse its children and keep those that are not in the skip range.
+ ; If keep? is #f, skip the current node if atomic. Otherwise,
+ ; traverse its children. If all children are skipped, skip the node
+ ; as well.
+
+ (define (loop forest keep? new-forest)
+ (if (null? forest) (values (reverse new-forest) keep?)
+ (let ((node (car forest)))
+ (if keep?
+ (cond ; accumulate mode
+ ((beg-pred node) => ; see if the node starts the skip range
+ (lambda (repl-branches) ; if so, skip/replace the node
+ (loop (cdr forest) #f
+ (append (reverse repl-branches) new-forest))))
+ ((not (pair? node)) ; it's an atom, keep it
+ (loop (cdr forest) keep? (cons node new-forest)))
+ (else
+ (let*-values
+ (((node?) (symbol? (car node))) ; or is it a nodelist?
+ ((new-kids keep?) ; traverse its children
+ (loop (if node? (cdr node) node) #t '())))
+ (loop (cdr forest) keep?
+ (cons
+ (if node? (cons (car node) new-kids) new-kids)
+ new-forest)))))
+ ; skip mode
+ (cond
+ ((end-pred node) => ; end the skip range
+ (lambda (repl-branches) ; repl-branches will be re-scanned
+ (loop (append repl-branches (cdr forest)) #t
+ new-forest)))
+ ((not (pair? node)) ; it's an atom, skip it
+ (loop (cdr forest) keep? new-forest))
+ (else
+ (let*-values
+ (((node?) (symbol? (car node))) ; or is it a nodelist?
+ ((new-kids keep?) ; traverse its children
+ (loop (if node? (cdr node) node) #f '())))
+ (loop (cdr forest) keep?
+ (if (or keep? (pair? new-kids))
+ (cons
+ (if node? (cons (car node) new-kids) new-kids)
+ new-forest)
+ new-forest) ; if all kids are skipped
+ )))))))) ; skip the node too
+
+ (let*-values (((new-forest keep?) (loop forest #t '())))
+ new-forest))
+
diff --git a/module/sxml/upstream/SXPath-old.scm b/module/sxml/upstream/SXPath-old.scm
new file mode 100644
index 000000000..cf4526ed6
--- /dev/null
+++ b/module/sxml/upstream/SXPath-old.scm
@@ -0,0 +1,1216 @@
+; XML processing in Scheme
+; SXPath -- SXML Query Language
+;
+; SXPath is a query language for SXML, an instance of XML Information
+; set (Infoset) in the form of s-expressions. See SSAX.scm for the
+; definition of SXML and more details. SXPath is also a translation into
+; Scheme of an XML Path Language, XPath:
+; http://www.w3.org/TR/xpath
+; XPath and SXPath describe means of selecting a set of Infoset's items
+; or their properties.
+;
+; To facilitate queries, XPath maps the XML Infoset into an explicit
+; tree, and introduces important notions of a location path and a
+; current, context node. A location path denotes a selection of a set of
+; nodes relative to a context node. Any XPath tree has a distinguished,
+; root node -- which serves as the context node for absolute location
+; paths. Location path is recursively defined as a location step joined
+; with a location path. A location step is a simple query of the
+; database relative to a context node. A step may include expressions
+; that further filter the selected set. Each node in the resulting set
+; is used as a context node for the adjoining location path. The result
+; of the step is a union of the sets returned by the latter location
+; paths.
+;
+; The SXML representation of the XML Infoset (see SSAX.scm) is rather
+; suitable for querying as it is. Bowing to the XPath specification,
+; we will refer to SXML information items as 'Nodes':
+; <Node> ::= <Element> | <attributes-coll> | <attrib>
+; | "text string" | <PI>
+; This production can also be described as
+; <Node> ::= (name . <Nodeset>) | "text string"
+; An (ordered) set of nodes is just a list of the constituent nodes:
+; <Nodeset> ::= (<Node> ...)
+; Nodesets, and Nodes other than text strings are both lists. A
+; <Nodeset> however is either an empty list, or a list whose head is not
+; a symbol. A symbol at the head of a node is either an XML name (in
+; which case it's a tag of an XML element), or an administrative name
+; such as '@'. This uniform list representation makes processing rather
+; simple and elegant, while avoiding confusion. The multi-branch tree
+; structure formed by the mutually-recursive datatypes <Node> and
+; <Nodeset> lends itself well to processing by functional languages.
+;
+; A location path is in fact a composite query over an XPath tree or
+; its branch. A singe step is a combination of a projection, selection
+; or a transitive closure. Multiple steps are combined via join and
+; union operations. This insight allows us to _elegantly_ implement
+; XPath as a sequence of projection and filtering primitives --
+; converters -- joined by _combinators_. Each converter takes a node
+; and returns a nodeset which is the result of the corresponding query
+; relative to that node. A converter can also be called on a set of
+; nodes. In that case it returns a union of the corresponding queries over
+; each node in the set. The union is easily implemented as a list
+; append operation as all nodes in a SXML tree are considered
+; distinct, by XPath conventions. We also preserve the order of the
+; members in the union. Query combinators are high-order functions:
+; they take converter(s) (which is a Node|Nodeset -> Nodeset function)
+; and compose or otherwise combine them. We will be concerned with
+; only relative location paths [XPath]: an absolute location path is a
+; relative path applied to the root node.
+;
+; Similarly to XPath, SXPath defines full and abbreviated notations
+; for location paths. In both cases, the abbreviated notation can be
+; mechanically expanded into the full form by simple rewriting
+; rules. In case of SXPath the corresponding rules are given as
+; comments to a sxpath function, below. The regression test suite at
+; the end of this file shows a representative sample of SXPaths in
+; both notations, juxtaposed with the corresponding XPath
+; expressions. Most of the samples are borrowed literally from the
+; XPath specification, while the others are adjusted for our running
+; example, tree1.
+;
+; To do:
+; Rename filter to node-filter or ns-filter
+; Use ;=== for chapters, ;--- for sections, and ;^^^ for end sections
+;
+; $Id: SXPath-old.scm,v 1.4 2004/07/07 16:02:31 sperber Exp $
+
+
+ ; See http://pobox.com/~oleg/ftp/Scheme/myenv.scm
+ ; See http://pobox.com/~oleg/ftp/Scheme/myenv-scm.scm
+ ; See http://pobox.com/~oleg/ftp/Scheme/myenv-bigloo.scm
+;(module SXPath
+; (include "myenv-bigloo.scm")) ; For use with Bigloo 2.2b
+;(load "myenv-scm.scm") ; For use with SCM v5d2
+;(include "myenv.scm") ; For use with Gambit-C 3.0
+
+
+
+(define (nodeset? x)
+ (or (and (pair? x) (not (symbol? (car x)))) (null? x)))
+
+;-------------------------
+; Basic converters and applicators
+; A converter is a function
+; type Converter = Node|Nodeset -> Nodeset
+; A converter can also play a role of a predicate: in that case, if a
+; converter, applied to a node or a nodeset, yields a non-empty
+; nodeset, the converter-predicate is deemed satisfied. Throughout
+; this file a nil nodeset is equivalent to #f in denoting a failure.
+
+; The following function implements a 'Node test' as defined in
+; Sec. 2.3 of XPath document. A node test is one of the components of a
+; location step. It is also a converter-predicate in SXPath.
+;
+; The function node-typeof? takes a type criterion and returns a function,
+; which, when applied to a node, will tell if the node satisfies
+; the test.
+; node-typeof? :: Crit -> Node -> Boolean
+;
+; The criterion 'crit' is a symbol, one of the following:
+; id - tests if the Node has the right name (id)
+; @ - tests if the Node is an <attributes-coll>
+; * - tests if the Node is an <Element>
+; *text* - tests if the Node is a text node
+; *PI* - tests if the Node is a PI node
+; *any* - #t for any type of Node
+
+(define (node-typeof? crit)
+ (lambda (node)
+ (case crit
+ ((*) (and (pair? node) (not (memq (car node) '(@ *PI*)))))
+ ((*any*) #t)
+ ((*text*) (string? node))
+ (else
+ (and (pair? node) (eq? crit (car node))))
+)))
+
+
+; Curried equivalence converter-predicates
+(define (node-eq? other)
+ (lambda (node)
+ (eq? other node)))
+
+(define (node-equal? other)
+ (lambda (node)
+ (equal? other node)))
+
+; node-pos:: N -> Nodeset -> Nodeset, or
+; node-pos:: N -> Converter
+; Select the N'th element of a Nodeset and return as a singular Nodeset;
+; Return an empty nodeset if the Nth element does not exist.
+; ((node-pos 1) Nodeset) selects the node at the head of the Nodeset,
+; if exists; ((node-pos 2) Nodeset) selects the Node after that, if
+; exists.
+; N can also be a negative number: in that case the node is picked from
+; the tail of the list.
+; ((node-pos -1) Nodeset) selects the last node of a non-empty nodeset;
+; ((node-pos -2) Nodeset) selects the last but one node, if exists.
+
+(define (node-pos n)
+ (lambda (nodeset)
+ (cond
+ ((not (nodeset? nodeset)) '())
+ ((null? nodeset) nodeset)
+ ((eqv? n 1) (list (car nodeset)))
+ ((negative? n) ((node-pos (+ n 1 (length nodeset))) nodeset))
+ (else
+ (assert (positive? n))
+ ((node-pos (dec n)) (cdr nodeset))))))
+
+; filter:: Converter -> Converter
+; A filter applicator, which introduces a filtering context. The argument
+; converter is considered a predicate, with either #f or nil result meaning
+; failure.
+(define (filter pred?)
+ (lambda (lst) ; a nodeset or a node (will be converted to a singleton nset)
+ (let loop ((lst (if (nodeset? lst) lst (list lst))) (res '()))
+ (if (null? lst)
+ (reverse res)
+ (let ((pred-result (pred? (car lst))))
+ (loop (cdr lst)
+ (if (and pred-result (not (null? pred-result)))
+ (cons (car lst) res)
+ res)))))))
+
+; take-until:: Converter -> Converter, or
+; take-until:: Pred -> Node|Nodeset -> Nodeset
+; Given a converter-predicate and a nodeset, apply the predicate to
+; each element of the nodeset, until the predicate yields anything but #f or
+; nil. Return the elements of the input nodeset that have been processed
+; till that moment (that is, which fail the predicate).
+; take-until is a variation of the filter above: take-until passes
+; elements of an ordered input set till (but not including) the first
+; element that satisfies the predicate.
+; The nodeset returned by ((take-until (not pred)) nset) is a subset --
+; to be more precise, a prefix -- of the nodeset returned by
+; ((filter pred) nset)
+
+(define (take-until pred?)
+ (lambda (lst) ; a nodeset or a node (will be converted to a singleton nset)
+ (let loop ((lst (if (nodeset? lst) lst (list lst))))
+ (if (null? lst) lst
+ (let ((pred-result (pred? (car lst))))
+ (if (and pred-result (not (null? pred-result)))
+ '()
+ (cons (car lst) (loop (cdr lst)))))
+ ))))
+
+
+; take-after:: Converter -> Converter, or
+; take-after:: Pred -> Node|Nodeset -> Nodeset
+; Given a converter-predicate and a nodeset, apply the predicate to
+; each element of the nodeset, until the predicate yields anything but #f or
+; nil. Return the elements of the input nodeset that have not been processed:
+; that is, return the elements of the input nodeset that follow the first
+; element that satisfied the predicate.
+; take-after along with take-until partition an input nodeset into three
+; parts: the first element that satisfies a predicate, all preceding
+; elements and all following elements.
+
+(define (take-after pred?)
+ (lambda (lst) ; a nodeset or a node (will be converted to a singleton nset)
+ (let loop ((lst (if (nodeset? lst) lst (list lst))))
+ (if (null? lst) lst
+ (let ((pred-result (pred? (car lst))))
+ (if (and pred-result (not (null? pred-result)))
+ (cdr lst)
+ (loop (cdr lst))))
+ ))))
+
+; Apply proc to each element of lst and return the list of results.
+; if proc returns a nodeset, splice it into the result
+;
+; From another point of view, map-union is a function Converter->Converter,
+; which places an argument-converter in a joining context.
+
+(define (map-union proc lst)
+ (if (null? lst) lst
+ (let ((proc-res (proc (car lst))))
+ ((if (nodeset? proc-res) append cons)
+ proc-res (map-union proc (cdr lst))))))
+
+; node-reverse :: Converter, or
+; node-reverse:: Node|Nodeset -> Nodeset
+; Reverses the order of nodes in the nodeset
+; This basic converter is needed to implement a reverse document order
+; (see the XPath Recommendation).
+(define node-reverse
+ (lambda (node-or-nodeset)
+ (if (not (nodeset? node-or-nodeset)) (list node-or-nodeset)
+ (reverse node-or-nodeset))))
+
+; node-trace:: String -> Converter
+; (node-trace title) is an identity converter. In addition it prints out
+; a node or nodeset it is applied to, prefixed with the 'title'.
+; This converter is very useful for debugging.
+
+(define (node-trace title)
+ (lambda (node-or-nodeset)
+ (cout nl "-->")
+ (display title)
+ (display " :")
+ (pretty-print node-or-nodeset)
+ node-or-nodeset))
+
+
+;-------------------------
+; Converter combinators
+;
+; Combinators are higher-order functions that transmogrify a converter
+; or glue a sequence of converters into a single, non-trivial
+; converter. The goal is to arrive at converters that correspond to
+; XPath location paths.
+;
+; From a different point of view, a combinator is a fixed, named
+; _pattern_ of applying converters. Given below is a complete set of
+; such patterns that together implement XPath location path
+; specification. As it turns out, all these combinators can be built
+; from a small number of basic blocks: regular functional composition,
+; map-union and filter applicators, and the nodeset union.
+
+
+
+; select-kids:: Pred -> Node -> Nodeset
+; Given a Node, return an (ordered) subset its children that satisfy
+; the Pred (a converter, actually)
+; select-kids:: Pred -> Nodeset -> Nodeset
+; The same as above, but select among children of all the nodes in
+; the Nodeset
+;
+; More succinctly, the signature of this function is
+; select-kids:: Converter -> Converter
+
+(define (select-kids test-pred?)
+ (lambda (node) ; node or node-set
+ (cond
+ ((null? node) node)
+ ((not (pair? node)) '()) ; No children
+ ((symbol? (car node))
+ ((filter test-pred?) (cdr node))) ; it's a single node
+ (else (map-union (select-kids test-pred?) node)))))
+
+
+; node-self:: Pred -> Node -> Nodeset, or
+; node-self:: Converter -> Converter
+; Similar to select-kids but apply to the Node itself rather
+; than to its children. The resulting Nodeset will contain either one
+; component, or will be empty (if the Node failed the Pred).
+(define node-self filter)
+
+
+; node-join:: [LocPath] -> Node|Nodeset -> Nodeset, or
+; node-join:: [Converter] -> Converter
+; join the sequence of location steps or paths as described
+; in the title comments above.
+(define (node-join . selectors)
+ (lambda (nodeset) ; Nodeset or node
+ (let loop ((nodeset nodeset) (selectors selectors))
+ (if (null? selectors) nodeset
+ (loop
+ (if (nodeset? nodeset)
+ (map-union (car selectors) nodeset)
+ ((car selectors) nodeset))
+ (cdr selectors))))))
+
+
+; node-reduce:: [LocPath] -> Node|Nodeset -> Nodeset, or
+; node-reduce:: [Converter] -> Converter
+; A regular functional composition of converters.
+; From a different point of view,
+; ((apply node-reduce converters) nodeset)
+; is equivalent to
+; (foldl apply nodeset converters)
+; i.e., folding, or reducing, a list of converters with the nodeset
+; as a seed.
+(define (node-reduce . converters)
+ (lambda (nodeset) ; Nodeset or node
+ (let loop ((nodeset nodeset) (converters converters))
+ (if (null? converters) nodeset
+ (loop ((car converters) nodeset) (cdr converters))))))
+
+
+; node-or:: [Converter] -> Converter
+; This combinator applies all converters to a given node and
+; produces the union of their results.
+; This combinator corresponds to a union, '|' operation for XPath
+; location paths.
+; (define (node-or . converters)
+; (lambda (node-or-nodeset)
+; (if (null? converters) node-or-nodeset
+; (append
+; ((car converters) node-or-nodeset)
+; ((apply node-or (cdr converters)) node-or-nodeset)))))
+; More optimal implementation follows
+(define (node-or . converters)
+ (lambda (node-or-nodeset)
+ (let loop ((result '()) (converters converters))
+ (if (null? converters) result
+ (loop (append result (or ((car converters) node-or-nodeset) '()))
+ (cdr converters))))))
+
+
+; node-closure:: Converter -> Converter
+; Select all _descendants_ of a node that satisfy a converter-predicate.
+; This combinator is similar to select-kids but applies to
+; grand... children as well.
+; This combinator implements the "descendant::" XPath axis
+; Conceptually, this combinator can be expressed as
+; (define (node-closure f)
+; (node-or
+; (select-kids f)
+; (node-reduce (select-kids (node-typeof? '*)) (node-closure f))))
+; This definition, as written, looks somewhat like a fixpoint, and it
+; will run forever. It is obvious however that sooner or later
+; (select-kids (node-typeof? '*)) will return an empty nodeset. At
+; this point further iterations will no longer affect the result and
+; can be stopped.
+
+(define (node-closure test-pred?)
+ (lambda (node) ; Nodeset or node
+ (let loop ((parent node) (result '()))
+ (if (null? parent) result
+ (loop ((select-kids (node-typeof? '*)) parent)
+ (append result
+ ((select-kids test-pred?) parent)))
+ ))))
+
+; node-parent:: RootNode -> Converter
+; (node-parent rootnode) yields a converter that returns a parent of a
+; node it is applied to. If applied to a nodeset, it returns the list
+; of parents of nodes in the nodeset. The rootnode does not have
+; to be the root node of the whole SXML tree -- it may be a root node
+; of a branch of interest.
+; Given the notation of Philip Wadler's paper on semantics of XSLT,
+; parent(x) = { y | y=subnode*(root), x=subnode(y) }
+; Therefore, node-parent is not the fundamental converter: it can be
+; expressed through the existing ones. Yet node-parent is a rather
+; convenient converter. It corresponds to a parent:: axis of SXPath.
+; Note that the parent:: axis can be used with an attribute node as well!
+
+(define (node-parent rootnode)
+ (lambda (node) ; Nodeset or node
+ (if (nodeset? node) (map-union (node-parent rootnode) node)
+ (let ((pred
+ (node-or
+ (node-reduce
+ (node-self (node-typeof? '*))
+ (select-kids (node-eq? node)))
+ (node-join
+ (select-kids (node-typeof? '@))
+ (select-kids (node-eq? node))))))
+ ((node-or
+ (node-self pred)
+ (node-closure pred))
+ rootnode)))))
+
+;-------------------------
+; Evaluate an abbreviated SXPath
+; sxpath:: AbbrPath -> Converter, or
+; sxpath:: AbbrPath -> Node|Nodeset -> Nodeset
+; AbbrPath is a list. It is translated to the full SXPath according
+; to the following rewriting rules
+; (sxpath '()) -> (node-join)
+; (sxpath '(path-component ...)) ->
+; (node-join (sxpath1 path-component) (sxpath '(...)))
+; (sxpath1 '//) -> (node-or
+; (node-self (node-typeof? '*any*))
+; (node-closure (node-typeof? '*any*)))
+; (sxpath1 '(equal? x)) -> (select-kids (node-equal? x))
+; (sxpath1 '(eq? x)) -> (select-kids (node-eq? x))
+; (sxpath1 ?symbol) -> (select-kids (node-typeof? ?symbol)
+; (sxpath1 procedure) -> procedure
+; (sxpath1 '(?symbol ...)) -> (sxpath1 '((?symbol) ...))
+; (sxpath1 '(path reducer ...)) ->
+; (node-reduce (sxpath path) (sxpathr reducer) ...)
+; (sxpathr number) -> (node-pos number)
+; (sxpathr path-filter) -> (filter (sxpath path-filter))
+
+(define (sxpath path)
+ (lambda (nodeset)
+ (let loop ((nodeset nodeset) (path path))
+ (cond
+ ((null? path) nodeset)
+ ((nodeset? nodeset)
+ (map-union (sxpath path) nodeset))
+ ((procedure? (car path))
+ (loop ((car path) nodeset) (cdr path)))
+ ((eq? '// (car path))
+ (loop
+ ((if (nodeset? nodeset) append cons) nodeset
+ ((node-closure (node-typeof? '*any*)) nodeset))
+ (cdr path)))
+ ((symbol? (car path))
+ (loop ((select-kids (node-typeof? (car path))) nodeset)
+ (cdr path)))
+ ((and (pair? (car path)) (eq? 'equal? (caar path)))
+ (loop ((select-kids (apply node-equal? (cdar path))) nodeset)
+ (cdr path)))
+ ((and (pair? (car path)) (eq? 'eq? (caar path)))
+ (loop ((select-kids (apply node-eq? (cdar path))) nodeset)
+ (cdr path)))
+ ((pair? (car path))
+ (let reducer ((nodeset
+ (if (symbol? (caar path))
+ ((select-kids (node-typeof? (caar path))) nodeset)
+ (loop nodeset (caar path))))
+ (reducing-path (cdar path)))
+ (cond
+ ((null? reducing-path) (loop nodeset (cdr path)))
+ ((number? (car reducing-path))
+ (reducer ((node-pos (car reducing-path)) nodeset)
+ (cdr reducing-path)))
+ (else
+ (reducer ((filter (sxpath (car reducing-path))) nodeset)
+ (cdr reducing-path))))))
+ (else
+ (error "Invalid path step: " (car path)))
+))))
+
+;------------------------------------------------------------------------
+; Sample XPath/SXPath expressions: regression test suite for the
+; implementation above.
+
+; A running example
+
+(define tree1
+ '(html
+ (head (title "Slides"))
+ (body
+ (p (@ (align "center"))
+ (table (@ (style "font-size: x-large"))
+ (tr
+ (td (@ (align "right")) "Talks ")
+ (td (@ (align "center")) " = ")
+ (td " slides + transition"))
+ (tr (td)
+ (td (@ (align "center")) " = ")
+ (td " data + control"))
+ (tr (td)
+ (td (@ (align "center")) " = ")
+ (td " programs"))))
+ (ul
+ (li (a (@ (href "slides/slide0001.gif")) "Introduction"))
+ (li (a (@ (href "slides/slide0010.gif")) "Summary")))
+ )))
+
+
+; Example from a posting "Re: DrScheme and XML",
+; Shriram Krishnamurthi, comp.lang.scheme, Nov. 26. 1999.
+; http://www.deja.com/getdoc.xp?AN=553507805
+(define tree3
+ '(poem (@ (title "The Lovesong of J. Alfred Prufrock")
+ (poet "T. S. Eliot"))
+ (stanza
+ (line "Let us go then, you and I,")
+ (line "When the evening is spread out against the sky")
+ (line "Like a patient etherized upon a table:"))
+ (stanza
+ (line "In the room the women come and go")
+ (line "Talking of Michaelangelo."))))
+
+; Validation Test harness
+
+(define-syntax run-test
+ (syntax-rules (define)
+ ((run-test "scan-exp" (define vars body))
+ (define vars (run-test "scan-exp" body)))
+ ((run-test "scan-exp" ?body)
+ (letrec-syntax
+ ((scan-exp ; (scan-exp body k)
+ (syntax-rules (quote quasiquote !)
+ ((scan-exp '() (k-head ! . args))
+ (k-head '() . args))
+ ((scan-exp (quote (hd . tl)) k)
+ (scan-lit-lst (hd . tl) (do-wrap ! quasiquote k)))
+ ((scan-exp (quasiquote (hd . tl)) k)
+ (scan-lit-lst (hd . tl) (do-wrap ! quasiquote k)))
+ ((scan-exp (quote x) (k-head ! . args))
+ (k-head
+ (if (string? (quote x)) (string->symbol (quote x)) (quote x))
+ . args))
+ ((scan-exp (hd . tl) k)
+ (scan-exp hd (do-tl ! scan-exp tl k)))
+ ((scan-exp x (k-head ! . args))
+ (k-head x . args))))
+ (do-tl
+ (syntax-rules (!)
+ ((do-tl processed-hd fn () (k-head ! . args))
+ (k-head (processed-hd) . args))
+ ((do-tl processed-hd fn old-tl k)
+ (fn old-tl (do-cons ! processed-hd k)))))
+ (do-cons
+ (syntax-rules (!)
+ ((do-cons processed-tl processed-hd (k-head ! . args))
+ (k-head (processed-hd . processed-tl) . args))))
+ (do-wrap
+ (syntax-rules (!)
+ ((do-wrap val fn (k-head ! . args))
+ (k-head (fn val) . args))))
+ (do-finish
+ (syntax-rules ()
+ ((do-finish new-body) new-body)))
+
+ (scan-lit-lst ; scan literal list
+ (syntax-rules (quote unquote unquote-splicing !)
+ ((scan-lit-lst '() (k-head ! . args))
+ (k-head '() . args))
+ ((scan-lit-lst (quote (hd . tl)) k)
+ (do-tl quote scan-lit-lst ((hd . tl)) k))
+ ((scan-lit-lst (unquote x) k)
+ (scan-exp x (do-wrap ! unquote k)))
+ ((scan-lit-lst (unquote-splicing x) k)
+ (scan-exp x (do-wrap ! unquote-splicing k)))
+ ((scan-lit-lst (quote x) (k-head ! . args))
+ (k-head
+ ,(if (string? (quote x)) (string->symbol (quote x)) (quote x))
+ . args))
+ ((scan-lit-lst (hd . tl) k)
+ (scan-lit-lst hd (do-tl ! scan-lit-lst tl k)))
+ ((scan-lit-lst x (k-head ! . args))
+ (k-head x . args))))
+ )
+ (scan-exp ?body (do-finish !))))
+ ((run-test body ...)
+ (begin
+ (run-test "scan-exp" body) ...))
+))
+
+; Overwrite the above macro to switch the tests off
+; (define-macro (run-test selector node expected-result) #f)
+
+; Location path, full form: child::para
+; Location path, abbreviated form: para
+; selects the para element children of the context node
+
+(let ((tree
+ '(elem (@) (para (@) "para") (br (@)) "cdata" (para (@) "second par"))
+ )
+ (expected '((para (@) "para") (para (@) "second par")))
+ )
+ (run-test (select-kids (node-typeof? 'para)) tree expected)
+ (run-test (sxpath '(para)) tree expected)
+)
+
+; Location path, full form: child::*
+; Location path, abbreviated form: *
+; selects all element children of the context node
+
+(let ((tree
+ '(elem (@) (para (@) "para") (br (@)) "cdata" (para "second par"))
+ )
+ (expected
+ '((para (@) "para") (br (@)) (para "second par")))
+ )
+ (run-test (select-kids (node-typeof? '*)) tree expected)
+ (run-test (sxpath '(*)) tree expected)
+)
+
+
+
+; Location path, full form: child::text()
+; Location path, abbreviated form: text()
+; selects all text node children of the context node
+(let ((tree
+ '(elem (@) (para (@) "para") (br (@)) "cdata" (para "second par"))
+ )
+ (expected
+ '("cdata"))
+ )
+ (run-test (select-kids (node-typeof? '*text*)) tree expected)
+ (run-test (sxpath '(*text*)) tree expected)
+)
+
+
+; Location path, full form: child::node()
+; Location path, abbreviated form: node()
+; selects all the children of the context node, whatever their node type
+(let* ((tree
+ '(elem (@) (para (@) "para") (br (@)) "cdata" (para "second par"))
+ )
+ (expected (cdr tree))
+ )
+ (run-test (select-kids (node-typeof? '*any*)) tree expected)
+ (run-test (sxpath '(*any*)) tree expected)
+)
+
+; Location path, full form: child::*/child::para
+; Location path, abbreviated form: */para
+; selects all para grandchildren of the context node
+
+(let ((tree
+ '(elem (@) (para (@) "para") (br (@)) "cdata" (para "second par")
+ (div (@ (name "aa")) (para "third para")))
+ )
+ (expected
+ '((para "third para")))
+ )
+ (run-test
+ (node-join (select-kids (node-typeof? '*))
+ (select-kids (node-typeof? 'para)))
+ tree expected)
+ (run-test (sxpath '(* para)) tree expected)
+)
+
+
+; Location path, full form: attribute::name
+; Location path, abbreviated form: @name
+; selects the 'name' attribute of the context node
+
+(let ((tree
+ '(elem (@ (name "elem") (id "idz"))
+ (para (@) "para") (br (@)) "cdata" (para (@) "second par")
+ (div (@ (name "aa")) (para (@) "third para")))
+ )
+ (expected
+ '((name "elem")))
+ )
+ (run-test
+ (node-join (select-kids (node-typeof? '@))
+ (select-kids (node-typeof? 'name)))
+ tree expected)
+ (run-test (sxpath '(@ name)) tree expected)
+)
+
+; Location path, full form: attribute::*
+; Location path, abbreviated form: @*
+; selects all the attributes of the context node
+(let ((tree
+ '(elem (@ (name "elem") (id "idz"))
+ (para (@) "para") (br (@)) "cdata" (para "second par")
+ (div (@ (name "aa")) (para (@) "third para")))
+ )
+ (expected
+ '((name "elem") (id "idz")))
+ )
+ (run-test
+ (node-join (select-kids (node-typeof? '@))
+ (select-kids (node-typeof? '*)))
+ tree expected)
+ (run-test (sxpath '(@ *)) tree expected)
+)
+
+
+; Location path, full form: descendant::para
+; Location path, abbreviated form: .//para
+; selects the para element descendants of the context node
+
+(let ((tree
+ '(elem (@ (name "elem") (id "idz"))
+ (para (@) "para") (br (@)) "cdata" (para "second par")
+ (div (@ (name "aa")) (para (@) "third para")))
+ )
+ (expected
+ '((para (@) "para") (para "second par") (para (@) "third para")))
+ )
+ (run-test
+ (node-closure (node-typeof? 'para))
+ tree expected)
+ (run-test (sxpath '(// para)) tree expected)
+)
+
+; Location path, full form: self::para
+; Location path, abbreviated form: _none_
+; selects the context node if it is a para element; otherwise selects nothing
+
+(let ((tree
+ '(elem (@ (name "elem") (id "idz"))
+ (para (@) "para") (br (@)) "cdata" (para "second par")
+ (div (@ (name "aa")) (para (@) "third para")))
+ )
+ )
+ (run-test (node-self (node-typeof? 'para)) tree '())
+ (run-test (node-self (node-typeof? 'elem)) tree (list tree))
+)
+
+; Location path, full form: descendant-or-self::node()
+; Location path, abbreviated form: //
+; selects the context node, all the children (including attribute nodes)
+; of the context node, and all the children of all the (element)
+; descendants of the context node.
+; This is _almost_ a powerset of the context node.
+(let* ((tree
+ '(para (@ (name "elem") (id "idz"))
+ (para (@) "para") (br (@)) "cdata" (para "second par")
+ (div (@ (name "aa")) (para (@) "third para")))
+ )
+ (expected
+ (cons tree
+ (append (cdr tree)
+ '((@) "para" (@) "second par"
+ (@ (name "aa")) (para (@) "third para")
+ (@) "third para"))))
+ )
+ (run-test
+ (node-or
+ (node-self (node-typeof? '*any*))
+ (node-closure (node-typeof? '*any*)))
+ tree expected)
+ (run-test (sxpath '(//)) tree expected)
+)
+
+; Location path, full form: ancestor::div
+; Location path, abbreviated form: _none_
+; selects all div ancestors of the context node
+; This Location expression is equivalent to the following:
+; /descendant-or-self::div[descendant::node() = curr_node]
+; This shows that the ancestor:: axis is actually redundant. Still,
+; it can be emulated as the following SXPath expression demonstrates.
+
+; The insight behind "ancestor::div" -- selecting all "div" ancestors
+; of the current node -- is
+; S[ancestor::div] context_node =
+; { y | y=subnode*(root), context_node=subnode(subnode*(y)),
+; isElement(y), name(y) = "div" }
+; We observe that
+; { y | y=subnode*(root), pred(y) }
+; can be expressed in SXPath as
+; ((node-or (node-self pred) (node-closure pred)) root-node)
+; The composite predicate 'isElement(y) & name(y) = "div"' corresponds to
+; (node-self (node-typeof? 'div)) in SXPath. Finally, filter
+; context_node=subnode(subnode*(y)) is tantamount to
+; (node-closure (node-eq? context-node)), whereas node-reduce denotes the
+; the composition of converters-predicates in the filtering context.
+
+(let*
+ ((root
+ '(div (@ (name "elem") (id "idz"))
+ (para (@) "para") (br (@)) "cdata" (para (@) "second par")
+ (div (@ (name "aa")) (para (@) "third para"))))
+ (context-node ; /descendant::any()[child::text() == "third para"]
+ (car
+ ((node-closure
+ (select-kids
+ (node-equal? "third para")))
+ root)))
+ (pred
+ (node-reduce (node-self (node-typeof? 'div))
+ (node-closure (node-eq? context-node))
+ ))
+ )
+ (run-test
+ (node-or
+ (node-self pred)
+ (node-closure pred))
+ root
+ (cons root
+ '((div (@ (name "aa")) (para (@) "third para")))))
+)
+
+
+
+; Location path, full form: child::div/descendant::para
+; Location path, abbreviated form: div//para
+; selects the para element descendants of the div element
+; children of the context node
+
+(let ((tree
+ '(elem (@ (name "elem") (id "idz"))
+ (para (@) "para") (br (@)) "cdata" (para "second par")
+ (div (@ (name "aa")) (para (@) "third para")
+ (div (para "fourth para"))))
+ )
+ (expected
+ '((para (@) "third para") (para "fourth para")))
+ )
+ (run-test
+ (node-join
+ (select-kids (node-typeof? 'div))
+ (node-closure (node-typeof? 'para)))
+ tree expected)
+ (run-test (sxpath '(div // para)) tree expected)
+)
+
+
+; Location path, full form: /descendant::olist/child::item
+; Location path, abbreviated form: //olist/item
+; selects all the item elements that have an olist parent (which is not root)
+; and that are in the same document as the context node
+; See the following test.
+
+; Location path, full form: /descendant::td/attribute::align
+; Location path, abbreviated form: //td/@align
+; Selects 'align' attributes of all 'td' elements in tree1
+(let ((tree tree1)
+ (expected
+ '((align "right") (align "center") (align "center") (align "center"))
+ ))
+ (run-test
+ (node-join
+ (node-closure (node-typeof? 'td))
+ (select-kids (node-typeof? '@))
+ (select-kids (node-typeof? 'align)))
+ tree expected)
+ (run-test (sxpath '(// td @ align)) tree expected)
+)
+
+
+; Location path, full form: /descendant::td[attribute::align]
+; Location path, abbreviated form: //td[@align]
+; Selects all td elements that have an attribute 'align' in tree1
+(let ((tree tree1)
+ (expected
+ '((td (@ (align "right")) "Talks ") (td (@ (align "center")) " = ")
+ (td (@ (align "center")) " = ") (td (@ (align "center")) " = "))
+ ))
+ (run-test
+ (node-reduce
+ (node-closure (node-typeof? 'td))
+ (filter
+ (node-join
+ (select-kids (node-typeof? '@))
+ (select-kids (node-typeof? 'align)))))
+ tree expected)
+ (run-test (sxpath `(// td ,(node-self (sxpath '(@ align))))) tree expected)
+ (run-test (sxpath '(// (td (@ align)))) tree expected)
+ (run-test (sxpath '(// ((td) (@ align)))) tree expected)
+ ; note! (sxpath ...) is a converter. Therefore, it can be used
+ ; as any other converter, for example, in the full-form SXPath.
+ ; Thus we can mix the full and abbreviated form SXPath's freely.
+ (run-test
+ (node-reduce
+ (node-closure (node-typeof? 'td))
+ (filter
+ (sxpath '(@ align))))
+ tree expected)
+)
+
+
+; Location path, full form: /descendant::td[attribute::align = "right"]
+; Location path, abbreviated form: //td[@align = "right"]
+; Selects all td elements that have an attribute align = "right" in tree1
+(let ((tree tree1)
+ (expected
+ '((td (@ (align "right")) "Talks "))
+ ))
+ (run-test
+ (node-reduce
+ (node-closure (node-typeof? 'td))
+ (filter
+ (node-join
+ (select-kids (node-typeof? '@))
+ (select-kids (node-equal? '(align "right"))))))
+ tree expected)
+ (run-test (sxpath '(// (td (@ (equal? (align "right")))))) tree expected)
+)
+
+; Location path, full form: child::para[position()=1]
+; Location path, abbreviated form: para[1]
+; selects the first para child of the context node
+(let ((tree
+ '(elem (@ (name "elem") (id "idz"))
+ (para (@) "para") (br (@)) "cdata" (para "second par")
+ (div (@ (name "aa")) (para (@) "third para")))
+ )
+ (expected
+ '((para (@) "para"))
+ ))
+ (run-test
+ (node-reduce
+ (select-kids (node-typeof? 'para))
+ (node-pos 1))
+ tree expected)
+ (run-test (sxpath '((para 1))) tree expected)
+)
+
+; Location path, full form: child::para[position()=last()]
+; Location path, abbreviated form: para[last()]
+; selects the last para child of the context node
+(let ((tree
+ '(elem (@ (name "elem") (id "idz"))
+ (para (@) "para") (br (@)) "cdata" (para "second par")
+ (div (@ (name "aa")) (para (@) "third para")))
+ )
+ (expected
+ '((para "second par"))
+ ))
+ (run-test
+ (node-reduce
+ (select-kids (node-typeof? 'para))
+ (node-pos -1))
+ tree expected)
+ (run-test (sxpath '((para -1))) tree expected)
+)
+
+; Illustrating the following Note of Sec 2.5 of XPath:
+; "NOTE: The location path //para[1] does not mean the same as the
+; location path /descendant::para[1]. The latter selects the first
+; descendant para element; the former selects all descendant para
+; elements that are the first para children of their parents."
+
+(let ((tree
+ '(elem (@ (name "elem") (id "idz"))
+ (para (@) "para") (br (@)) "cdata" (para "second par")
+ (div (@ (name "aa")) (para (@) "third para")))
+ )
+ )
+ (run-test
+ (node-reduce ; /descendant::para[1] in SXPath
+ (node-closure (node-typeof? 'para))
+ (node-pos 1))
+ tree '((para (@) "para")))
+ (run-test (sxpath '(// (para 1))) tree
+ '((para (@) "para") (para (@) "third para")))
+)
+
+; Location path, full form: parent::node()
+; Location path, abbreviated form: ..
+; selects the parent of the context node. The context node may be
+; an attribute node!
+; For the last test:
+; Location path, full form: parent::*/attribute::name
+; Location path, abbreviated form: ../@name
+; Selects the name attribute of the parent of the context node
+
+(let* ((tree
+ '(elem (@ (name "elem") (id "idz"))
+ (para (@) "para") (br (@)) "cdata" (para "second par")
+ (div (@ (name "aa")) (para (@) "third para")))
+ )
+ (para1 ; the first para node
+ (car ((sxpath '(para)) tree)))
+ (para3 ; the third para node
+ (car ((sxpath '(div para)) tree)))
+ (div ; div node
+ (car ((sxpath '(// div)) tree)))
+ )
+ (run-test
+ (node-parent tree)
+ para1 (list tree))
+ (run-test
+ (node-parent tree)
+ para3 (list div))
+ (run-test ; checking the parent of an attribute node
+ (node-parent tree)
+ ((sxpath '(@ name)) div) (list div))
+ (run-test
+ (node-join
+ (node-parent tree)
+ (select-kids (node-typeof? '@))
+ (select-kids (node-typeof? 'name)))
+ para3 '((name "aa")))
+ (run-test
+ (sxpath `(,(node-parent tree) @ name))
+ para3 '((name "aa")))
+)
+
+; Location path, full form: following-sibling::chapter[position()=1]
+; Location path, abbreviated form: none
+; selects the next chapter sibling of the context node
+; The path is equivalent to
+; let cnode = context-node
+; in
+; parent::* / child::chapter [take-after node_eq(self::*,cnode)]
+; [position()=1]
+(let* ((tree
+ '(document
+ (preface "preface")
+ (chapter (@ (id "one")) "Chap 1 text")
+ (chapter (@ (id "two")) "Chap 2 text")
+ (chapter (@ (id "three")) "Chap 3 text")
+ (chapter (@ (id "four")) "Chap 4 text")
+ (epilogue "Epilogue text")
+ (appendix (@ (id "A")) "App A text")
+ (References "References"))
+ )
+ (a-node ; to be used as a context node
+ (car ((sxpath '(// (chapter (@ (equal? (id "two")))))) tree)))
+ (expected
+ '((chapter (@ (id "three")) "Chap 3 text")))
+ )
+ (run-test
+ (node-reduce
+ (node-join
+ (node-parent tree)
+ (select-kids (node-typeof? 'chapter)))
+ (take-after (node-eq? a-node))
+ (node-pos 1)
+ )
+ a-node expected)
+)
+
+; preceding-sibling::chapter[position()=1]
+; selects the previous chapter sibling of the context node
+; The path is equivalent to
+; let cnode = context-node
+; in
+; parent::* / child::chapter [take-until node_eq(self::*,cnode)]
+; [position()=-1]
+(let* ((tree
+ '(document
+ (preface "preface")
+ (chapter (@ (id "one")) "Chap 1 text")
+ (chapter (@ (id "two")) "Chap 2 text")
+ (chapter (@ (id "three")) "Chap 3 text")
+ (chapter (@ (id "four")) "Chap 4 text")
+ (epilogue "Epilogue text")
+ (appendix (@ (id "A")) "App A text")
+ (References "References"))
+ )
+ (a-node ; to be used as a context node
+ (car ((sxpath '(// (chapter (@ (equal? (id "three")))))) tree)))
+ (expected
+ '((chapter (@ (id "two")) "Chap 2 text")))
+ )
+ (run-test
+ (node-reduce
+ (node-join
+ (node-parent tree)
+ (select-kids (node-typeof? 'chapter)))
+ (take-until (node-eq? a-node))
+ (node-pos -1)
+ )
+ a-node expected)
+)
+
+
+; /descendant::figure[position()=42]
+; selects the forty-second figure element in the document
+; See the next example, which is more general.
+
+; Location path, full form:
+; child::table/child::tr[position()=2]/child::td[position()=3]
+; Location path, abbreviated form: table/tr[2]/td[3]
+; selects the third td of the second tr of the table
+(let ((tree ((node-closure (node-typeof? 'p)) tree1))
+ (expected
+ '((td " data + control"))
+ ))
+ (run-test
+ (node-join
+ (select-kids (node-typeof? 'table))
+ (node-reduce (select-kids (node-typeof? 'tr))
+ (node-pos 2))
+ (node-reduce (select-kids (node-typeof? 'td))
+ (node-pos 3)))
+ tree expected)
+ (run-test (sxpath '(table (tr 2) (td 3))) tree expected)
+)
+
+
+; Location path, full form:
+; child::para[attribute::type='warning'][position()=5]
+; Location path, abbreviated form: para[@type='warning'][5]
+; selects the fifth para child of the context node that has a type
+; attribute with value warning
+(let ((tree
+ '(chapter
+ (para "para1")
+ (para (@ (type "warning")) "para 2")
+ (para (@ (type "warning")) "para 3")
+ (para (@ (type "warning")) "para 4")
+ (para (@ (type "warning")) "para 5")
+ (para (@ (type "warning")) "para 6"))
+ )
+ (expected
+ '((para (@ (type "warning")) "para 6"))
+ ))
+ (run-test
+ (node-reduce
+ (select-kids (node-typeof? 'para))
+ (filter
+ (node-join
+ (select-kids (node-typeof? '@))
+ (select-kids (node-equal? '(type "warning")))))
+ (node-pos 5))
+ tree expected)
+ (run-test (sxpath '( (((para (@ (equal? (type "warning"))))) 5 ) ))
+ tree expected)
+ (run-test (sxpath '( (para (@ (equal? (type "warning"))) 5 ) ))
+ tree expected)
+)
+
+
+; Location path, full form:
+; child::para[position()=5][attribute::type='warning']
+; Location path, abbreviated form: para[5][@type='warning']
+; selects the fifth para child of the context node if that child has a 'type'
+; attribute with value warning
+(let ((tree
+ '(chapter
+ (para "para1")
+ (para (@ (type "warning")) "para 2")
+ (para (@ (type "warning")) "para 3")
+ (para (@ (type "warning")) "para 4")
+ (para (@ (type "warning")) "para 5")
+ (para (@ (type "warning")) "para 6"))
+ )
+ (expected
+ '((para (@ (type "warning")) "para 5"))
+ ))
+ (run-test
+ (node-reduce
+ (select-kids (node-typeof? 'para))
+ (node-pos 5)
+ (filter
+ (node-join
+ (select-kids (node-typeof? '@))
+ (select-kids (node-equal? '(type "warning"))))))
+ tree expected)
+ (run-test (sxpath '( (( (para 5)) (@ (equal? (type "warning"))))))
+ tree expected)
+ (run-test (sxpath '( (para 5 (@ (equal? (type "warning")))) ))
+ tree expected)
+)
+
+; Location path, full form:
+; child::*[self::chapter or self::appendix]
+; Location path, semi-abbreviated form: *[self::chapter or self::appendix]
+; selects the chapter and appendix children of the context node
+(let ((tree
+ '(document
+ (preface "preface")
+ (chapter (@ (id "one")) "Chap 1 text")
+ (chapter (@ (id "two")) "Chap 2 text")
+ (chapter (@ (id "three")) "Chap 3 text")
+ (epilogue "Epilogue text")
+ (appendix (@ (id "A")) "App A text")
+ (References "References"))
+ )
+ (expected
+ '((chapter (@ (id "one")) "Chap 1 text")
+ (chapter (@ (id "two")) "Chap 2 text")
+ (chapter (@ (id "three")) "Chap 3 text")
+ (appendix (@ (id "A")) "App A text"))
+ ))
+ (run-test
+ (node-join
+ (select-kids (node-typeof? '*))
+ (filter
+ (node-or
+ (node-self (node-typeof? 'chapter))
+ (node-self (node-typeof? 'appendix)))))
+ tree expected)
+ (run-test (sxpath `(* ,(node-or (node-self (node-typeof? 'chapter))
+ (node-self (node-typeof? 'appendix)))))
+ tree expected)
+)
+
+
+; Location path, full form: child::chapter[child::title='Introduction']
+; Location path, abbreviated form: chapter[title = 'Introduction']
+; selects the chapter children of the context node that have one or more
+; title children with string-value equal to Introduction
+; See a similar example: //td[@align = "right"] above.
+
+; Location path, full form: child::chapter[child::title]
+; Location path, abbreviated form: chapter[title]
+; selects the chapter children of the context node that have one or
+; more title children
+; See a similar example //td[@align] above.
+
+(cerr nl "Example with tree3: extracting the first lines of every stanza" nl)
+(let ((tree tree3)
+ (expected
+ '("Let us go then, you and I," "In the room the women come and go")
+ ))
+ (run-test
+ (node-join
+ (node-closure (node-typeof? 'stanza))
+ (node-reduce
+ (select-kids (node-typeof? 'line)) (node-pos 1))
+ (select-kids (node-typeof? '*text*)))
+ tree expected)
+ (run-test (sxpath '(// stanza (line 1) *text*)) tree expected)
+)
+
diff --git a/module/sxml/upstream/assert.scm b/module/sxml/upstream/assert.scm
new file mode 100644
index 000000000..e9e983d5b
--- /dev/null
+++ b/module/sxml/upstream/assert.scm
@@ -0,0 +1,35 @@
+;
+; syntax: assert ?expr ?expr ... [report: ?r-exp ?r-exp ...]
+;
+; If (and ?expr ?expr ...) evaluates to anything but #f, the result
+; is the value of that expression.
+; If (and ?expr ?expr ...) evaluates to #f, an error is reported.
+; The error message will show the failed expressions, as well
+; as the values of selected variables (or expressions, in general).
+; The user may explicitly specify the expressions whose
+; values are to be printed upon assertion failure -- as ?r-exp that
+; follow the identifier 'report:'
+; Typically, ?r-exp is either a variable or a string constant.
+; If the user specified no ?r-exp, the values of variables that are
+; referenced in ?expr will be printed upon the assertion failure.
+
+(define-syntax assert
+ (syntax-rules (report:)
+ ((assert "doit" (expr ...) (r-exp ...))
+ (cond
+ ((and expr ...) => (lambda (x) x))
+ (else
+ (error "assertion failure: ~a" (list '(and expr ...) r-exp ...)))))
+ ((assert "collect" (expr ...))
+ (assert "doit" (expr ...) ()))
+ ((assert "collect" (expr ...) report: r-exp ...)
+ (assert "doit" (expr ...) (r-exp ...)))
+ ((assert "collect" (expr ...) expr1 stuff ...)
+ (assert "collect" (expr ... expr1) stuff ...))
+ ((assert stuff ...)
+ (assert "collect" () stuff ...))))
+
+(define-syntax assure
+ (syntax-rules ()
+ ((assure exp error-msg)
+ (assert exp report: error-msg)))) \ No newline at end of file
diff --git a/module/sxml/upstream/input-parse.scm b/module/sxml/upstream/input-parse.scm
new file mode 100644
index 000000000..e0bccfda8
--- /dev/null
+++ b/module/sxml/upstream/input-parse.scm
@@ -0,0 +1,326 @@
+;****************************************************************************
+; Simple Parsing of input
+;
+; The following simple functions surprisingly often suffice to parse
+; an input stream. They either skip, or build and return tokens,
+; according to inclusion or delimiting semantics. The list of
+; characters to expect, include, or to break at may vary from one
+; invocation of a function to another. This allows the functions to
+; easily parse even context-sensitive languages.
+;
+; EOF is generally frowned on, and thrown up upon if encountered.
+; Exceptions are mentioned specifically. The list of expected characters
+; (characters to skip until, or break-characters) may include an EOF
+; "character", which is to be coded as symbol *eof*
+;
+; The input stream to parse is specified as a PORT, which is usually
+; the last (and optional) argument. It defaults to the current input
+; port if omitted.
+;
+; IMPORT
+; This package relies on a function parser-error, which must be defined
+; by a user of the package. The function has the following signature:
+; parser-error PORT MESSAGE SPECIALISING-MSG*
+; Many procedures of this package call parser-error to report a parsing
+; error. The first argument is a port, which typically points to the
+; offending character or its neighborhood. Most of the Scheme systems
+; let the user query a PORT for the current position. MESSAGE is the
+; description of the error. Other arguments supply more details about
+; the problem.
+; myenv.scm, myenv-bigloo.scm or a similar prelude is assumed.
+; From SRFI-13, string-concatenate-reverse
+; If a particular implementation lacks SRFI-13 support, please
+; include the file srfi-13-local.scm
+;
+; $Id: input-parse.scm,v 1.7 2004/07/07 16:02:31 sperber Exp $
+
+;------------------------------------------------------------------------
+
+; -- procedure+: peek-next-char [PORT]
+; advances to the next character in the PORT and peeks at it.
+; This function is useful when parsing LR(1)-type languages
+; (one-char-read-ahead).
+; The optional argument PORT defaults to the current input port.
+
+(define-opt (peek-next-char (optional (port (current-input-port))))
+ (read-char port)
+ (peek-char port))
+
+
+;------------------------------------------------------------------------
+
+; -- procedure+: assert-curr-char CHAR-LIST STRING [PORT]
+; Reads a character from the PORT and looks it up
+; in the CHAR-LIST of expected characters
+; If the read character was found among expected, it is returned
+; Otherwise, the procedure writes a nasty message using STRING
+; as a comment, and quits.
+; The optional argument PORT defaults to the current input port.
+;
+(define-opt (assert-curr-char expected-chars comment
+ (optional (port (current-input-port))))
+ (let ((c (read-char port)))
+ (if (memv c expected-chars) c
+ (parser-error port "Wrong character " c
+ " (0x" (if (eof-object? c) "*eof*"
+ (number->string (char->integer c) 16)) ") "
+ comment ". " expected-chars " expected"))))
+
+
+; -- procedure+: skip-until CHAR-LIST [PORT]
+; Reads and skips characters from the PORT until one of the break
+; characters is encountered. This break character is returned.
+; The break characters are specified as the CHAR-LIST. This list
+; may include EOF, which is to be coded as a symbol *eof*
+;
+; -- procedure+: skip-until NUMBER [PORT]
+; Skips the specified NUMBER of characters from the PORT and returns #f
+;
+; The optional argument PORT defaults to the current input port.
+
+
+(define-opt (skip-until arg (optional (port (current-input-port))) )
+ (cond
+ ((number? arg) ; skip 'arg' characters
+ (do ((i arg (dec i)))
+ ((not (positive? i)) #f)
+ (if (eof-object? (read-char port))
+ (parser-error port "Unexpected EOF while skipping "
+ arg " characters"))))
+ (else ; skip until break-chars (=arg)
+ (let loop ((c (read-char port)))
+ (cond
+ ((memv c arg) c)
+ ((eof-object? c)
+ (if (memq '*eof* arg) c
+ (parser-error port "Unexpected EOF while skipping until " arg)))
+ (else (loop (read-char port))))))))
+
+
+; -- procedure+: skip-while CHAR-LIST [PORT]
+; Reads characters from the PORT and disregards them,
+; as long as they are mentioned in the CHAR-LIST.
+; The first character (which may be EOF) peeked from the stream
+; that is NOT a member of the CHAR-LIST is returned. This character
+; is left on the stream.
+; The optional argument PORT defaults to the current input port.
+
+(define-opt (skip-while skip-chars (optional (port (current-input-port))) )
+ (do ((c (peek-char port) (peek-char port)))
+ ((not (memv c skip-chars)) c)
+ (read-char port)))
+
+; whitespace const
+
+;------------------------------------------------------------------------
+; Stream tokenizers
+
+
+; -- procedure+:
+; next-token PREFIX-CHAR-LIST BREAK-CHAR-LIST [COMMENT-STRING] [PORT]
+; skips any number of the prefix characters (members of the
+; PREFIX-CHAR-LIST), if any, and reads the sequence of characters
+; up to (but not including) a break character, one of the
+; BREAK-CHAR-LIST.
+; The string of characters thus read is returned.
+; The break character is left on the input stream
+; The list of break characters may include EOF, which is to be coded as
+; a symbol *eof*. Otherwise, EOF is fatal, generating an error message
+; including a specified COMMENT-STRING (if any)
+;
+; The optional argument PORT defaults to the current input port.
+;
+; Note: since we can't tell offhand how large the token being read is
+; going to be, we make a guess, pre-allocate a string, and grow it by
+; quanta if necessary. The quantum is always the length of the string
+; before it was extended the last time. Thus the algorithm does
+; a Fibonacci-type extension, which has been proven optimal.
+; Note, explicit port specification in read-char, peek-char helps.
+
+; Procedure: input-parse:init-buffer
+; returns an initial buffer for next-token* procedures.
+; The input-parse:init-buffer may allocate a new buffer per each invocation:
+; (define (input-parse:init-buffer) (make-string 32))
+; Size 32 turns out to be fairly good, on average.
+; That policy is good only when a Scheme system is multi-threaded with
+; preemptive scheduling, or when a Scheme system supports shared substrings.
+; In all the other cases, it's better for input-parse:init-buffer to
+; return the same static buffer. next-token* functions return a copy
+; (a substring) of accumulated data, so the same buffer can be reused.
+; We shouldn't worry about an incoming token being too large:
+; next-token will use another chunk automatically. Still,
+; the best size for the static buffer is to allow most of the tokens to fit in.
+; Using a static buffer _dramatically_ reduces the amount of produced garbage
+; (e.g., during XML parsing).
+
+(define input-parse:init-buffer
+ (let ((buffer (make-string 512)))
+ (lambda () buffer)))
+
+
+ ; See a better version below
+(define-opt (next-token-old prefix-skipped-chars break-chars
+ (optional (comment "") (port (current-input-port))) )
+ (let* ((buffer (input-parse:init-buffer))
+ (curr-buf-len (string-length buffer))
+ (quantum curr-buf-len))
+ (let loop ((i 0) (c (skip-while prefix-skipped-chars port)))
+ (cond
+ ((memv c break-chars) (substring buffer 0 i))
+ ((eof-object? c)
+ (if (memq '*eof* break-chars)
+ (substring buffer 0 i) ; was EOF expected?
+ (parser-error port "EOF while reading a token " comment)))
+ (else
+ (if (>= i curr-buf-len) ; make space for i-th char in buffer
+ (begin ; -> grow the buffer by the quantum
+ (set! buffer (string-append buffer (make-string quantum)))
+ (set! quantum curr-buf-len)
+ (set! curr-buf-len (string-length buffer))))
+ (string-set! buffer i c)
+ (read-char port) ; move to the next char
+ (loop (inc i) (peek-char port))
+ )))))
+
+
+; A better version of next-token, which accumulates the characters
+; in chunks, and later on reverse-concatenates them, using
+; SRFI-13 if available.
+; The overhead of copying characters is only 100% (or even smaller: bulk
+; string copying might be well-optimised), compared to the (hypothetical)
+; circumstance if we had known the size of the token beforehand.
+; For small tokens, the code performs just as above. For large
+; tokens, we expect an improvement. Note, the code also has no
+; assignments.
+; See next-token-comp.scm
+
+(define-opt (next-token prefix-skipped-chars break-chars
+ (optional (comment "") (port (current-input-port))) )
+ (let outer ((buffer (input-parse:init-buffer)) (filled-buffer-l '())
+ (c (skip-while prefix-skipped-chars port)))
+ (let ((curr-buf-len (string-length buffer)))
+ (let loop ((i 0) (c c))
+ (cond
+ ((memv c break-chars)
+ (if (null? filled-buffer-l) (substring buffer 0 i)
+ (string-concatenate-reverse filled-buffer-l buffer i)))
+ ((eof-object? c)
+ (if (memq '*eof* break-chars) ; was EOF expected?
+ (if (null? filled-buffer-l) (substring buffer 0 i)
+ (string-concatenate-reverse filled-buffer-l buffer i))
+ (parser-error port "EOF while reading a token " comment)))
+ ((>= i curr-buf-len)
+ (outer (make-string curr-buf-len)
+ (cons buffer filled-buffer-l) c))
+ (else
+ (string-set! buffer i c)
+ (read-char port) ; move to the next char
+ (loop (inc i) (peek-char port))))))))
+
+; -- procedure+: next-token-of INC-CHARSET [PORT]
+; Reads characters from the PORT that belong to the list of characters
+; INC-CHARSET. The reading stops at the first character which is not
+; a member of the set. This character is left on the stream.
+; All the read characters are returned in a string.
+;
+; -- procedure+: next-token-of PRED [PORT]
+; Reads characters from the PORT for which PRED (a procedure of one
+; argument) returns non-#f. The reading stops at the first character
+; for which PRED returns #f. That character is left on the stream.
+; All the results of evaluating of PRED up to #f are returned in a
+; string.
+;
+; PRED is a procedure that takes one argument (a character
+; or the EOF object) and returns a character or #f. The returned
+; character does not have to be the same as the input argument
+; to the PRED. For example,
+; (next-token-of (lambda (c)
+; (cond ((eof-object? c) #f)
+; ((char-alphabetic? c) (char-downcase c))
+; (else #f))))
+; will try to read an alphabetic token from the current
+; input port, and return it in lower case.
+;
+; The optional argument PORT defaults to the current input port.
+;
+; This procedure is similar to next-token but only it implements
+; an inclusion rather than delimiting semantics.
+
+(define-opt (next-token-of incl-list/pred
+ (optional (port (current-input-port))) )
+ (let* ((buffer (input-parse:init-buffer))
+ (curr-buf-len (string-length buffer)))
+ (if (procedure? incl-list/pred)
+ (let outer ((buffer buffer) (filled-buffer-l '()))
+ (let loop ((i 0))
+ (if (>= i curr-buf-len) ; make sure we have space
+ (outer (make-string curr-buf-len) (cons buffer filled-buffer-l))
+ (let ((c (incl-list/pred (peek-char port))))
+ (if c
+ (begin
+ (string-set! buffer i c)
+ (read-char port) ; move to the next char
+ (loop (inc i)))
+ ; incl-list/pred decided it had had enough
+ (if (null? filled-buffer-l) (substring buffer 0 i)
+ (string-concatenate-reverse filled-buffer-l buffer i)))))))
+
+ ; incl-list/pred is a list of allowed characters
+ (let outer ((buffer buffer) (filled-buffer-l '()))
+ (let loop ((i 0))
+ (if (>= i curr-buf-len) ; make sure we have space
+ (outer (make-string curr-buf-len) (cons buffer filled-buffer-l))
+ (let ((c (peek-char port)))
+ (cond
+ ((not (memv c incl-list/pred))
+ (if (null? filled-buffer-l) (substring buffer 0 i)
+ (string-concatenate-reverse filled-buffer-l buffer i)))
+ (else
+ (string-set! buffer i c)
+ (read-char port) ; move to the next char
+ (loop (inc i))))))))
+ )))
+
+
+; -- procedure+: read-text-line [PORT]
+; Reads one line of text from the PORT, and returns it as a string.
+; A line is a (possibly empty) sequence of characters terminated
+; by CR, CRLF or LF (or even the end of file).
+; The terminating character (or CRLF combination) is removed from
+; the input stream. The terminating character(s) is not a part
+; of the return string either.
+; If EOF is encountered before any character is read, the return
+; value is EOF.
+;
+; The optional argument PORT defaults to the current input port.
+
+(define *read-line-breaks* (list char-newline char-return '*eof*))
+
+(define-opt (read-text-line (optional (port (current-input-port))) )
+ (if (eof-object? (peek-char port)) (peek-char port)
+ (let* ((line
+ (next-token '() *read-line-breaks*
+ "reading a line" port))
+ (c (read-char port))) ; must be either \n or \r or EOF
+ (and (eqv? c char-return) (eqv? (peek-char port) #\newline)
+ (read-char port)) ; skip \n that follows \r
+ line)))
+
+
+; -- procedure+: read-string N [PORT]
+; Reads N characters from the PORT, and returns them in a string.
+; If EOF is encountered before N characters are read, a shorter string
+; will be returned.
+; If N is not positive, an empty string will be returned.
+; The optional argument PORT defaults to the current input port.
+
+(define-opt (read-string n (optional (port (current-input-port))) )
+ (if (not (positive? n)) ""
+ (let ((buffer (make-string n)))
+ (let loop ((i 0) (c (read-char port)))
+ (if (eof-object? c) (substring buffer 0 i)
+ (let ((i1 (inc i)))
+ (string-set! buffer i c)
+ (if (= i1 n) buffer
+ (loop i1 (read-char port)))))))))
+
diff --git a/module/sxml/xpath.scm b/module/sxml/xpath.scm
new file mode 100644
index 000000000..bdf4ae98a
--- /dev/null
+++ b/module/sxml/xpath.scm
@@ -0,0 +1,493 @@
+;;;; (sxml xpath) -- SXPath
+;;;;
+;;;; Copyright (C) 2009 Free Software Foundation, Inc.
+;;;; Modified 2004 by Andy Wingo <wingo at pobox dot com>.
+;;;; Written 2001 by Oleg Kiselyov <oleg at pobox dot com> SXPath.scm.
+;;;;
+;;;; This library is free software; you can redistribute it and/or
+;;;; modify it under the terms of the GNU Lesser General Public
+;;;; License as published by the Free Software Foundation; either
+;;;; version 3 of the License, or (at your option) any later version.
+;;;;
+;;;; This library is distributed in the hope that it will be useful,
+;;;; but WITHOUT ANY WARRANTY; without even the implied warranty of
+;;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;;;; Lesser General Public License for more details.
+;;;;
+;;;; You should have received a copy of the GNU Lesser General Public
+;;;; License along with this library; if not, write to the Free Software
+;;;; Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;;;;
+
+;;; Commentary:
+;;
+;;@heading SXPath: SXML Query Language
+;;
+;; SXPath is a query language for SXML, an instance of XML Information
+;; set (Infoset) in the form of s-expressions. See @code{(sxml ssax)}
+;; for the definition of SXML and more details. SXPath is also a
+;; translation into Scheme of an XML Path Language,
+;; @uref{http://www.w3.org/TR/xpath,XPath}. XPath and SXPath describe
+;; means of selecting a set of Infoset's items or their properties.
+;;
+;; To facilitate queries, XPath maps the XML Infoset into an explicit
+;; tree, and introduces important notions of a location path and a
+;; current, context node. A location path denotes a selection of a set of
+;; nodes relative to a context node. Any XPath tree has a distinguished,
+;; root node -- which serves as the context node for absolute location
+;; paths. Location path is recursively defined as a location step joined
+;; with a location path. A location step is a simple query of the
+;; database relative to a context node. A step may include expressions
+;; that further filter the selected set. Each node in the resulting set
+;; is used as a context node for the adjoining location path. The result
+;; of the step is a union of the sets returned by the latter location
+;; paths.
+;;
+;; The SXML representation of the XML Infoset (see SSAX.scm) is rather
+;; suitable for querying as it is. Bowing to the XPath specification,
+;; we will refer to SXML information items as 'Nodes':
+;;@example
+;; <Node> ::= <Element> | <attributes-coll> | <attrib>
+;; | "text string" | <PI>
+;;@end example
+;; This production can also be described as
+;;@example
+;; <Node> ::= (name . <Nodeset>) | "text string"
+;;@end example
+;; An (ordered) set of nodes is just a list of the constituent nodes:
+;;@example
+;; <Nodeset> ::= (<Node> ...)
+;;@end example
+;; Nodesets, and Nodes other than text strings are both lists. A
+;; <Nodeset> however is either an empty list, or a list whose head is not
+;; a symbol. A symbol at the head of a node is either an XML name (in
+;; which case it's a tag of an XML element), or an administrative name
+;; such as '@@'. This uniform list representation makes processing rather
+;; simple and elegant, while avoiding confusion. The multi-branch tree
+;; structure formed by the mutually-recursive datatypes <Node> and
+;; <Nodeset> lends itself well to processing by functional languages.
+;;
+;; A location path is in fact a composite query over an XPath tree or
+;; its branch. A singe step is a combination of a projection, selection
+;; or a transitive closure. Multiple steps are combined via join and
+;; union operations. This insight allows us to @emph{elegantly}
+;; implement XPath as a sequence of projection and filtering primitives
+;; -- converters -- joined by @dfn{combinators}. Each converter takes a
+;; node and returns a nodeset which is the result of the corresponding
+;; query relative to that node. A converter can also be called on a set
+;; of nodes. In that case it returns a union of the corresponding
+;; queries over each node in the set. The union is easily implemented as
+;; a list append operation as all nodes in a SXML tree are considered
+;; distinct, by XPath conventions. We also preserve the order of the
+;; members in the union. Query combinators are high-order functions:
+;; they take converter(s) (which is a Node|Nodeset -> Nodeset function)
+;; and compose or otherwise combine them. We will be concerned with only
+;; relative location paths [XPath]: an absolute location path is a
+;; relative path applied to the root node.
+;;
+;; Similarly to XPath, SXPath defines full and abbreviated notations
+;; for location paths. In both cases, the abbreviated notation can be
+;; mechanically expanded into the full form by simple rewriting
+;; rules. In case of SXPath the corresponding rules are given as
+;; comments to a sxpath function, below. The regression test suite at
+;; the end of this file shows a representative sample of SXPaths in
+;; both notations, juxtaposed with the corresponding XPath
+;; expressions. Most of the samples are borrowed literally from the
+;; XPath specification, while the others are adjusted for our running
+;; example, tree1.
+;;
+;;; Code:
+
+(define-module (sxml xpath)
+ #:use-module (ice-9 pretty-print)
+ #:export (nodeset? node-typeof? node-eq? node-equal? node-pos
+ filter take-until take-after map-union node-reverse
+ node-trace select-kids node-self node-join node-reduce
+ node-or node-closure node-parent
+ sxpath))
+
+;; Upstream version:
+; $Id: SXPath.scm,v 3.5 2001/01/12 23:20:35 oleg Exp oleg $
+
+(define (nodeset? x)
+ (or (and (pair? x) (not (symbol? (car x)))) (null? x)))
+
+;-------------------------
+; Basic converters and applicators
+; A converter is a function
+; type Converter = Node|Nodeset -> Nodeset
+; A converter can also play a role of a predicate: in that case, if a
+; converter, applied to a node or a nodeset, yields a non-empty
+; nodeset, the converter-predicate is deemed satisfied. Throughout
+; this file a nil nodeset is equivalent to #f in denoting a failure.
+
+; The following function implements a 'Node test' as defined in
+; Sec. 2.3 of XPath document. A node test is one of the components of a
+; location step. It is also a converter-predicate in SXPath.
+;
+; The function node-typeof? takes a type criterion and returns a function,
+; which, when applied to a node, will tell if the node satisfies
+; the test.
+; node-typeof? :: Crit -> Node -> Boolean
+;
+; The criterion 'crit' is a symbol, one of the following:
+; id - tests if the Node has the right name (id)
+; @ - tests if the Node is an <attributes-coll>
+; * - tests if the Node is an <Element>
+; *text* - tests if the Node is a text node
+; *PI* - tests if the Node is a PI node
+; *any* - #t for any type of Node
+
+(define (node-typeof? crit)
+ (lambda (node)
+ (case crit
+ ((*) (and (pair? node) (not (memq (car node) '(@ *PI*)))))
+ ((*any*) #t)
+ ((*text*) (string? node))
+ (else
+ (and (pair? node) (eq? crit (car node))))
+)))
+
+
+; Curried equivalence converter-predicates
+(define (node-eq? other)
+ (lambda (node)
+ (eq? other node)))
+
+(define (node-equal? other)
+ (lambda (node)
+ (equal? other node)))
+
+; node-pos:: N -> Nodeset -> Nodeset, or
+; node-pos:: N -> Converter
+; Select the N'th element of a Nodeset and return as a singular Nodeset;
+; Return an empty nodeset if the Nth element does not exist.
+; ((node-pos 1) Nodeset) selects the node at the head of the Nodeset,
+; if exists; ((node-pos 2) Nodeset) selects the Node after that, if
+; exists.
+; N can also be a negative number: in that case the node is picked from
+; the tail of the list.
+; ((node-pos -1) Nodeset) selects the last node of a non-empty nodeset;
+; ((node-pos -2) Nodeset) selects the last but one node, if exists.
+
+(define (node-pos n)
+ (lambda (nodeset)
+ (cond
+ ((not (nodeset? nodeset)) '())
+ ((null? nodeset) nodeset)
+ ((eqv? n 1) (list (car nodeset)))
+ ((negative? n) ((node-pos (+ n 1 (length nodeset))) nodeset))
+ (else
+ (or (positive? n) (error "yikes!"))
+ ((node-pos (1- n)) (cdr nodeset))))))
+
+; filter:: Converter -> Converter
+; A filter applicator, which introduces a filtering context. The argument
+; converter is considered a predicate, with either #f or nil result meaning
+; failure.
+(define (filter pred?)
+ (lambda (lst) ; a nodeset or a node (will be converted to a singleton nset)
+ (let loop ((lst (if (nodeset? lst) lst (list lst))) (res '()))
+ (if (null? lst)
+ (reverse res)
+ (let ((pred-result (pred? (car lst))))
+ (loop (cdr lst)
+ (if (and pred-result (not (null? pred-result)))
+ (cons (car lst) res)
+ res)))))))
+
+; take-until:: Converter -> Converter, or
+; take-until:: Pred -> Node|Nodeset -> Nodeset
+; Given a converter-predicate and a nodeset, apply the predicate to
+; each element of the nodeset, until the predicate yields anything but #f or
+; nil. Return the elements of the input nodeset that have been processed
+; till that moment (that is, which fail the predicate).
+; take-until is a variation of the filter above: take-until passes
+; elements of an ordered input set till (but not including) the first
+; element that satisfies the predicate.
+; The nodeset returned by ((take-until (not pred)) nset) is a subset --
+; to be more precise, a prefix -- of the nodeset returned by
+; ((filter pred) nset)
+
+(define (take-until pred?)
+ (lambda (lst) ; a nodeset or a node (will be converted to a singleton nset)
+ (let loop ((lst (if (nodeset? lst) lst (list lst))))
+ (if (null? lst) lst
+ (let ((pred-result (pred? (car lst))))
+ (if (and pred-result (not (null? pred-result)))
+ '()
+ (cons (car lst) (loop (cdr lst)))))
+ ))))
+
+
+; take-after:: Converter -> Converter, or
+; take-after:: Pred -> Node|Nodeset -> Nodeset
+; Given a converter-predicate and a nodeset, apply the predicate to
+; each element of the nodeset, until the predicate yields anything but #f or
+; nil. Return the elements of the input nodeset that have not been processed:
+; that is, return the elements of the input nodeset that follow the first
+; element that satisfied the predicate.
+; take-after along with take-until partition an input nodeset into three
+; parts: the first element that satisfies a predicate, all preceding
+; elements and all following elements.
+
+(define (take-after pred?)
+ (lambda (lst) ; a nodeset or a node (will be converted to a singleton nset)
+ (let loop ((lst (if (nodeset? lst) lst (list lst))))
+ (if (null? lst) lst
+ (let ((pred-result (pred? (car lst))))
+ (if (and pred-result (not (null? pred-result)))
+ (cdr lst)
+ (loop (cdr lst))))
+ ))))
+
+; Apply proc to each element of lst and return the list of results.
+; if proc returns a nodeset, splice it into the result
+;
+; From another point of view, map-union is a function Converter->Converter,
+; which places an argument-converter in a joining context.
+
+(define (map-union proc lst)
+ (if (null? lst) lst
+ (let ((proc-res (proc (car lst))))
+ ((if (nodeset? proc-res) append cons)
+ proc-res (map-union proc (cdr lst))))))
+
+; node-reverse :: Converter, or
+; node-reverse:: Node|Nodeset -> Nodeset
+; Reverses the order of nodes in the nodeset
+; This basic converter is needed to implement a reverse document order
+; (see the XPath Recommendation).
+(define node-reverse
+ (lambda (node-or-nodeset)
+ (if (not (nodeset? node-or-nodeset)) (list node-or-nodeset)
+ (reverse node-or-nodeset))))
+
+; node-trace:: String -> Converter
+; (node-trace title) is an identity converter. In addition it prints out
+; a node or nodeset it is applied to, prefixed with the 'title'.
+; This converter is very useful for debugging.
+
+(define (node-trace title)
+ (lambda (node-or-nodeset)
+ (display "\n-->")
+ (display title)
+ (display " :")
+ (pretty-print node-or-nodeset)
+ node-or-nodeset))
+
+
+;-------------------------
+; Converter combinators
+;
+; Combinators are higher-order functions that transmogrify a converter
+; or glue a sequence of converters into a single, non-trivial
+; converter. The goal is to arrive at converters that correspond to
+; XPath location paths.
+;
+; From a different point of view, a combinator is a fixed, named
+; _pattern_ of applying converters. Given below is a complete set of
+; such patterns that together implement XPath location path
+; specification. As it turns out, all these combinators can be built
+; from a small number of basic blocks: regular functional composition,
+; map-union and filter applicators, and the nodeset union.
+
+
+
+; select-kids:: Pred -> Node -> Nodeset
+; Given a Node, return an (ordered) subset its children that satisfy
+; the Pred (a converter, actually)
+; select-kids:: Pred -> Nodeset -> Nodeset
+; The same as above, but select among children of all the nodes in
+; the Nodeset
+;
+; More succinctly, the signature of this function is
+; select-kids:: Converter -> Converter
+
+(define (select-kids test-pred?)
+ (lambda (node) ; node or node-set
+ (cond
+ ((null? node) node)
+ ((not (pair? node)) '()) ; No children
+ ((symbol? (car node))
+ ((filter test-pred?) (cdr node))) ; it's a single node
+ (else (map-union (select-kids test-pred?) node)))))
+
+
+; node-self:: Pred -> Node -> Nodeset, or
+; node-self:: Converter -> Converter
+; Similar to select-kids but apply to the Node itself rather
+; than to its children. The resulting Nodeset will contain either one
+; component, or will be empty (if the Node failed the Pred).
+(define node-self filter)
+
+
+; node-join:: [LocPath] -> Node|Nodeset -> Nodeset, or
+; node-join:: [Converter] -> Converter
+; join the sequence of location steps or paths as described
+; in the title comments above.
+(define (node-join . selectors)
+ (lambda (nodeset) ; Nodeset or node
+ (let loop ((nodeset nodeset) (selectors selectors))
+ (if (null? selectors) nodeset
+ (loop
+ (if (nodeset? nodeset)
+ (map-union (car selectors) nodeset)
+ ((car selectors) nodeset))
+ (cdr selectors))))))
+
+
+; node-reduce:: [LocPath] -> Node|Nodeset -> Nodeset, or
+; node-reduce:: [Converter] -> Converter
+; A regular functional composition of converters.
+; From a different point of view,
+; ((apply node-reduce converters) nodeset)
+; is equivalent to
+; (foldl apply nodeset converters)
+; i.e., folding, or reducing, a list of converters with the nodeset
+; as a seed.
+(define (node-reduce . converters)
+ (lambda (nodeset) ; Nodeset or node
+ (let loop ((nodeset nodeset) (converters converters))
+ (if (null? converters) nodeset
+ (loop ((car converters) nodeset) (cdr converters))))))
+
+
+; node-or:: [Converter] -> Converter
+; This combinator applies all converters to a given node and
+; produces the union of their results.
+; This combinator corresponds to a union, '|' operation for XPath
+; location paths.
+; (define (node-or . converters)
+; (lambda (node-or-nodeset)
+; (if (null? converters) node-or-nodeset
+; (append
+; ((car converters) node-or-nodeset)
+; ((apply node-or (cdr converters)) node-or-nodeset)))))
+; More optimal implementation follows
+(define (node-or . converters)
+ (lambda (node-or-nodeset)
+ (let loop ((result '()) (converters converters))
+ (if (null? converters) result
+ (loop (append result (or ((car converters) node-or-nodeset) '()))
+ (cdr converters))))))
+
+
+; node-closure:: Converter -> Converter
+; Select all _descendants_ of a node that satisfy a converter-predicate.
+; This combinator is similar to select-kids but applies to
+; grand... children as well.
+; This combinator implements the "descendant::" XPath axis
+; Conceptually, this combinator can be expressed as
+; (define (node-closure f)
+; (node-or
+; (select-kids f)
+; (node-reduce (select-kids (node-typeof? '*)) (node-closure f))))
+; This definition, as written, looks somewhat like a fixpoint, and it
+; will run forever. It is obvious however that sooner or later
+; (select-kids (node-typeof? '*)) will return an empty nodeset. At
+; this point further iterations will no longer affect the result and
+; can be stopped.
+
+(define (node-closure test-pred?)
+ (lambda (node) ; Nodeset or node
+ (let loop ((parent node) (result '()))
+ (if (null? parent) result
+ (loop ((select-kids (node-typeof? '*)) parent)
+ (append result
+ ((select-kids test-pred?) parent)))
+ ))))
+
+; node-parent:: RootNode -> Converter
+; (node-parent rootnode) yields a converter that returns a parent of a
+; node it is applied to. If applied to a nodeset, it returns the list
+; of parents of nodes in the nodeset. The rootnode does not have
+; to be the root node of the whole SXML tree -- it may be a root node
+; of a branch of interest.
+; Given the notation of Philip Wadler's paper on semantics of XSLT,
+; parent(x) = { y | y=subnode*(root), x=subnode(y) }
+; Therefore, node-parent is not the fundamental converter: it can be
+; expressed through the existing ones. Yet node-parent is a rather
+; convenient converter. It corresponds to a parent:: axis of SXPath.
+; Note that the parent:: axis can be used with an attribute node as well!
+
+(define (node-parent rootnode)
+ (lambda (node) ; Nodeset or node
+ (if (nodeset? node) (map-union (node-parent rootnode) node)
+ (let ((pred
+ (node-or
+ (node-reduce
+ (node-self (node-typeof? '*))
+ (select-kids (node-eq? node)))
+ (node-join
+ (select-kids (node-typeof? '@))
+ (select-kids (node-eq? node))))))
+ ((node-or
+ (node-self pred)
+ (node-closure pred))
+ rootnode)))))
+
+;-------------------------
+; Evaluate an abbreviated SXPath
+; sxpath:: AbbrPath -> Converter, or
+; sxpath:: AbbrPath -> Node|Nodeset -> Nodeset
+; AbbrPath is a list. It is translated to the full SXPath according
+; to the following rewriting rules
+; (sxpath '()) -> (node-join)
+; (sxpath '(path-component ...)) ->
+; (node-join (sxpath1 path-component) (sxpath '(...)))
+; (sxpath1 '//) -> (node-or
+; (node-self (node-typeof? '*any*))
+; (node-closure (node-typeof? '*any*)))
+; (sxpath1 '(equal? x)) -> (select-kids (node-equal? x))
+; (sxpath1 '(eq? x)) -> (select-kids (node-eq? x))
+; (sxpath1 ?symbol) -> (select-kids (node-typeof? ?symbol)
+; (sxpath1 procedure) -> procedure
+; (sxpath1 '(?symbol ...)) -> (sxpath1 '((?symbol) ...))
+; (sxpath1 '(path reducer ...)) ->
+; (node-reduce (sxpath path) (sxpathr reducer) ...)
+; (sxpathr number) -> (node-pos number)
+; (sxpathr path-filter) -> (filter (sxpath path-filter))
+
+(define (sxpath path)
+ (lambda (nodeset)
+ (let loop ((nodeset nodeset) (path path))
+ (cond
+ ((null? path) nodeset)
+ ((nodeset? nodeset)
+ (map-union (sxpath path) nodeset))
+ ((procedure? (car path))
+ (loop ((car path) nodeset) (cdr path)))
+ ((eq? '// (car path))
+ (loop
+ ((if (nodeset? nodeset) append cons) nodeset
+ ((node-closure (node-typeof? '*any*)) nodeset))
+ (cdr path)))
+ ((symbol? (car path))
+ (loop ((select-kids (node-typeof? (car path))) nodeset)
+ (cdr path)))
+ ((and (pair? (car path)) (eq? 'equal? (caar path)))
+ (loop ((select-kids (apply node-equal? (cdar path))) nodeset)
+ (cdr path)))
+ ((and (pair? (car path)) (eq? 'eq? (caar path)))
+ (loop ((select-kids (apply node-eq? (cdar path))) nodeset)
+ (cdr path)))
+ ((pair? (car path))
+ (let reducer ((nodeset
+ (if (symbol? (caar path))
+ ((select-kids (node-typeof? (caar path))) nodeset)
+ (loop nodeset (caar path))))
+ (reducing-path (cdar path)))
+ (cond
+ ((null? reducing-path) (loop nodeset (cdr path)))
+ ((number? (car reducing-path))
+ (reducer ((node-pos (car reducing-path)) nodeset)
+ (cdr reducing-path)))
+ (else
+ (reducer ((filter (sxpath (car reducing-path))) nodeset)
+ (cdr reducing-path))))))
+ (else
+ (error "Invalid path step: " (car path)))))))
+
+;;; arch-tag: c4e57abf-6b61-4612-a6aa-d1536d440774
+;;; xpath.scm ends here
diff --git a/module/texinfo.scm b/module/texinfo.scm
new file mode 100644
index 000000000..76d79dfab
--- /dev/null
+++ b/module/texinfo.scm
@@ -0,0 +1,1217 @@
+;;;; (texinfo) -- parsing of texinfo into SXML
+;;;;
+;;;; Copyright (C) 2009 Free Software Foundation, Inc.
+;;;; Copyright (C) 2004, 2009 Andy Wingo <wingo at pobox dot com>
+;;;; Copyright (C) 2001,2002 Oleg Kiselyov <oleg at pobox dot com>
+;;;;
+;;;; This file is based on SSAX's SSAX.scm.
+;;;;
+;;;; This library is free software; you can redistribute it and/or
+;;;; modify it under the terms of the GNU Lesser General Public
+;;;; License as published by the Free Software Foundation; either
+;;;; version 3 of the License, or (at your option) any later version.
+;;;;
+;;;; This library is distributed in the hope that it will be useful,
+;;;; but WITHOUT ANY WARRANTY; without even the implied warranty of
+;;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;;;; Lesser General Public License for more details.
+;;;;
+;;;; You should have received a copy of the GNU Lesser General Public
+;;;; License along with this library; if not, write to the Free Software
+;;;; Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+
+;;; Commentary:
+;;
+;; @subheading Texinfo processing in scheme
+;;
+;; This module parses texinfo into SXML. TeX will always be the
+;; processor of choice for print output, of course. However, although
+;; @code{makeinfo} works well for info, its output in other formats is
+;; not very customizable, and the program is not extensible as a whole.
+;; This module aims to provide an extensible framework for texinfo
+;; processing that integrates texinfo into the constellation of SXML
+;; processing tools.
+;;
+;; @subheading Notes on the SXML vocabulary
+;;
+;; Consider the following texinfo fragment:
+;;
+;;@example
+;; @@deffn Primitive set-car! pair value
+;; This function...
+;; @@end deffn
+;;@end example
+;;
+;; Logically, the category (Primitive), name (set-car!), and arguments
+;; (pair value) are ``attributes'' of the deffn, with the description as
+;; the content. However, texinfo allows for @@-commands within the
+;; arguments to an environment, like @code{@@deffn}, which means that
+;; texinfo ``attributes'' are PCDATA. XML attributes, on the other hand,
+;; are CDATA. For this reason, ``attributes'' of texinfo @@-commands are
+;; called ``arguments'', and are grouped under the special element, `%'.
+;;
+;; Because `%' is not a valid NCName, stexinfo is a superset of SXML. In
+;; the interests of interoperability, this module provides a conversion
+;; function to replace the `%' with `texinfo-arguments'.
+;;
+;;; Code:
+
+;; Comparison to xml output of texinfo (which is rather undocumented):
+;; Doesn't conform to texinfo dtd
+;; No DTD at all, in fact :-/
+;; Actually outputs valid xml, after transforming %
+;; Slower (although with caching the SXML that problem can go away)
+;; Doesn't parse menus (although menus are shite)
+;; Args go in a dedicated element, FBOFW
+;; Definitions are handled a lot better
+;; Does parse comments
+;; Outputs only significant line breaks (a biggie!)
+;; Nodes are treated as anchors, rather than content organizers (a biggie)
+;; (more book-like, less info-like)
+
+;; TODO
+;; Integration: help, indexing, plain text
+
+(define-module (texinfo)
+ #:use-module (sxml simple)
+ #:use-module (sxml transform)
+ #:use-module (sxml ssax input-parse)
+ #:use-module (srfi srfi-1)
+ #:use-module (srfi srfi-13)
+ #:export (call-with-file-and-dir
+ texi-command-specs
+ texi-command-depth
+ texi-fragment->stexi
+ texi->stexi
+ stexi->sxml))
+
+;; Some utilities
+
+(define (parser-error port message . rest)
+ (apply error port message rest))
+
+(define (call-with-file-and-dir filename proc)
+ "Call the one-argument procedure @var{proc} with an input port that
+reads from @var{filename}. During the dynamic extent of @var{proc}'s
+execution, the current directory will be @code{(dirname
+@var{filename})}. This is useful for parsing documents that can include
+files by relative path name."
+ (let ((current-dir (getcwd)))
+ (dynamic-wind
+ (lambda () (chdir (dirname filename)))
+ (lambda ()
+ (call-with-input-file (basename filename) proc))
+ (lambda () (chdir current-dir)))))
+
+;; Define this version here, because (srfi srfi-11)'s definition uses
+;; syntax-rules, which is really damn slow
+(define-macro (let*-values bindings . body)
+ (if (null? bindings) (cons 'begin body)
+ (apply
+ (lambda (vars initializer)
+ (let ((cont
+ (cons 'let*-values
+ (cons (cdr bindings) body))))
+ (cond
+ ((not (pair? vars)) ; regular let case, a single var
+ `(let ((,vars ,initializer)) ,cont))
+ ((null? (cdr vars)) ; single var, see the prev case
+ `(let ((,(car vars) ,initializer)) ,cont))
+ (else ; the most generic case
+ `(call-with-values (lambda () ,initializer)
+ (lambda ,vars ,cont))))))
+ (car bindings))))
+
+;;========================================================================
+;; Reflection on the XML vocabulary
+
+(define texi-command-specs
+ #;
+"A list of (@var{name} @var{content-model} . @var{args})
+
+@table @var
+@item name
+The name of an @@-command, as a symbol.
+
+@item content-model
+A symbol indicating the syntactic type of the @@-command:
+@table @code
+@item EMPTY-COMMAND
+No content, and no @code{@@end} is coming
+@item EOL-ARGS
+Unparsed arguments until end of line
+@item EOL-TEXT
+Parsed arguments until end of line
+@item INLINE-ARGS
+Unparsed arguments ending with @code{#\\@}}
+@item INLINE-TEXT
+Parsed arguments ending with @code{#\\@}}
+@item ENVIRON
+The tag is an environment tag, expect @code{@@end foo}.
+@item TABLE-ENVIRON
+Like ENVIRON, but with special parsing rules for its arguments.
+@item FRAGMENT
+For @code{*fragment*}, the command used for parsing fragments of
+texinfo documents.
+@end table
+
+@code{INLINE-TEXT} commands will receive their arguments within their
+bodies, whereas the @code{-ARGS} commands will receive them in their
+attribute list.
+
+@code{EOF-TEXT} receives its arguments in its body.
+
+@code{ENVIRON} commands have both: parsed arguments until the end of
+line, received through their attribute list, and parsed text until the
+@code{@@end}, received in their bodies.
+
+@code{EOF-TEXT-ARGS} receives its arguments in its attribute list, as in
+@code{ENVIRON}.
+
+There are four @@-commands that are treated specially. @code{@@include}
+is a low-level token that will not be seen by higher-level parsers, so
+it has no content-model. @code{@@para} is the paragraph command, which
+is only implicit in the texinfo source. @code{@@item} has special
+syntax, as noted above, and @code{@@entry} is how this parser treats
+@code{@@item} commands within @code{@@table}, @code{@@ftable}, and
+@code{@@vtable}.
+
+Also, indexing commands (@code{@@cindex}, etc.) are treated specially.
+Their arguments are parsed, but they are needed before entering the
+element so that an anchor can be inserted into the text before the index
+entry.
+
+@item args
+Named arguments to the command, in the same format as the formals for a
+lambda. Only present for @code{INLINE-ARGS}, @code{EOL-ARGS},
+@code{ENVIRON}, @code{TABLE-ENVIRON} commands.
+@end table"
+ '(;; Special commands
+ (include #f) ;; this is a low-level token
+ (para PARAGRAPH)
+ (item ITEM)
+ (entry ENTRY . heading)
+ (noindent EMPTY-COMMAND)
+ (*fragment* FRAGMENT)
+
+ ;; Inline text commands
+ (*braces* INLINE-TEXT) ;; FIXME: make me irrelevant
+ (bold INLINE-TEXT)
+ (sample INLINE-TEXT)
+ (samp INLINE-TEXT)
+ (code INLINE-TEXT)
+ (kbd INLINE-TEXT)
+ (key INLINE-TEXT)
+ (var INLINE-TEXT)
+ (env INLINE-TEXT)
+ (file INLINE-TEXT)
+ (command INLINE-TEXT)
+ (option INLINE-TEXT)
+ (dfn INLINE-TEXT)
+ (cite INLINE-TEXT)
+ (acro INLINE-TEXT)
+ (url INLINE-TEXT)
+ (email INLINE-TEXT)
+ (emph INLINE-TEXT)
+ (strong INLINE-TEXT)
+ (sample INLINE-TEXT)
+ (sc INLINE-TEXT)
+ (titlefont INLINE-TEXT)
+ (asis INLINE-TEXT)
+ (b INLINE-TEXT)
+ (i INLINE-TEXT)
+ (r INLINE-TEXT)
+ (sansserif INLINE-TEXT)
+ (slanted INLINE-TEXT)
+ (t INLINE-TEXT)
+
+ ;; Inline args commands
+ (value INLINE-ARGS . (key))
+ (ref INLINE-ARGS . (node #:opt name section info-file manual))
+ (xref INLINE-ARGS . (node #:opt name section info-file manual))
+ (pxref INLINE-ARGS . (node #:opt name section info-file manual))
+ (uref INLINE-ARGS . (url #:opt title replacement))
+ (anchor INLINE-ARGS . (name))
+ (dots INLINE-ARGS . ())
+ (result INLINE-ARGS . ())
+ (bullet INLINE-ARGS . ())
+ (copyright INLINE-ARGS . ())
+ (tie INLINE-ARGS . ())
+ (image INLINE-ARGS . (file #:opt width height alt-text extension))
+
+ ;; EOL args elements
+ (node EOL-ARGS . (name #:opt next previous up))
+ (c EOL-ARGS . all)
+ (comment EOL-ARGS . all)
+ (setchapternewpage EOL-ARGS . all)
+ (sp EOL-ARGS . all)
+ (page EOL-ARGS . ())
+ (vskip EOL-ARGS . all)
+ (syncodeindex EOL-ARGS . all)
+ (contents EOL-ARGS . ())
+ (shortcontents EOL-ARGS . ())
+ (summarycontents EOL-ARGS . ())
+ (insertcopying EOL-ARGS . ())
+ (dircategory EOL-ARGS . (category))
+ (top EOL-ARGS . (title))
+ (printindex EOL-ARGS . (type))
+
+ ;; EOL text commands
+ (*ENVIRON-ARGS* EOL-TEXT)
+ (itemx EOL-TEXT)
+ (set EOL-TEXT)
+ (center EOL-TEXT)
+ (title EOL-TEXT)
+ (subtitle EOL-TEXT)
+ (author EOL-TEXT)
+ (chapter EOL-TEXT)
+ (section EOL-TEXT)
+ (appendix EOL-TEXT)
+ (appendixsec EOL-TEXT)
+ (unnumbered EOL-TEXT)
+ (unnumberedsec EOL-TEXT)
+ (subsection EOL-TEXT)
+ (subsubsection EOL-TEXT)
+ (appendixsubsec EOL-TEXT)
+ (appendixsubsubsec EOL-TEXT)
+ (unnumberedsubsec EOL-TEXT)
+ (unnumberedsubsubsec EOL-TEXT)
+ (chapheading EOL-TEXT)
+ (majorheading EOL-TEXT)
+ (heading EOL-TEXT)
+ (subheading EOL-TEXT)
+ (subsubheading EOL-TEXT)
+
+ (deftpx EOL-TEXT-ARGS . (category name . attributes))
+ (defcvx EOL-TEXT-ARGS . (category class name))
+ (defivarx EOL-TEXT-ARGS . (class name))
+ (deftypeivarx EOL-TEXT-ARGS . (class data-type name))
+ (defopx EOL-TEXT-ARGS . (category class name . arguments))
+ (deftypeopx EOL-TEXT-ARGS . (category class data-type name . arguments))
+ (defmethodx EOL-TEXT-ARGS . (class name . arguments))
+ (deftypemethodx EOL-TEXT-ARGS . (class data-type name . arguments))
+ (defoptx EOL-TEXT-ARGS . (name))
+ (defvrx EOL-TEXT-ARGS . (category name))
+ (defvarx EOL-TEXT-ARGS . (name))
+ (deftypevrx EOL-TEXT-ARGS . (category data-type name))
+ (deftypevarx EOL-TEXT-ARGS . (data-type name))
+ (deffnx EOL-TEXT-ARGS . (category name . arguments))
+ (deftypefnx EOL-TEXT-ARGS . (category data-type name . arguments))
+ (defspecx EOL-TEXT-ARGS . (name . arguments))
+ (defmacx EOL-TEXT-ARGS . (name . arguments))
+ (defunx EOL-TEXT-ARGS . (name . arguments))
+ (deftypefunx EOL-TEXT-ARGS . (data-type name . arguments))
+
+ ;; Indexing commands
+ (cindex INDEX . entry)
+ (findex INDEX . entry)
+ (vindex INDEX . entry)
+ (kindex INDEX . entry)
+ (pindex INDEX . entry)
+ (tindex INDEX . entry)
+
+ ;; Environment commands (those that need @end)
+ (texinfo ENVIRON . title)
+ (ignore ENVIRON . ())
+ (ifinfo ENVIRON . ())
+ (iftex ENVIRON . ())
+ (ifhtml ENVIRON . ())
+ (ifxml ENVIRON . ())
+ (ifplaintext ENVIRON . ())
+ (ifnotinfo ENVIRON . ())
+ (ifnottex ENVIRON . ())
+ (ifnothtml ENVIRON . ())
+ (ifnotxml ENVIRON . ())
+ (ifnotplaintext ENVIRON . ())
+ (titlepage ENVIRON . ())
+ (menu ENVIRON . ())
+ (direntry ENVIRON . ())
+ (copying ENVIRON . ())
+ (example ENVIRON . ())
+ (smallexample ENVIRON . ())
+ (display ENVIRON . ())
+ (smalldisplay ENVIRON . ())
+ (verbatim ENVIRON . ())
+ (format ENVIRON . ())
+ (smallformat ENVIRON . ())
+ (lisp ENVIRON . ())
+ (smalllisp ENVIRON . ())
+ (cartouche ENVIRON . ())
+ (quotation ENVIRON . ())
+
+ (deftp ENVIRON . (category name . attributes))
+ (defcv ENVIRON . (category class name))
+ (defivar ENVIRON . (class name))
+ (deftypeivar ENVIRON . (class data-type name))
+ (defop ENVIRON . (category class name . arguments))
+ (deftypeop ENVIRON . (category class data-type name . arguments))
+ (defmethod ENVIRON . (class name . arguments))
+ (deftypemethod ENVIRON . (class data-type name . arguments))
+ (defopt ENVIRON . (name))
+ (defvr ENVIRON . (category name))
+ (defvar ENVIRON . (name))
+ (deftypevr ENVIRON . (category data-type name))
+ (deftypevar ENVIRON . (data-type name))
+ (deffn ENVIRON . (category name . arguments))
+ (deftypefn ENVIRON . (category data-type name . arguments))
+ (defspec ENVIRON . (name . arguments))
+ (defmac ENVIRON . (name . arguments))
+ (defun ENVIRON . (name . arguments))
+ (deftypefun ENVIRON . (data-type name . arguments))
+
+ (table TABLE-ENVIRON . (formatter))
+ (itemize TABLE-ENVIRON . (formatter))
+ (enumerate TABLE-ENVIRON . (start))
+ (ftable TABLE-ENVIRON . (formatter))
+ (vtable TABLE-ENVIRON . (formatter))))
+
+(define command-depths
+ '((chapter . 1) (section . 2) (subsection . 3) (subsubsection . 4)
+ (top . 0) (unnumbered . 1) (unnumberedsec . 2)
+ (unnumberedsubsec . 3) (unnumberedsubsubsec . 4)
+ (appendix . 1) (appendixsec . 2) (appendixsection . 2)
+ (appendixsubsec . 3) (appendixsubsubsec . 4)))
+(define (texi-command-depth command max-depth)
+ "Given the texinfo command @var{command}, return its nesting level, or
+@code{#f} if it nests too deep for @var{max-depth}.
+
+Examples:
+@example
+(texi-command-depth 'chapter 4) @result{} 1
+(texi-command-depth 'top 4) @result{} 0
+(texi-command-depth 'subsection 4) @result{} 3
+(texi-command-depth 'appendixsubsec 4) @result{} 3
+(texi-command-depth 'subsection 2) @result{} #f
+@end example"
+ (let ((depth (and=> (assq command command-depths) cdr)))
+ (and depth (<= depth max-depth) depth)))
+
+;; The % is for arguments
+(define (space-significant? command)
+ (memq command
+ '(example smallexample verbatim lisp smalllisp menu %)))
+
+;; Like a DTD for texinfo
+(define (command-spec command)
+ (or (assq command texi-command-specs)
+ (parser-error #f "Unknown command" command)))
+
+(define (inline-content? content)
+ (or (eq? content 'INLINE-TEXT) (eq? content 'INLINE-ARGS)))
+
+
+;;========================================================================
+;; Lower-level parsers and scanners
+;;
+;; They deal with primitive lexical units (Names, whitespaces, tags) and
+;; with pieces of more generic productions. Most of these parsers must
+;; be called in appropriate context. For example, complete-start-command
+;; must be called only when the @-command start has been detected and
+;; its name token has been read.
+
+;; Test if a string is made of only whitespace
+;; An empty string is considered made of whitespace as well
+(define (string-whitespace? str)
+ (or (string-null? str)
+ (string-every char-whitespace? str)))
+
+;; Like read-text-line, but allows EOF.
+(define read-eof-breaks '(*eof* #\return #\newline))
+(define (read-eof-line port)
+ (if (eof-object? (peek-char port))
+ (peek-char port)
+ (let* ((line (next-token '() read-eof-breaks
+ "reading a line" port))
+ (c (read-char port))) ; must be either \n or \r or EOF
+ (if (and (eq? c #\return) (eq? (peek-char port) #\newline))
+ (read-char port)) ; skip \n that follows \r
+ line)))
+
+(define ascii->char integer->char)
+
+(define (skip-whitespace port)
+ (skip-while '(#\space #\tab #\return #\newline) port))
+
+(define (skip-horizontal-whitespace port)
+ (skip-while '(#\space #\tab) port))
+
+;; command ::= Letter+
+
+;; procedure: read-command PORT
+;;
+;; Read a command starting from the current position in the PORT and
+;; return it as a symbol.
+(define (read-command port)
+ (let ((first-char (peek-char port)))
+ (or (char-alphabetic? first-char)
+ (parser-error port "Nonalphabetic @-command char: '" first-char "'")))
+ (string->symbol
+ (next-token-of
+ (lambda (c)
+ (cond
+ ((eof-object? c) #f)
+ ((char-alphabetic? c) c)
+ (else #f)))
+ port)))
+
+;; A token is a primitive lexical unit. It is a record with two fields,
+;; token-head and token-kind.
+;;
+;; Token types:
+;; END The end of a texinfo command. If the command is ended by },
+;; token-head will be #f. Otherwise if the command is ended by
+;; @end COMMAND, token-head will be COMMAND. As a special case,
+;; @bye is the end of a special @texinfo command.
+;; START The start of a texinfo command. The token-head will be a
+;; symbol of the @-command name.
+;; INCLUDE An @include directive. The token-head will be empty -- the
+;; caller is responsible for reading the include file name.
+;; ITEM @item commands have an irregular syntax. They end at the
+;; next @item, or at the end of the environment. For that
+;; read-command-token treats them specially.
+
+(define (make-token kind head) (cons kind head))
+(define token? pair?)
+(define token-kind car)
+(define token-head cdr)
+
+;; procedure: read-command-token PORT
+;;
+;; This procedure starts parsing of a command token. The current
+;; position in the stream must be #\@. This procedure scans enough of
+;; the input stream to figure out what kind of a command token it is
+;; seeing. The procedure returns a token structure describing the token.
+
+(define (read-command-token port)
+ (assert-curr-char '(#\@) "start of the command" port)
+ (let ((peeked (peek-char port)))
+ (cond
+ ((memq peeked '(#\! #\. #\? #\@ #\\ #\{ #\}))
+ ;; @-commands that escape characters
+ (make-token 'STRING (string (read-char port))))
+ (else
+ (let ((name (read-command port)))
+ (case name
+ ((end)
+ ;; got an ending tag
+ (let ((command (string-trim-both
+ (read-eof-line port))))
+ (or (and (not (string-null? command))
+ (string-every char-alphabetic? command))
+ (parser-error port "malformed @end" command))
+ (make-token 'END (string->symbol command))))
+ ((bye)
+ ;; the end of the top
+ (make-token 'END 'texinfo))
+ ((item)
+ (make-token 'ITEM 'item))
+ ((include)
+ (make-token 'INCLUDE #f))
+ (else
+ (make-token 'START name))))))))
+
+;; procedure+: read-verbatim-body PORT STR-HANDLER SEED
+;;
+;; This procedure must be called after we have read a string
+;; "@verbatim\n" that begins a verbatim section. The current position
+;; must be the first position of the verbatim body. This function reads
+;; _lines_ of the verbatim body and passes them to a STR-HANDLER, a
+;; character data consumer.
+;;
+;; The str-handler is a STR-HANDLER, a procedure STRING1 STRING2 SEED.
+;; The first STRING1 argument to STR-HANDLER never contains a newline.
+;; The second STRING2 argument often will. On the first invocation of the
+;; STR-HANDLER, the seed is the one passed to read-verbatim-body
+;; as the third argument. The result of this first invocation will be
+;; passed as the seed argument to the second invocation of the line
+;; consumer, and so on. The result of the last invocation of the
+;; STR-HANDLER is returned by the read-verbatim-body. Note a
+;; similarity to the fundamental 'fold' iterator.
+;;
+;; Within a verbatim section all characters are taken at their face
+;; value. It ends with "\n@end verbatim(\r)?\n".
+
+;; Must be called right after the newline after @verbatim.
+(define (read-verbatim-body port str-handler seed)
+ (let loop ((seed seed))
+ (let ((fragment (next-token '() '(#\newline)
+ "reading verbatim" port)))
+ ;; We're reading the char after the 'fragment', which is
+ ;; #\newline.
+ (read-char port)
+ (if (string=? fragment "@end verbatim")
+ seed
+ (loop (str-handler fragment "\n" seed))))))
+
+;; procedure+: read-arguments PORT
+;;
+;; This procedure reads and parses a production ArgumentList.
+;; ArgumentList ::= S* Argument (S* , S* Argument)* S*
+;; Argument ::= ([^@{},])*
+;;
+;; Arguments are the things in braces, i.e @ref{my node} has one
+;; argument, "my node". Most commands taking braces actually don't have
+;; arguments, they process text. For example, in
+;; @emph{@strong{emphasized}}, the emph takes text, because the parse
+;; continues into the braces.
+;;
+;; Any whitespace within Argument is replaced with a single space.
+;; Whitespace around an Argument is trimmed.
+;;
+;; The procedure returns a list of arguments. Afterwards the current
+;; character will be after the final #\}.
+
+(define (read-arguments port stop-char)
+ (define (split str)
+ (read-char port) ;; eat the delimiter
+ (let ((ret (map (lambda (x) (if (string-null? x) #f x))
+ (map string-trim-both (string-split str #\,)))))
+ (if (and (pair? ret) (eq? (car ret) #f) (null? (cdr ret)))
+ '()
+ ret)))
+ (split (next-token '() (list stop-char)
+ "arguments of @-command" port)))
+
+;; procedure+: complete-start-command COMMAND PORT
+;;
+;; This procedure is to complete parsing of an @-command. The procedure
+;; must be called after the command token has been read. COMMAND is a
+;; TAG-NAME.
+;;
+;; This procedure returns several values:
+;; COMMAND: a symbol.
+;; ARGUMENTS: command's arguments, as an alist.
+;; CONTENT-MODEL: the content model of the command.
+;;
+;; On exit, the current position in PORT will depend on the CONTENT-MODEL.
+;;
+;; Content model Port position
+;; ============= =============
+;; INLINE-TEXT One character after the #\{.
+;; INLINE-ARGS The first character after the #\}.
+;; EOL-TEXT The first non-whitespace character after the command.
+;; ENVIRON, TABLE-ENVIRON, EOL-ARGS, EOL-TEXT
+;; The first character on the next line.
+;; PARAGRAPH, ITEM, EMPTY-COMMAND
+;; The first character after the command.
+
+(define (arguments->attlist port args arg-names)
+ (let loop ((in args) (names arg-names) (opt? #f) (out '()))
+ (cond
+ ((symbol? names) ;; a rest arg
+ (reverse (if (null? in) out (acons names in out))))
+ ((and (not (null? names)) (eq? (car names) #:opt))
+ (loop in (cdr names) #t out))
+ ((null? in)
+ (if (or (null? names) opt?)
+ (reverse out)
+ (parser-error port "@-command expected more arguments:"
+ args arg-names names)))
+ ((null? names)
+ (parser-error port "@-command didn't expect more arguments:" in))
+ ((not (car in))
+ (or (and opt? (loop (cdr in) (cdr names) opt? out))
+ (parser-error "@-command missing required argument"
+ (car names))))
+ (else
+ (loop (cdr in) (cdr names) opt?
+ (cons (list (car names) (car in)) out))))))
+
+(define (parse-table-args command port)
+ (let* ((line (string-trim-both (read-text-line port)))
+ (length (string-length line)))
+ (define (get-formatter)
+ (or (and (not (zero? length))
+ (eq? (string-ref line 0) #\@)
+ (let ((f (string->symbol (substring line 1))))
+ (or (inline-content? (cadr (command-spec f)))
+ (parser-error
+ port "@item formatter must be INLINE" f))
+ f))
+ (parser-error "Invalid @item formatter" line)))
+ (case command
+ ((enumerate)
+ (if (zero? length)
+ '()
+ `((start
+ ,(if (or (and (eq? length 1)
+ (char-alphabetic? (string-ref line 0)))
+ (string-every char-numeric? line))
+ line
+ (parser-error
+ port "Invalid enumerate start" line))))))
+ ((itemize)
+ `((bullet
+ ,(or (and (eq? length 1) line)
+ (and (string-null? line) '(bullet))
+ (list (get-formatter))))))
+ (else ;; tables of various varieties
+ `((formatter (,(get-formatter))))))))
+
+(define (complete-start-command command port)
+ (define (get-arguments type arg-names stop-char)
+ (arguments->attlist port (read-arguments port stop-char) arg-names))
+
+ (let* ((spec (command-spec command))
+ (type (cadr spec))
+ (arg-names (cddr spec)))
+ (case type
+ ((INLINE-TEXT)
+ (assert-curr-char '(#\{) "Inline element lacks {" port)
+ (values command '() type))
+ ((INLINE-ARGS)
+ (assert-curr-char '(#\{) "Inline element lacks {" port)
+ (values command (get-arguments type arg-names #\}) type))
+ ((EOL-ARGS)
+ (values command (get-arguments type arg-names #\newline) type))
+ ((ENVIRON ENTRY INDEX)
+ (skip-horizontal-whitespace port)
+ (values command (parse-environment-args command port) type))
+ ((TABLE-ENVIRON)
+ (skip-horizontal-whitespace port)
+ (values command (parse-table-args command port) type))
+ ((EOL-TEXT)
+ (skip-horizontal-whitespace port)
+ (values command '() type))
+ ((EOL-TEXT-ARGS)
+ (skip-horizontal-whitespace port)
+ (values command (parse-eol-text-args command port) type))
+ ((PARAGRAPH EMPTY-COMMAND ITEM FRAGMENT)
+ (values command '() type))
+ (else ;; INCLUDE shouldn't get here
+ (parser-error port "can't happen")))))
+
+;;-----------------------------------------------------------------------------
+;; Higher-level parsers and scanners
+;;
+;; They parse productions corresponding entire @-commands.
+
+;; Only reads @settitle, leaves it to the command parser to finish
+;; reading the title.
+(define (take-until-settitle port)
+ (or (find-string-from-port? "\n@settitle " port)
+ (parser-error port "No \\n@settitle found"))
+ (skip-horizontal-whitespace port)
+ (and (eq? (peek-char port) #\newline)
+ (parser-error port "You have a @settitle, but no title")))
+
+;; procedure+: read-char-data PORT EXPECT-EOF? STR-HANDLER SEED
+;;
+;; This procedure is to read the CharData of a texinfo document.
+;;
+;; text ::= (CharData | Command)*
+;;
+;; The procedure reads CharData and stops at @-commands (or
+;; environments). It also stops at an open or close brace.
+;;
+;; port
+;; a PORT to read
+;; expect-eof?
+;; a boolean indicating if EOF is normal, i.e., the character
+;; data may be terminated by the EOF. EOF is normal
+;; while processing the main document.
+;; preserve-ws?
+;; a boolean indicating if we are within a whitespace-preserving
+;; environment. If #t, suppress paragraph detection.
+;; str-handler
+;; a STR-HANDLER, see read-verbatim-body
+;; seed
+;; an argument passed to the first invocation of STR-HANDLER.
+;;
+;; The procedure returns two results: SEED and TOKEN. The SEED is the
+;; result of the last invocation of STR-HANDLER, or the original seed if
+;; STR-HANDLER was never called.
+;;
+;; TOKEN can be either an eof-object (this can happen only if expect-eof?
+;; was #t), or a texinfo token denoting the start or end of a tag.
+
+;; read-char-data port expect-eof? preserve-ws? str-handler seed
+(define read-char-data
+ (let* ((end-chars-eof '(*eof* #\{ #\} #\@ #\newline)))
+ (define (handle str-handler str1 str2 seed)
+ (if (and (string-null? str1) (string-null? str2))
+ seed
+ (str-handler str1 str2 seed)))
+
+ (lambda (port expect-eof? preserve-ws? str-handler seed)
+ (let ((end-chars ((if expect-eof? identity cdr) end-chars-eof)))
+ (let loop ((seed seed))
+ (let* ((fragment (next-token '() end-chars "reading char data" port))
+ (term-char (peek-char port))) ; one of end-chars
+ (cond
+ ((eof-object? term-char) ; only if expect-eof?
+ (values (handle str-handler fragment "" seed) term-char))
+ ((memq term-char '(#\@ #\{ #\}))
+ (values (handle str-handler fragment "" seed)
+ (case term-char
+ ((#\@) (read-command-token port))
+ ((#\{) (make-token 'START '*braces*))
+ ((#\}) (read-char port) (make-token 'END #f)))))
+ ((eq? term-char #\newline)
+ ;; Always significant, unless directly before an end token.
+ (let ((c (peek-next-char port)))
+ (cond
+ ((eof-object? c)
+ (or expect-eof?
+ (parser-error port "EOF while reading char data"))
+ (values (handle str-handler fragment "" seed) c))
+ ((eq? c #\@)
+ (let* ((token (read-command-token port))
+ (end? (eq? (token-kind token) 'END)))
+ (values
+ (handle str-handler fragment (if end? "" " ") seed)
+ token)))
+ ((and (not preserve-ws?) (eq? c #\newline))
+ ;; paragraph-separator ::= #\newline #\newline+
+ (skip-while '(#\newline) port)
+ (skip-horizontal-whitespace port)
+ (values (handle str-handler fragment "" seed)
+ (make-token 'PARA 'para)))
+ (else
+ (loop (handle str-handler fragment
+ (if preserve-ws? "\n" " ") seed)))))))))))))
+
+; procedure+: assert-token TOKEN KIND NAME
+; Make sure that TOKEN is of anticipated KIND and has anticipated NAME
+(define (assert-token token kind name)
+ (or (and (token? token)
+ (eq? kind (token-kind token))
+ (equal? name (token-head token)))
+ (parser-error #f "Expecting @end for " name ", got " token)))
+
+;;========================================================================
+;; Highest-level parsers: Texinfo to SXML
+
+;; These parsers are a set of syntactic forms to instantiate a SSAX
+;; parser. The user tells what to do with the parsed character and
+;; element data. These latter handlers determine if the parsing follows a
+;; SAX or a DOM model.
+
+;; syntax: make-command-parser fdown fup str-handler
+
+;; Create a parser to parse and process one element, including its
+;; character content or children elements. The parser is typically
+;; applied to the root element of a document.
+
+;; fdown
+;; procedure COMMAND ARGUMENTS EXPECTED-CONTENT SEED
+;;
+;; This procedure is to generate the seed to be passed to handlers
+;; that process the content of the element. This is the function
+;; identified as 'fdown' in the denotational semantics of the XML
+;; parser given in the title comments to (sxml ssax).
+;;
+;; fup
+;; procedure COMMAND ARGUMENTS PARENT-SEED SEED
+;;
+;; This procedure is called when parsing of COMMAND is finished.
+;; The SEED is the result from the last content parser (or from
+;; fdown if the element has the empty content). PARENT-SEED is the
+;; same seed as was passed to fdown. The procedure is to generate a
+;; seed that will be the result of the element parser. This is the
+;; function identified as 'fup' in the denotational semantics of
+;; the XML parser given in the title comments to (sxml ssax).
+;;
+;; str-handler
+;; A STR-HANDLER, see read-verbatim-body
+;;
+
+;; The generated parser is a
+;; procedure COMMAND PORT SEED
+;;
+;; The procedure must be called *after* the command token has been read.
+
+(define (read-include-file-name port)
+ (let ((x (string-trim-both (read-eof-line port))))
+ (if (string-null? x)
+ (error "no file listed")
+ x))) ;; fixme: should expand @value{} references
+
+(define (sxml->node-name sxml)
+ "Turn some sxml string into a valid node name."
+ (let loop ((in (string->list (sxml->string sxml))) (out '()))
+ (if (null? in)
+ (apply string (reverse out))
+ (if (memq (car in) '(#\{ #\} #\@ #\,))
+ (loop (cdr in) out)
+ (loop (cdr in) (cons (car in) out))))))
+
+(define (index command arguments fdown fup parent-seed)
+ (case command
+ ((deftp defcv defivar deftypeivar defop deftypeop defmethod
+ deftypemethod defopt defvr defvar deftypevr deftypevar deffn
+ deftypefn defspec defmac defun deftypefun)
+ (let ((args `((name ,(string-append (symbol->string command) "-"
+ (cadr (assq 'name arguments)))))))
+ (fup 'anchor args parent-seed
+ (fdown 'anchor args 'INLINE-ARGS '()))))
+ ((cindex findex vindex kindex pindex tindex)
+ (let ((args `((name ,(string-append (symbol->string command) "-"
+ (sxml->node-name
+ (assq 'entry arguments)))))))
+ (fup 'anchor args parent-seed
+ (fdown 'anchor args 'INLINE-ARGS '()))))
+ (else parent-seed)))
+
+(define (make-command-parser fdown fup str-handler)
+ (lambda (command port seed)
+ (let visit ((command command) (port port) (sig-ws? #f) (parent-seed seed))
+ (let*-values (((command arguments expected-content)
+ (complete-start-command command port)))
+ (let* ((parent-seed (index command arguments fdown fup parent-seed))
+ (seed (fdown command arguments expected-content parent-seed))
+ (eof-closes? (or (memq command '(texinfo para *fragment*))
+ (eq? expected-content 'EOL-TEXT)))
+ (sig-ws? (or sig-ws? (space-significant? command)))
+ (up (lambda (s) (fup command arguments parent-seed s)))
+ (new-para (lambda (s) (fdown 'para '() 'PARAGRAPH s)))
+ (make-end-para (lambda (p) (lambda (s) (fup 'para '() p s)))))
+
+ (define (port-for-content)
+ (if (eq? expected-content 'EOL-TEXT)
+ (call-with-input-string (read-text-line port) identity)
+ port))
+
+ (cond
+ ((memq expected-content '(EMPTY-COMMAND INLINE-ARGS EOL-ARGS INDEX
+ EOL-TEXT-ARGS))
+ ;; empty or finished by complete-start-command
+ (up seed))
+ ((eq? command 'verbatim)
+ (up (read-verbatim-body port str-handler seed)))
+ (else
+ (let loop ((port (port-for-content))
+ (expect-eof? eof-closes?)
+ (end-para identity)
+ (need-break? (and (not sig-ws?)
+ (memq expected-content
+ '(ENVIRON TABLE-ENVIRON
+ ENTRY ITEM FRAGMENT))))
+ (seed seed))
+ (cond
+ ((and need-break? (or sig-ws? (skip-whitespace port))
+ (not (memq (peek-char port) '(#\@ #\})))
+ (not (eof-object? (peek-char port))))
+ ;; Even if we have an @, it might be inline -- check
+ ;; that later
+ (let ((seed (end-para seed)))
+ (loop port expect-eof? (make-end-para seed) #f
+ (new-para seed))))
+ (else
+ (let*-values (((seed token)
+ (read-char-data
+ port expect-eof? sig-ws? str-handler seed)))
+ (cond
+ ((eof-object? token)
+ (case expect-eof?
+ ((include #f) (end-para seed))
+ (else (up (end-para seed)))))
+ (else
+ (case (token-kind token)
+ ((STRING)
+ ;; this is only @-commands that escape
+ ;; characters: @}, @@, @{ -- new para if need-break
+ (let ((seed ((if need-break? end-para identity) seed)))
+ (loop port expect-eof?
+ (if need-break? (make-end-para seed) end-para) #f
+ (str-handler (token-head token) ""
+ ((if need-break? new-para identity)
+ seed)))))
+ ((END)
+ ;; The end will only have a name if it's for an
+ ;; environment
+ (cond
+ ((memq command '(item entry))
+ (let ((spec (command-spec (token-head token))))
+ (or (eq? (cadr spec) 'TABLE-ENVIRON)
+ (parser-error
+ port "@item not ended by @end table/enumerate/itemize"
+ token))))
+ ((eq? expected-content 'ENVIRON)
+ (assert-token token 'END command)))
+ (up (end-para seed)))
+ ((ITEM)
+ (cond
+ ((memq command '(enumerate itemize))
+ (up (visit 'item port sig-ws? (end-para seed))))
+ ((eq? expected-content 'TABLE-ENVIRON)
+ (up (visit 'entry port sig-ws? (end-para seed))))
+ ((memq command '(item entry))
+ (visit command port sig-ws? (up (end-para seed))))
+ (else
+ (parser-error
+ port "@item must be within a table environment"
+ command))))
+ ((PARA)
+ ;; examine valid paragraphs?
+ (loop port expect-eof? end-para (not sig-ws?) seed))
+ ((INCLUDE)
+ ;; Recurse for include files
+ (let ((seed (call-with-file-and-dir
+ (read-include-file-name port)
+ (lambda (port)
+ (loop port 'include end-para
+ need-break? seed)))))
+ (loop port expect-eof? end-para need-break? seed)))
+ ((START) ; Start of an @-command
+ (let* ((head (token-head token))
+ (type (cadr (command-spec head)))
+ (inline? (inline-content? type))
+ (seed ((if (and inline? (not need-break?))
+ identity end-para) seed))
+ (end-para (if inline?
+ (if need-break? (make-end-para seed)
+ end-para)
+ identity))
+ (new-para (if (and inline? need-break?)
+ new-para identity)))
+ (loop port expect-eof? end-para (not inline?)
+ (visit head port sig-ws? (new-para seed)))))
+ (else
+ (parser-error port "Unknown token type" token))))))))))))))))
+
+;; procedure: reverse-collect-str-drop-ws fragments
+;;
+;; Given the list of fragments (some of which are text strings), reverse
+;; the list and concatenate adjacent text strings. We also drop
+;; "unsignificant" whitespace, that is, whitespace in front, behind and
+;; between elements. The whitespace that is included in character data
+;; is not affected.
+(define (reverse-collect-str-drop-ws fragments)
+ (cond
+ ((null? fragments) ; a shortcut
+ '())
+ ((and (string? (car fragments)) ; another shortcut
+ (null? (cdr fragments)) ; remove single ws-only string
+ (string-whitespace? (car fragments)))
+ '())
+ (else
+ (let loop ((fragments fragments) (result '()) (strs '())
+ (all-whitespace? #t))
+ (cond
+ ((null? fragments)
+ (if all-whitespace?
+ result ; remove leading ws
+ (cons (apply string-append strs) result)))
+ ((string? (car fragments))
+ (loop (cdr fragments) result (cons (car fragments) strs)
+ (and all-whitespace?
+ (string-whitespace? (car fragments)))))
+ (else
+ (loop (cdr fragments)
+ (cons
+ (car fragments)
+ (cond
+ ((null? strs) result)
+ (all-whitespace?
+ (if (null? result)
+ result ; remove trailing whitespace
+ (cons " " result))); replace interstitial ws with
+ ; one space
+ (else
+ (cons (apply string-append strs) result))))
+ '() #t)))))))
+
+(define (make-dom-parser)
+ (make-command-parser
+ (lambda (command args content seed) ; fdown
+ '())
+ (lambda (command args parent-seed seed) ; fup
+ (let ((seed (reverse-collect-str-drop-ws seed)))
+ (acons command
+ (if (null? args) seed (acons '% args seed))
+ parent-seed)))
+ (lambda (string1 string2 seed) ; str-handler
+ (if (string-null? string2)
+ (cons string1 seed)
+ (cons* string2 string1 seed)))))
+
+(define parse-environment-args
+ (let ((parser (make-dom-parser)))
+ ;; duplicate arguments->attlist to avoid unnecessary splitting
+ (lambda (command port)
+ (let ((args (cdar (parser '*ENVIRON-ARGS* port '())))
+ (arg-names (cddr (command-spec command))))
+ (cond
+ ((not arg-names)
+ (if (null? args) '()
+ (parser-error port "@-command doesn't take args" command)))
+ ((eq? arg-names #t)
+ (list (cons 'arguments args)))
+ (else
+ (let loop ((args args) (arg-names arg-names) (out '()))
+ (cond
+ ((null? arg-names)
+ (if (null? args) (reverse! out)
+ (parser-error port "@-command didn't expect more args"
+ command args)))
+ ((symbol? arg-names)
+ (reverse! (acons arg-names args out)))
+ ((null? args)
+ (parser-error port "@-command expects more args"
+ command arg-names))
+ ((and (string? (car args)) (string-index (car args) #\space))
+ => (lambda (i)
+ (let ((rest (substring/shared (car args) (1+ i))))
+ (if (zero? i)
+ (loop (cons rest (cdr args)) arg-names out)
+ (loop (cons rest (cdr args)) (cdr arg-names)
+ (cons (list (car arg-names)
+ (substring (car args) 0 i))
+ out))))))
+ (else
+ (loop (cdr args) (cdr arg-names)
+ (if (and (pair? (car args)) (eq? (caar args) '*braces*))
+ (acons (car arg-names) (cdar args) out)
+ (cons (list (car arg-names) (car args)) out))))))))))))
+
+(define (parse-eol-text-args command port)
+ ;; perhaps parse-environment-args should be named more
+ ;; generically.
+ (parse-environment-args command port))
+
+;; procedure: texi-fragment->stexi STRING
+;;
+;; A DOM parser for a texinfo fragment STRING.
+;;
+;; The procedure returns an SXML tree headed by the special tag,
+;; *fragment*.
+
+(define (texi-fragment->stexi string-or-port)
+ "Parse the texinfo commands in @var{string-or-port}, and return the
+resultant stexi tree. The head of the tree will be the special command,
+@code{*fragment*}."
+ (define (parse port)
+ (postprocess (car ((make-dom-parser) '*fragment* port '()))))
+ (if (input-port? string-or-port)
+ (parse string-or-port)
+ (call-with-input-string string-or-port parse)))
+
+;; procedure: texi->stexi PORT
+;;
+;; This is an instance of a SSAX parser above that returns an SXML
+;; representation of the texinfo document ready to be read at PORT.
+;;
+;; The procedure returns an SXML tree. The port points to the
+;; first character after the @bye, or to the end of the file.
+
+(define (texi->stexi port)
+ "Read a full texinfo document from @var{port} and return the parsed
+stexi tree. The parsing will start at the @code{@@settitle} and end at
+@code{@@bye} or EOF."
+ (let ((parser (make-dom-parser)))
+ (take-until-settitle port)
+ (postprocess (car (parser 'texinfo port '())))))
+
+(define (car-eq? x y) (and (pair? x) (eq? (car x) y)))
+(define (make-contents tree)
+ (define (lp in out depth)
+ (cond
+ ((null? in) (values in (cons 'enumerate (reverse! out))))
+ ((and (pair? (cdr in)) (texi-command-depth (caadr in) 4))
+ => (lambda (new-depth)
+ (let ((node-name (and (car-eq? (car in) 'node)
+ (cadr (assq 'name (cdadar in))))))
+ (cond
+ ((< new-depth depth)
+ (values in (cons 'enumerate (reverse! out))))
+ ((> new-depth depth)
+ (let ((out-cdr (if (null? out) '() (cdr out)))
+ (out-car (if (null? out) (list 'item) (car out))))
+ (let*-values (((new-in new-out) (lp in '() (1+ depth))))
+ (lp new-in
+ (cons (append out-car (list new-out)) out-cdr)
+ depth))))
+ (else ;; same depth
+ (lp (cddr in)
+ (cons
+ `(item (para
+ ,@(if node-name
+ `((ref (% (node ,node-name))))
+ (cdadr in))))
+ out)
+ depth))))))
+ (else (lp (cdr in) out depth))))
+ (let*-values (((_ contents) (lp tree '() 1)))
+ `((chapheading "Table of Contents") ,contents)))
+
+(define (trim-whitespace str trim-left? trim-right?)
+ (let* ((left-space? (and (not trim-left?)
+ (string-prefix? " " str)))
+ (right-space? (and (not trim-right?)
+ (string-suffix? " " str)))
+ (tail (append! (string-tokenize str)
+ (if right-space? '("") '()))))
+ (string-join (if left-space? (cons "" tail) tail))))
+
+(define (postprocess tree)
+ (define (loop in out state first? sig-ws?)
+ (cond
+ ((null? in)
+ (values (reverse! out) state))
+ ((string? (car in))
+ (loop (cdr in)
+ (cons (if sig-ws? (car in)
+ (trim-whitespace (car in) first? (null? (cdr in))))
+ out)
+ state #f sig-ws?))
+ ((pair? (car in))
+ (case (caar in)
+ ((set)
+ (if (null? (cdar in)) (error "@set missing arguments" in))
+ (if (string? (cadar in))
+ (let ((i (string-index (cadar in) #\space)))
+ (if i
+ (loop (cdr in) out
+ (acons (substring (cadar in) 0 i)
+ (cons (substring (cadar in) (1+ i)) (cddar in))
+ state)
+ #f sig-ws?)
+ (loop (cdr in) out (acons (cadar in) (cddar in) state)
+ #f sig-ws?)))
+ (error "expected a constant to define for @set" in)))
+ ((value)
+ (loop (fold-right cons (cdr in)
+ (or (and=>
+ (assoc (cadr (assq 'key (cdadar in))) state) cdr)
+ (error "unknown value" (cdadar in) state)))
+ out
+ state #f sig-ws?))
+ ((copying)
+ (loop (cdr in) out (cons (car in) state) #f sig-ws?))
+ ((insertcopying)
+ (loop (fold-right cons (cdr in)
+ (or (cdr (assoc 'copying state))
+ (error "copying isn't set yet")))
+ out
+ state #f sig-ws?))
+ ((contents)
+ (loop (cdr in) (fold cons out (make-contents tree)) state #f sig-ws?))
+ (else
+ (let*-values (((kid-out state)
+ (loop (car in) '() state #t
+ (or sig-ws? (space-significant? (caar in))))))
+ (loop (cdr in) (cons kid-out out) state #f sig-ws?)))))
+ (else ; a symbol
+ (loop (cdr in) (cons (car in) out) state #t sig-ws?))))
+
+ (call-with-values
+ (lambda () (loop tree '() '() #t #f))
+ (lambda (out state) out)))
+
+;; Replace % with texinfo-arguments.
+(define (stexi->sxml tree)
+ "Transform the stexi tree @var{tree} into sxml. This involves
+replacing the @code{%} element that keeps the texinfo arguments with an
+element for each argument.
+
+FIXME: right now it just changes % to @code{texinfo-arguments} -- that
+doesn't hang with the idea of making a dtd at some point"
+ (pre-post-order
+ tree
+ `((% . ,(lambda (x . t) (cons 'texinfo-arguments t)))
+ (*text* . ,(lambda (x t) t))
+ (*default* . ,(lambda (x . t) (cons x t))))))
+
+;;; arch-tag: 73890afa-597c-4264-ae70-46fe7756ffb5
+;;; texinfo.scm ends here
diff --git a/module/texinfo/docbook.scm b/module/texinfo/docbook.scm
new file mode 100644
index 000000000..f760e5bd6
--- /dev/null
+++ b/module/texinfo/docbook.scm
@@ -0,0 +1,233 @@
+;;;; (texinfo docbook) -- translating sdocbook into stexinfo
+;;;;
+;;;; Copyright (C) 2009 Free Software Foundation, Inc.
+;;;; Copyright (C) 2007, 2009 Andy Wingo <wingo at pobox dot com>
+;;;;
+;;;; This library is free software; you can redistribute it and/or
+;;;; modify it under the terms of the GNU Lesser General Public
+;;;; License as published by the Free Software Foundation; either
+;;;; version 3 of the License, or (at your option) any later version.
+;;;;
+;;;; This library is distributed in the hope that it will be useful,
+;;;; but WITHOUT ANY WARRANTY; without even the implied warranty of
+;;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;;;; Lesser General Public License for more details.
+;;;;
+;;;; You should have received a copy of the GNU Lesser General Public
+;;;; License along with this library; if not, write to the Free Software
+;;;; Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;;;;
+
+;;; Commentary:
+;;
+;; @c
+;; This module exports procedures for transforming a limited subset of
+;; the SXML representation of docbook into stexi. It is not complete by
+;; any means. The intention is to gather a number of routines and
+;; stylesheets so that external modules can parse specific subsets of
+;; docbook, for example that set generated by certain tools.
+;;
+;;; Code:
+
+(define-module (texinfo docbook)
+ :use-module (sxml fold)
+ :export (*sdocbook->stexi-rules*
+ *sdocbook-block-commands*
+ sdocbook-flatten
+ filter-empty-elements
+ replace-titles))
+
+(define (identity . args)
+ args)
+
+(define (identity-deattr tag . body)
+ `(,tag ,@(if (and (pair? body) (pair? (car body))
+ (eq? (caar body) '@))
+ (cdr body)
+ body)))
+
+(define (detag-one tag body)
+ body)
+
+(define tag-replacements
+ '((parameter var)
+ (replaceable var)
+ (type code)
+ (function code)
+ (literal samp)
+ (emphasis emph)
+ (simpara para)
+ (programlisting example)
+ (firstterm dfn)
+ (filename file)
+ (quote cite)
+ (application cite)
+ (symbol code)
+ (note cartouche)
+ (envar env)))
+
+(define ignore-list '())
+
+(define (stringify exp)
+ (with-output-to-string (lambda () (write exp))))
+
+(define *sdocbook->stexi-rules*
+ #;
+ "A stylesheet for use with SSAX's @code{pre-post-order}, which defines
+a number of generic rules for transforming docbook into texinfo."
+ `((@ *preorder* . ,identity)
+ (% *preorder* . ,identity)
+ (para . ,identity-deattr)
+ (orderedlist ((listitem
+ . ,(lambda (tag . body)
+ `(item ,@body))))
+ . ,(lambda (tag . body)
+ `(enumerate ,@body)))
+ (itemizedlist ((listitem
+ . ,(lambda (tag . body)
+ `(item ,@body))))
+ . ,(lambda (tag . body)
+ `(itemize ,@body)))
+ (term . ,detag-one)
+ (informalexample . ,detag-one)
+ (section . ,identity)
+ (subsection . ,identity)
+ (subsubsection . ,identity)
+ (ulink . ,(lambda (tag attrs . body)
+ `(uref (% ,(assq 'url (cdr attrs))
+ (title ,@body)))))
+ (*text* . ,detag-one)
+ (*default* . ,(lambda (tag . body)
+ (let ((subst (assq tag tag-replacements)))
+ (cond
+ (subst
+ (if (and (pair? body) (pair? (car body)) (eq? (caar body) '@))
+ (begin
+ (warn "Ignoring" tag "attributes" (car body))
+ (append (cdr subst) (cdr body)))
+ (append (cdr subst) body)))
+ ((memq tag ignore-list) #f)
+ (else
+ (warn "Don't know how to convert" tag "to stexi")
+ `(c (% (all ,(stringify (cons tag body))))))))))))
+
+;; (variablelist
+;; ((varlistentry
+;; . ,(lambda (tag term . body)
+;; `(entry (% (heading ,@(cdr term))) ,@body)))
+;; (listitem
+;; . ,(lambda (tag simpara)
+;; simpara)))
+;; . ,(lambda (tag attrs . body)
+;; `(table (% (formatter (var))) ,@body)))
+
+(define *sdocbook-block-commands*
+ #;
+ "The set of sdocbook element tags that should not be nested inside
+each other. @xref{texinfo docbook sdocbook-flatten,,sdocbook-flatten},
+for more information."
+ '(para programlisting informalexample indexterm variablelist
+ orderedlist refsect1 refsect2 refsect3 refsect4 title example
+ note itemizedlist))
+
+(define (inline-command? command)
+ (not (memq command *sdocbook-block-commands*)))
+
+(define (sdocbook-flatten sdocbook)
+ "\"Flatten\" a fragment of sdocbook so that block elements do not nest
+inside each other.
+
+Docbook is a nested format, where e.g. a @code{refsect2} normally
+appears inside a @code{refsect1}. Logical divisions in the document are
+represented via the tree topology; a @code{refsect2} element
+@emph{contains} all of the elements in its section.
+
+On the contrary, texinfo is a flat format, in which sections are marked
+off by standalone section headers like @code{@@chapter}, and block
+elements do not nest inside each other.
+
+This function takes a nested sdocbook fragment @var{sdocbook} and
+flattens all of the sections, such that e.g.
+@example
+ (refsect1 (refsect2 (para \"Hello\")))
+@end example
+becomes
+@example
+ ((refsect1) (refsect2) (para \"Hello\"))
+@end example
+
+Oftentimes (always?) sectioning elements have @code{<title>} as their
+first element child; users interested in processing the @code{refsect*}
+elements into proper sectioning elements like @code{chapter} might be
+interested in @code{replace-titles} and @code{filter-empty-elements}.
+@xref{texinfo docbook replace-titles,,replace-titles}, and @ref{texinfo
+docbook filter-empty-elements,,filter-empty-elements}.
+
+Returns a nodeset, as described in @ref{sxml xpath}. That is to say,
+this function returns an untagged list of stexi elements."
+ (define (fhere str accum block cont)
+ (values (cons str accum)
+ block
+ cont))
+ (define (fdown node accum block cont)
+ (let ((command (car node))
+ (attrs (and (pair? (cdr node)) (pair? (cadr node))
+ (eq? (caadr node) '%)
+ (cadr node))))
+ (values (if attrs (cddr node) (cdr node))
+ '()
+ '()
+ (lambda (accum block)
+ (values
+ `(,command ,@(if attrs (list attrs) '())
+ ,@(reverse accum))
+ block)))))
+ (define (fup node paccum pblock pcont kaccum kblock kcont)
+ (call-with-values (lambda () (kcont kaccum kblock))
+ (lambda (ret block)
+ (if (inline-command? (car ret))
+ (values (cons ret paccum) (append kblock pblock) pcont)
+ (values paccum (append kblock (cons ret pblock)) pcont)))))
+ (call-with-values
+ (lambda () (foldts*-values fdown fup fhere sdocbook '() '() #f))
+ (lambda (accum block cont)
+ (reverse block))))
+
+(define (filter-empty-elements sdocbook)
+ "Filters out empty elements in an sdocbook nodeset. Mostly useful
+after running @code{sdocbook-flatten}."
+ (reverse
+ (fold
+ (lambda (x rest)
+ (if (and (pair? x) (null? (cdr x)))
+ rest
+ (cons x rest)))
+ '()
+ sdocbook)))
+
+(define (replace-titles sdocbook-fragment)
+ "Iterate over the sdocbook nodeset @var{sdocbook-fragment},
+transforming contiguous @code{refsect} and @code{title} elements into
+the appropriate texinfo sectioning command. Most useful after having run
+@code{sdocbook-flatten}.
+
+For example:
+@example
+ (replace-titles '((refsect1) (title \"Foo\") (para \"Bar.\")))
+ @result{} '((chapter \"Foo\") (para \"Bar.\"))
+@end example
+"
+ (define sections '((refsect1 . chapter)
+ (refsect2 . section)
+ (refsect3 . subsection)
+ (refsect4 . subsubsection)))
+ (let lp ((in sdocbook-fragment) (out '()))
+ (cond
+ ((null? in)
+ (reverse out))
+ ((and (pair? (car in)) (assq (caar in) sections))
+ ;; pull out the title
+ => (lambda (pair)
+ (lp (cddr in) (cons `(,(cdr pair) ,@(cdadr in)) out))))
+ (else
+ (lp (cdr in) (cons (car in) out))))))
diff --git a/module/texinfo/html.scm b/module/texinfo/html.scm
new file mode 100644
index 000000000..f9faf6a36
--- /dev/null
+++ b/module/texinfo/html.scm
@@ -0,0 +1,259 @@
+;;;; (texinfo html) -- translating stexinfo into shtml
+;;;;
+;;;; Copyright (C) 2009 Free Software Foundation, Inc.
+;;;; Copyright (C) 2003,2004,2009 Andy Wingo <wingo at pobox dot com>
+;;;;
+;;;; This library is free software; you can redistribute it and/or
+;;;; modify it under the terms of the GNU Lesser General Public
+;;;; License as published by the Free Software Foundation; either
+;;;; version 3 of the License, or (at your option) any later version.
+;;;;
+;;;; This library is distributed in the hope that it will be useful,
+;;;; but WITHOUT ANY WARRANTY; without even the implied warranty of
+;;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;;;; Lesser General Public License for more details.
+;;;;
+;;;; You should have received a copy of the GNU Lesser General Public
+;;;; License along with this library; if not, write to the Free Software
+;;;; Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;;;;
+
+;;; Commentary:
+;;
+;;This module implements transformation from @code{stexi} to HTML. Note
+;;that the output of @code{stexi->shtml} is actually SXML with the HTML
+;;vocabulary. This means that the output can be further processed, and
+;;that it must eventually be serialized by
+;;@ref{sxml simple sxml->xml,sxml->xml}.
+;;
+;;References (i.e., the @code{@@ref} family of commands) are resolved by
+;;a @dfn{ref-resolver}.
+;;@xref{texinfo html add-ref-resolver!,add-ref-resolver!}, for more
+;;information.
+;;
+;;; Code:
+
+;; TODO: nice ref resolving API, default CSS stylesheet (esp. to remove
+;; margin-top on dd > p)
+
+(define-module (texinfo html)
+ :use-module (texinfo)
+ :use-module (sxml transform)
+ :use-module (srfi srfi-13)
+ :export (stexi->shtml add-ref-resolver! urlify))
+
+;; The caller is responsible for carring the returned list.
+(define (arg-ref key %-args)
+ (and=> (assq key (cdr %-args)) (lambda (x) (stexi->shtml (cdr x)))))
+(define (arg-req key %-args)
+ (or (arg-ref key %-args)
+ (error "Missing argument:" key %-args)))
+(define (car* x) (and x (car x)))
+
+(define (urlify str)
+ (string-downcase
+ (string-map
+ (lambda (c)
+ (case c
+ ((#\space #\/ #\:) #\-)
+ (else c)))
+ str)))
+
+(define ref-resolvers
+ (list
+ (lambda (node-name manual-name) ;; the default
+ (urlify (string-append (or manual-name "") "#" node-name)))))
+
+(define (add-ref-resolver! proc)
+ "Add @var{proc} to the head of the list of ref-resolvers. @var{proc}
+will be expected to take the name of a node and the name of a manual and
+return the URL of the referent, or @code{#f} to pass control to the next
+ref-resolver in the list.
+
+The default ref-resolver will return the concatenation of the manual
+name, @code{#}, and the node name."
+ (set! ref-resolvers (cons proc ref-resolvers)))
+
+(define (resolve-ref node manual)
+ (or (or-map (lambda (x) (x node manual)) ref-resolvers)
+ (error "Could not resolve reference" node manual)))
+
+(define (ref tag args)
+ (let* ((node (car (arg-req 'node args)))
+ (section (or (car* (arg-ref 'section args)) node))
+ (manual (car* (arg-ref 'manual args)))
+ (target (resolve-ref node manual)))
+ `(span ,(and=> (assq tag '((xref "See ") (pxref "see "))) cdr)
+ (a (@ (href ,target)) ,section))))
+
+(define (uref tag args)
+ (let ((url (car (arg-req 'url args))))
+ `(a (@ (href ,url)) ,(or (car* (arg-ref 'title args)) url))))
+
+;; @!*&%( Mozilla gets confused at an empty ("<a .. />") a tag. Put an
+;; empty string here to placate the reptile.
+(define (node tag args)
+ `(a (@ (name ,(urlify (car (arg-req 'name args))))) ""))
+
+(define (def tag args . body)
+ (define (code x) (and x (cons 'code x)))
+ (define (var x) (and x (cons 'var x)))
+ (define (b x) (and x (cons 'b x)))
+ (define (list/spaces . elts)
+ (let lp ((in elts) (out '()))
+ (cond ((null? in) (reverse! out))
+ ((null? (car in)) (lp (cdr in) out))
+ (else (lp (cdr in)
+ (cons (car in)
+ (if (null? out) out (cons " " out))))))))
+ (define (left-td-contents)
+ (list/spaces (code (arg-ref 'data-type args))
+ (b (list (code (arg-ref 'class args)))) ;; is this right?
+ (b (list (code (arg-ref 'name args))))
+ (if (memq tag '(deftypeop deftypefn deftypefun))
+ (code (arg-ref 'arguments args))
+ (var (list (code (arg-ref 'arguments args)))))))
+
+ (let* ((category (case tag
+ ((defun) "Function")
+ ((defspec) "Special Form")
+ ((defvar) "Variable")
+ (else (car (arg-req 'category args))))))
+ `(div
+ (table
+ (@ (cellpadding "0") (cellspacing "0") (width "100%") (class "def"))
+ (tr (td ,@(left-td-contents))
+ (td (div (@ (class "right")) "[" ,category "]"))))
+ (div (@ (class "description")) ,@body))))
+
+(define (enumerate tag . elts)
+ (define (tonumber start)
+ (let ((c (string-ref start 0)))
+ (cond ((number? c) (string->number start))
+ (else (1+ (- (char->integer c)
+ (char->integer (if (char-upper-case? c) #\A #\a))))))))
+ `(ol ,@(if (and (pair? elts) (pair? (car elts)) (eq? (caar elts) '%))
+ (cons `(@ (start ,@(tonumber (arg-req 'start (car elts)))))
+ ;; (type ,(type (arg-ref 'start (car elts)))))
+ (cdr elts))
+ elts)))
+
+(define (table tag args . body)
+ (let ((formatter (caar (arg-req 'formatter args))))
+ (cons 'dl
+ (map (lambda (x)
+ (cond ((and (pair? x) (eq? (car x) 'dt))
+ (list (car x) (cons formatter (cdr x))))
+ (else x)))
+ (apply append body)))))
+
+(define (entry tag args . body)
+ `((dt ,@(arg-req 'heading args))
+ (dd ,@body)))
+
+(define tag-replacements
+ '((titlepage div (@ (class "titlepage")))
+ (title h2 (@ (class "title")))
+ (subtitle h3 (@ (class "subtitle")))
+ (author h3 (@ (class "author")))
+ (example pre)
+ (lisp pre)
+ (smallexample pre (@ (class "smaller")))
+ (smalllisp pre (@ (class "smaller")))
+ (cartouche div (@ (class "cartouche")))
+ (verbatim pre (@ (class "verbatim")))
+ (chapter h2)
+ (section h3)
+ (subsection h4)
+ (subsubsection h5)
+ (appendix h2)
+ (appendixsec h3)
+ (appendixsubsec h4)
+ (appendixsubsubsec h5)
+ (unnumbered h2)
+ (unnumberedsec h3)
+ (unnumberedsubsec h4)
+ (unnumberedsubsubsec h5)
+ (majorheading h2)
+ (chapheading h2)
+ (heading h3)
+ (subheading h4)
+ (subsubheading h5)
+ (quotation blockquote)
+ (itemize ul)
+ (item li) ;; itemx ?
+ (para p)
+ (*fragment* div) ;; should be ok
+
+ (asis span)
+ (bold b)
+ (sample samp)
+ (samp samp)
+ (code code)
+ (kbd kbd)
+ (key code (@ (class "key")))
+ (var var)
+ (env code (@ (class "env")))
+ (file code (@ (class "file")))
+ (command code (@ (class "command")))
+ (option code (@ (class "option")))
+ (url code (@ (class "url")))
+ (dfn dfn)
+ (cite cite)
+ (acro acronym)
+ (email code (@ (class "email")))
+ (emph em)
+ (strong strong)
+ (sc span (@ (class "small-caps")))))
+
+(define ignore-list
+ '(page setfilename setchapternewpage iftex ifinfo ifplaintext ifxml sp vskip
+ menu ignore syncodeindex comment c dircategory direntry top shortcontents
+ cindex printindex))
+(define (ignored? tag)
+ (memq tag ignore-list))
+
+(define rules
+ `((% *preorder* . ,(lambda args args)) ;; Keep these around...
+ (texinfo . ,(lambda (tag args . body)
+ (pre-post-order
+ `(html
+ (@ (xmlns "http://www.w3.org/1999/xhtml"))
+ (head (title ,(car (arg-req 'title args))))
+ (body ,@body))
+ `((% *preorder* . ,(lambda args #f)) ;; ... filter out.
+ (*text* . ,(lambda (tag x) x))
+ (*default* . ,(lambda (tag . body)
+ (cons tag body)))))))
+ (copyright . ,(lambda args '(*ENTITY* "copy")))
+ (result . ,(lambda args '(*ENTITY* "rArr")))
+ (xref . ,ref) (ref . ,ref) (pxref . ,ref)
+ (uref . ,uref)
+ (node . ,node) (anchor . ,node)
+ (table . ,table)
+ (enumerate . ,enumerate)
+ (entry . ,entry)
+
+ (deftp . ,def) (defcv . ,def) (defivar . ,def) (deftypeivar . ,def)
+ (defop . ,def) (deftypeop . ,def) (defmethod . ,def)
+ (deftypemethod . ,def) (defopt . ,def) (defvr . ,def) (defvar . ,def)
+ (deftypevr . ,def) (deftypevar . ,def) (deffn . ,def)
+ (deftypefn . ,def) (defmac . ,def) (defspec . ,def) (defun . ,def)
+ (deftypefun . ,def)
+ (ifnottex . ,(lambda (tag . body) body))
+ (*text* . ,(lambda (tag x) x))
+ (*default* . ,(lambda (tag . body)
+ (let ((subst (assq tag tag-replacements)))
+ (cond
+ (subst (append (cdr subst) body))
+ ((memq tag ignore-list) #f)
+ (else
+ (warn "Don't know how to convert" tag "to HTML")
+ body)))))))
+
+(define (stexi->shtml tree)
+ "Transform the stexi @var{tree} into shtml, resolving references via
+ref-resolvers. See the module commentary for more details."
+ (pre-post-order tree rules))
+
+;;; arch-tag: ab05f3fe-9981-4a78-b64c-48efcd9983a6
diff --git a/module/texinfo/indexing.scm b/module/texinfo/indexing.scm
new file mode 100644
index 000000000..bc3d7ab74
--- /dev/null
+++ b/module/texinfo/indexing.scm
@@ -0,0 +1,78 @@
+;;;; (texinfo indexing) -- indexing stexinfo
+;;;;
+;;;; Copyright (C) 2009 Free Software Foundation, Inc.
+;;;; Copyright (C) 2003,2004,2009 Andy Wingo <wingo at pobox dot com>
+;;;;
+;;;; This library is free software; you can redistribute it and/or
+;;;; modify it under the terms of the GNU Lesser General Public
+;;;; License as published by the Free Software Foundation; either
+;;;; version 3 of the License, or (at your option) any later version.
+;;;;
+;;;; This library is distributed in the hope that it will be useful,
+;;;; but WITHOUT ANY WARRANTY; without even the implied warranty of
+;;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;;;; Lesser General Public License for more details.
+;;;;
+;;;; You should have received a copy of the GNU Lesser General Public
+;;;; License along with this library; if not, write to the Free Software
+;;;; Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;;;;
+
+;;; Commentary:
+;;
+;;@c texinfo formatting
+;;Given a piece of stexi, return an index of a specified variety.
+;;
+;;Note that currently, @code{stexi-extract-index} doesn't differentiate
+;;between different kinds of index entries. That's a bug ;)
+;;; Code:
+
+(define-module (texinfo indexing)
+ #:use-module (sxml simple)
+ #:use-module (srfi srfi-13)
+ #:export (stexi-extract-index))
+
+(define (def-name def)
+ (cadr (assq 'name (cdadr def))))
+
+(define defines
+ '(deftp defcv defivar deftypeivar defop deftypeop defmethod
+ deftypemethod defopt defvr defvar deftypevr deftypevar deffn
+ deftypefn defspec defmac defun deftypefun))
+
+(define indices
+ '(cindex findex vindex kindex pindex tindex))
+
+(define (stexi-extract-index tree manual-name kind)
+ "Given an stexi tree @var{tree}, index all of the entries of type
+@var{kind}. @var{kind} can be one of the predefined texinfo indices
+(@code{concept}, @code{variable}, @code{function}, @code{key},
+@code{program}, @code{type}) or one of the special symbols @code{auto}
+or @code{all}. @code{auto} will scan the stext for a @code{(printindex)}
+statement, and @code{all} will generate an index from all entries,
+regardless of type.
+
+The returned index is a list of pairs, the @sc{car} of which is the
+entry (a string) and the @sc{cdr} of which is a node name (a string)."
+ (let loop ((in tree) (entries '()))
+ (cond
+ ((null? in)
+ entries)
+ ((pair? (car in))
+ (cond
+ ((and (pair? (cdr in)) (pair? (cadr in))
+ (eq? (caar in) 'anchor) (memq (caadr in) defines))
+ (loop (cddr in) (acons (cadr (assq 'name (cdr (cadadr in))))
+ (cadr (assq 'name (cdadar in)))
+ entries)))
+ ((and (pair? (cdr in)) (pair? (cadr in))
+ (eq? (caar in) 'anchor) (memq (caadr in) indices))
+ (loop (cddr in) (acons (sxml->string (cadr in))
+ (cadr (assq 'name (cdadar in)))
+ entries)))
+ (else
+ (loop (cdr in) (loop (car in) entries)))))
+ (else
+ (loop (cdr in) entries)))))
+
+;;; arch-tag: 216d29d3-1ed9-433f-9c19-0dc4d6b439b6
diff --git a/module/texinfo/plain-text.scm b/module/texinfo/plain-text.scm
new file mode 100644
index 000000000..dfa4c9acc
--- /dev/null
+++ b/module/texinfo/plain-text.scm
@@ -0,0 +1,319 @@
+;;;; (texinfo plain-text) -- rendering stexinfo as plain text
+;;;;
+;;;; Copyright (C) 2009 Free Software Foundation, Inc.
+;;;; Copyright (C) 2003,2004,2009 Andy Wingo <wingo at pobox dot com>
+;;;;
+;;;; This library is free software; you can redistribute it and/or
+;;;; modify it under the terms of the GNU Lesser General Public
+;;;; License as published by the Free Software Foundation; either
+;;;; version 3 of the License, or (at your option) any later version.
+;;;;
+;;;; This library is distributed in the hope that it will be useful,
+;;;; but WITHOUT ANY WARRANTY; without even the implied warranty of
+;;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;;;; Lesser General Public License for more details.
+;;;;
+;;;; You should have received a copy of the GNU Lesser General Public
+;;;; License along with this library; if not, write to the Free Software
+;;;; Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;;;;
+
+;;; Commentary:
+;;
+;;Transformation from stexi to plain-text. Strives to re-create the
+;;output from @code{info}; comes pretty damn close.
+;;
+;;; Code:
+
+(define-module (texinfo plain-text)
+ #:use-module (texinfo)
+ #:use-module (texinfo string-utils)
+ #:use-module (sxml transform)
+ #:use-module (srfi srfi-1)
+ #:use-module (srfi srfi-13)
+ #:export (stexi->plain-text))
+
+;; The return value is a string.
+(define (arg-ref key %-args)
+ (and=> (and=> (assq key (cdr %-args)) cdr)
+ stexi->plain-text))
+(define (arg-req key %-args)
+ (or (arg-ref key %-args)
+ (error "Missing argument:" key %-args)))
+
+(define *indent* (make-fluid))
+(define *itemizer* (make-fluid))
+
+(define (make-ticker str)
+ (lambda () str))
+(define (make-enumerator n)
+ (lambda ()
+ (let ((last n))
+ (set! n (1+ n))
+ (format #f "~A. " last))))
+
+(fluid-set! *indent* "")
+;; Shouldn't be necessary to do this, but just in case.
+(fluid-set! *itemizer* (make-ticker "* "))
+
+(define-macro (with-indent n . body)
+ `(with-fluids ((*indent* (string-append (fluid-ref *indent*)
+ (make-string ,n #\space))))
+ ,@body))
+
+(define (make-indenter n proc)
+ (lambda args (with-indent n (apply proc args))))
+
+(define (string-indent str)
+ (string-append (fluid-ref *indent*) str "\n"))
+
+(define-macro (with-itemizer itemizer . body)
+ `(with-fluids ((*itemizer* ,itemizer))
+ ,@body))
+
+(define (wrap* . strings)
+ (let ((indent (fluid-ref *indent*)))
+ (fill-string (string-concatenate strings)
+ #:line-width 72 #:initial-indent indent
+ #:subsequent-indent indent)))
+(define (wrap . strings)
+ (string-append (apply wrap* strings) "\n\n"))
+(define (wrap-heading . strings)
+ (string-append (apply wrap* strings) "\n"))
+
+(define (ref tag args)
+ (let* ((node (arg-req 'node args))
+ (name (or (arg-ref 'name args) node))
+ (manual (arg-ref 'manual args)))
+ (string-concatenate
+ (cons*
+ (or (and=> (assq tag '((xref "See ") (pxref "see "))) cadr) "")
+ name
+ (if manual `(" in manual " ,manual) '())))))
+
+(define (uref tag args)
+ (let ((url (arg-req 'url args))
+ (title (arg-ref 'title args)))
+ (if title
+ (string-append title " (" url ")")
+ (string-append "`" url "'"))))
+
+(define (def tag args . body)
+ (define (list/spaces . elts)
+ (let lp ((in elts) (out '()))
+ (cond ((null? in) (reverse! out))
+ ((null? (car in)) (lp (cdr in) out))
+ (else (lp (cdr in)
+ (cons (car in)
+ (if (null? out) out (cons " " out))))))))
+ (define (first-line)
+ (string-join
+ (filter identity
+ (map (lambda (x) (arg-ref x args))
+ '(data-type class name arguments)))
+ " "))
+
+ (let* ((category (case tag
+ ((defun) "Function")
+ ((defspec) "Special Form")
+ ((defvar) "Variable")
+ (else (arg-req 'category args)))))
+ (string-append
+ (wrap-heading (string-append " - " category ": " (first-line)))
+ (with-indent 5 (stexi->plain-text body)))))
+
+(define (enumerate tag . elts)
+ (define (tonumber start)
+ (let ((c (string-ref start 0)))
+ (cond ((number? c) (string->number start))
+ (else (1+ (- (char->integer c)
+ (char->integer (if (char-upper-case? c) #\A #\a))))))))
+ (let* ((args? (and (pair? elts) (pair? (car elts))
+ (eq? (caar elts) '%)))
+ (start (and args? (arg-ref 'start (car elts)))))
+ (with-itemizer (make-enumerator (if start (tonumber start) 1))
+ (with-indent 5
+ (stexi->plain-text (if start (cdr elts) elts))))))
+
+(define (itemize tag args . elts)
+ (with-itemizer (make-ticker "* ")
+ (with-indent 5
+ (stexi->plain-text elts))))
+
+(define (item tag . elts)
+ (let* ((ret (stexi->plain-text elts))
+ (tick ((fluid-ref *itemizer*)))
+ (tick-pos (- (string-length (fluid-ref *indent*))
+ (string-length tick))))
+ (if (and (not (string-null? ret)) (not (negative? tick-pos)))
+ (string-copy! ret tick-pos tick))
+ ret))
+
+(define (table tag args . body)
+ (stexi->plain-text body))
+
+(define (entry tag args . body)
+ (let ((heading (wrap-heading
+ (stexi->plain-text (arg-req 'heading args)))))
+ (string-append heading
+ (with-indent 5 (stexi->plain-text body)))))
+
+(define (make-underliner char)
+ (lambda (tag . body)
+ (let ((str (stexi->plain-text body)))
+ (string-append
+ "\n"
+ (string-indent str)
+ (string-indent (make-string (string-length str) char))
+ "\n"))))
+
+(define chapter (make-underliner #\*))
+(define section (make-underliner #\=))
+(define subsection (make-underliner #\-))
+(define subsubsection (make-underliner #\.))
+
+(define (example tag . body)
+ (let ((ret (stexi->plain-text body)))
+ (string-append
+ (string-concatenate
+ (with-indent 5 (map string-indent (string-split ret #\newline))))
+ "\n")))
+
+(define (verbatim tag . body)
+ (let ((ret (stexi->plain-text body)))
+ (string-append
+ (string-concatenate
+ (map string-indent (string-split ret #\newline)))
+ "\n")))
+
+(define (fragment tag . body)
+ (string-concatenate (map-in-order stexi->plain-text body)))
+
+(define (para tag . body)
+ (wrap (stexi->plain-text body)))
+
+(define (make-surrounder str)
+ (lambda (tag . body)
+ (string-append str (stexi->plain-text body) str)))
+
+(define (code tag . body)
+ (string-append "`" (stexi->plain-text body) "'"))
+
+(define (key tag . body)
+ (string-append "<" (stexi->plain-text body) ">"))
+
+(define (var tag . body)
+ (string-upcase (stexi->plain-text body)))
+
+(define (passthrough tag . body)
+ (stexi->plain-text body))
+
+(define (ignore . args)
+ "")
+
+(define (texinfo tag args . body)
+ (let ((title (chapter 'foo (arg-req 'title args))))
+ (string-append title (stexi->plain-text body))))
+
+(define ignore-list
+ '(page setfilename setchapternewpage iftex ifinfo ifplaintext ifxml sp vskip
+ menu ignore syncodeindex comment c % node anchor))
+(define (ignored? tag)
+ (memq tag ignore-list))
+
+(define tag-handlers
+ `((title ,chapter)
+ (chapter ,chapter)
+ (section ,section)
+ (subsection ,subsection)
+ (subsubsection ,subsubsection)
+ (appendix ,chapter)
+ (appendixsec ,section)
+ (appendixsubsec ,subsection)
+ (appendixsubsubsec ,subsubsection)
+ (unnumbered ,chapter)
+ (unnumberedsec ,section)
+ (unnumberedsubsec ,subsection)
+ (unnumberedsubsubsec ,subsubsection)
+ (majorheading ,chapter)
+ (chapheading ,chapter)
+ (heading ,section)
+ (subheading ,subsection)
+ (subsubheading ,subsubsection)
+
+ (strong ,(make-surrounder "*"))
+ (sample ,code)
+ (samp ,code)
+ (code ,code)
+ (kbd ,code)
+ (key ,key)
+ (var ,var)
+ (env ,code)
+ (file ,code)
+ (command ,code)
+ (option ,code)
+ (url ,code)
+ (dfn ,(make-surrounder "\""))
+ (cite ,(make-surrounder "\""))
+ (acro ,passthrough)
+ (email ,key)
+ (emph ,(make-surrounder "_"))
+ (sc ,var)
+ (copyright ,(lambda args "(C)"))
+ (result ,(lambda args "==>"))
+ (xref ,ref)
+ (ref ,ref)
+ (pxref ,ref)
+ (uref ,uref)
+
+ (texinfo ,texinfo)
+ (quotation ,(make-indenter 5 para))
+ (itemize ,itemize)
+ (enumerate ,enumerate)
+ (item ,item)
+ (table ,table)
+ (entry ,entry)
+ (example ,example)
+ (lisp ,example)
+ (smallexample ,example)
+ (smalllisp ,example)
+ (verbatim ,verbatim)
+ (*fragment* ,fragment)
+
+ (deftp ,def)
+ (defcv ,def)
+ (defivar ,def)
+ (deftypeivar ,def)
+ (defop ,def)
+ (deftypeop ,def)
+ (defmethod ,def)
+ (deftypemethod ,def)
+ (defopt ,def)
+ (defvr ,def)
+ (defvar ,def)
+ (deftypevr ,def)
+ (deftypevar ,def)
+ (deffn ,def)
+ (deftypefn ,def)
+ (defmac ,def)
+ (defspec ,def)
+ (defun ,def)
+ (deftypefun ,def)))
+
+(define (stexi->plain-text tree)
+ "Transform @var{tree} into plain text. Returns a string."
+ (cond
+ ((null? tree) "")
+ ((string? tree) tree)
+ ((pair? tree)
+ (cond
+ ((symbol? (car tree))
+ (let ((handler (and (not (ignored? (car tree)))
+ (or (and=> (assq (car tree) tag-handlers) cadr)
+ para))))
+ (if handler (apply handler tree) "")))
+ (else
+ (string-concatenate (map-in-order stexi->plain-text tree)))))
+ (else "")))
+
+;;; arch-tag: f966c3f6-3b46-4790-bbf9-3ad27e4917c2
diff --git a/module/texinfo/reflection.scm b/module/texinfo/reflection.scm
new file mode 100644
index 000000000..bda1830ea
--- /dev/null
+++ b/module/texinfo/reflection.scm
@@ -0,0 +1,528 @@
+;;;; (texinfo reflection) -- documenting Scheme as stexinfo
+;;;;
+;;;; Copyright (C) 2009 Free Software Foundation, Inc.
+;;;; Copyright (C) 2003,2004,2009 Andy Wingo <wingo at pobox dot com>
+;;;;
+;;;; This library is free software; you can redistribute it and/or
+;;;; modify it under the terms of the GNU Lesser General Public
+;;;; License as published by the Free Software Foundation; either
+;;;; version 3 of the License, or (at your option) any later version.
+;;;;
+;;;; This library is distributed in the hope that it will be useful,
+;;;; but WITHOUT ANY WARRANTY; without even the implied warranty of
+;;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;;;; Lesser General Public License for more details.
+;;;;
+;;;; You should have received a copy of the GNU Lesser General Public
+;;;; License along with this library; if not, write to the Free Software
+;;;; Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;;;;
+
+;;; Commentary:
+;;
+;;Routines to generare @code{stexi} documentation for objects and
+;;modules.
+;;
+;;Note that in this context, an @dfn{object} is just a value associated
+;;with a location. It has nothing to do with GOOPS.
+;;
+;;; Code:
+
+(define-module (texinfo reflection)
+ #:use-module ((srfi srfi-1) #:select (append-map))
+ #:use-module (oop goops)
+ #:use-module (texinfo)
+ #:use-module (texinfo plain-text)
+ #:use-module (srfi srfi-13)
+ #:use-module (ice-9 session)
+ #:use-module (ice-9 documentation)
+ #:use-module (ice-9 optargs)
+ #:use-module ((sxml transform) #:select (pre-post-order))
+ #:export (module-stexi-documentation
+ script-stexi-documentation
+ object-stexi-documentation
+ package-stexi-standard-copying
+ package-stexi-standard-titlepage
+ package-stexi-generic-menu
+ package-stexi-standard-menu
+ package-stexi-extended-menu
+ package-stexi-standard-prologue
+ package-stexi-documentation))
+
+;; List for sorting the definitions in a module
+(define defs
+ '(deftp defcv defivar deftypeivar defop deftypeop defmethod
+ deftypemethod defopt defvr defvar deftypevr deftypevar deffn
+ deftypefn defmac defspec defun deftypefun))
+
+(define (sort-defs ordering a b)
+ (define (def x)
+ ;; a and b are lists of the form ((anchor ...) (def* ...)...)
+ (cadr x))
+ (define (name x)
+ (cadr (assq 'name (cdadr (def x)))))
+ (define (priority x)
+ (list-index defs (car (def x))))
+ (define (order x)
+ (or (list-index ordering (string->symbol (name x)))
+ ;; if the def is not in the list, a big number
+ 1234567890))
+ (define (compare-in-order proc eq? < . args)
+ (if (not (eq? (proc a) (proc b)))
+ (< (proc a) (proc b))
+ (or (null? args)
+ (apply compare-in-order args))))
+ (compare-in-order order = <
+ priority = <
+ name string=? string<=?))
+
+(define (list*-join l infix restfix)
+ (let lp ((in l) (out '()))
+ (cond ((null? in) (reverse! out))
+ ((symbol? in) (reverse! (cons* in restfix out)))
+ (else (lp (cdr in) (if (null? out)
+ (list (car in))
+ (cons* (car in) infix out)))))))
+
+(define (process-args args)
+ (map (lambda (x) (if (symbol? x) (symbol->string x) x))
+ (list*-join (or args '())
+ " " " . ")))
+
+(define (get-proc-args proc)
+ (cond
+ ((procedure-property proc 'arglist)
+ => (lambda (arglist)
+ (let ((required-args (car arglist))
+ (optional-args (cadr arglist))
+ (keyword-args (caddr arglist))
+ (rest-arg (car (cddddr arglist))))
+ (process-args
+ (append
+ ;; start with the required args...
+ (map symbol->string required-args)
+
+ ;; add any optional args if needed...
+ (map (lambda (a)
+ (if (list? a)
+ (format #f "[~a = ~s]" (car a) (cadr a))
+ (format #f "[~a]" a)))
+ optional-args)
+
+ ;; now the keyword args..
+ (map (lambda (a)
+ (if (list? a)
+ (format #f "[#:~a = ~s]" (car a) (cadr a))
+ (format #f "[#:~a]" a)))
+ keyword-args)
+
+ ;; now the rest arg...
+ (if rest-arg
+ (list "." (symbol->string rest-arg))
+ '()))))))
+ (else
+ (process-args (and=> (procedure-source proc) cadr)))))
+
+;; like the normal false-if-exception, but doesn't affect the-last-stack
+(define-macro (false-if-exception exp)
+ `(catch #t
+ (lambda ()
+ (with-fluids ((the-last-stack (fluid-ref the-last-stack)))
+ ,exp))
+ (lambda args #f)))
+
+;; This is really nasty, I wish guile gave a better way to get this...
+(define (get-macro-args macro)
+ (process-args
+ (case (macro-type macro)
+ ((syncase-macro)
+ (case (syncase-macro-type macro)
+ ((macro)
+ (get-proc-args (car (syncase-macro-binding macro))))
+ (else #f)))
+ (else #f))))
+
+(define many-space? (make-regexp "[[:space:]][[:space:]][[:space:]]"))
+(define initial-space? (make-regexp "^[[:space:]]"))
+(define (string->stexi str)
+ (or (and (or (not str) (string-null? str))
+ '(*fragment*))
+ (and (or (string-index str #\@)
+ (and (not (regexp-exec many-space? str))
+ (not (regexp-exec initial-space? str))))
+ (false-if-exception
+ (texi-fragment->stexi str)))
+ `(*fragment* (verbatim ,str))))
+
+(define method-formals
+ (and (defined? 'method-formals) method-formals))
+
+(define (method-stexi-arguments method)
+ (cond
+ (method-formals
+ (let lp ((formals (method-formals method))
+ (specializers (method-specializers method))
+ (out '()))
+ (define (arg-texinfo formal specializer)
+ `(" (" (var ,(symbol->string formal)) " "
+ (code ,(symbol->string (class-name specializer))) ")"))
+ (cond
+ ((null? formals) (reverse out))
+ ((pair? formals)
+ (lp (cdr formals) (cdr specializers)
+ (append (reverse (arg-texinfo (car formals) (car specializers)))
+ out)))
+ (else
+ (append (reverse out) (arg-texinfo formals specializers)
+ (list "..."))))))
+ ((method-source method)
+ (let lp ((bindings (cadr (method-source method))) (out '()))
+ (define (arg-texinfo arg)
+ `(" (" (var ,(symbol->string (car arg))) " "
+ (code ,(symbol->string (cadr arg))) ")"))
+ (cond
+ ((null? bindings)
+ (reverse out))
+ ((not (pair? (car bindings)))
+ (append (reverse out) (arg-texinfo bindings) (list "...")))
+ (else
+ (lp (cdr bindings)
+ (append (reverse (arg-texinfo (car bindings))) out))))))
+ (else (warn method) '())))
+
+(define* (object-stexi-documentation object #:optional (name "[unknown]")
+ #:key (force #f))
+ (if (symbol? name)
+ (set! name (symbol->string name)))
+ (let ((stexi ((lambda (x)
+ (cond ((string? x) (string->stexi x))
+ ((and (pair? x) (eq? (car x) '*fragment*)) x)
+ (force `(*fragment*))
+ (else #f)))
+ (object-documentation
+ (if (is-a? object <method>)
+ (method-procedure object)
+ object)))))
+ (define (make-def type args)
+ `(,type (% ,@args) ,@(cdr stexi)))
+ (cond
+ ((not stexi) #f)
+ ;; stexi is now a list, headed by *fragment*.
+ ((and (pair? (cdr stexi)) (pair? (cadr stexi))
+ (memq (caadr stexi) defs))
+ ;; it's already a deffoo.
+ stexi)
+ ((is-a? object <class>)
+ (make-def 'deftp `((name ,name)
+ (category "Class"))))
+ ((is-a? object <macro>)
+ (make-def 'defspec `((name ,name)
+ (arguments ,@(get-macro-args object)))))
+ ((is-a? object <procedure>)
+ (make-def 'defun `((name ,name)
+ (arguments ,@(get-proc-args object)))))
+ ((is-a? object <method>)
+ (make-def 'deffn `((category "Method")
+ (name ,name)
+ (arguments ,@(method-stexi-arguments object)))))
+ ((is-a? object <generic>)
+ `(*fragment*
+ ,(make-def 'deffn `((name ,name)
+ (category "Generic")))
+ ,@(map
+ (lambda (method)
+ (object-stexi-documentation method name #:force force))
+ (generic-function-methods object))))
+ (else
+ (make-def 'defvar `((name ,name)))))))
+
+(define (module-name->node-name sym-name)
+ (string-join (map symbol->string sym-name) " "))
+
+;; this copied from (ice-9 session); need to find a better way
+(define (module-filename name)
+ (let* ((name (map symbol->string name))
+ (reverse-name (reverse name))
+ (leaf (car reverse-name))
+ (dir-hint-module-name (reverse (cdr reverse-name)))
+ (dir-hint (apply string-append
+ (map (lambda (elt)
+ (string-append elt "/"))
+ dir-hint-module-name))))
+ (%search-load-path (in-vicinity dir-hint leaf))))
+
+(define (read-module name)
+ (let ((filename (module-filename name)))
+ (if filename
+ (let ((port (open-input-file filename)))
+ (let lp ((out '()) (form (read port)))
+ (if (eof-object? form)
+ (reverse out)
+ (lp (cons form out) (read port)))))
+ '())))
+
+(define (module-export-list sym-name)
+ (define (module-form-export-list form)
+ (and (pair? form)
+ (eq? (car form) 'define-module)
+ (equal? (cadr form) sym-name)
+ (and=> (memq #:export (cddr form)) cadr)))
+ (let lp ((forms (read-module sym-name)))
+ (cond ((null? forms) '())
+ ((module-form-export-list (car forms)) => identity)
+ (else (lp (cdr forms))))))
+
+(define* (module-stexi-documentation sym-name
+ #:optional (docs-resolver
+ (lambda (name def) def)))
+ "Return documentation for the module named @var{sym-name}. The
+documentation will be formatted as @code{stexi}
+ (@pxref{texinfo,texinfo})."
+ (let* ((commentary (and=> (module-commentary sym-name)
+ (lambda (x) (string-trim-both x #\newline))))
+ (stexi (string->stexi commentary))
+ (node-name (module-name->node-name sym-name))
+ (name-str (with-output-to-string
+ (lambda () (display sym-name))))
+ (module (resolve-interface sym-name))
+ (export-list (module-export-list sym-name)))
+ (define (anchor-name sym)
+ (string-append node-name " " (symbol->string sym)))
+ (define (make-defs)
+ (sort!
+ (module-map
+ (lambda (sym var)
+ `((anchor (% (name ,(anchor-name sym))))
+ ,@((lambda (x)
+ (if (eq? (car x) '*fragment*)
+ (cdr x)
+ (list x)))
+ (if (variable-bound? var)
+ (docs-resolver
+ sym
+ (object-stexi-documentation (variable-ref var) sym
+ #:force #t))
+ (begin
+ (warn "variable unbound!" sym)
+ `(defvar (% (name ,(symbol->string sym)))
+ "[unbound!]"))))))
+ module)
+ (lambda (a b) (sort-defs export-list a b))))
+
+ `(texinfo (% (title ,name-str))
+ (node (% (name ,node-name)))
+ (section "Overview")
+ ,@(cdr stexi)
+ (section "Usage")
+ ,@(apply append! (make-defs)))))
+
+(define (script-stexi-documentation scriptpath)
+ "Return documentation for given script. The documentation will be
+taken from the script's commentary, and will be returned in the
+@code{stexi} format (@pxref{texinfo,texinfo})."
+ (let ((commentary (file-commentary scriptpath)))
+ `(texinfo (% (title ,(basename scriptpath)))
+ (node (% (name ,(basename scriptpath))))
+ ,@(if commentary
+ (cdr
+ (string->stexi
+ (string-trim-both commentary #\newline)))
+ '()))))
+
+(cond
+ ((defined? 'add-value-help-handler!)
+ (add-value-help-handler!
+ (lambda (name value)
+ (stexi->plain-text
+ (object-stexi-documentation value name #:force #t))))
+ (add-name-help-handler!
+ (lambda (name)
+ (and (list? name)
+ (and-map symbol? name)
+ (stexi->plain-text (module-stexi-documentation name)))))))
+
+;; we could be dealing with an old (ice-9 session); fondle it to get
+;; module-commentary
+(define module-commentary (@@ (ice-9 session) module-commentary))
+
+(define (package-stexi-standard-copying name version updated years
+ copyright-holder permissions)
+ "Create a standard texinfo @code{copying} section.
+
+@var{years} is a list of years (as integers) in which the modules
+being documented were released. All other arguments are strings."
+ `(copying
+ (para "This manual is for " ,name
+ " (version " ,version ", updated " ,updated ")")
+ (para "Copyright " ,(string-join (map number->string years) ",")
+ " " ,copyright-holder)
+ (quotation
+ (para ,permissions))))
+
+(define (package-stexi-standard-titlepage name version updated authors)
+ "Create a standard GNU title page.
+
+@var{authors} is a list of @code{(@var{name} . @var{email})}
+pairs. All other arguments are strings.
+
+Here is an example of the usage of this procedure:
+
+@smallexample
+ (package-stexi-standard-titlepage
+ \"Foolib\"
+ \"3.2\"
+ \"26 September 2006\"
+ '((\"Alyssa P Hacker\" . \"alyssa@@example.com\"))
+ '(2004 2005 2006)
+ \"Free Software Foundation, Inc.\"
+ \"Standard GPL permissions blurb goes here\")
+@end smallexample
+"
+ `(;(setchapternewpage (% (all "odd"))) makes manuals too long
+ (titlepage
+ (title ,name)
+ (subtitle "version " ,version ", updated " ,updated)
+ ,@(map (lambda (pair)
+ `(author ,(car pair)
+ " (" (email ,(cdr pair)) ")"))
+ authors)
+ (page)
+ (vskip (% (all "0pt plus 1filll")))
+ (insertcopying))))
+
+(define (package-stexi-generic-menu name entries)
+ "Create a menu from a generic alist of entries, the car of which
+should be the node name, and the cdr the description. As an exception,
+an entry of @code{#f} will produce a separator."
+ (define (make-entry node description)
+ `("* " ,node "::"
+ ,(make-string (max (- 21 (string-length node)) 2) #\space)
+ ,@description "\n"))
+ `((ifnottex
+ (node (% (name "Top")))
+ (top (% (title ,name)))
+ (insertcopying)
+ (menu
+ ,@(apply
+ append
+ (map
+ (lambda (entry)
+ (if entry
+ (make-entry (car entry) (cdr entry))
+ '("\n")))
+ entries))))
+ (iftex
+ (shortcontents))))
+
+
+(define (package-stexi-standard-menu name modules module-descriptions
+ extra-entries)
+ "Create a standard top node and menu, suitable for processing
+by makeinfo."
+ (package-stexi-generic-menu
+ name
+ (let ((module-entries (map cons
+ (map module-name->node-name modules)
+ module-descriptions))
+ (separate-sections (lambda (x) (if (null? x) x (cons #f x)))))
+ `(,@module-entries
+ ,@(separate-sections extra-entries)))))
+
+(define (package-stexi-extended-menu name module-pairs script-pairs
+ extra-entries)
+ "Create an \"extended\" menu, like the standard menu but with a
+section for scripts."
+ (package-stexi-generic-menu
+ name
+ (let ((module-entries (map cons
+ (map module-name->node-name
+ (map car module-pairs))
+ (map cdr module-pairs)))
+ (script-entries (map cons
+ (map basename (map car script-pairs))
+ (map cdr script-pairs)))
+ (separate-sections (lambda (x) (if (null? x) x (cons #f x)))))
+ `(,@module-entries
+ ,@(separate-sections script-entries)
+ ,@(separate-sections extra-entries)))))
+
+(define (package-stexi-standard-prologue name filename category
+ description copying titlepage
+ menu)
+ "Create a standard prologue, suitable for later serialization
+to texinfo and .info creation with makeinfo.
+
+Returns a list of stexinfo forms suitable for passing to
+@code{package-stexi-documentation} as the prologue. @xref{texinfo
+reflection package-stexi-documentation}, @ref{texinfo reflection
+package-stexi-standard-titlepage,package-stexi-standard-titlepage},
+@ref{texinfo reflection
+package-stexi-standard-copying,package-stexi-standard-copying},
+and @ref{texinfo reflection
+package-stexi-standard-menu,package-stexi-standard-menu}."
+ `(,copying
+ (dircategory (% (category ,category)))
+ (direntry
+ "* " ,name ": (" ,filename "). " ,description ".")
+ ,@titlepage
+ ,@menu))
+
+(define (stexi->chapter stexi)
+ (pre-post-order
+ stexi
+ `((texinfo . ,(lambda (tag attrs node . body)
+ `(,node
+ (chapter ,@(assq-ref (cdr attrs) 'title))
+ ,@body)))
+ (*text* . ,(lambda (tag text) text))
+ (*default* . ,(lambda args args)))))
+
+(define* (package-stexi-documentation modules name filename
+ prologue epilogue
+ #:key
+ (module-stexi-documentation-args
+ '())
+ (scripts '()))
+ "Create stexi documentation for a @dfn{package}, where a
+package is a set of modules that is released together.
+
+@var{modules} is expected to be a list of module names, where a
+module name is a list of symbols. The stexi that is returned will
+be titled @var{name} and a texinfo filename of @var{filename}.
+
+@var{prologue} and @var{epilogue} are lists of stexi forms that
+will be spliced into the output document before and after the
+generated modules documentation, respectively.
+@xref{texinfo reflection package-stexi-standard-prologue}, to
+create a conventional GNU texinfo prologue.
+
+@var{module-stexi-documentation-args} is an optional argument that, if
+given, will be added to the argument list when
+@code{module-texi-documentation} is called. For example, it might be
+useful to define a @code{#:docs-resolver} argument."
+ (define (verify-modules-list l)
+ (define (all pred l)
+ (and (pred (car l))
+ (or (null? (cdr l)) (all pred (cdr l)))))
+ (false-if-exception
+ (all (lambda (x) (all symbol? x)) modules)))
+ (if (not (verify-modules-list modules))
+ (error "expected modules to be a list of a list of symbols"
+ modules))
+
+ `(texinfo
+ (% (title ,name)
+ (filename ,filename))
+ ,@prologue
+ ,@(append-map (lambda (mod)
+ (stexi->chapter
+ (apply module-stexi-documentation
+ mod module-stexi-documentation-args)))
+ modules)
+ ,@(append-map (lambda (script)
+ (stexi->chapter
+ (script-stexi-documentation script)))
+ scripts)
+ ,@epilogue))
+
+;;; arch-tag: bbe2bc03-e16d-4a9e-87b9-55225dc9836c
diff --git a/module/texinfo/serialize.scm b/module/texinfo/serialize.scm
new file mode 100644
index 000000000..6a32d2346
--- /dev/null
+++ b/module/texinfo/serialize.scm
@@ -0,0 +1,263 @@
+;;;; (texinfo serialize) -- rendering stexinfo as texinfo
+;;;;
+;;;; Copyright (C) 2009 Free Software Foundation, Inc.
+;;;; Copyright (C) 2003,2004,2009 Andy Wingo <wingo at pobox dot com>
+;;;;
+;;;; This library is free software; you can redistribute it and/or
+;;;; modify it under the terms of the GNU Lesser General Public
+;;;; License as published by the Free Software Foundation; either
+;;;; version 3 of the License, or (at your option) any later version.
+;;;;
+;;;; This library is distributed in the hope that it will be useful,
+;;;; but WITHOUT ANY WARRANTY; without even the implied warranty of
+;;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;;;; Lesser General Public License for more details.
+;;;;
+;;;; You should have received a copy of the GNU Lesser General Public
+;;;; License along with this library; if not, write to the Free Software
+;;;; Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;;;;
+
+;;; Commentary:
+;;
+;;Serialization of @code{stexi} to plain texinfo.
+;;
+;;; Code:
+
+(define-module (texinfo serialize)
+ #:use-module (texinfo)
+ #:use-module (texinfo string-utils)
+ #:use-module (sxml transform)
+ #:use-module (srfi srfi-1)
+ #:use-module (srfi srfi-13)
+ #:export (stexi->texi))
+
+(define (list-intersperse src-l elem)
+ (if (null? src-l) src-l
+ (let loop ((l (cdr src-l)) (dest (cons (car src-l) '())))
+ (if (null? l) (reverse dest)
+ (loop (cdr l) (cons (car l) (cons elem dest)))))))
+
+;; converts improper lists to proper lists.
+(define (filter* pred l)
+ (let lp ((in l) (out '()))
+ (cond ((null? in)
+ (reverse! out))
+ ((pair? in)
+ (lp (cdr in) (if (pred (car in)) (cons (car in) out) out)))
+ (else
+ (lp '() (if (pred in) (cons in out) out))))))
+
+;; (list* 'a '(b c) 'd '(e f g)) => '(a b c d e f g)
+(define (list* . args)
+ (let* ((args (reverse args))
+ (tail (car args)))
+ (let lp ((in (cdr args)) (out tail))
+ (cond ((null? in) out)
+ ((pair? (car in)) (lp (cdr in) (append (car in) out)))
+ ((null? (car in)) (lp (cdr in) out))
+ (else (lp (cdr in) (cons (car in) out)))))))
+
+;; Why? Well, because syntax-case defines `include', and carps about its
+;; wrong usage below...
+(eval-when (eval load compile)
+ (define (include exp lp command type formals args accum)
+ (list* "\n"
+ (list-intersperse
+ args
+ " ")
+ " " command "@" accum)))
+
+(define (empty-command exp lp command type formals args accum)
+ (list* " " command "@" accum))
+
+(define (inline-text exp lp command type formals args accum)
+ (if (not (string=? command "*braces*")) ;; fixme :(
+ (list* "}"
+ (append-map (lambda (x) (lp x '())) (reverse (cdr exp)))
+ "{" command "@" accum)
+ (list* "@}"
+ (append-map (lambda (x) (lp x '())) (reverse (cdr exp)))
+ "@{" accum)))
+
+(define (inline-args exp lp command type formals args accum)
+ (list* "}"
+ (if (not args) ""
+ (list-intersperse
+ (map
+ (lambda (x)
+ (cond ((not x) "")
+ ((pair? x)
+ (if (pair? (cdr x))
+ (warn "Strange inline-args!" args))
+ (car x))
+ (else (error "Invalid inline-args" args))))
+ (drop-while not
+ (map (lambda (x) (assq-ref args x))
+ (reverse formals))))
+ ","))
+ "{" command "@" accum))
+
+(define (serialize-text-args lp formals args)
+ (apply
+ append
+ (list-intersperse
+ (map (lambda (arg) (append-map (lambda (x) (lp x '())) arg))
+ (map
+ reverse
+ (drop-while
+ not (map (lambda (x) (assq-ref args x))
+ (reverse formals)))))
+ '(" "))))
+
+(define (eol-text-args exp lp command type formals args accum)
+ (list* "\n"
+ (serialize-text-args lp formals args)
+ " " command "@" accum))
+
+(define (eol-text exp lp command type formals args accum)
+ (list* "\n"
+ (append-map (lambda (x) (lp x '()))
+ (reverse (if args (cddr exp) (cdr exp))))
+ " " command "@" accum))
+
+(define (eol-args exp lp command type formals args accum)
+ (list* "\n"
+ (list-intersperse
+ (apply append
+ (drop-while not
+ (map (lambda (x) (assq-ref args x))
+ (reverse formals))))
+ ", ")
+ " " command "@" accum))
+
+(define (environ exp lp command type formals args accum)
+ (case (car exp)
+ ((texinfo)
+ (list* "@bye\n"
+ (append-map (lambda (x) (lp x '())) (reverse (cddr exp)))
+ "\n@c %**end of header\n\n"
+ (reverse (assq-ref args 'title)) "@settitle "
+ (or (and=> (assq-ref args 'filename)
+ (lambda (filename)
+ (cons "\n" (reverse (cons "@setfilename " filename)))))
+ "")
+ "\\input texinfo @c -*-texinfo-*-\n@c %**start of header\n"
+ accum))
+ (else
+ (list* "\n\n" command "@end "
+ (let ((body (append-map (lambda (x) (lp x '()))
+ (reverse (if args (cddr exp) (cdr exp))))))
+ (if (or (null? body)
+ (eqv? (string-ref (car body)
+ (1- (string-length (car body))))
+ #\newline))
+ body
+ (cons "\n" body)))
+ "\n"
+ (serialize-text-args lp formals args)
+ " " command "@" accum))))
+
+(define (table-environ exp lp command type formals args accum)
+ (list* "\n\n" command "@end "
+ (append-map (lambda (x) (lp x '()))
+ (reverse (if args (cddr exp) (cdr exp))))
+ "\n"
+ (let* ((arg (if args (cadar args) ""))) ;; zero or one args
+ (if (pair? arg)
+ (list (symbol->string (car arg)) "@")
+ arg))
+ " " command "@" accum))
+
+(define (wrap strings)
+ (fill-string (string-concatenate strings)
+ #:line-width 72))
+
+(define (paragraph exp lp command type formals args accum)
+ (list* "\n\n"
+ (wrap
+ (reverse
+ (append-map (lambda (x) (lp x '())) (reverse (cdr exp)))))
+ accum))
+
+(define (item exp lp command type formals args accum)
+ (list* (append-map (lambda (x) (lp x '())) (reverse (cdr exp)))
+ "@item\n"
+ accum))
+
+(define (entry exp lp command type formals args accum)
+ (list* (append-map (lambda (x) (lp x '())) (reverse (cddr exp)))
+ "\n"
+ (append-map (lambda (x) (lp x '())) (reverse (cdar args)))
+ "@item "
+ accum))
+
+(define (fragment exp lp command type formals args accum)
+ (list* "\n@c %end of fragment\n"
+ (append-map (lambda (x) (lp x '())) (reverse (cdr exp)))
+ "\n@c %start of fragment\n\n"
+ accum))
+
+(define serializers
+ `((EMPTY-COMMAND . ,empty-command)
+ (INLINE-TEXT . ,inline-text)
+ (INLINE-ARGS . ,inline-args)
+ (EOL-TEXT . ,eol-text)
+ (EOL-TEXT-ARGS . ,eol-text-args)
+ (INDEX . ,eol-text-args)
+ (EOL-ARGS . ,eol-args)
+ (ENVIRON . ,environ)
+ (TABLE-ENVIRON . ,table-environ)
+ (ENTRY . ,entry)
+ (ITEM . ,item)
+ (PARAGRAPH . ,paragraph)
+ (FRAGMENT . ,fragment)
+ (#f . ,include))) ; support writing include statements
+
+(define (serialize exp lp command type formals args accum)
+ ((or (assq-ref serializers type)
+ (error "Unknown command type" exp type))
+ exp lp command type formals args accum))
+
+(define escaped-chars '(#\} #\{ #\@))
+(define (escape str)
+ "Escapes any illegal texinfo characters (currently @{, @}, and @@)."
+ (let loop ((in (string->list str)) (out '()))
+ (if (null? in)
+ (apply string (reverse out))
+ (if (memq (car in) escaped-chars)
+ (loop (cdr in) (cons* (car in) #\@ out))
+ (loop (cdr in) (cons (car in) out))))))
+
+(define (stexi->texi tree)
+ "Serialize the stexi @var{tree} into plain texinfo."
+ (string-concatenate-reverse
+ (let lp ((in tree) (out '()))
+ (cond
+ ((or (not in) (null? in)) out)
+ ((string? in) (cons (escape in) out))
+ ((pair? in)
+ (let ((command-spec (assq (car in) texi-command-specs)))
+ (if (not command-spec)
+ (begin
+ (warn "Unknown stexi command, not rendering" in)
+ out)
+ (serialize in
+ lp
+ (symbol->string (car in))
+ (cadr command-spec)
+ (filter* symbol? (cddr command-spec))
+ (cond
+ ((and (pair? (cdr in)) (pair? (cadr in))
+ (eq? (caadr in) '%))
+ (cdadr in))
+ ((not (cadr command-spec))
+ ;; include
+ (cdr in))
+ (else
+ #f))
+ out))))
+ (else
+ (error "Invalid stexi" in))))))
+
+;;; arch-tag: d3fa16ea-0bf7-4ec5-ab9f-3f08490f77f5
diff --git a/module/texinfo/string-utils.scm b/module/texinfo/string-utils.scm
new file mode 100644
index 000000000..eff914369
--- /dev/null
+++ b/module/texinfo/string-utils.scm
@@ -0,0 +1,400 @@
+;;;; (texinfo string-utils) -- text filling and wrapping
+;;;;
+;;;; Copyright (C) 2009 Free Software Foundation, Inc.
+;;;; Copyright (C) 2003 Richard Todd
+;;;;
+;;;; This library is free software; you can redistribute it and/or
+;;;; modify it under the terms of the GNU Lesser General Public
+;;;; License as published by the Free Software Foundation; either
+;;;; version 3 of the License, or (at your option) any later version.
+;;;;
+;;;; This library is distributed in the hope that it will be useful,
+;;;; but WITHOUT ANY WARRANTY; without even the implied warranty of
+;;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;;;; Lesser General Public License for more details.
+;;;;
+;;;; You should have received a copy of the GNU Lesser General Public
+;;;; License along with this library; if not, write to the Free Software
+;;;; Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;;;;
+
+;;; Commentary:
+;; Module @samp{(texinfo string-utils)} provides various string-related
+;; functions useful to Guile's texinfo support.
+;;; Code:
+
+(define-module (texinfo string-utils)
+ #:use-module (srfi srfi-13)
+ #:use-module (srfi srfi-14)
+ #:use-module (oop goops)
+ #:export (escape-special-chars
+ transform-string
+ expand-tabs
+ center-string
+ left-justify-string
+ right-justify-string
+ collapse-repeated-chars
+ make-text-wrapper
+ fill-string
+ string->wrapped-lines))
+
+(define* (transform-string str match? replace #:optional (start #f) (end #f))
+"Uses @var{match?} against each character in @var{str}, and performs a
+replacement on each character for which matches are found.
+
+@var{match?} may either be a function, a character, a string, or
+@code{#t}. If @var{match?} is a function, then it takes a single
+character as input, and should return @samp{#t} for matches.
+@var{match?} is a character, it is compared to each string character
+using @code{char=?}. If @var{match?} is a string, then any character
+in that string will be considered a match. @code{#t} will cause
+every character to be a match.
+
+If @var{replace} is a function, it is called with the matched
+character as an argument, and the returned value is sent to the output
+string via @samp{display}. If @var{replace} is anything else, it is
+sent through the output string via @samp{display}.
+
+Note that te replacement for the matched characters does not need to
+be a single character. That is what differentiates this function from
+@samp{string-map}, and what makes it useful for applications such as
+converting @samp{#\\&} to @samp{\"&amp;\"} in web page text. Some other
+functions in this module are just wrappers around common uses of
+@samp{transform-string}. Transformations not possible with this
+function should probably be done with regular expressions.
+
+If @var{start} and @var{end} are given, they control which portion
+of the string undergoes transformation. The entire input string
+is still output, though. So, if @var{start} is @samp{5}, then the
+first five characters of @var{str} will still appear in the returned
+string.
+
+@lisp
+; these two are equivalent...
+ (transform-string str #\\space #\\-) ; change all spaces to -'s
+ (transform-string str (lambda (c) (char=? #\\space c)) #\\-)
+@end lisp"
+ ;; I had implemented this with string-fold, but it was
+ ;; slower...
+ (let* ((os (open-output-string))
+ (matcher (cond ((char? match?)
+ (lambda (c) (char=? match? c)))
+ ((procedure? match?)
+ match?)
+ ((string? match?)
+ (lambda (c) (string-index match? c)))
+ ((boolean? match?)
+ (lambda (c) match?))
+ (else (throw 'bad-type "expected #t, char, string, or procedure"))))
+ (replacer (if (procedure? replace)
+ (lambda (c) (display (replace c) os))
+ (lambda (c) (display replace os)))))
+
+ ;; put the first part in, un-transformed if they asked for it...
+ (if (and start (<= start (string-length str)))
+ (display (substring str 0 start) os))
+
+ ;; process the portion they want processed....
+ (string-for-each
+ (lambda (c)
+ (if (matcher c)
+ ;; we have a match! replace the char as directed...
+ (replacer c)
+
+ ;; not a match, just insert the character itself...
+ (write-char c os)))
+ str
+ (or start 0)
+ (or end (string-length str)))
+
+ ;; if there was any at the end, tack it on...
+ (if (and end (< end (string-length str)))
+ (display (substring str end) os))
+
+ (get-output-string os)))
+
+(define* (expand-tabs str #:optional (tab-size 8))
+"Returns a copy of @var{str} with all tabs expanded to spaces. @var{tab-size} defaults to 8.
+
+Assuming tab size of 8, this is equivalent to: @lisp
+ (transform-string str #\\tab \" \")
+@end lisp"
+ (transform-string str
+ #\tab
+ (make-string tab-size #\space)))
+
+(define (escape-special-chars str special-chars escape-char)
+"Returns a copy of @var{str} with all given special characters preceded
+by the given @var{escape-char}.
+
+@var{special-chars} can either be a single character, or a string consisting
+of all the special characters.
+
+@lisp
+;; make a string regexp-safe...
+ (escape-special-chars \"***(Example String)***\"
+ \"[]()/*.\"
+ #\\\\)
+=> \"\\\\*\\\\*\\\\*\\\\(Example String\\\\)\\\\*\\\\*\\\\*\"
+
+;; also can escape a singe char...
+ (escape-special-chars \"richardt@@vzavenue.net\"
+ #\\@@
+ #\\@@)
+=> \"richardt@@@@vzavenue.net\"
+@end lisp"
+ (transform-string str
+ (if (char? special-chars)
+ ;; if they gave us a char, use char=?
+ (lambda (c) (char=? c special-chars))
+
+ ;; if they gave us a string, see if our character is in it
+ (lambda (c) (string-index special-chars c)))
+
+ ;; replace matches with the character preceded by the escape character
+ (lambda (c) (string escape-char c))))
+
+(define* (center-string str #:optional (width 80) (chr #\space) (rchr #f))
+"Returns a copy of @var{str} centered in a field of @var{width}
+characters. Any needed padding is done by character @var{chr}, which
+defaults to @samp{#\\space}. If @var{rchr} is provided, then the
+padding to the right will use it instead. See the examples below.
+left and @var{rchr} on the right. The default @var{width} is 80. The
+default @var{lchr} and @var{rchr} is @samp{#\\space}. The string is
+never truncated.
+@lisp
+ (center-string \"Richard Todd\" 24)
+=> \" Richard Todd \"
+
+ (center-string \" Richard Todd \" 24 #\\=)
+=> \"===== Richard Todd =====\"
+
+ (center-string \" Richard Todd \" 24 #\\< #\\>)
+=> \"<<<<< Richard Todd >>>>>\"
+@end lisp"
+ (let* ((len (string-length str))
+ (lpad (make-string (max (quotient (- width len) 2) 0) chr))
+ ;; right-char == char unless it has been provided by the user
+ (right-chr (or rchr chr))
+ (rpad (if (char=? right-chr chr)
+ lpad
+ (make-string (max (quotient (- width len) 2) 0) right-chr))))
+ (if (>= len width)
+ str
+ (string-append lpad str rpad (if (odd? (- width len)) (string right-chr) "")))))
+
+(define* (left-justify-string str #:optional (width 80) (chr #\space))
+"@code{left-justify-string str [width chr]}.
+Returns a copy of @var{str} padded with @var{chr} such that it is left
+justified in a field of @var{width} characters. The default
+@var{width} is 80. Unlike @samp{string-pad} from srfi-13, the string
+is never truncated."
+ (let* ((len (string-length str))
+ (pad (make-string (max (- width len) 0) chr)))
+ (if (>= len width)
+ str
+ (string-append str pad))))
+
+(define* (right-justify-string str #:optional (width 80) (chr #\space))
+"Returns a copy of @var{str} padded with @var{chr} such that it is
+right justified in a field of @var{width} characters. The default
+@var{width} is 80. The default @var{chr} is @samp{#\\space}. Unlike
+@samp{string-pad} from srfi-13, the string is never truncated."
+ (let* ((len (string-length str))
+ (pad (make-string (max (- width len) 0) chr)))
+ (if (>= len width)
+ str
+ (string-append pad str))))
+
+ (define* (collapse-repeated-chars str #:optional (chr #\space) (num 1))
+"Returns a copy of @var{str} with all repeated instances of
+@var{chr} collapsed down to at most @var{num} instances.
+The default value for @var{chr} is @samp{#\\space}, and
+the default value for @var{num} is 1.
+
+@lisp
+ (collapse-repeated-chars \"H e l l o\")
+=> \"H e l l o\"
+ (collapse-repeated-chars \"H--e--l--l--o\" #\\-)
+=> \"H-e-l-l-o\"
+ (collapse-repeated-chars \"H-e--l---l----o\" #\\- 2)
+=> \"H-e--l--l--o\"
+@end lisp"
+ ;; define repeat-locator as a stateful match? function which remembers
+ ;; the last character it had seen.
+ (let ((repeat-locator
+ ;; initialize prev-chr to something other than what we're seeking...
+ (let ((prev-chr (if (char=? chr #\space) #\A #\space))
+ (match-count 0))
+ (lambda (c)
+ (if (and (char=? c prev-chr)
+ (char=? prev-chr chr))
+ ;; found enough duplicates if the match-count is high enough
+ (begin
+ (set! match-count (+ 1 match-count))
+ (>= match-count num))
+
+ ;; did not find a duplicate
+ (begin (set! match-count 0)
+ (set! prev-chr c)
+ #f))))))
+
+ ;; transform the string with our stateful matcher...
+ ;; deleting matches...
+ (transform-string str repeat-locator "")))
+
+;; split a text string into segments that have the form...
+;; <ws non-ws> <ws non-ws> etc..
+(define (split-by-single-words str)
+ (let ((non-wschars (char-set-complement char-set:whitespace)))
+ (let loop ((ans '())
+ (index 0))
+ (let ((next-non-ws (string-index str non-wschars index)))
+ (if next-non-ws
+ ;; found non-ws...look for ws following...
+ (let ((next-ws (string-index str char-set:whitespace next-non-ws)))
+ (if next-ws
+ ;; found the ws following...
+ (loop (cons (substring str index next-ws) ans)
+ next-ws)
+ ;; did not find ws...must be the end...
+ (reverse (cons (substring str index) ans))))
+ ;; did not find non-ws... only ws at end of the string...
+ (reverse ans))))))
+
+(define* (make-text-wrapper #:key
+ (line-width 80)
+ (expand-tabs? #t)
+ (tab-width 8)
+ (collapse-whitespace? #t)
+ (subsequent-indent "")
+ (initial-indent "")
+ (break-long-words? #t))
+ "Returns a procedure that will split a string into lines according to the
+given parameters.
+
+@table @code
+@item #:line-width
+This is the target length used when deciding where to wrap lines.
+Default is 80.
+
+@item #:expand-tabs?
+Boolean describing whether tabs in the input should be expanded. Default
+is #t.
+
+@item #:tab-width
+If tabs are expanded, this will be the number of spaces to which they
+expand. Default is 8.
+
+@item #:collapse-whitespace?
+Boolean describing whether the whitespace inside the existing text
+should be removed or not. Default is #t.
+
+If text is already well-formatted, and is just being wrapped to fit in a
+different width, then set this to @samp{#f}. This way, many common text
+conventions (such as two spaces between sentences) can be preserved if
+in the original text. If the input text spacing cannot be trusted, then
+leave this setting at the default, and all repeated whitespace will be
+collapsed down to a single space.
+
+@item #:initial-indent
+Defines a string that will be put in front of the first line of wrapped
+text. Default is the empty string, ``''.
+
+@item #:subsequent-indent
+Defines a string that will be put in front of all lines of wrapped
+text, except the first one. Default is the empty string, ``''.
+
+@item #:break-long-words?
+If a single word is too big to fit on a line, this setting tells the
+wrapper what to do. Defaults to #t, which will break up long words.
+When set to #f, the line will be allowed, even though it is longer
+than the defined @code{#:line-width}.
+@end table
+
+The return value is a procedure of one argument, the input string, which
+returns a list of strings, where each element of the list is one line."
+ (lambda (str)
+ ;; replace newlines with spaces
+ (set! str (transform-string str (lambda (c) (char=? c #\nl)) #\space))
+
+ ;; expand tabs if they wanted us to...
+ (if expand-tabs?
+ (set! str (expand-tabs str tab-width)))
+
+ ;; collapse whitespace if they wanted us to...
+ (if collapse-whitespace?
+ (set! str (collapse-repeated-chars str)))
+
+ ;; drop any whitespace from the front...
+ (set! str (string-trim str))
+
+ ;; now start breaking the text into lines...
+ (let loop ((ans '())
+ (words (split-by-single-words str))
+ (line initial-indent)
+ (count 0))
+ (if (null? words)
+ ;; out of words? ...done!
+ (reverse (if (> count 0)
+ (cons line ans)
+ ans))
+
+ ;; not out of words...keep going...
+ (let ((length-left (- line-width
+ (string-length line)))
+ (next-word (if (= count 0)
+ (string-trim (car words))
+ (car words))))
+ (cond
+ ;; does the next entry fit?
+ ((<= (string-length next-word)
+ length-left)
+ (loop ans
+ (cdr words)
+ (string-append line next-word)
+ (+ count 1)))
+
+ ;; ok, it didn't fit...is there already at least one word on the line?
+ ((> count 0)
+ ;; try to use it for the next line, then...
+ (loop (cons line ans)
+ words
+ subsequent-indent
+ 0))
+
+ ;; ok, it didn't fit...and it's the first word.
+ ;; were we told to break up long words?
+ (break-long-words?
+ ;; break the like at the limit, since the user wants us to...
+ (loop (cons (string-append line (substring next-word 0 length-left))
+ ans)
+ (cons (substring next-word length-left)
+ (cdr words))
+ subsequent-indent
+ 0))
+
+ ;; well, then is it the first word and we *shouldn't* break long words, then...
+ (else
+ (loop (cons (string-append line next-word)
+ ans)
+ (cdr words)
+ subsequent-indent
+ 0))))))))
+
+(define (string->wrapped-lines str . kwargs)
+ "@code{string->wrapped-lines str keywds ...}. Wraps the text given in
+string @var{str} according to the parameters provided in @var{keywds},
+or the default setting if they are not given. Returns a list of strings
+representing the formatted lines. Valid keyword arguments are discussed
+in @code{make-text-wrapper}."
+ ((apply make-text-wrapper kwargs) str))
+
+(define (fill-string str . kwargs)
+ "Wraps the text given in string @var{str} according to the parameters
+provided in @var{keywds}, or the default setting if they are not
+given. Returns a single string with the wrapped text. Valid keyword
+arguments are discussed in @code{make-text-wrapper}."
+ (string-join (apply string->wrapped-lines str kwargs)
+ "\n"
+ 'infix))