diff options
Diffstat (limited to 'module')
-rw-r--r-- | module/Makefile.am | 54 | ||||
-rw-r--r-- | module/statprof.scm | 688 | ||||
-rw-r--r-- | module/sxml/apply-templates.scm | 102 | ||||
-rw-r--r-- | module/sxml/fold.scm | 259 | ||||
-rw-r--r-- | module/sxml/simple.scm | 169 | ||||
-rw-r--r-- | module/sxml/ssax.scm | 246 | ||||
-rw-r--r-- | module/sxml/ssax/input-parse.scm | 180 | ||||
-rw-r--r-- | module/sxml/transform.scm | 298 | ||||
-rw-r--r-- | module/sxml/upstream/COPYING.SSAX | 2 | ||||
-rw-r--r-- | module/sxml/upstream/SSAX.scm | 3212 | ||||
-rw-r--r-- | module/sxml/upstream/SXML-tree-trans.scm | 249 | ||||
-rw-r--r-- | module/sxml/upstream/SXPath-old.scm | 1216 | ||||
-rw-r--r-- | module/sxml/upstream/assert.scm | 35 | ||||
-rw-r--r-- | module/sxml/upstream/input-parse.scm | 326 | ||||
-rw-r--r-- | module/sxml/xpath.scm | 493 | ||||
-rw-r--r-- | module/texinfo.scm | 1217 | ||||
-rw-r--r-- | module/texinfo/docbook.scm | 233 | ||||
-rw-r--r-- | module/texinfo/html.scm | 259 | ||||
-rw-r--r-- | module/texinfo/indexing.scm | 78 | ||||
-rw-r--r-- | module/texinfo/plain-text.scm | 319 | ||||
-rw-r--r-- | module/texinfo/reflection.scm | 528 | ||||
-rw-r--r-- | module/texinfo/serialize.scm | 263 | ||||
-rw-r--r-- | module/texinfo/string-utils.scm | 400 |
23 files changed, 10811 insertions, 15 deletions
diff --git a/module/Makefile.am b/module/Makefile.am index 21c36243a..54ceca917 100644 --- a/module/Makefile.am +++ b/module/Makefile.am @@ -63,7 +63,8 @@ SOURCES = \ $(SCRIPTS_SOURCES) \ $(ECMASCRIPT_LANG_SOURCES) \ $(ELISP_LANG_SOURCES) \ - $(BRAINFUCK_LANG_SOURCES) + $(BRAINFUCK_LANG_SOURCES) \ + $(LIB_SOURCES) ## test.scm is not currently installed. EXTRA_DIST += \ @@ -283,20 +284,43 @@ SYSTEM_SOURCES = \ system/repl/repl.scm system/repl/common.scm \ system/repl/command.scm +LIB_SOURCES = \ + statprof.scm \ + sxml/apply-templates.scm \ + sxml/fold.scm \ + sxml/simple.scm \ + sxml/ssax/input-parse.scm \ + sxml/ssax.scm \ + sxml/transform.scm \ + sxml/xpath.scm \ + texinfo.scm \ + texinfo/docbook.scm \ + texinfo/html.scm \ + texinfo/indexing.scm \ + texinfo/string-utils.scm \ + texinfo/plain-text.scm \ + texinfo/reflection.scm \ + texinfo/serialize.scm + EXTRA_DIST += oop/ChangeLog-2008 NOCOMP_SOURCES = \ - ice-9/gds-client.scm \ - ice-9/psyntax.scm \ - ice-9/quasisyntax.scm \ - system/repl/describe.scm \ - ice-9/debugger/command-loop.scm \ - ice-9/debugger/commands.scm \ - ice-9/debugger/state.scm \ - ice-9/debugger/trc.scm \ - ice-9/debugger/utils.scm \ - ice-9/debugging/example-fns.scm \ - ice-9/debugging/steps.scm \ - ice-9/debugging/trace.scm \ - ice-9/debugging/traps.scm \ - ice-9/debugging/trc.scm + ice-9/gds-client.scm \ + ice-9/psyntax.scm \ + ice-9/quasisyntax.scm \ + system/repl/describe.scm \ + ice-9/debugger/command-loop.scm \ + ice-9/debugger/commands.scm \ + ice-9/debugger/state.scm \ + ice-9/debugger/trc.scm \ + ice-9/debugger/utils.scm \ + ice-9/debugging/example-fns.scm \ + ice-9/debugging/steps.scm \ + ice-9/debugging/trace.scm \ + ice-9/debugging/traps.scm \ + ice-9/debugging/trc.scm \ + sxml/upstream/SSAX.scm \ + sxml/upstream/SXML-tree-trans.scm \ + sxml/upstream/SXPath-old.scm \ + sxml/upstream/assert.scm \ + sxml/upstream/input-parse.scm diff --git a/module/statprof.scm b/module/statprof.scm new file mode 100644 index 000000000..f021778a9 --- /dev/null +++ b/module/statprof.scm @@ -0,0 +1,688 @@ +;;;; (statprof) -- a statistical profiler for Guile +;;;; -*-scheme-*- +;;;; +;;;; Copyright (C) 2009 Free Software Foundation, Inc. +;;;; Copyright (C) 2004, 2009 Andy Wingo <wingo at pobox dot com> +;;;; Copyright (C) 2001 Rob Browning <rlb at defaultvalue dot org> +;;;; +;;;; This library is free software; you can redistribute it and/or +;;;; modify it under the terms of the GNU Lesser General Public +;;;; License as published by the Free Software Foundation; either +;;;; version 3 of the License, or (at your option) any later version. +;;;; +;;;; This library is distributed in the hope that it will be useful, +;;;; but WITHOUT ANY WARRANTY; without even the implied warranty of +;;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;;;; Lesser General Public License for more details. +;;;; +;;;; You should have received a copy of the GNU Lesser General Public +;;;; License along with this library; if not, write to the Free Software +;;;; Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;;;; + + +;;; Commentary: +;; +;;@code{(statprof)} is intended to be a fairly simple +;;statistical profiler for guile. It is in the early stages yet, so +;;consider its output still suspect, and please report any bugs to +;;@email{guile-devel at gnu.org}, or to me directly at @email{rlb at +;;defaultvalue.org}. +;; +;;A simple use of statprof would look like this: +;; +;;@example +;; (statprof-reset 0 50000 #t) +;; (statprof-start) +;; (do-something) +;; (statprof-stop) +;; (statprof-display) +;;@end example +;; +;;This would reset statprof, clearing all accumulated statistics, then +;;start profiling, run some code, stop profiling, and finally display a +;;gprof flat-style table of statistics which will look something like +;;this: +;; +;;@example +;; % cumulative self self total +;; time seconds seconds calls ms/call ms/call name +;; 35.29 0.23 0.23 2002 0.11 0.11 - +;; 23.53 0.15 0.15 2001 0.08 0.08 positive? +;; 23.53 0.15 0.15 2000 0.08 0.08 + +;; 11.76 0.23 0.08 2000 0.04 0.11 do-nothing +;; 5.88 0.64 0.04 2001 0.02 0.32 loop +;; 0.00 0.15 0.00 1 0.00 150.59 do-something +;; ... +;;@end example +;; +;;All of the numerical data with the exception of the calls column is +;;statistically approximate. In the following column descriptions, and +;;in all of statprof, "time" refers to execution time (both user and +;;system), not wall clock time. +;; +;;@table @asis +;;@item % time +;;The percent of the time spent inside the procedure itself +;;(not counting children). +;;@item cumulative seconds +;;The total number of seconds spent in the procedure, including +;;children. +;;@item self seconds +;;The total number of seconds spent in the procedure itself (not counting +;;children). +;;@item calls +;;The total number of times the procedure was called. +;;@item self ms/call +;;The average time taken by the procedure itself on each call, in ms. +;;@item total ms/call +;;The average time taken by each call to the procedure, including time +;;spent in child functions. +;;@item name +;;The name of the procedure. +;;@end table +;; +;;The profiler uses @code{eq?} and the procedure object itself to +;;identify the procedures, so it won't confuse different procedures with +;;the same name. They will show up as two different rows in the output. +;; +;;Right now the profiler is quite simplistic. I cannot provide +;;call-graphs or other higher level information. What you see in the +;;table is pretty much all there is. Patches are welcome :-) +;; +;;@section Implementation notes +;; +;;The profiler works by setting the unix profiling signal +;;@code{ITIMER_PROF} to go off after the interval you define in the call +;;to @code{statprof-reset}. When the signal fires, a sampling routine is +;;run which looks at the current procedure that's executing, and then +;;crawls up the stack, and for each procedure encountered, increments +;;that procedure's sample count. Note that if a procedure is encountered +;;multiple times on a given stack, it is only counted once. After the +;;sampling is complete, the profiler resets profiling timer to fire +;;again after the appropriate interval. +;; +;;Meanwhile, the profiler keeps track, via @code{get-internal-run-time}, +;;how much CPU time (system and user -- which is also what +;;@code{ITIMER_PROF} tracks), has elapsed while code has been executing +;;within a statprof-start/stop block. +;; +;;The profiler also tries to avoid counting or timing its own code as +;;much as possible. +;; +;;; Code: + +;; When you add new features, please also add tests to ./tests/ if you +;; have time, and then add the new files to ./run-tests. Also, if +;; anyone's bored, there are a lot of existing API bits that don't +;; have tests yet. + +;; TODO +;; +;; Check about profiling C functions -- does profiling primitives work? +;; Also look into stealing code from qprof so we can sample the C stack +;; Call graphs? + +(define-module (statprof) + #:use-module (srfi srfi-1) + #:autoload (ice-9 format) (format) + #:export (statprof-active? + statprof-start + statprof-stop + statprof-reset + + statprof-accumulated-time + statprof-sample-count + statprof-fold-call-data + statprof-proc-call-data + statprof-call-data-name + statprof-call-data-calls + statprof-call-data-cum-samples + statprof-call-data-self-samples + statprof-call-data->stats + + statprof-stats-proc-name + statprof-stats-%-time-in-proc + statprof-stats-cum-secs-in-proc + statprof-stats-self-secs-in-proc + statprof-stats-calls + statprof-stats-self-secs-per-call + statprof-stats-cum-secs-per-call + + statprof-display + statprof-display-anomolies + + statprof-fetch-stacks + statprof-fetch-call-tree + + with-statprof)) + + +;; This profiler tracks two numbers for every function called while +;; it's active. It tracks the total number of calls, and the number +;; of times the function was active when the sampler fired. +;; +;; Globally the profiler tracks the total time elapsed and the number +;; of times the sampler was fired. +;; +;; Right now, this profiler is not per-thread and is not thread safe. + +(define accumulated-time #f) ; total so far. +(define last-start-time #f) ; start-time when timer is active. +(define sample-count #f) ; total count of sampler calls. +(define sampling-frequency #f) ; in (seconds . microseconds) +(define remaining-prof-time #f) ; time remaining when prof suspended. +(define profile-level 0) ; for user start/stop nesting. +(define %count-calls? #t) ; whether to catch apply-frame. +(define gc-time-taken 0) ; gc time between statprof-start and + ; statprof-stop. +(define record-full-stacks? #f) ; if #t, stash away the stacks + ; for later analysis. +(define stacks '()) + +;; procedure-data will be a hash where the key is the function object +;; itself and the value is the data. The data will be a vector like +;; this: #(name call-count cum-sample-count self-sample-count) +(define procedure-data #f) + +;; If you change the call-data data structure, you need to also change +;; sample-uncount-frame. +(define (make-call-data name call-count cum-sample-count self-sample-count) + (vector (or name (error "internal error (we don't count anonymous procs)")) + call-count cum-sample-count self-sample-count)) +(define (call-data-name cd) (vector-ref cd 0)) +(define (call-data-call-count cd) (vector-ref cd 1)) +(define (call-data-cum-sample-count cd) (vector-ref cd 2)) +(define (call-data-self-sample-count cd) (vector-ref cd 3)) + +(define (set-call-data-name! cd name) + (vector-set! cd 0 name)) +(define (inc-call-data-call-count! cd) + (vector-set! cd 1 (1+ (vector-ref cd 1)))) +(define (inc-call-data-cum-sample-count! cd) + (vector-set! cd 2 (1+ (vector-ref cd 2)))) +(define (inc-call-data-self-sample-count! cd) + (vector-set! cd 3 (1+ (vector-ref cd 3)))) + +(define-macro (accumulate-time stop-time) + `(set! accumulated-time + (+ accumulated-time 0.0 (- ,stop-time last-start-time)))) + +(define (get-call-data proc) + (or (hashq-ref procedure-data proc) + (let ((call-data (make-call-data (procedure-name proc) 0 0 0))) + (hashq-set! procedure-data proc call-data) + call-data))) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; SIGPROF handler + +(define (sample-stack-procs stack) + (let ((stacklen (stack-length stack)) + (hit-count-call? #f)) + + (if record-full-stacks? + (set! stacks (cons stack stacks))) + + (set! sample-count (+ sample-count 1)) + ;; Now accumulate stats for the whole stack. + (let loop ((frame (stack-ref stack 0)) + (procs-seen (make-hash-table 13)) + (self #f)) + (cond + ((not frame) + (hash-fold + (lambda (proc val accum) + (inc-call-data-cum-sample-count! + (get-call-data proc))) + #f + procs-seen) + (and=> (and=> self get-call-data) + inc-call-data-self-sample-count!)) + ((frame-procedure frame) + => (lambda (proc) + (cond + ((eq? proc count-call) + ;; We're not supposed to be sampling count-call and + ;; its sub-functions, so loop again with a clean + ;; slate. + (set! hit-count-call? #t) + (loop (frame-previous frame) (make-hash-table 13) #f)) + ((procedure-name proc) + (hashq-set! procs-seen proc #t) + (loop (frame-previous frame) + procs-seen + (or self proc))) + (else + (loop (frame-previous frame) procs-seen self))))) + (else + (loop (frame-previous frame) procs-seen self)))) + hit-count-call?)) + +(define inside-profiler? #f) + +(define (profile-signal-handler sig) + (set! inside-profiler? #t) + + ;; FIXME: with-statprof should be able to set an outer frame for the + ;; stack cut + (if (positive? profile-level) + (let* ((stop-time (get-internal-run-time)) + ;; cut down to the signal handler, then we rely on + ;; knowledge of guile: it dispatches signal handlers + ;; through a thunk, so cut one more procedure + (stack (make-stack #t profile-signal-handler 0 1)) + (inside-apply-trap? (sample-stack-procs stack))) + + (if (not inside-apply-trap?) + (begin + ;; disabling here is just a little more efficient, but + ;; not necessary given inside-profiler?. We can't just + ;; disable unconditionally at the top of this function + ;; and eliminate inside-profiler? because it seems to + ;; confuse guile wrt re-enabling the trap when + ;; count-call finishes. + (if %count-calls? (trap-disable 'apply-frame)) + (accumulate-time stop-time))) + + (setitimer ITIMER_PROF + 0 0 + (car sampling-frequency) + (cdr sampling-frequency)) + + (if (not inside-apply-trap?) + (begin + (set! last-start-time (get-internal-run-time)) + (if %count-calls? (trap-enable 'apply-frame)))))) + + (set! inside-profiler? #f)) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; Count total calls. + +(define (count-call trap-name continuation tail) + (if (not inside-profiler?) + (begin + (accumulate-time (get-internal-run-time)) + + (and=> (frame-procedure (last-stack-frame continuation)) + (lambda (proc) + (if (procedure-name proc) + (inc-call-data-call-count! + (get-call-data proc))))) + + (set! last-start-time (get-internal-run-time))))) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(define (statprof-active?) + "Returns @code{#t} if @code{statprof-start} has been called more times +than @code{statprof-stop}, @code{#f} otherwise." + (positive? profile-level)) + +;; Do not call this from statprof internal functions -- user only. +(define (statprof-start) + "Start the profiler.@code{}" + ;; After some head-scratching, I don't *think* I need to mask/unmask + ;; signals here, but if I'm wrong, please let me know. + (set! profile-level (+ profile-level 1)) + (if (= profile-level 1) + (let* ((rpt remaining-prof-time) + (use-rpt? (and rpt + (or (positive? (car rpt)) + (positive? (cdr rpt)))))) + (set! remaining-prof-time #f) + (set! last-start-time (get-internal-run-time)) + (set! gc-time-taken + (cdr (assq 'gc-time-taken (gc-stats)))) + (if use-rpt? + (setitimer ITIMER_PROF 0 0 (car rpt) (cdr rpt)) + (setitimer ITIMER_PROF + 0 0 + (car sampling-frequency) + (cdr sampling-frequency))) + (trap-enable 'apply-frame) + #t))) + +;; Do not call this from statprof internal functions -- user only. +(define (statprof-stop) + "Stop the profiler.@code{}" + ;; After some head-scratching, I don't *think* I need to mask/unmask + ;; signals here, but if I'm wrong, please let me know. + (set! profile-level (- profile-level 1)) + (if (zero? profile-level) + (begin + (set! gc-time-taken + (- (cdr (assq 'gc-time-taken (gc-stats))) gc-time-taken)) + (trap-disable 'apply-frame) + ;; I believe that we need to do this before getting the time + ;; (unless we want to make things even more complicated). + (set! remaining-prof-time (setitimer ITIMER_PROF 0 0 0 0)) + (accumulate-time (get-internal-run-time)) + (set! last-start-time #f)))) + +(define (statprof-reset sample-seconds sample-microseconds count-calls? + . full-stacks?) + "Reset the statprof sampler interval to @var{sample-seconds} and +@var{sample-microseconds}. If @var{count-calls?} is true, arrange to +instrument procedure calls as well as collecting statistical profiling +data. If @var{full-stacks?} is true, collect all sampled stacks into a +list for later analysis. + +Enables traps and debugging as necessary." + (if (positive? profile-level) + (error "Can't reset profiler while profiler is running.")) + (set! %count-calls? count-calls?) + (set! accumulated-time 0) + (set! last-start-time #f) + (set! sample-count 0) + (set! sampling-frequency (cons sample-seconds sample-microseconds)) + (set! remaining-prof-time #f) + (set! procedure-data (make-hash-table 131)) + (if %count-calls? + (begin + (trap-set! apply-frame-handler count-call) + (trap-enable 'traps))) + (set! record-full-stacks? (and (pair? full-stacks?) (car full-stacks?))) + (set! stacks '()) + (debug-enable 'debug) + (sigaction SIGPROF profile-signal-handler) + #t) + +(define (statprof-fold-call-data proc init) + "Fold @var{proc} over the call-data accumulated by statprof. Cannot be +called while statprof is active. @var{proc} should take two arguments, +@code{(@var{call-data} @var{prior-result})}. + +Note that a given proc-name may appear multiple times, but if it does, +it represents different functions with the same name." + (if (positive? profile-level) + (error "Can't call statprof-fold-called while profiler is running.")) + + (hash-fold + (lambda (key value prior-result) + (proc value prior-result)) + init + procedure-data)) + +(define (statprof-proc-call-data proc) + "Returns the call-data associated with @var{proc}, or @code{#f} if +none is available." + (if (positive? profile-level) + (error "Can't call statprof-fold-called while profiler is running.")) + + (hashq-ref procedure-data proc)) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; Stats + +(define (statprof-call-data->stats call-data) + "Returns an object of type @code{statprof-stats}." + ;; returns (vector proc-name + ;; %-time-in-proc + ;; cum-seconds-in-proc + ;; self-seconds-in-proc + ;; num-calls + ;; self-secs-per-call + ;; total-secs-per-call) + + (let* ((proc-name (call-data-name call-data)) + (self-samples (call-data-self-sample-count call-data)) + (cum-samples (call-data-cum-sample-count call-data)) + (all-samples (statprof-sample-count)) + (secs-per-sample (/ (statprof-accumulated-time) + (statprof-sample-count))) + (num-calls (and %count-calls? (statprof-call-data-calls call-data)))) + + (vector proc-name + (* (/ self-samples all-samples) 100.0) + (* cum-samples secs-per-sample 1.0) + (* self-samples secs-per-sample 1.0) + num-calls + (and num-calls ;; maybe we only sampled in children + (if (zero? self-samples) 0.0 + (/ (* self-samples secs-per-sample) 1.0 num-calls))) + (and num-calls ;; cum-samples must be positive + (/ (* cum-samples secs-per-sample) 1.0 num-calls))))) + +(define (statprof-stats-proc-name stats) (vector-ref stats 0)) +(define (statprof-stats-%-time-in-proc stats) (vector-ref stats 1)) +(define (statprof-stats-cum-secs-in-proc stats) (vector-ref stats 2)) +(define (statprof-stats-self-secs-in-proc stats) (vector-ref stats 3)) +(define (statprof-stats-calls stats) (vector-ref stats 4)) +(define (statprof-stats-self-secs-per-call stats) (vector-ref stats 5)) +(define (statprof-stats-cum-secs-per-call stats) (vector-ref stats 6)) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(define (stats-sorter x y) + (let ((diff (- (statprof-stats-self-secs-in-proc x) + (statprof-stats-self-secs-in-proc y)))) + (positive? + (if (= diff 0) + (- (statprof-stats-cum-secs-in-proc x) + (statprof-stats-cum-secs-in-proc y)) + diff)))) + +(define (statprof-display . port) + "Displays a gprof-like summary of the statistics collected. Unless an +optional @var{port} argument is passed, uses the current output port." + (if (null? port) (set! port (current-output-port))) + + (cond + ((zero? (statprof-sample-count)) + (format port "No samples recorded.\n")) + (else + (let* ((stats-list (statprof-fold-call-data + (lambda (data prior-value) + (cons (statprof-call-data->stats data) + prior-value)) + '())) + (sorted-stats (sort stats-list stats-sorter))) + + (define (display-stats-line stats) + (if %count-calls? + (format port "~6,2f ~9,2f ~9,2f ~8r ~8,2f ~8,2f " + (statprof-stats-%-time-in-proc stats) + (statprof-stats-cum-secs-in-proc stats) + (statprof-stats-self-secs-in-proc stats) + (statprof-stats-calls stats) + (* 1000 (statprof-stats-self-secs-per-call stats)) + (* 1000 (statprof-stats-cum-secs-per-call stats))) + (format port "~6,2f ~9,2f ~9,2f " + (statprof-stats-%-time-in-proc stats) + (statprof-stats-cum-secs-in-proc stats) + (statprof-stats-self-secs-in-proc stats))) + (display (statprof-stats-proc-name stats) port) + (newline port)) + + (if %count-calls? + (begin + (format port "~5a ~10a ~7a ~8a ~8a ~8a ~8@a\n" + "% " "cumulative" "self" "" "self" "total" "") + (format port "~5a ~9a ~8a ~8a ~8a ~8a ~8@a\n" + "time" "seconds" "seconds" "calls" "ms/call" "ms/call" "name")) + (begin + (format port "~5a ~10a ~7a ~8@a\n" + "%" "cumulative" "self" "") + (format port "~5a ~10a ~7a ~8@a\n" + "time" "seconds" "seconds" "name"))) + + (for-each display-stats-line sorted-stats) + + (display "---\n" port) + (simple-format #t "Sample count: ~A\n" (statprof-sample-count)) + (simple-format #t "Total time: ~A seconds (~A seconds in GC)\n" + (statprof-accumulated-time) + (/ gc-time-taken internal-time-units-per-second)))))) + +(define (statprof-display-anomolies) + "A sanity check that attempts to detect anomolies in statprof's +statistics.@code{}" + (statprof-fold-call-data + (lambda (data prior-value) + (if (and %count-calls? + (zero? (call-data-call-count data)) + (positive? (call-data-sample-count data))) + (simple-format #t + "==[~A ~A ~A]\n" + (call-data-name data) + (call-data-call-count data) + (call-data-sample-count data)))) + #f) + (simple-format #t "Total time: ~A\n" (statprof-accumulated-time)) + (simple-format #t "Sample count: ~A\n" (statprof-sample-count))) + +(define (statprof-accumulated-time) + "Returns the time accumulated during the last statprof run.@code{}" + (if (positive? profile-level) + (error "Can't get accumulated time while profiler is running.")) + (/ accumulated-time internal-time-units-per-second)) + +(define (statprof-sample-count) + "Returns the number of samples taken during the last statprof run.@code{}" + (if (positive? profile-level) + (error "Can't get accumulated time while profiler is running.")) + sample-count) + +(define statprof-call-data-name call-data-name) +(define statprof-call-data-calls call-data-call-count) +(define statprof-call-data-cum-samples call-data-cum-sample-count) +(define statprof-call-data-self-samples call-data-self-sample-count) + +(define (statprof-fetch-stacks) + "Returns a list of stacks, as they were captured since the last call +to @code{statprof-reset}. + +Note that stacks are only collected if the @var{full-stacks?} argument +to @code{statprof-reset} is true." + stacks) + +(define procedure=? + (if (false-if-exception (resolve-interface '(system base compile))) + (lambda (a b) + (cond + ((eq? a b)) + ((and ((@ (system vm program) program?) a) + ((@ (system vm program) program?) b)) + (eq? ((@ (system vm program) program-objcode) a) + ((@ (system vm program) program-objcode) b))) + ((and (closure? a) (closure? b) + (procedure-source a) (procedure-source b)) + (and (eq? (procedure-name a) (procedure-name b)) + (equal? (procedure-source a) (procedure-source b)))) + (else + #f))) + (lambda (a b) + (cond + ((eq? a b)) + ((and (closure? a) (closure? b) + (procedure-source a) (procedure-source b)) + (and (eq? (procedure-name a) (procedure-name b)) + (equal? (procedure-source a) (procedure-source b)))) + (else + #f))))) + +;; tree ::= (car n . tree*) + +(define (lists->trees lists equal?) + (let lp ((in lists) (n-terminal 0) (tails '())) + (cond + ((null? in) + (let ((trees (map (lambda (tail) + (cons (car tail) + (lists->trees (cdr tail) equal?))) + tails))) + (cons (apply + n-terminal (map cadr trees)) + (sort trees + (lambda (a b) (> (cadr a) (cadr b))))))) + ((null? (car in)) + (lp (cdr in) (1+ n-terminal) tails)) + ((find (lambda (x) (equal? (car x) (caar in))) + tails) + => (lambda (tail) + (lp (cdr in) + n-terminal + (assq-set! tails + (car tail) + (cons (cdar in) (cdr tail)))))) + (else + (lp (cdr in) + n-terminal + (acons (caar in) (list (cdar in)) tails)))))) + +(define (stack->procedures stack) + (filter identity + (unfold-right (lambda (x) (not x)) + frame-procedure + frame-previous + (stack-ref stack 0)))) + +(define (statprof-fetch-call-tree) + "Return a call tree for the previous statprof run. + +The return value is a list of nodes, each of which is of the type: +@code + node ::= (@var{proc} @var{count} . @var{nodes}) +@end code" + (cons #t (lists->trees (map stack->procedures stacks) procedure=?))) + +(define-macro (with-statprof . args) + "Profiles the expressions in its body. + +Keyword arguments: + +@table @code +@item #:loop +Execute the body @var{loop} number of times, or @code{#f} for no looping + +default: @code{#f} +@item #:hz +Sampling rate + +default: @code{20} +@item #:count-calls? +Whether to instrument each function call (expensive) + +default: @code{#f} +@item #:full-stacks? +Whether to collect away all sampled stacks into a list + +default: @code{#f} +@end table" + (define (kw-arg-ref kw args def) + (cond + ((null? args) (error "Invalid macro body")) + ((keyword? (car args)) + (if (eq? (car args) kw) + (cadr args) + (kw-arg-ref kw (cddr args) def))) + ((eq? kw #f def) ;; asking for the body + args) + (else def))) ;; kw not found + (let ((loop (kw-arg-ref #:loop args #f)) + (hz (kw-arg-ref #:hz args 20)) + (count-calls? (kw-arg-ref #:count-calls? args #f)) + (full-stacks? (kw-arg-ref #:full-stacks? args #f)) + (body (kw-arg-ref #f args #f))) + `(dynamic-wind + (lambda () + (statprof-reset (inexact->exact (floor (/ 1 ,hz))) + (inexact->exact (* 1e6 (- (/ 1 ,hz) + (floor (/ 1 ,hz))))) + ,count-calls? + ,full-stacks?) + (statprof-start)) + (lambda () + ,(if loop + (let ((lp (gensym "statprof ")) (x (gensym))) + `(let ,lp ((,x ,loop)) + (if (not (zero? ,x)) + (begin ,@body (,lp (1- ,x)))))) + `(begin ,@body))) + (lambda () + (statprof-stop) + (statprof-display) + (set! (@@ (statprof) procedure-data) #f))))) + +;;; arch-tag: 83969178-b576-4c52-a31c-6a9c2be85d10 diff --git a/module/sxml/apply-templates.scm b/module/sxml/apply-templates.scm new file mode 100644 index 000000000..0ee27477c --- /dev/null +++ b/module/sxml/apply-templates.scm @@ -0,0 +1,102 @@ +;;;; (sxml apply-templates) -- xslt-like transformation for sxml +;;;; +;;;; Copyright (C) 2009 Free Software Foundation, Inc. +;;;; Copyright 2004 by Andy Wingo <wingo at pobox dot com>. +;;;; Written 2003 by Oleg Kiselyov <oleg at pobox dot com> as apply-templates.scm. +;;;; +;;;; This library is free software; you can redistribute it and/or +;;;; modify it under the terms of the GNU Lesser General Public +;;;; License as published by the Free Software Foundation; either +;;;; version 3 of the License, or (at your option) any later version. +;;;; +;;;; This library is distributed in the hope that it will be useful, +;;;; but WITHOUT ANY WARRANTY; without even the implied warranty of +;;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;;;; Lesser General Public License for more details. +;;;; +;;;; You should have received a copy of the GNU Lesser General Public +;;;; License along with this library; if not, write to the Free Software +;;;; Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;;;; + +;;; Commentary: +;; +;; Pre-order traversal of a tree and creation of a new tree: +;; +;;@smallexample +;; apply-templates:: tree x <templates> -> <new-tree> +;;@end smallexample +;; where +;;@smallexample +;; <templates> ::= (<template> ...) +;; <template> ::= (<node-test> <node-test> ... <node-test> . <handler>) +;; <node-test> ::= an argument to node-typeof? above +;; <handler> ::= <tree> -> <new-tree> +;;@end smallexample +;; +;; This procedure does a @emph{normal}, pre-order traversal of an SXML +;; tree. It walks the tree, checking at each node against the list of +;; matching templates. +;; +;; If the match is found (which must be unique, i.e., unambiguous), the +;; corresponding handler is invoked and given the current node as an +;; argument. The result from the handler, which must be a @code{<tree>}, +;; takes place of the current node in the resulting tree. +;; +;; The name of the function is not accidental: it resembles rather +;; closely an @code{apply-templates} function of XSLT. +;; +;;; Code: + +(define-module (sxml apply-templates) + #:use-module (sxml ssax) + #:use-module ((sxml xpath) :hide (filter)) + + #:export (apply-templates)) + +(define (apply-templates tree templates) + + ; Filter the list of templates. If a template does not + ; contradict the given node (that is, its head matches + ; the type of the node), chop off the head and keep the + ; rest as the result. All contradicting templates are removed. + (define (filter-templates node templates) + (cond + ((null? templates) templates) + ((not (pair? (car templates))) ; A good template must be a list + (filter-templates node (cdr templates))) + (((node-typeof? (caar templates)) node) + (cons (cdar templates) (filter-templates node (cdr templates)))) + (else + (filter-templates node (cdr templates))))) + + ; Here <templates> ::= [<template> | <handler>] + ; If there is a <handler> in the above list, it must + ; be only one. If found, return it; otherwise, return #f + (define (find-handler templates) + (and (pair? templates) + (cond + ((procedure? (car templates)) + (if (find-handler (cdr templates)) + (error "ambiguous template match")) + (car templates)) + (else (find-handler (cdr templates)))))) + + (let loop ((tree tree) (active-templates '())) + ;(cout "active-templates: " active-templates nl "tree: " tree nl) + (if (nodeset? tree) + (map-union (lambda (a-tree) (loop a-tree active-templates)) tree) + (let ((still-active-templates + (append + (filter-templates tree active-templates) + (filter-templates tree templates)))) + (cond + ;((null? still-active-templates) '()) + ((find-handler still-active-templates) => + (lambda (handler) (handler tree))) + ((not (pair? tree)) '()) + (else + (loop (cdr tree) still-active-templates))))))) + +;;; arch-tag: 88cd87de-8825-4ab3-9721-cf99694fb787 +;;; templates.scm ends here diff --git a/module/sxml/fold.scm b/module/sxml/fold.scm new file mode 100644 index 000000000..4a39da050 --- /dev/null +++ b/module/sxml/fold.scm @@ -0,0 +1,259 @@ +;;;; (sxml fold) -- transformation of sxml via fold operations +;;;; +;;;; Copyright (C) 2009 Free Software Foundation, Inc. +;;;; Written 2007 by Andy Wingo <wingo at pobox dot com>. +;;;; +;;;; This library is free software; you can redistribute it and/or +;;;; modify it under the terms of the GNU Lesser General Public +;;;; License as published by the Free Software Foundation; either +;;;; version 3 of the License, or (at your option) any later version. +;;;; +;;;; This library is distributed in the hope that it will be useful, +;;;; but WITHOUT ANY WARRANTY; without even the implied warranty of +;;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;;;; Lesser General Public License for more details. +;;;; +;;;; You should have received a copy of the GNU Lesser General Public +;;;; License along with this library; if not, write to the Free Software +;;;; Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;;;; + +;;; Commentary: +;; +;; @code{(sxml fold)} defines a number of variants of the @dfn{fold} +;; algorithm for use in transforming SXML trees. Additionally it defines +;; the layout operator, @code{fold-layout}, which might be described as +;; a context-passing variant of SSAX's @code{pre-post-order}. +;; +;;; Code: + +(define-module (sxml fold) + #:export (foldt + fold + foldts + foldts* + fold-values + foldts*-values + fold-layout)) + +(define (atom? x) + (not (pair? x))) + +(define (foldt fup fhere tree) + "The standard multithreaded tree fold. + +@var{fup} is of type [a] -> a. @var{fhere} is of type object -> a. +" + (if (atom? tree) + (fhere tree) + (fup (map (lambda (kid) + (foldt fup fhere kid)) + tree)))) + +(define (fold proc seed list) + "The standard list fold. + +@var{proc} is of type a -> b -> b. @var{seed} is of type b. @var{list} +is of type [a]." + (if (null? list) + seed + (fold proc (proc (car list) seed) (cdr list)))) + +(define (foldts fdown fup fhere seed tree) + "The single-threaded tree fold originally defined in SSAX. +@xref{sxml ssax,,(sxml ssax)}, for more information." + (if (atom? tree) + (fhere seed tree) + (fup seed + (fold (lambda (kid kseed) + (foldts fdown fup fhere kseed kid)) + (fdown seed tree) + tree) + tree))) + +(define (foldts* fdown fup fhere seed tree) + "A variant of @ref{sxml fold foldts,,foldts} that allows pre-order +tree rewrites. Originally defined in Andy Wingo's 2007 paper, +@emph{Applications of fold to XML transformation}." + (if (atom? tree) + (fhere seed tree) + (call-with-values + (lambda () (fdown seed tree)) + (lambda (kseed tree) + (fup seed + (fold (lambda (kid kseed) + (foldts* fdown fup fhere + kseed kid)) + kseed + tree) + tree))))) + +(define (fold-values proc list . seeds) + "A variant of @ref{sxml fold fold,,fold} that allows multi-valued +seeds. Note that the order of the arguments differs from that of +@code{fold}." + (if (null? list) + (apply values seeds) + (call-with-values + (lambda () (apply proc (car list) seeds)) + (lambda seeds + (apply fold-values proc (cdr list) seeds))))) + +(define (foldts*-values fdown fup fhere tree . seeds) + "A variant of @ref{sxml fold foldts*,,foldts*} that allows +multi-valued seeds. Originally defined in Andy Wingo's 2007 paper, +@emph{Applications of fold to XML transformation}." + (if (atom? tree) + (apply fhere tree seeds) + (call-with-values + (lambda () (apply fdown tree seeds)) + (lambda (tree . kseeds) + (call-with-values + (lambda () + (apply fold-values + (lambda (tree . seeds) + (apply foldts*-values + fdown fup fhere tree seeds)) + tree kseeds)) + (lambda kseeds + (apply fup tree (append seeds kseeds)))))))) + +(define (assq-ref alist key default) + (cond ((assq key alist) => cdr) + (else default))) + +(define (fold-layout tree bindings params layout stylesheet) + "A traversal combinator in the spirit of SSAX's @ref{sxml transform +pre-post-order,,pre-post-order}. + +@code{fold-layout} was originally presented in Andy Wingo's 2007 paper, +@emph{Applications of fold to XML transformation}. + +@example +bindings := (<binding>...) +binding := (<tag> <bandler-pair>...) + | (*default* . <post-handler>) + | (*text* . <text-handler>) +tag := <symbol> +handler-pair := (pre-layout . <pre-layout-handler>) + | (post . <post-handler>) + | (bindings . <bindings>) + | (pre . <pre-handler>) + | (macro . <macro-handler>) +@end example + +@table @var +@item pre-layout-handler +A function of three arguments: + +@table @var +@item kids +the kids of the current node, before traversal +@item params +the params of the current node +@item layout +the layout coming into this node +@end table + +@var{pre-layout-handler} is expected to use this information to return a +layout to pass to the kids. The default implementation returns the +layout given in the arguments. + +@item post-handler +A function of five arguments: +@table @var +@item tag +the current tag being processed +@item params +the params of the current node +@item layout +the layout coming into the current node, before any kids were processed +@item klayout +the layout after processing all of the children +@item kids +the already-processed child nodes +@end table + +@var{post-handler} should return two values, the layout to pass to the +next node and the final tree. + +@item text-handler +@var{text-handler} is a function of three arguments: +@table @var +@item text +the string +@item params +the current params +@item layout +the current layout +@end table + +@var{text-handler} should return two values, the layout to pass to the +next node and the value to which the string should transform. +@end table +" + (define (err . args) + (error "no binding available" args)) + (define (fdown tree bindings pcont params layout ret) + (define (fdown-helper new-bindings new-layout cont) + (let ((cont-with-tag (lambda args + (apply cont (car tree) args))) + (bindings (if new-bindings + (append new-bindings bindings) + bindings)) + (style-params (assq-ref stylesheet (car tree) '()))) + (cond + ((null? (cdr tree)) + (values + '() bindings cont-with-tag (cons style-params params) new-layout '())) + ((and (pair? (cadr tree)) (eq? (caadr tree) '@)) + (let ((params (cons (append (cdadr tree) style-params) params))) + (values + (cddr tree) bindings cont-with-tag params new-layout '()))) + (else + (values + (cdr tree) bindings cont-with-tag (cons style-params params) new-layout '()))))) + (define (no-bindings) + (fdown-helper #f layout (assq-ref bindings '*default* err))) + (define (macro macro-handler) + (fdown (apply macro-handler tree) + bindings pcont params layout ret)) + (define (pre pre-handler) + (values '() bindings + (lambda (params layout old-layout kids) + (values layout (reverse kids))) + params layout (apply pre-handler tree))) + (define (have-bindings tag-bindings) + (fdown-helper + (assq-ref tag-bindings 'bindings #f) + ((assq-ref tag-bindings 'pre-layout + (lambda (tag params layout) + layout)) + tree params layout) + (assq-ref tag-bindings 'post + (assq-ref bindings '*default* err)))) + (let ((tag-bindings (assq-ref bindings (car tree) #f))) + (cond + ((not tag-bindings) (no-bindings)) + ((assq-ref tag-bindings 'macro #f) => macro) + ((assq-ref tag-bindings 'pre #f) => pre) + (else (have-bindings tag-bindings))))) + (define (fup tree bindings cont params layout ret + kbindings kcont kparams klayout kret) + (call-with-values + (lambda () + (kcont kparams layout klayout (reverse kret))) + (lambda (klayout kret) + (values bindings cont params klayout (cons kret ret))))) + (define (fhere tree bindings cont params layout ret) + (call-with-values + (lambda () + ((assq-ref bindings '*text* err) tree params layout)) + (lambda (tlayout tret) + (values bindings cont params tlayout (cons tret ret))))) + (call-with-values + (lambda () + (foldts*-values + fdown fup fhere tree bindings #f (cons params '()) layout '())) + (lambda (bindings cont params layout ret) + (values (car ret) layout)))) diff --git a/module/sxml/simple.scm b/module/sxml/simple.scm new file mode 100644 index 000000000..a1b485420 --- /dev/null +++ b/module/sxml/simple.scm @@ -0,0 +1,169 @@ +;;;; (sxml simple) -- a simple interface to the SSAX parser +;;;; +;;;; Copyright (C) 2009 Free Software Foundation, Inc. +;;;; Modified 2004 by Andy Wingo <wingo at pobox dot com>. +;;;; Originally written by Oleg Kiselyov <oleg at pobox dot com> as SXML-to-HTML.scm. +;;;; +;;;; This library is free software; you can redistribute it and/or +;;;; modify it under the terms of the GNU Lesser General Public +;;;; License as published by the Free Software Foundation; either +;;;; version 3 of the License, or (at your option) any later version. +;;;; +;;;; This library is distributed in the hope that it will be useful, +;;;; but WITHOUT ANY WARRANTY; without even the implied warranty of +;;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;;;; Lesser General Public License for more details. +;;;; +;;;; You should have received a copy of the GNU Lesser General Public +;;;; License along with this library; if not, write to the Free Software +;;;; Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;;;; + +;;; Commentary: +;; +;;A simple interface to XML parsing and serialization. +;; +;;; Code: + +(define-module (sxml simple) + #:use-module (sxml ssax) + #:use-module (sxml transform) + #:use-module (ice-9 optargs) + #:use-module (srfi srfi-13) + #:export (xml->sxml sxml->xml sxml->string universal-sxslt-rules)) + +(define* (xml->sxml #:optional (port (current-input-port))) + "Use SSAX to parse an XML document into SXML. Takes one optional +argument, @var{port}, which defaults to the current input port." + (ssax:xml->sxml port '())) + +;; Universal transformation rules. Works for all XML. +(define universal-sxslt-rules + #; + "A set of @code{pre-post-order} rules that transform any SXML tree +into a form suitable for XML serialization by @code{(sxml transform)}'s +@code{SRV:send-reply}. Used internally by @code{sxml->xml}." + `((@ + ((*default* . ,(lambda (attr-key . value) ((enattr attr-key) value)))) + . ,(lambda (trigger . value) (list '@ value))) + (*ENTITY* . ,(lambda (tag name) (list "&" name ";"))) + (*PI* . ,(lambda (pi tag str) (list "<?" tag " " str "?>"))) + ;; Is this right for entities? I don't have a reference for + ;; public-id/system-id at the moment... + (*default* . ,(lambda (tag . elems) (apply (entag tag) elems))) + (*text* . ,(lambda (trigger str) + (if (string? str) (string->escaped-xml str) str))))) + +(define* (sxml->xml tree #:optional (port (current-output-port))) + "Serialize the sxml tree @var{tree} as XML. The output will be written +to the current output port, unless the optional argument @var{port} is +present." + (with-output-to-port port + (lambda () + (SRV:send-reply + (post-order + tree + universal-sxslt-rules))))) + +(define (sxml->string sxml) + "Detag an sxml tree @var{sxml} into a string. Does not perform any +formatting." + (string-concatenate-reverse + (foldts + (lambda (seed tree) ; fdown + '()) + (lambda (seed kid-seed tree) ; fup + (append! kid-seed seed)) + (lambda (seed tree) ; fhere + (if (string? tree) (cons tree seed) seed)) + '() + sxml))) + +;; The following two functions serialize tags and attributes. They are +;; being used in the node handlers for the post-order function, see +;; above. + +(define (check-name name) + (let* ((str (symbol->string name)) + (i (string-index str #\:)) + (head (or (and i (substring str 0 i)) str)) + (tail (and i (substring str (1+ i))))) + (and i (string-index (substring str (1+ i)) #\:) + (error "Invalid QName: more than one colon" name)) + (for-each + (lambda (s) + (and s + (or (char-alphabetic? (string-ref s 0)) + (eq? (string-ref s 0) #\_) + (error "Invalid name starting character" s name)) + (string-for-each + (lambda (c) + (or (char-alphabetic? c) (string-index "0123456789.-_" c) + (error "Invalid name character" c s name))) + s))) + (list head tail)))) + +(define (entag tag) + (check-name tag) + (lambda elems + (if (and (pair? elems) (pair? (car elems)) (eq? '@ (caar elems))) + (list #\< tag (cdar elems) + (if (pair? (cdr elems)) + (list #\> (cdr elems) "</" tag #\>) + " />")) + (list #\< tag + (if (pair? elems) + (list #\> elems "</" tag #\>) + " />"))))) + +(define (enattr attr-key) + (check-name attr-key) + (let ((attr-str (symbol->string attr-key))) + (lambda (value) + (list #\space attr-str + "=\"" (and (not (null? value)) value) #\")))) + +(define (make-char-quotator char-encoding) + (let ((bad-chars (map car char-encoding))) + + ;; Check to see if str contains one of the characters in charset, + ;; from the position i onward. If so, return that character's index. + ;; otherwise, return #f + (define (index-cset str i charset) + (let loop ((i i)) + (and (< i (string-length str)) + (if (memv (string-ref str i) charset) i + (loop (+ 1 i)))))) + + ;; The body of the function + (lambda (str) + (let ((bad-pos (index-cset str 0 bad-chars))) + (if (not bad-pos) str ; str had all good chars + (string-concatenate-reverse + (let loop ((from 0) (to bad-pos) (out '())) + (cond + ((>= from (string-length str)) out) + ((not to) + (cons (substring str from (string-length str)) out)) + (else + (let ((quoted-char + (cdr (assv (string-ref str to) char-encoding))) + (new-to + (index-cset str (+ 1 to) bad-chars))) + (loop (1+ to) new-to + (if (< from to) + (cons* quoted-char (substring str from to) out) + (cons quoted-char out))))))))))))) + +;; Given a string, check to make sure it does not contain characters +;; such as '<' or '&' that require encoding. Return either the original +;; string, or a list of string fragments with special characters +;; replaced by appropriate character entities. + +(define string->escaped-xml + (make-char-quotator + '((#\< . "<") (#\> . ">") (#\& . "&") (#\" . """)))) + +;;; arch-tag: 9c853b25-d82f-42ef-a959-ae26fdc7d1ac +;;; simple.scm ends here + diff --git a/module/sxml/ssax.scm b/module/sxml/ssax.scm new file mode 100644 index 000000000..8794927e9 --- /dev/null +++ b/module/sxml/ssax.scm @@ -0,0 +1,246 @@ +;;;; (sxml ssax) -- the SSAX parser +;;;; +;;;; Copyright (C) 2009 Free Software Foundation, Inc. +;;;; Modified 2004 by Andy Wingo <wingo at pobox dot com>. +;;;; Written 2001,2002,2003,2004 by Oleg Kiselyov <oleg at pobox dot com> as SSAX.scm. +;;;; +;;;; This library is free software; you can redistribute it and/or +;;;; modify it under the terms of the GNU Lesser General Public +;;;; License as published by the Free Software Foundation; either +;;;; version 3 of the License, or (at your option) any later version. +;;;; +;;;; This library is distributed in the hope that it will be useful, +;;;; but WITHOUT ANY WARRANTY; without even the implied warranty of +;;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;;;; Lesser General Public License for more details. +;;;; +;;;; You should have received a copy of the GNU Lesser General Public +;;;; License along with this library; if not, write to the Free Software +;;;; Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;;;; + +;;; Commentary: +;; +;@subheading Functional XML parsing framework +;@subsubheading SAX/DOM and SXML parsers with support for XML Namespaces and validation +; +; This is a package of low-to-high level lexing and parsing procedures +; that can be combined to yield a SAX, a DOM, a validating parser, or +; a parser intended for a particular document type. The procedures in +; the package can be used separately to tokenize or parse various +; pieces of XML documents. The package supports XML Namespaces, +; internal and external parsed entities, user-controlled handling of +; whitespace, and validation. This module therefore is intended to be +; a framework, a set of "Lego blocks" you can use to build a parser +; following any discipline and performing validation to any degree. As +; an example of the parser construction, this file includes a +; semi-validating SXML parser. + +; The present XML framework has a "sequential" feel of SAX yet a +; "functional style" of DOM. Like a SAX parser, the framework scans the +; document only once and permits incremental processing. An application +; that handles document elements in order can run as efficiently as +; possible. @emph{Unlike} a SAX parser, the framework does not require +; an application register stateful callbacks and surrender control to +; the parser. Rather, it is the application that can drive the framework +; -- calling its functions to get the current lexical or syntax element. +; These functions do not maintain or mutate any state save the input +; port. Therefore, the framework permits parsing of XML in a pure +; functional style, with the input port being a monad (or a linear, +; read-once parameter). + +; Besides the @var{port}, there is another monad -- @var{seed}. Most of +; the middle- and high-level parsers are single-threaded through the +; @var{seed}. The functions of this framework do not process or affect +; the @var{seed} in any way: they simply pass it around as an instance +; of an opaque datatype. User functions, on the other hand, can use the +; seed to maintain user's state, to accumulate parsing results, etc. A +; user can freely mix his own functions with those of the framework. On +; the other hand, the user may wish to instantiate a high-level parser: +; @code{SSAX:make-elem-parser} or @code{SSAX:make-parser}. In the latter +; case, the user must provide functions of specific signatures, which +; are called at predictable moments during the parsing: to handle +; character data, element data, or processing instructions (PI). The +; functions are always given the @var{seed}, among other parameters, and +; must return the new @var{seed}. + +; From a functional point of view, XML parsing is a combined +; pre-post-order traversal of a "tree" that is the XML document +; itself. This down-and-up traversal tells the user about an element +; when its start tag is encountered. The user is notified about the +; element once more, after all element's children have been +; handled. The process of XML parsing therefore is a fold over the +; raw XML document. Unlike a fold over trees defined in [1], the +; parser is necessarily single-threaded -- obviously as elements +; in a text XML document are laid down sequentially. The parser +; therefore is a tree fold that has been transformed to accept an +; accumulating parameter [1,2]. + +; Formally, the denotational semantics of the parser can be expressed +; as +;@smallexample +; parser:: (Start-tag -> Seed -> Seed) -> +; (Start-tag -> Seed -> Seed -> Seed) -> +; (Char-Data -> Seed -> Seed) -> +; XML-text-fragment -> Seed -> Seed +; parser fdown fup fchar "<elem attrs> content </elem>" seed +; = fup "<elem attrs>" seed +; (parser fdown fup fchar "content" (fdown "<elem attrs>" seed)) +; +; parser fdown fup fchar "char-data content" seed +; = parser fdown fup fchar "content" (fchar "char-data" seed) +; +; parser fdown fup fchar "elem-content content" seed +; = parser fdown fup fchar "content" ( +; parser fdown fup fchar "elem-content" seed) +;@end smallexample + +; Compare the last two equations with the left fold +;@smallexample +; fold-left kons elem:list seed = fold-left kons list (kons elem seed) +;@end smallexample + +; The real parser created by @code{SSAX:make-parser} is slightly more +; complicated, to account for processing instructions, entity +; references, namespaces, processing of document type declaration, etc. + + +; The XML standard document referred to in this module is +; @uref{http://www.w3.org/TR/1998/REC-xml-19980210.html} +; +; The present file also defines a procedure that parses the text of an +; XML document or of a separate element into SXML, an S-expression-based +; model of an XML Information Set. SXML is also an Abstract Syntax Tree +; of an XML document. SXML is similar but not identical to DOM; SXML is +; particularly suitable for Scheme-based XML/HTML authoring, SXPath +; queries, and tree transformations. See SXML.html for more details. +; SXML is a term implementation of evaluation of the XML document [3]. +; The other implementation is context-passing. + +; The present frameworks fully supports the XML Namespaces Recommendation: +; @uref{http://www.w3.org/TR/REC-xml-names/} +; Other links: +;@table @asis +;@item [1] +; Jeremy Gibbons, Geraint Jones, "The Under-appreciated Unfold," +; Proc. ICFP'98, 1998, pp. 273-279. +;@item [2] +; Richard S. Bird, The promotion and accumulation strategies in +; transformational programming, ACM Trans. Progr. Lang. Systems, +; 6(4):487-504, October 1984. +;@item [3] +; Ralf Hinze, "Deriving Backtracking Monad Transformers," +; Functional Pearl. Proc ICFP'00, pp. 186-197. +;@end table +;; +;;; Code: + +(define-module (sxml ssax) + #:use-module (sxml ssax input-parse) + #:use-module (srfi srfi-1) + #:use-module (srfi srfi-13) + + #:export (current-ssax-error-port + with-ssax-error-to-port + xml-token? xml-token-kind xml-token-head + make-empty-attlist attlist-add + attlist-null? + attlist-remove-top + attlist->alist attlist-fold + ssax:uri-string->symbol + ssax:skip-internal-dtd + ssax:read-pi-body-as-string + ssax:reverse-collect-str-drop-ws + ssax:read-markup-token + ssax:read-cdata-body + ssax:read-char-ref + ssax:read-attributes + ssax:complete-start-tag + ssax:read-external-id + ssax:read-char-data + ssax:xml->sxml + ssax:make-parser + ssax:make-pi-parser + ssax:make-elem-parser)) + +(define (parser-error port message . rest) + (apply throw 'parser-error port message rest)) +(define ascii->char integer->char) +(define char->ascii char->integer) + +(define *current-ssax-error-port* (make-fluid)) +(define (current-ssax-error-port) + (fluid-ref *current-ssax-error-port*)) + +(define (with-ssax-error-to-port port thunk) + (with-fluids ((*current-ssax-error-port* port)) + (thunk))) + +(define (ssax:warn port msg . args) + (format (current-ssax-error-port) + ";;; SSAX warning: ~a ~a\n" msg args)) + +(define (ucscode->string codepoint) + (string (integer->char codepoint))) + +(define char-newline #\newline) +(define char-return #\return) +(define char-tab #\tab) +(define nl "\n") + +;; if condition is true, execute stmts in turn and return the result of +;; the last statement otherwise, return #f +(define-syntax when + (syntax-rules () + ((when condition . stmts) + (and condition (begin . stmts))))) + +;; Execute a sequence of forms and return the result of the _first_ one. +;; Like PROG1 in Lisp. Typically used to evaluate one or more forms with +;; side effects and return a value that must be computed before some or +;; all of the side effects happen. +(define-syntax begin0 + (syntax-rules () + ((begin0 form form1 ... ) + (let ((val form)) form1 ... val)))) + +; Like let* but allowing for multiple-value bindings +(define-syntax let*-values + (syntax-rules () + ((let*-values () . bodies) (begin . bodies)) + ((let*-values (((var) initializer) . rest) . bodies) + (let ((var initializer)) ; a single var optimization + (let*-values rest . bodies))) + ((let*-values ((vars initializer) . rest) . bodies) + (call-with-values (lambda () initializer) ; the most generic case + (lambda vars (let*-values rest . bodies)))))) + +;; needed for some dumb reason +(define inc 1+) +(define dec 1-) + +(define-syntax include-from-path/filtered + (lambda (x) + (define (read-filtered accept-list file) + (with-input-from-file (%search-load-path file) + (lambda () + (let loop ((sexp (read)) (out '())) + (cond + ((eof-object? sexp) (reverse out)) + ((and (pair? sexp) (memq (car sexp) accept-list)) + (loop (read) (cons sexp out))) + (else + (loop (read) out))))))) + (syntax-case x () + ((_ accept-list file) + (with-syntax (((exp ...) (datum->syntax + x + (read-filtered + (syntax->datum #'accept-list) + (syntax->datum #'file))))) + #'(begin exp ...)))))) + +(include-from-path "sxml/upstream/assert.scm") +(include-from-path/filtered + (define define-syntax ssax:define-labeled-arg-macro) + "sxml/upstream/SSAX.scm") diff --git a/module/sxml/ssax/input-parse.scm b/module/sxml/ssax/input-parse.scm new file mode 100644 index 000000000..6e845eef8 --- /dev/null +++ b/module/sxml/ssax/input-parse.scm @@ -0,0 +1,180 @@ +;;;; (sxml ssax input-parse) -- a simple lexer +;;;; +;;;; Copyright (C) 2009 Free Software Foundation, Inc. +;;;; Modified 2004 by Andy Wingo <wingo at pobox dot com>. +;;;; Written 2003 by Oleg Kiselyov <oleg at pobox dot com> as input-parse.scm. +;;;; +;;;; This library is free software; you can redistribute it and/or +;;;; modify it under the terms of the GNU Lesser General Public +;;;; License as published by the Free Software Foundation; either +;;;; version 3 of the License, or (at your option) any later version. +;;;; +;;;; This library is distributed in the hope that it will be useful, +;;;; but WITHOUT ANY WARRANTY; without even the implied warranty of +;;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;;;; Lesser General Public License for more details. +;;;; +;;;; You should have received a copy of the GNU Lesser General Public +;;;; License along with this library; if not, write to the Free Software +;;;; Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;;;; + +;;; Commentary: +;; +;; A simple lexer. +;; +;; The procedures in this module surprisingly often suffice to parse an +;; input stream. They either skip, or build and return tokens, according +;; to inclusion or delimiting semantics. The list of characters to +;; expect, include, or to break at may vary from one invocation of a +;; function to another. This allows the functions to easily parse even +;; context-sensitive languages. +;; +;; EOF is generally frowned on, and thrown up upon if encountered. +;; Exceptions are mentioned specifically. The list of expected +;; characters (characters to skip until, or break-characters) may +;; include an EOF "character", which is to be coded as the symbol, +;; @code{*eof*}. +;; +;; The input stream to parse is specified as a @dfn{port}, which is +;; usually the last (and optional) argument. It defaults to the current +;; input port if omitted. +;; +;; If the parser encounters an error, it will throw an exception to the +;; key @code{parser-error}. The arguments will be of the form +;; @code{(@var{port} @var{message} @var{specialising-msg}*)}. +;; +;; The first argument is a port, which typically points to the offending +;; character or its neighborhood. You can then use @code{port-column} +;; and @code{port-line} to query the current position. @var{message} is +;; the description of the error. Other arguments supply more details +;; about the problem. +;; +;;; Code: + +(define-module (sxml ssax input-parse) + #:use-module (ice-9 rdelim) + #:export (peek-next-char + assert-curr-char + skip-until + skip-while + next-token + next-token-of + read-text-line + read-string + find-string-from-port?)) + +(define ascii->char integer->char) +(define char->ascii char->integer) +(define char-newline #\newline) +(define char-return #\return) +(define inc 1+) +(define dec 1-) + +;; rewrite oleg's define-opt into define* style +(define-macro (define-opt bindings body . body-rest) + (let* ((rev-bindings (reverse bindings)) + (opt-bindings + (and (pair? rev-bindings) (pair? (car rev-bindings)) + (eq? 'optional (caar rev-bindings)) + (cdar rev-bindings)))) + (if opt-bindings + `(define* ,(append (reverse (cons #:optional (cdr rev-bindings))) + opt-bindings) + ,body ,@body-rest) + `(define* ,bindings ,body ,@body-rest)))) + +(define (parser-error port message . rest) + (apply throw 'parser-error port message rest)) + +(include-from-path "sxml/upstream/input-parse.scm") + +;; This version for guile is quite speedy, due to read-delimited (which +;; is implemented in C). +(define-opt (next-token prefix-skipped-chars break-chars + (optional (comment "") (port (current-input-port))) ) + (let ((delims (list->string (delete '*eof* break-chars)))) + (if (eof-object? (if (null? prefix-skipped-chars) + (peek-char port) + (skip-while prefix-skipped-chars port))) + (if (memq '*eof* break-chars) + "" + (parser-error port "EOF while reading a token " comment)) + (let ((token (read-delimited delims port 'peek))) + (if (and (eof-object? (peek-char port)) + (not (memq '*eof* break-chars))) + (parser-error port "EOF while reading a token " comment) + token))))) + +(define-opt (read-text-line (optional (port (current-input-port))) ) + (read-line port)) + +;; Written 1995, 1996 by Oleg Kiselyov (oleg@acm.org) +;; Modified 1996, 1997, 1998, 2001 by A. Jaffer (agj@alum.mit.edu) +;; Modified 2003 by Steve VanDevender (stevev@hexadecimal.uoregon.edu) +;; Modified 2004 Andy Wingo <wingo at pobox dot com> +;; This function is from SLIB's strsrch.scm, and is in the public domain. +(define (find-string-from-port? str <input-port> . max-no-char) + "Looks for @var{str} in @var{<input-port>}, optionally within the +first @var{max-no-char} characters." + (set! max-no-char (if (null? max-no-char) #f (car max-no-char))) + (letrec + ((no-chars-read 0) + (peeked? #f) + (my-peek-char ; Return a peeked char or #f + (lambda () (and (or (not (number? max-no-char)) + (< no-chars-read max-no-char)) + (let ((c (peek-char <input-port>))) + (cond (peeked? c) + ((eof-object? c) #f) + ((procedure? max-no-char) + (set! peeked? #t) + (if (max-no-char c) #f c)) + ((eqv? max-no-char c) #f) + (else c)))))) + (next-char (lambda () (set! peeked? #f) (read-char <input-port>) + (set! no-chars-read (+ 1 no-chars-read)))) + (match-1st-char ; of the string str + (lambda () + (let ((c (my-peek-char))) + (and c + (begin (next-char) + (if (char=? c (string-ref str 0)) + (match-other-chars 1) + (match-1st-char))))))) + ;; There has been a partial match, up to the point pos-to-match + ;; (for example, str[0] has been found in the stream) + ;; Now look to see if str[pos-to-match] for would be found, too + (match-other-chars + (lambda (pos-to-match) + (if (>= pos-to-match (string-length str)) + no-chars-read ; the entire string has matched + (let ((c (my-peek-char))) + (and c + (if (not (char=? c (string-ref str pos-to-match))) + (backtrack 1 pos-to-match) + (begin (next-char) + (match-other-chars (+ 1 pos-to-match))))))))) + + ;; There had been a partial match, but then a wrong char showed up. + ;; Before discarding previously read (and matched) characters, we check + ;; to see if there was some smaller partial match. Note, characters read + ;; so far (which matter) are those of str[0..matched-substr-len - 1] + ;; In other words, we will check to see if there is such i>0 that + ;; substr(str,0,j) = substr(str,i,matched-substr-len) + ;; where j=matched-substr-len - i + (backtrack + (lambda (i matched-substr-len) + (let ((j (- matched-substr-len i))) + (if (<= j 0) + ;; backed off completely to the begining of str + (match-1st-char) + (let loop ((k 0)) + (if (>= k j) + (match-other-chars j) ; there was indeed a shorter match + (if (char=? (string-ref str k) + (string-ref str (+ i k))) + (loop (+ 1 k)) + (backtrack (+ 1 i) matched-substr-len)))))))) + ) + (match-1st-char))) diff --git a/module/sxml/transform.scm b/module/sxml/transform.scm new file mode 100644 index 000000000..c905456ba --- /dev/null +++ b/module/sxml/transform.scm @@ -0,0 +1,298 @@ +;;;; (sxml transform) -- pre- and post-order sxml transformation +;;;; +;;;; Copyright (C) 2009 Free Software Foundation, Inc. +;;;; Modified 2004 by Andy Wingo <wingo at pobox dot com>. +;;;; Written 2003 by Oleg Kiselyov <oleg at pobox dot com> as SXML-tree-trans.scm. +;;;; +;;;; This library is free software; you can redistribute it and/or +;;;; modify it under the terms of the GNU Lesser General Public +;;;; License as published by the Free Software Foundation; either +;;;; version 3 of the License, or (at your option) any later version. +;;;; +;;;; This library is distributed in the hope that it will be useful, +;;;; but WITHOUT ANY WARRANTY; without even the implied warranty of +;;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;;;; Lesser General Public License for more details. +;;;; +;;;; You should have received a copy of the GNU Lesser General Public +;;;; License along with this library; if not, write to the Free Software +;;;; Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;;;; + +;;; Commentary: +;; +;;@heading SXML expression tree transformers +; +;@subheading Pre-Post-order traversal of a tree and creation of a new tree +;@smallexample +;pre-post-order:: <tree> x <bindings> -> <new-tree> +;@end smallexample +; where +;@smallexample +; <bindings> ::= (<binding> ...) +; <binding> ::= (<trigger-symbol> *preorder* . <handler>) | +; (<trigger-symbol> *macro* . <handler>) | +; (<trigger-symbol> <new-bindings> . <handler>) | +; (<trigger-symbol> . <handler>) +; <trigger-symbol> ::= XMLname | *text* | *default* +; <handler> :: <trigger-symbol> x [<tree>] -> <new-tree> +;@end smallexample +; +; The pre-post-order function visits the nodes and nodelists +; pre-post-order (depth-first). For each @code{<Node>} of the form +; @code{(@var{name} <Node> ...)}, it looks up an association with the +; given @var{name} among its @var{<bindings>}. If failed, +; @code{pre-post-order} tries to locate a @code{*default*} binding. It's +; an error if the latter attempt fails as well. Having found a binding, +; the @code{pre-post-order} function first checks to see if the binding +; is of the form +;@smallexample +; (<trigger-symbol> *preorder* . <handler>) +;@end smallexample +; +; If it is, the handler is 'applied' to the current node. Otherwise, the +; pre-post-order function first calls itself recursively for each child +; of the current node, with @var{<new-bindings>} prepended to the +; @var{<bindings>} in effect. The result of these calls is passed to the +; @var{<handler>} (along with the head of the current @var{<Node>}). To +; be more precise, the handler is _applied_ to the head of the current +; node and its processed children. The result of the handler, which +; should also be a @code{<tree>}, replaces the current @var{<Node>}. If +; the current @var{<Node>} is a text string or other atom, a special +; binding with a symbol @code{*text*} is looked up. +; +; A binding can also be of a form +;@smallexample +; (<trigger-symbol> *macro* . <handler>) +;@end smallexample +; This is equivalent to @code{*preorder*} described above. However, the +; result is re-processed again, with the current stylesheet. +;; +;;; Code: + +(define-module (sxml transform) + #:export (SRV:send-reply + foldts + post-order + pre-post-order + replace-range)) + +;; Upstream version: +; $Id: SXML-tree-trans.scm,v 1.8 2003/04/24 19:39:53 oleg Exp oleg $ + +; Like let* but allowing for multiple-value bindings +(define-macro (let*-values bindings . body) + (if (null? bindings) (cons 'begin body) + (apply + (lambda (vars initializer) + (let ((cont + (cons 'let*-values + (cons (cdr bindings) body)))) + (cond + ((not (pair? vars)) ; regular let case, a single var + `(let ((,vars ,initializer)) ,cont)) + ((null? (cdr vars)) ; single var, see the prev case + `(let ((,(car vars) ,initializer)) ,cont)) + (else ; the most generic case + `(call-with-values (lambda () ,initializer) + (lambda ,vars ,cont)))))) + (car bindings)))) + +(define (SRV:send-reply . fragments) + "Output the @var{fragments} to the current output port. + +The fragments are a list of strings, characters, numbers, thunks, +@code{#f}, @code{#t} -- and other fragments. The function traverses the +tree depth-first, writes out strings and characters, executes thunks, +and ignores @code{#f} and @code{'()}. The function returns @code{#t} if +anything was written at all; otherwise the result is @code{#f} If +@code{#t} occurs among the fragments, it is not written out but causes +the result of @code{SRV:send-reply} to be @code{#t}." + (let loop ((fragments fragments) (result #f)) + (cond + ((null? fragments) result) + ((not (car fragments)) (loop (cdr fragments) result)) + ((null? (car fragments)) (loop (cdr fragments) result)) + ((eq? #t (car fragments)) (loop (cdr fragments) #t)) + ((pair? (car fragments)) + (loop (cdr fragments) (loop (car fragments) result))) + ((procedure? (car fragments)) + ((car fragments)) + (loop (cdr fragments) #t)) + (else + (display (car fragments)) + (loop (cdr fragments) #t))))) + + + +;------------------------------------------------------------------------ +; Traversal of an SXML tree or a grove: +; a <Node> or a <Nodelist> +; +; A <Node> and a <Nodelist> are mutually-recursive datatypes that +; underlie the SXML tree: +; <Node> ::= (name . <Nodelist>) | "text string" +; An (ordered) set of nodes is just a list of the constituent nodes: +; <Nodelist> ::= (<Node> ...) +; Nodelists, and Nodes other than text strings are both lists. A +; <Nodelist> however is either an empty list, or a list whose head is +; not a symbol (an atom in general). A symbol at the head of a node is +; either an XML name (in which case it's a tag of an XML element), or +; an administrative name such as '@'. +; See SXPath.scm and SSAX.scm for more information on SXML. + + +;; see the commentary for docs +(define (pre-post-order tree bindings) + (let* ((default-binding (assq '*default* bindings)) + (text-binding (or (assq '*text* bindings) default-binding)) + (text-handler ; Cache default and text bindings + (and text-binding + (if (procedure? (cdr text-binding)) + (cdr text-binding) (cddr text-binding))))) + (let loop ((tree tree)) + (cond + ((null? tree) '()) + ((not (pair? tree)) + (let ((trigger '*text*)) + (if text-handler (text-handler trigger tree) + (error "Unknown binding for " trigger " and no default")))) + ((not (symbol? (car tree))) (map loop tree)) ; tree is a nodelist + (else ; tree is an SXML node + (let* ((trigger (car tree)) + (binding (or (assq trigger bindings) default-binding))) + (cond + ((not binding) + (error "Unknown binding for " trigger " and no default")) + ((not (pair? (cdr binding))) ; must be a procedure: handler + (apply (cdr binding) trigger (map loop (cdr tree)))) + ((eq? '*preorder* (cadr binding)) + (apply (cddr binding) tree)) + ((eq? '*macro* (cadr binding)) + (loop (apply (cddr binding) tree))) + (else ; (cadr binding) is a local binding + (apply (cddr binding) trigger + (pre-post-order (cdr tree) (append (cadr binding) bindings))) + )))))))) + +; post-order is a strict subset of pre-post-order without *preorder* +; (let alone *macro*) traversals. +; Now pre-post-order is actually faster than the old post-order. +; The function post-order is deprecated and is aliased below for +; backward compatibility. +(define post-order pre-post-order) + +;------------------------------------------------------------------------ +; Extended tree fold +; tree = atom | (node-name tree ...) +; +; foldts fdown fup fhere seed (Leaf str) = fhere seed str +; foldts fdown fup fhere seed (Nd kids) = +; fup seed $ foldl (foldts fdown fup fhere) (fdown seed) kids + +; procedure fhere: seed -> atom -> seed +; procedure fdown: seed -> node -> seed +; procedure fup: parent-seed -> last-kid-seed -> node -> seed +; foldts returns the final seed + +(define (foldts fdown fup fhere seed tree) + (cond + ((null? tree) seed) + ((not (pair? tree)) ; An atom + (fhere seed tree)) + (else + (let loop ((kid-seed (fdown seed tree)) (kids (cdr tree))) + (if (null? kids) + (fup seed kid-seed tree) + (loop (foldts fdown fup fhere kid-seed (car kids)) + (cdr kids))))))) + +;------------------------------------------------------------------------ +; Traverse a forest depth-first and cut/replace ranges of nodes. +; +; The nodes that define a range don't have to have the same immediate +; parent, don't have to be on the same level, and the end node of a +; range doesn't even have to exist. A replace-range procedure removes +; nodes from the beginning node of the range up to (but not including) +; the end node of the range. In addition, the beginning node of the +; range can be replaced by a node or a list of nodes. The range of +; nodes is cut while depth-first traversing the forest. If all +; branches of the node are cut a node is cut as well. The procedure +; can cut several non-overlapping ranges from a forest. + +; replace-range:: BEG-PRED x END-PRED x FOREST -> FOREST +; where +; type FOREST = (NODE ...) +; type NODE = Atom | (Name . FOREST) | FOREST +; +; The range of nodes is specified by two predicates, beg-pred and end-pred. +; beg-pred:: NODE -> #f | FOREST +; end-pred:: NODE -> #f | FOREST +; The beg-pred predicate decides on the beginning of the range. The node +; for which the predicate yields non-#f marks the beginning of the range +; The non-#f value of the predicate replaces the node. The value can be a +; list of nodes. The replace-range procedure then traverses the tree and skips +; all the nodes, until the end-pred yields non-#f. The value of the end-pred +; replaces the end-range node. The new end node and its brothers will be +; re-scanned. +; The predicates are evaluated pre-order. We do not descend into a node that +; is marked as the beginning of the range. + +(define (replace-range beg-pred end-pred forest) + + ; loop forest keep? new-forest + ; forest is the forest to traverse + ; new-forest accumulates the nodes we will keep, in the reverse + ; order + ; If keep? is #t, keep the curr node if atomic. If the node is not atomic, + ; traverse its children and keep those that are not in the skip range. + ; If keep? is #f, skip the current node if atomic. Otherwise, + ; traverse its children. If all children are skipped, skip the node + ; as well. + + (define (loop forest keep? new-forest) + (if (null? forest) (values (reverse new-forest) keep?) + (let ((node (car forest))) + (if keep? + (cond ; accumulate mode + ((beg-pred node) => ; see if the node starts the skip range + (lambda (repl-branches) ; if so, skip/replace the node + (loop (cdr forest) #f + (append (reverse repl-branches) new-forest)))) + ((not (pair? node)) ; it's an atom, keep it + (loop (cdr forest) keep? (cons node new-forest))) + (else + (let*-values + (((node?) (symbol? (car node))) ; or is it a nodelist? + ((new-kids keep?) ; traverse its children + (loop (if node? (cdr node) node) #t '()))) + (loop (cdr forest) keep? + (cons + (if node? (cons (car node) new-kids) new-kids) + new-forest))))) + ; skip mode + (cond + ((end-pred node) => ; end the skip range + (lambda (repl-branches) ; repl-branches will be re-scanned + (loop (append repl-branches (cdr forest)) #t + new-forest))) + ((not (pair? node)) ; it's an atom, skip it + (loop (cdr forest) keep? new-forest)) + (else + (let*-values + (((node?) (symbol? (car node))) ; or is it a nodelist? + ((new-kids keep?) ; traverse its children + (loop (if node? (cdr node) node) #f '()))) + (loop (cdr forest) keep? + (if (or keep? (pair? new-kids)) + (cons + (if node? (cons (car node) new-kids) new-kids) + new-forest) + new-forest) ; if all kids are skipped + )))))))) ; skip the node too + + (let*-values (((new-forest keep?) (loop forest #t '()))) + new-forest)) + +;;; arch-tag: 6c814f4b-38f7-42c1-b8ef-ce3447edefc7 +;;; transform.scm ends here diff --git a/module/sxml/upstream/COPYING.SSAX b/module/sxml/upstream/COPYING.SSAX new file mode 100644 index 000000000..9dc72b8ea --- /dev/null +++ b/module/sxml/upstream/COPYING.SSAX @@ -0,0 +1,2 @@ +The files in this directory are imported directly from upstream SSAX, +and are in the public domain. diff --git a/module/sxml/upstream/SSAX.scm b/module/sxml/upstream/SSAX.scm new file mode 100644 index 000000000..776e3119e --- /dev/null +++ b/module/sxml/upstream/SSAX.scm @@ -0,0 +1,3212 @@ +; Functional XML parsing framework: SAX/DOM and SXML parsers +; with support for XML Namespaces and validation +; +; This is a package of low-to-high level lexing and parsing procedures +; that can be combined to yield a SAX, a DOM, a validating parsers, or +; a parser intended for a particular document type. The procedures in +; the package can be used separately to tokenize or parse various +; pieces of XML documents. The package supports XML Namespaces, +; internal and external parsed entities, user-controlled handling of +; whitespace, and validation. This module therefore is intended to be +; a framework, a set of "Lego blocks" you can use to build a parser +; following any discipline and performing validation to any degree. As +; an example of the parser construction, this file includes a +; semi-validating SXML parser. + +; The present XML framework has a "sequential" feel of SAX yet a +; "functional style" of DOM. Like a SAX parser, the framework scans +; the document only once and permits incremental processing. An +; application that handles document elements in order can run as +; efficiently as possible. _Unlike_ a SAX parser, the framework does +; not require an application register stateful callbacks and surrender +; control to the parser. Rather, it is the application that can drive +; the framework -- calling its functions to get the current lexical or +; syntax element. These functions do not maintain or mutate any state +; save the input port. Therefore, the framework permits parsing of XML +; in a pure functional style, with the input port being a monad (or a +; linear, read-once parameter). + +; Besides the PORT, there is another monad -- SEED. Most of the +; middle- and high-level parsers are single-threaded through the +; seed. The functions of this framework do not process or affect the +; SEED in any way: they simply pass it around as an instance of an +; opaque datatype. User functions, on the other hand, can use the +; seed to maintain user's state, to accumulate parsing results, etc. A +; user can freely mix his own functions with those of the +; framework. On the other hand, the user may wish to instantiate a +; high-level parser: ssax:make-elem-parser or ssax:make-parser. In +; the latter case, the user must provide functions of specific +; signatures, which are called at predictable moments during the +; parsing: to handle character data, element data, or processing +; instructions (PI). The functions are always given the SEED, among +; other parameters, and must return the new SEED. + +; From a functional point of view, XML parsing is a combined +; pre-post-order traversal of a "tree" that is the XML document +; itself. This down-and-up traversal tells the user about an element +; when its start tag is encountered. The user is notified about the +; element once more, after all element's children have been +; handled. The process of XML parsing therefore is a fold over the +; raw XML document. Unlike a fold over trees defined in [1], the +; parser is necessarily single-threaded -- obviously as elements +; in a text XML document are laid down sequentially. The parser +; therefore is a tree fold that has been transformed to accept an +; accumulating parameter [1,2]. + +; Formally, the denotational semantics of the parser can be expressed +; as +; parser:: (Start-tag -> Seed -> Seed) -> +; (Start-tag -> Seed -> Seed -> Seed) -> +; (Char-Data -> Seed -> Seed) -> +; XML-text-fragment -> Seed -> Seed +; parser fdown fup fchar "<elem attrs> content </elem>" seed +; = fup "<elem attrs>" seed +; (parser fdown fup fchar "content" (fdown "<elem attrs>" seed)) +; +; parser fdown fup fchar "char-data content" seed +; = parser fdown fup fchar "content" (fchar "char-data" seed) +; +; parser fdown fup fchar "elem-content content" seed +; = parser fdown fup fchar "content" ( +; parser fdown fup fchar "elem-content" seed) + +; Compare the last two equations with the left fold +; fold-left kons elem:list seed = fold-left kons list (kons elem seed) + +; The real parser created my ssax:make-parser is slightly more complicated, +; to account for processing instructions, entity references, namespaces, +; processing of document type declaration, etc. + + +; The XML standard document referred to in this module is +; http://www.w3.org/TR/1998/REC-xml-19980210.html +; +; The present file also defines a procedure that parses the text of an +; XML document or of a separate element into SXML, an +; S-expression-based model of an XML Information Set. SXML is also an +; Abstract Syntax Tree of an XML document. SXML is similar +; but not identical to DOM; SXML is particularly suitable for +; Scheme-based XML/HTML authoring, SXPath queries, and tree +; transformations. See SXML.html for more details. +; SXML is a term implementation of evaluation of the XML document [3]. +; The other implementation is context-passing. + +; The present frameworks fully supports the XML Namespaces Recommendation: +; http://www.w3.org/TR/REC-xml-names/ +; Other links: +; [1] Jeremy Gibbons, Geraint Jones, "The Under-appreciated Unfold," +; Proc. ICFP'98, 1998, pp. 273-279. +; [2] Richard S. Bird, The promotion and accumulation strategies in +; transformational programming, ACM Trans. Progr. Lang. Systems, +; 6(4):487-504, October 1984. +; [3] Ralf Hinze, "Deriving Backtracking Monad Transformers," +; Functional Pearl. Proc ICFP'00, pp. 186-197. + +; IMPORT +; parser-error ssax:warn, see Handling of errors, below +; functions declared in files util.scm, input-parse.scm and look-for-str.scm +; char-encoding.scm for various platform-specific character-encoding functions. +; From SRFI-13: string-concatenate/shared and string-concatenate-reverse/shared +; If a particular implementation lacks SRFI-13 support, please +; include the file srfi-13-local.scm + +; Handling of errors +; This package relies on a function parser-error, which must be defined +; by a user of the package. The function has the following signature: +; parser-error PORT MESSAGE SPECIALISING-MSG* +; Many procedures of this package call 'parser-error' whenever a +; parsing, well-formedness or validation error is encountered. The +; first argument is a port, which typically points to the offending +; character or its neighborhood. Most of the Scheme systems let the +; user query a PORT for the current position. The MESSAGE argument +; indicates a failed XML production or a failed XML constraint. The +; latter is referred to by its anchor name in the XML Recommendation +; or XML Namespaces Recommendation. The parsing library (e.g., +; next-token, assert-curr-char) invoke 'parser-error' as well, in +; exactly the same way. See input-parse.scm for more details. +; See +; http://pair.com/lisovsky/download/parse-error.scm +; for an excellent example of such a redefined parser-error function. +; +; In addition, the present code invokes a function ssax:warn +; ssax:warn PORT MESSAGE SPECIALISING-MSG* +; to notify the user about warnings that are NOT errors but still +; may alert the user. +; +; Again, parser-error and ssax:warn are supposed to be defined by the +; user. However, if a run-test macro below is set to include +; self-tests, this present code does provide the definitions for these +; functions to allow tests to run. + +; Misc notes +; It seems it is highly desirable to separate tests out in a dedicated +; file. +; +; Jim Bender wrote on Mon, 9 Sep 2002 20:03:42 EDT on the SSAX-SXML +; mailing list (message A fine-grained "lego") +; The task was to record precise source location information, as PLT +; does with its current XML parser. That parser records the start and +; end location (filepos, line#, column#) for pi, elements, attributes, +; chuncks of "pcdata". +; As suggested above, though, in some cases I needed to be able force +; open an interface that did not yet exist. For instance, I added an +; "end-char-data-hook", which would be called at the end of char-data +; fragment. This returns a function of type (seed -> seed) which is +; invoked on the current seed only if read-char-data has indeed reached +; the end of a block of char data (after reading a new token. +; But the deepest interface that I needed to expose was that of reading +; attributes. In the official distribution, this is not even a separate +; function. Instead, it is embedded within SSAX:read-attributes. This +; required some small re-structuring as well. +; This definitely will not be to everyone's taste (nor needed by most). +; Certainly, the existing make-parser interface addresses most custom +; needs. And likely 80-90 lines of a "link specification" to create a +; parser from many tiny little lego blocks may please only a few, while +; appalling others. +; The code is available at http://celtic.benderweb.net/ssax-lego.plt or +; http://celtic.benderweb.net/ssax-lego.tar.gz +; In the examples directory, I provide: +; - a unit version of the make-parser interface, +; - a simple SXML parser using that interface, +; - an SXML parser which directly uses the "new lego", +; - a pseudo-SXML parser, which records source location information +; - and lastly a parser which returns the structures used in PLT's xml +; collection, with source location information + +; $Id: SSAX.scm,v 5.1 2004/07/07 16:02:30 sperber Exp $ +;^^^^^^^^^ + + + ; See the Makefile in the ../tests directory + ; (in particular, the rule vSSAX) for an example of how + ; to run this code on various Scheme systems. + ; See SSAX examples for many samples of using this code, + ; again, on a variety of Scheme systems. + ; See http://ssax.sf.net/ + + +; The following macro runs built-in test cases -- or does not run, +; depending on which of the two cases below you commented out +; Case 1: no tests: +;(define-macro run-test (lambda body '(begin #f))) +;(define-syntax run-test (syntax-rules () ((run-test . args) (begin #f)))) + +; Case 2: with tests. +; The following macro could've been defined just as +; (define-macro run-test (lambda body `(begin (display "\n-->Test\n") ,@body))) +; +; Instead, it's more involved, to make up for case-insensitivity of +; symbols on some Scheme systems. In Gambit, symbols are case +; sensitive: (eq? 'A 'a) is #f and (eq? 'Aa (string->symbol "Aa")) is +; #t. On some systems, symbols are case-insensitive and just the +; opposite is true. Therefore, we introduce a notation '"ASymbol" (a +; quoted string) that stands for a case-_sensitive_ ASymbol -- on any +; R5RS Scheme system. This notation is valid only within the body of +; run-test. +; The notation is implemented by scanning the run-test's +; body and replacing every occurrence of (quote "str") with the result +; of (string->symbol "str"). We can do such a replacement at macro-expand +; time (rather than at run time). + +; Here's the previous version of run-test, implemented as a low-level +; macro. +; (define-macro run-test +; (lambda body +; (define (re-write body) +; (cond +; ((vector? body) +; (list->vector (re-write (vector->list body)))) +; ((not (pair? body)) body) +; ((and (eq? 'quote (car body)) (pair? (cdr body)) +; (string? (cadr body))) +; (string->symbol (cadr body))) +; (else (cons (re-write (car body)) (re-write (cdr body)))))) +; (cons 'begin (re-write body)))) +; +; For portability, it is re-written as syntax-rules. The syntax-rules +; version is less powerful: for example, it can't handle +; (case x (('"Foo") (do-on-Foo))) whereas the low-level macro +; could correctly place a case-sensitive symbol at the right place. +; We also do not scan vectors (because we don't use them here). +; Twice-deep quasiquotes aren't handled either. +; Still, the syntax-rules version satisfies our immediate needs. +; Incidentally, I originally didn't believe that the macro below +; was at all possible. +; +; The macro is written in a continuation-passing style. A continuation +; typically has the following structure: (k-head ! . args) +; When the continuation is invoked, we expand into +; (k-head <computed-result> . arg). That is, the dedicated symbol ! +; is the placeholder for the result. +; +; It seems that the most modular way to write the run-test macro would +; be the following +; +; (define-syntax run-test +; (syntax-rules () +; ((run-test . ?body) +; (letrec-syntax +; ((scan-exp ; (scan-exp body k) +; (syntax-rules (quote quasiquote !) +; ((scan-exp (quote (hd . tl)) k) +; (scan-lit-lst (hd . tl) (do-wrap ! quasiquote k))) +; ((scan-exp (quote x) (k-head ! . args)) +; (k-head +; (if (string? (quote x)) (string->symbol (quote x)) (quote x)) +; . args)) +; ((scan-exp (hd . tl) k) +; (scan-exp hd (do-tl ! scan-exp tl k))) +; ((scan-exp x (k-head ! . args)) +; (k-head x . args)))) +; (do-tl +; (syntax-rules (!) +; ((do-tl processed-hd fn () (k-head ! . args)) +; (k-head (processed-hd) . args)) +; ((do-tl processed-hd fn old-tl k) +; (fn old-tl (do-cons ! processed-hd k))))) +; ... +; (do-finish +; (syntax-rules () +; ((do-finish (new-body)) new-body) +; ((do-finish new-body) (begin . new-body)))) +; ... +; (scan-exp ?body (do-finish !)) +; )))) +; +; Alas, that doesn't work on all systems. We hit yet another dark +; corner of the R5RS macros. The reason is that run-test is used in +; the code below to introduce definitions. For example: +; (run-test +; (define (ssax:warn port msg . other-msg) +; (apply cerr (cons* nl "Warning: " msg other-msg))) +; ) +; This code expands to +; (begin +; (define (ssax:warn port msg . other-msg) ...)) +; so the definition gets spliced in into the top level. Right? +; Well, On Petite Chez Scheme it is so. However, many other systems +; don't like this approach. The reason is that the invocation of +; (run-test (define (ssax:warn port msg . other-msg) ...)) +; first expands into +; (letrec-syntax (...) +; (scan-exp ((define (ssax:warn port msg . other-msg) ...)) ...)) +; because of the presence of (letrec-syntax ...), the begin form that +; is generated eventually is no longer at the top level! The begin +; form in Scheme is an overloading of two distinct forms: top-level +; begin and the other begin. The forms have different rules: for example, +; (begin (define x 1)) is OK for a top-level begin but not OK for +; the other begin. Some Scheme systems see the that the macro +; (run-test ...) expands into (letrec-syntax ...) and decide right there +; that any further (begin ...) forms are NOT top-level begin forms. +; The only way out is to make sure all our macros are top-level. +; The best approach <sigh> seems to be to make run-test one huge +; top-level macro. + + +(define-syntax run-test + (syntax-rules (define) + ((run-test "scan-exp" (define vars body)) + (define vars (run-test "scan-exp" body))) + ((run-test "scan-exp" ?body) + (letrec-syntax + ((scan-exp ; (scan-exp body k) + (syntax-rules (quote quasiquote !) + ((scan-exp '() (k-head ! . args)) + (k-head '() . args)) + ((scan-exp (quote (hd . tl)) k) + (scan-lit-lst (hd . tl) (do-wrap ! quasiquote k))) + ((scan-exp (quasiquote (hd . tl)) k) + (scan-lit-lst (hd . tl) (do-wrap ! quasiquote k))) + ((scan-exp (quote x) (k-head ! . args)) + (k-head + (if (string? (quote x)) (string->symbol (quote x)) (quote x)) + . args)) + ((scan-exp (hd . tl) k) + (scan-exp hd (do-tl ! scan-exp tl k))) + ((scan-exp x (k-head ! . args)) + (k-head x . args)))) + (do-tl + (syntax-rules (!) + ((do-tl processed-hd fn () (k-head ! . args)) + (k-head (processed-hd) . args)) + ((do-tl processed-hd fn old-tl k) + (fn old-tl (do-cons ! processed-hd k))))) + (do-cons + (syntax-rules (!) + ((do-cons processed-tl processed-hd (k-head ! . args)) + (k-head (processed-hd . processed-tl) . args)))) + (do-wrap + (syntax-rules (!) + ((do-wrap val fn (k-head ! . args)) + (k-head (fn val) . args)))) + (do-finish + (syntax-rules () + ((do-finish new-body) new-body))) + + (scan-lit-lst ; scan literal list + (syntax-rules (quote unquote unquote-splicing !) + ((scan-lit-lst '() (k-head ! . args)) + (k-head '() . args)) + ((scan-lit-lst (quote (hd . tl)) k) + (do-tl quote scan-lit-lst ((hd . tl)) k)) + ((scan-lit-lst (unquote x) k) + (scan-exp x (do-wrap ! unquote k))) + ((scan-lit-lst (unquote-splicing x) k) + (scan-exp x (do-wrap ! unquote-splicing k))) + ((scan-lit-lst (quote x) (k-head ! . args)) + (k-head + ,(if (string? (quote x)) (string->symbol (quote x)) (quote x)) + . args)) + ((scan-lit-lst (hd . tl) k) + (scan-lit-lst hd (do-tl ! scan-lit-lst tl k))) + ((scan-lit-lst x (k-head ! . args)) + (k-head x . args)))) + ) + (scan-exp ?body (do-finish !)))) + ((run-test body ...) + (begin + (run-test "scan-exp" body) ...)) +)) + +;======================================================================== +; Data Types + +; TAG-KIND +; a symbol 'START, 'END, 'PI, 'DECL, 'COMMENT, 'CDSECT +; or 'ENTITY-REF that identifies a markup token + +; UNRES-NAME +; a name (called GI in the XML Recommendation) as given in an xml +; document for a markup token: start-tag, PI target, attribute name. +; If a GI is an NCName, UNRES-NAME is this NCName converted into +; a Scheme symbol. If a GI is a QName, UNRES-NAME is a pair of +; symbols: (PREFIX . LOCALPART) + +; RES-NAME +; An expanded name, a resolved version of an UNRES-NAME. +; For an element or an attribute name with a non-empty namespace URI, +; RES-NAME is a pair of symbols, (URI-SYMB . LOCALPART). +; Otherwise, it's a single symbol. + +; ELEM-CONTENT-MODEL +; A symbol: +; ANY - anything goes, expect an END tag. +; EMPTY-TAG - no content, and no END-tag is coming +; EMPTY - no content, expect the END-tag as the next token +; PCDATA - expect character data only, and no children elements +; MIXED +; ELEM-CONTENT + +; URI-SYMB +; A symbol representing a namespace URI -- or other symbol chosen +; by the user to represent URI. In the former case, +; URI-SYMB is created by %-quoting of bad URI characters and +; converting the resulting string into a symbol. + +; NAMESPACES +; A list representing namespaces in effect. An element of the list +; has one of the following forms: +; (PREFIX URI-SYMB . URI-SYMB) or +; (PREFIX USER-PREFIX . URI-SYMB) +; USER-PREFIX is a symbol chosen by the user +; to represent the URI. +; (#f USER-PREFIX . URI-SYMB) +; Specification of the user-chosen prefix and a URI-SYMBOL. +; (*DEFAULT* USER-PREFIX . URI-SYMB) +; Declaration of the default namespace +; (*DEFAULT* #f . #f) +; Un-declaration of the default namespace. This notation +; represents overriding of the previous declaration +; A NAMESPACES list may contain several elements for the same PREFIX. +; The one closest to the beginning of the list takes effect. + +; ATTLIST +; An ordered collection of (NAME . VALUE) pairs, where NAME is +; a RES-NAME or an UNRES-NAME. The collection is an ADT + +; STR-HANDLER +; A procedure of three arguments: STRING1 STRING2 SEED +; returning a new SEED +; The procedure is supposed to handle a chunk of character data +; STRING1 followed by a chunk of character data STRING2. +; STRING2 is a short string, often "\n" and even "" + +; ENTITIES +; An assoc list of pairs: +; (named-entity-name . named-entity-body) +; where named-entity-name is a symbol under which the entity was +; declared, named-entity-body is either a string, or +; (for an external entity) a thunk that will return an +; input port (from which the entity can be read). +; named-entity-body may also be #f. This is an indication that a +; named-entity-name is currently being expanded. A reference to +; this named-entity-name will be an error: violation of the +; WFC nonrecursion. + +; XML-TOKEN -- a record + +; In Gambit, you can use the following declaration: +; (define-structure xml-token kind head) +; The following declaration is "standard" as it follows SRFI-9: +;;(define-record-type xml-token (make-xml-token kind head) xml-token? +;; (kind xml-token-kind) +;; (head xml-token-head) ) +; No field mutators are declared as SSAX is a pure functional parser +; +; But to make the code more portable, we define xml-token simply as +; a pair. It suffices for us. Furthermore, xml-token-kind and xml-token-head +; can be defined as simple procedures. However, they are declared as +; macros below for efficiency. + +(define (make-xml-token kind head) (cons kind head)) +(define xml-token? pair?) +(define-syntax xml-token-kind + (syntax-rules () ((xml-token-kind token) (car token)))) +(define-syntax xml-token-head + (syntax-rules () ((xml-token-head token) (cdr token)))) + +; (define-macro xml-token-kind (lambda (token) `(car ,token))) +; (define-macro xml-token-head (lambda (token) `(cdr ,token))) + +; This record represents a markup, which is, according to the XML +; Recommendation, "takes the form of start-tags, end-tags, empty-element tags, +; entity references, character references, comments, CDATA section delimiters, +; document type declarations, and processing instructions." +; +; kind -- a TAG-KIND +; head -- an UNRES-NAME. For xml-tokens of kinds 'COMMENT and +; 'CDSECT, the head is #f +; +; For example, +; <P> => kind='START, head='P +; </P> => kind='END, head='P +; <BR/> => kind='EMPTY-EL, head='BR +; <!DOCTYPE OMF ...> => kind='DECL, head='DOCTYPE +; <?xml version="1.0"?> => kind='PI, head='xml +; &my-ent; => kind = 'ENTITY-REF, head='my-ent +; +; Character references are not represented by xml-tokens as these references +; are transparently resolved into the corresponding characters. +; + + + +; XML-DECL -- a record + +; The following is Gambit-specific, see below for a portable declaration +;(define-structure xml-decl elems entities notations) + +; The record represents a datatype of an XML document: the list of +; declared elements and their attributes, declared notations, list of +; replacement strings or loading procedures for parsed general +; entities, etc. Normally an xml-decl record is created from a DTD or +; an XML Schema, although it can be created and filled in in many other +; ways (e.g., loaded from a file). +; +; elems: an (assoc) list of decl-elem or #f. The latter instructs +; the parser to do no validation of elements and attributes. +; +; decl-elem: declaration of one element: +; (elem-name elem-content decl-attrs) +; elem-name is an UNRES-NAME for the element. +; elem-content is an ELEM-CONTENT-MODEL. +; decl-attrs is an ATTLIST, of (ATTR-NAME . VALUE) associations +; !!!This element can declare a user procedure to handle parsing of an +; element (e.g., to do a custom validation, or to build a hash of +; IDs as they're encountered). +; +; decl-attr: an element of an ATTLIST, declaration of one attribute +; (attr-name content-type use-type default-value) +; attr-name is an UNRES-NAME for the declared attribute +; content-type is a symbol: CDATA, NMTOKEN, NMTOKENS, ... +; or a list of strings for the enumerated type. +; use-type is a symbol: REQUIRED, IMPLIED, FIXED +; default-value is a string for the default value, or #f if not given. +; +; + +; see a function make-empty-xml-decl to make a XML declaration entry +; suitable for a non-validating parsing. + + +;------------------------- +; Utilities + +; ssax:warn PORT MESSAGE SPECIALISING-MSG* +; to notify the user about warnings that are NOT errors but still +; may alert the user. +; Result is unspecified. +; We need to define the function to allow the self-tests to run. +; Normally the definition of ssax:warn is to be provided by the user. +(run-test + (define (ssax:warn port msg . other-msg) + (apply cerr (cons* nl "Warning: " msg other-msg))) +) + + +; parser-error PORT MESSAGE SPECIALISING-MSG* +; to let the user know of a syntax error or a violation of a +; well-formedness or validation constraint. +; Result is unspecified. +; We need to define the function to allow the self-tests to run. +; Normally the definition of parser-error is to be provided by the user. +(run-test + (define (parser-error port msg . specializing-msgs) + (apply error (cons msg specializing-msgs))) +) + +; The following is a function that is often used in validation tests, +; to make sure that the computed result matches the expected one. +; This function is a standard equal? predicate with one exception. +; On Scheme systems where (string->symbol "A") and a symbol A +; are the same, equal_? is precisely equal? +; On other Scheme systems, we compare symbols disregarding their case. +; Since this function is used only in tests, we don't have to +; strive to make it efficient. +(run-test + (define (equal_? e1 e2) + (if (eq? 'A (string->symbol "A")) (equal? e1 e2) + (cond + ((symbol? e1) + (and (symbol? e2) + (string-ci=? (symbol->string e1) (symbol->string e2)))) + ((pair? e1) + (and (pair? e2) + (equal_? (car e1) (car e2)) (equal_? (cdr e1) (cdr e2)))) + ((vector? e1) + (and (vector? e2) (equal_? (vector->list e1) (vector->list e2)))) + (else + (equal? e1 e2))))) +) + +; The following function, which is often used in validation tests, +; lets us conveniently enter newline, CR and tab characters in a character +; string. +; unesc-string: ESC-STRING -> STRING +; where ESC-STRING is a character string that may contain +; %n -- for #\newline +; %r -- for #\return +; %t -- for #\tab +; %% -- for #\% +; +; The result of unesc-string is a character string with all %-combinations +; above replaced with their character equivalents + +(run-test + (define (unesc-string str) + (call-with-input-string str + (lambda (port) + (let loop ((frags '())) + (let* ((token (next-token '() '(#\% *eof*) "unesc-string" port)) + (cterm (read-char port)) + (frags (cons token frags))) + (if (eof-object? cterm) (string-concatenate-reverse/shared frags) + (let ((cchar (read-char port))) ; char after #\% + (if (eof-object? cchar) + (error "unexpected EOF after reading % in unesc-string:" str) + (loop + (cons + (case cchar + ((#\n) (string #\newline)) + ((#\r) (string char-return)) + ((#\t) (string char-tab)) + ((#\%) "%") + (else (error "bad %-char in unesc-string:" cchar))) + frags)))))))))) +) + + +; Test if a string is made of only whitespace +; An empty string is considered made of whitespace as well +(define (string-whitespace? str) + (let ((len (string-length str))) + (cond + ((zero? len) #t) + ((= 1 len) (char-whitespace? (string-ref str 0))) + ((= 2 len) (and (char-whitespace? (string-ref str 0)) + (char-whitespace? (string-ref str 1)))) + (else + (let loop ((i 0)) + (or (>= i len) + (and (char-whitespace? (string-ref str i)) + (loop (inc i))))))))) + +; Find val in alist +; Return (values found-el remaining-alist) or +; (values #f alist) + +(define (assq-values val alist) + (let loop ((alist alist) (scanned '())) + (cond + ((null? alist) (values #f scanned)) + ((equal? val (caar alist)) + (values (car alist) (append scanned (cdr alist)))) + (else + (loop (cdr alist) (cons (car alist) scanned)))))) + +; From SRFI-1 +(define (fold-right kons knil lis1) + (let recur ((lis lis1)) + (if (null? lis) knil + (let ((head (car lis))) + (kons head (recur (cdr lis))))))) + +; Left fold combinator for a single list +(define (fold kons knil lis1) + (let lp ((lis lis1) (ans knil)) + (if (null? lis) ans + (lp (cdr lis) (kons (car lis) ans))))) + + + +;======================================================================== +; Lower-level parsers and scanners +; +; They deal with primitive lexical units (Names, whitespaces, tags) +; and with pieces of more generic productions. Most of these parsers +; must be called in appropriate context. For example, ssax:complete-start-tag +; must be called only when the start-tag has been detected and its GI +; has been read. + +;------------------------------------------------------------------------ +; Low-level parsing code + +; Skip the S (whitespace) production as defined by +; [3] S ::= (#x20 | #x9 | #xD | #xA) +; The procedure returns the first not-whitespace character it +; encounters while scanning the PORT. This character is left +; on the input stream. + +(define ssax:S-chars (map ascii->char '(32 10 9 13))) + +(define (ssax:skip-S port) + (skip-while ssax:S-chars port)) + + +; Read a Name lexem and return it as string +; [4] NameChar ::= Letter | Digit | '.' | '-' | '_' | ':' +; | CombiningChar | Extender +; [5] Name ::= (Letter | '_' | ':') (NameChar)* +; +; This code supports the XML Namespace Recommendation REC-xml-names, +; which modifies the above productions as follows: +; +; [4] NCNameChar ::= Letter | Digit | '.' | '-' | '_' +; | CombiningChar | Extender +; [5] NCName ::= (Letter | '_') (NCNameChar)* +; As the Rec-xml-names says, +; "An XML document conforms to this specification if all other tokens +; [other than element types and attribute names] in the document which +; are required, for XML conformance, to match the XML production for +; Name, match this specification's production for NCName." +; Element types and attribute names must match the production QName, +; defined below. + +; Check to see if a-char may start a NCName +(define (ssax:ncname-starting-char? a-char) + (and (char? a-char) + (or + (char-alphabetic? a-char) + (char=? #\_ a-char)))) + + +; Read a NCName starting from the current position in the PORT and +; return it as a symbol. +(define (ssax:read-NCName port) + (let ((first-char (peek-char port))) + (or (ssax:ncname-starting-char? first-char) + (parser-error port "XMLNS [4] for '" first-char "'"))) + (string->symbol + (next-token-of + (lambda (c) + (cond + ((eof-object? c) #f) + ((char-alphabetic? c) c) + ((string-index "0123456789.-_" c) c) + (else #f))) + port))) + +; Read a (namespace-) Qualified Name, QName, from the current +; position in the PORT. +; From REC-xml-names: +; [6] QName ::= (Prefix ':')? LocalPart +; [7] Prefix ::= NCName +; [8] LocalPart ::= NCName +; Return: an UNRES-NAME +(define (ssax:read-QName port) + (let ((prefix-or-localpart (ssax:read-NCName port))) + (case (peek-char port) + ((#\:) ; prefix was given after all + (read-char port) ; consume the colon + (cons prefix-or-localpart (ssax:read-NCName port))) + (else prefix-or-localpart) ; Prefix was omitted + ))) + +; The prefix of the pre-defined XML namespace +(define ssax:Prefix-XML (string->symbol "xml")) + +(run-test + (assert (eq? '_ + (call-with-input-string "_" ssax:read-NCName))) + (assert (eq? '_ + (call-with-input-string "_" ssax:read-QName))) + (assert (eq? (string->symbol "_abc_") + (call-with-input-string "_abc_;" ssax:read-NCName))) + (assert (eq? (string->symbol "_abc_") + (call-with-input-string "_abc_;" ssax:read-QName))) + (assert (eq? (string->symbol "_a.b") + (call-with-input-string "_a.b " ssax:read-QName))) + (assert (equal? (cons (string->symbol "_a.b") (string->symbol "d.1-ef-")) + (call-with-input-string "_a.b:d.1-ef-;" ssax:read-QName))) + (assert (equal? (cons (string->symbol "a") (string->symbol "b")) + (call-with-input-string "a:b:c" ssax:read-QName))) + + (assert (failed? (call-with-input-string ":abc" ssax:read-NCName))) + (assert (failed? (call-with-input-string "1:bc" ssax:read-NCName))) +) + +; Compare one RES-NAME or an UNRES-NAME with the other. +; Return a symbol '<, '>, or '= depending on the result of +; the comparison. +; Names without PREFIX are always smaller than those with the PREFIX. +(define name-compare + (letrec ((symbol-compare + (lambda (symb1 symb2) + (cond + ((eq? symb1 symb2) '=) + ((string<? (symbol->string symb1) (symbol->string symb2)) + '<) + (else '>))))) + (lambda (name1 name2) + (cond + ((symbol? name1) (if (symbol? name2) (symbol-compare name1 name2) + '<)) + ((symbol? name2) '>) + ((eq? name2 ssax:largest-unres-name) '<) + ((eq? name1 ssax:largest-unres-name) '>) + ((eq? (car name1) (car name2)) ; prefixes the same + (symbol-compare (cdr name1) (cdr name2))) + (else (symbol-compare (car name1) (car name2))))))) + +; An UNRES-NAME that is postulated to be larger than anything that can occur in +; a well-formed XML document. +; name-compare enforces this postulate. +(define ssax:largest-unres-name (cons + (string->symbol "#LARGEST-SYMBOL") + (string->symbol "#LARGEST-SYMBOL"))) + +(run-test + (assert (eq? '= (name-compare 'ABC 'ABC))) + (assert (eq? '< (name-compare 'ABC 'ABCD))) + (assert (eq? '> (name-compare 'XB 'ABCD))) + (assert (eq? '> (name-compare '(HTML . PRE) 'PRE))) + (assert (eq? '< (name-compare 'HTML '(HTML . PRE)))) + (assert (eq? '= (name-compare '(HTML . PRE) '(HTML . PRE)))) + (assert (eq? '< (name-compare '(HTML . PRE) '(XML . PRE)))) + (assert (eq? '> (name-compare '(HTML . PRE) '(HTML . P)))) + (assert (eq? '< (name-compare '(HTML . PRE) ssax:largest-unres-name))) + (assert (eq? '< (name-compare '(ZZZZ . ZZZ) ssax:largest-unres-name))) + (assert (eq? '> (name-compare ssax:largest-unres-name '(ZZZZ . ZZZ) ))) +) + + + +; procedure: ssax:read-markup-token PORT +; This procedure starts parsing of a markup token. The current position +; in the stream must be #\<. This procedure scans enough of the input stream +; to figure out what kind of a markup token it is seeing. The procedure returns +; an xml-token structure describing the token. Note, generally reading +; of the current markup is not finished! In particular, no attributes of +; the start-tag token are scanned. +; +; Here's a detailed break out of the return values and the position in the PORT +; when that particular value is returned: +; PI-token: only PI-target is read. +; To finish the Processing Instruction and disregard it, +; call ssax:skip-pi. ssax:read-attributes may be useful +; as well (for PIs whose content is attribute-value +; pairs) +; END-token: The end tag is read completely; the current position +; is right after the terminating #\> character. +; COMMENT is read and skipped completely. The current position +; is right after "-->" that terminates the comment. +; CDSECT The current position is right after "<!CDATA[" +; Use ssax:read-cdata-body to read the rest. +; DECL We have read the keyword (the one that follows "<!") +; identifying this declaration markup. The current +; position is after the keyword (usually a +; whitespace character) +; +; START-token We have read the keyword (GI) of this start tag. +; No attributes are scanned yet. We don't know if this +; tag has an empty content either. +; Use ssax:complete-start-tag to finish parsing of +; the token. + +(define ssax:read-markup-token ; procedure ssax:read-markup-token port + (let () + ; we have read "<!-". Skip through the rest of the comment + ; Return the 'COMMENT token as an indication we saw a comment + ; and skipped it. + (define (skip-comment port) + (assert-curr-char '(#\-) "XML [15], second dash" port) + (if (not (find-string-from-port? "-->" port)) + (parser-error port "XML [15], no -->")) + (make-xml-token 'COMMENT #f)) + + ; we have read "<![" that must begin a CDATA section + (define (read-cdata port) + (assert (string=? "CDATA[" (read-string 6 port))) + (make-xml-token 'CDSECT #f)) + + (lambda (port) + (assert-curr-char '(#\<) "start of the token" port) + (case (peek-char port) + ((#\/) (read-char port) + (begin0 (make-xml-token 'END (ssax:read-QName port)) + (ssax:skip-S port) + (assert-curr-char '(#\>) "XML [42]" port))) + ((#\?) (read-char port) (make-xml-token 'PI (ssax:read-NCName port))) + ((#\!) + (case (peek-next-char port) + ((#\-) (read-char port) (skip-comment port)) + ((#\[) (read-char port) (read-cdata port)) + (else (make-xml-token 'DECL (ssax:read-NCName port))))) + (else (make-xml-token 'START (ssax:read-QName port))))) +)) + + +; The current position is inside a PI. Skip till the rest of the PI +(define (ssax:skip-pi port) + (if (not (find-string-from-port? "?>" port)) + (parser-error port "Failed to find ?> terminating the PI"))) + + +; The current position is right after reading the PITarget. We read the +; body of PI and return is as a string. The port will point to the +; character right after '?>' combination that terminates PI. +; [16] PI ::= '<?' PITarget (S (Char* - (Char* '?>' Char*)))? '?>' + +(define (ssax:read-pi-body-as-string port) + (ssax:skip-S port) ; skip WS after the PI target name + (string-concatenate/shared + (let loop () + (let ((pi-fragment + (next-token '() '(#\?) "reading PI content" port))) + (if (eqv? #\> (peek-next-char port)) + (begin + (read-char port) + (cons pi-fragment '())) + (cons* pi-fragment "?" (loop))))))) + +(run-test + (assert (equal? "p1 content " + (call-with-input-string "<?pi1 p1 content ?>" + (lambda (port) + (ssax:read-markup-token port) + (ssax:read-pi-body-as-string port))))) + (assert (equal? "pi2? content? ?" + (call-with-input-string "<?pi2 pi2? content? ??>" + (lambda (port) + (ssax:read-markup-token port) + (ssax:read-pi-body-as-string port))))) +) + +;(define (ssax:read-pi-body-as-name-values port) + +; The current pos in the port is inside an internal DTD subset +; (e.g., after reading #\[ that begins an internal DTD subset) +; Skip until the "]>" combination that terminates this DTD +(define (ssax:skip-internal-dtd port) + (if (not (find-string-from-port? "]>" port)) + (parser-error port + "Failed to find ]> terminating the internal DTD subset"))) + + +; procedure+: ssax:read-cdata-body PORT STR-HANDLER SEED +; +; This procedure must be called after we have read a string "<![CDATA[" +; that begins a CDATA section. The current position must be the first +; position of the CDATA body. This function reads _lines_ of the CDATA +; body and passes them to a STR-HANDLER, a character data consumer. +; +; The str-handler is a STR-HANDLER, a procedure STRING1 STRING2 SEED. +; The first STRING1 argument to STR-HANDLER never contains a newline. +; The second STRING2 argument often will. On the first invocation of +; the STR-HANDLER, the seed is the one passed to ssax:read-cdata-body +; as the third argument. The result of this first invocation will be +; passed as the seed argument to the second invocation of the line +; consumer, and so on. The result of the last invocation of the +; STR-HANDLER is returned by the ssax:read-cdata-body. Note a +; similarity to the fundamental 'fold' iterator. +; +; Within a CDATA section all characters are taken at their face value, +; with only three exceptions: +; CR, LF, and CRLF are treated as line delimiters, and passed +; as a single #\newline to the STR-HANDLER +; "]]>" combination is the end of the CDATA section. +; > is treated as an embedded #\> character +; Note, < and & are not specially recognized (and are not expanded)! + +(define ssax:read-cdata-body + (let ((cdata-delimiters (list char-return #\newline #\] #\&))) + + (lambda (port str-handler seed) + (let loop ((seed seed)) + (let ((fragment (next-token '() cdata-delimiters + "reading CDATA" port))) + ; that is, we're reading the char after the 'fragment' + (case (read-char port) + ((#\newline) (loop (str-handler fragment nl seed))) + ((#\]) + (if (not (eqv? (peek-char port) #\])) + (loop (str-handler fragment "]" seed)) + (let check-after-second-braket + ((seed (if (string-null? fragment) seed + (str-handler fragment "" seed)))) + (case (peek-next-char port) ; after the second bracket + ((#\>) (read-char port) seed) ; we have read "]]>" + ((#\]) (check-after-second-braket + (str-handler "]" "" seed))) + (else (loop (str-handler "]]" "" seed))))))) + ((#\&) ; Note that #\& within CDATA may stand for itself + (let ((ent-ref ; it does not have to start an entity ref + (next-token-of (lambda (c) + (and (not (eof-object? c)) (char-alphabetic? c) c)) port))) + (cond ; ">" is to be replaced with #\> + ((and (string=? "gt" ent-ref) (eqv? (peek-char port) #\;)) + (read-char port) + (loop (str-handler fragment ">" seed))) + (else + (loop + (str-handler ent-ref "" + (str-handler fragment "&" seed))))))) + (else ; Must be CR: if the next char is #\newline, skip it + (if (eqv? (peek-char port) #\newline) (read-char port)) + (loop (str-handler fragment nl seed))) + )))))) + +; a few lines of validation code +(run-test (letrec + ((consumer (lambda (fragment foll-fragment seed) + (cons* (if (equal? foll-fragment (string #\newline)) + " NL" foll-fragment) fragment seed))) + (test (lambda (str expected-result) + (newline) (display "body: ") (write str) + (newline) (display "Result: ") + (let ((result + (reverse + (call-with-input-string (unesc-string str) + (lambda (port) (ssax:read-cdata-body port consumer '())) + )))) + (write result) + (assert (equal? result expected-result))))) + ) + (test "]]>" '()) + (test "abcd]]>" '("abcd" "")) + (test "abcd]]]>" '("abcd" "" "]" "")) + (test "abcd]]]]>" '("abcd" "" "]" "" "]" "")) + (test "abcd]]]]]>" '("abcd" "" "]" "" "]" "" "]" "")) + (test "abcd]]]a]]>" '("abcd" "" "]" "" "]]" "" "a" "")) + (test "abc%r%ndef%n]]>" '("abc" " NL" "def" " NL")) + (test "%r%n%r%n]]>" '("" " NL" "" " NL")) + (test "%r%n%r%na]]>" '("" " NL" "" " NL" "a" "")) + (test "%r%r%r%na]]>" '("" " NL" "" " NL" "" " NL" "a" "")) + (test "abc&!!!]]>" '("abc" "&" "" "" "!!!" "")) + (test "abc]]>>&]]]>and]]>" + '("abc" "" "]]" "" "" ">" "" "&" "gt" "" "" "&" "amp" "" ";" "" "]" "" + "]]" "" "" ">" "and" "")) +)) + + +; procedure+: ssax:read-char-ref PORT +; +; [66] CharRef ::= '&#' [0-9]+ ';' +; | '&#x' [0-9a-fA-F]+ ';' +; +; This procedure must be called after we we have read "&#" +; that introduces a char reference. +; The procedure reads this reference and returns the corresponding char +; The current position in PORT will be after ";" that terminates +; the char reference +; Faults detected: +; WFC: XML-Spec.html#wf-Legalchar +; +; According to Section "4.1 Character and Entity References" +; of the XML Recommendation: +; "[Definition: A character reference refers to a specific character +; in the ISO/IEC 10646 character set, for example one not directly +; accessible from available input devices.]" +; Therefore, we use a ucscode->string function to convert a character +; code into the character -- *regardless* of the current character +; encoding of the input stream. + +(define (ssax:read-char-ref port) + (let* ((base + (cond ((eqv? (peek-char port) #\x) (read-char port) 16) + (else 10))) + (name (next-token '() '(#\;) "XML [66]" port)) + (char-code (string->number name base))) + (read-char port) ; read the terminating #\; char + (if (integer? char-code) (ucscode->string char-code) + (parser-error port "[wf-Legalchar] broken for '" name "'")))) + + +; procedure+: ssax:handle-parsed-entity PORT NAME ENTITIES +; CONTENT-HANDLER STR-HANDLER SEED +; +; Expand and handle a parsed-entity reference +; port - a PORT +; name - the name of the parsed entity to expand, a symbol +; entities - see ENTITIES +; content-handler -- procedure PORT ENTITIES SEED +; that is supposed to return a SEED +; str-handler - a STR-HANDLER. It is called if the entity in question +; turns out to be a pre-declared entity +; +; The result is the one returned by CONTENT-HANDLER or STR-HANDLER +; Faults detected: +; WFC: XML-Spec.html#wf-entdeclared +; WFC: XML-Spec.html#norecursion + +(define ssax:predefined-parsed-entities + `( + (,(string->symbol "amp") . "&") + (,(string->symbol "lt") . "<") + (,(string->symbol "gt") . ">") + (,(string->symbol "apos") . "'") + (,(string->symbol "quot") . "\""))) + +(define (ssax:handle-parsed-entity port name entities + content-handler str-handler seed) + (cond ; First we check the list of the declared entities + ((assq name entities) => + (lambda (decl-entity) + (let ((ent-body (cdr decl-entity)) ; mark the list to prevent recursion + (new-entities (cons (cons name #f) entities))) + (cond + ((string? ent-body) + (call-with-input-string ent-body + (lambda (port) (content-handler port new-entities seed)))) + ((procedure? ent-body) + (let ((port (ent-body))) + (begin0 + (content-handler port new-entities seed) + (close-input-port port)))) + (else + (parser-error port "[norecursion] broken for " name)))))) + ((assq name ssax:predefined-parsed-entities) + => (lambda (decl-entity) + (str-handler (cdr decl-entity) "" seed))) + (else (parser-error port "[wf-entdeclared] broken for " name)))) + + + +; The ATTLIST Abstract Data Type +; Currently is implemented as an assoc list sorted in the ascending +; order of NAMES. + +(define (make-empty-attlist) '()) + +; Add a name-value pair to the existing attlist preserving the order +; Return the new list, in the sorted ascending order. +; Return #f if a pair with the same name already exists in the attlist + +(define (attlist-add attlist name-value) + (if (null? attlist) (cons name-value attlist) + (case (name-compare (car name-value) (caar attlist)) + ((=) #f) + ((<) (cons name-value attlist)) + (else (cons (car attlist) (attlist-add (cdr attlist) name-value))) + ))) + +(define attlist-null? null?) + +; Given an non-null attlist, return a pair of values: the top and the rest +(define (attlist-remove-top attlist) + (values (car attlist) (cdr attlist))) + +(define (attlist->alist attlist) attlist) +(define attlist-fold fold) + +; procedure+: ssax:read-attributes PORT ENTITIES +; +; This procedure reads and parses a production Attribute* +; [41] Attribute ::= Name Eq AttValue +; [10] AttValue ::= '"' ([^<&"] | Reference)* '"' +; | "'" ([^<&'] | Reference)* "'" +; [25] Eq ::= S? '=' S? +; +; +; The procedure returns an ATTLIST, of Name (as UNRES-NAME), Value (as string) +; pairs. The current character on the PORT is a non-whitespace character +; that is not an ncname-starting character. +; +; Note the following rules to keep in mind when reading an 'AttValue' +; "Before the value of an attribute is passed to the application +; or checked for validity, the XML processor must normalize it as follows: +; - a character reference is processed by appending the referenced +; character to the attribute value +; - an entity reference is processed by recursively processing the +; replacement text of the entity [see ENTITIES] +; [named entities amp lt gt quot apos are assumed pre-declared] +; - a whitespace character (#x20, #xD, #xA, #x9) is processed by appending #x20 +; to the normalized value, except that only a single #x20 is appended for a +; "#xD#xA" sequence that is part of an external parsed entity or the +; literal entity value of an internal parsed entity +; - other characters are processed by appending them to the normalized value " +; +; +; Faults detected: +; WFC: XML-Spec.html#CleanAttrVals +; WFC: XML-Spec.html#uniqattspec + +(define ssax:read-attributes ; ssax:read-attributes port entities + (let ((value-delimeters (append ssax:S-chars '(#\< #\&)))) + ; Read the AttValue from the PORT up to the delimiter + ; (which can be a single or double-quote character, + ; or even a symbol *eof*) + ; 'prev-fragments' is the list of string fragments, accumulated + ; so far, in reverse order. + ; Return the list of fragments with newly read fragments + ; prepended. + (define (read-attrib-value delimiter port entities prev-fragments) + (let* ((new-fragments + (cons (next-token '() (cons delimiter value-delimeters) + "XML [10]" port) + prev-fragments)) + (cterm (read-char port))) + (cond + ((or (eof-object? cterm) (eqv? cterm delimiter)) + new-fragments) + ((eqv? cterm char-return) ; treat a CR and CRLF as a LF + (if (eqv? (peek-char port) #\newline) (read-char port)) + (read-attrib-value delimiter port entities + (cons " " new-fragments))) + ((memv cterm ssax:S-chars) + (read-attrib-value delimiter port entities + (cons " " new-fragments))) + ((eqv? cterm #\&) + (cond + ((eqv? (peek-char port) #\#) + (read-char port) + (read-attrib-value delimiter port entities + (cons (ssax:read-char-ref port) new-fragments))) + (else + (read-attrib-value delimiter port entities + (read-named-entity port entities new-fragments))))) + (else (parser-error port "[CleanAttrVals] broken"))))) + + ; we have read "&" that introduces a named entity reference. + ; read this reference and return the result of + ; normalizing of the corresponding string + ; (that is, read-attrib-value is applied to the replacement + ; text of the entity) + ; The current position will be after ";" that terminates + ; the entity reference + (define (read-named-entity port entities fragments) + (let ((name (ssax:read-NCName port))) + (assert-curr-char '(#\;) "XML [68]" port) + (ssax:handle-parsed-entity port name entities + (lambda (port entities fragments) + (read-attrib-value '*eof* port entities fragments)) + (lambda (str1 str2 fragments) + (if (equal? "" str2) (cons str1 fragments) + (cons* str2 str1 fragments))) + fragments))) + + (lambda (port entities) + (let loop ((attr-list (make-empty-attlist))) + (if (not (ssax:ncname-starting-char? (ssax:skip-S port))) attr-list + (let ((name (ssax:read-QName port))) + (ssax:skip-S port) + (assert-curr-char '(#\=) "XML [25]" port) + (ssax:skip-S port) + (let ((delimiter + (assert-curr-char '(#\' #\" ) "XML [10]" port))) + (loop + (or (attlist-add attr-list + (cons name + (string-concatenate-reverse/shared + (read-attrib-value delimiter port entities + '())))) + (parser-error port "[uniqattspec] broken for " name)))))))) +)) + +; a few lines of validation code +(run-test (letrec + ((test (lambda (str decl-entities expected-res) + (newline) (display "input: ") (write str) + (newline) (display "Result: ") + (let ((result + (call-with-input-string (unesc-string str) + (lambda (port) + (ssax:read-attributes port decl-entities))))) + (write result) (newline) + (assert (equal? result expected-res)))))) + (test "" '() '()) + (test "href='http://a%tb%r%n%r%n%nc'" '() + `((,(string->symbol "href") . "http://a b c"))) + (test "href='http://a%tb%r%r%n%rc'" '() + `((,(string->symbol "href") . "http://a b c"))) + (test "_1 ='12&' _2= \"%r%n%t12 3\">" '() + `((_1 . "12&") (_2 . ,(unesc-string " 12%n3")))) + (test "%tAbc='<&>
'%nNext='12&ent;34' />" + '((ent . "<xx>")) + `((,(string->symbol "Abc") . ,(unesc-string "<&>%n")) + (,(string->symbol "Next") . "12<xx>34"))) + (test "%tAbc='<&>
'%nNext='12&ent;34' />" + '((ent . "<xx>")) + `((,(string->symbol "Abc") . ,(unesc-string "<&>%r")) + (,(string->symbol "Next") . "12<xx>34"))) + (test "%tAbc='<&>
'%nNext='12&en;34' />" + `((en . ,(lambda () (open-input-string ""xx'")))) + `((,(string->symbol "Abc") . ,(unesc-string "<&>%n")) + (,(string->symbol "Next") . "12\"xx'34"))) + (test "%tAbc='<&>
'%nNext='12&ent;34' />" + '((ent . "<&ent1;T;>") (ent1 . "&")) + `((,(string->symbol "Abc") . ,(unesc-string "<&>%n")) + (,(string->symbol "Next") . "12<&T;>34"))) + (assert (failed? + (test "%tAbc='<&>
'%nNext='12&ent;34' />" + '((ent . "<&ent1;T;>") (ent1 . "&")) '()))) + (assert (failed? + (test "%tAbc='<&>
'%nNext='12&ent;34' />" + '((ent . "<&ent;T;>") (ent1 . "&")) '()))) + (assert (failed? + (test "%tAbc='<&>
'%nNext='12&ent;34' />" + '((ent . "<&ent1;T;>") (ent1 . "&ent;")) '()))) + (test "html:href='http://a%tb%r%n%r%n%nc'" '() + `(((,(string->symbol "html") . ,(string->symbol "href")) + . "http://a b c"))) + (test "html:href='ref1' html:src='ref2'" '() + `(((,(string->symbol "html") . ,(string->symbol "href")) + . "ref1") + ((,(string->symbol "html") . ,(string->symbol "src")) + . "ref2"))) + (test "html:href='ref1' xml:html='ref2'" '() + `(((,(string->symbol "html") . ,(string->symbol "href")) + . "ref1") + ((,ssax:Prefix-XML . ,(string->symbol "html")) + . "ref2"))) + (assert (failed? (test "html:href='ref1' html:href='ref2'" '() '()))) + (assert (failed? (test "html:href='<' html:href='ref2'" '() '()))) + (assert (failed? (test "html:href='ref1' html:href='&ref2;'" '() '()))) +)) + +; ssax:resolve-name PORT UNRES-NAME NAMESPACES apply-default-ns? +; +; Convert an UNRES-NAME to a RES-NAME given the appropriate NAMESPACES +; declarations. +; the last parameter apply-default-ns? determines if the default +; namespace applies (for instance, it does not for attribute names) +; +; Per REC-xml-names/#nsc-NSDeclared, "xml" prefix is considered pre-declared +; and bound to the namespace name "http://www.w3.org/XML/1998/namespace". +; +; This procedure tests for the namespace constraints: +; http://www.w3.org/TR/REC-xml-names/#nsc-NSDeclared + +(define (ssax:resolve-name port unres-name namespaces apply-default-ns?) + (cond + ((pair? unres-name) ; it's a QNAME + (cons + (cond + ((assq (car unres-name) namespaces) => cadr) + ((eq? (car unres-name) ssax:Prefix-XML) ssax:Prefix-XML) + (else + (parser-error port "[nsc-NSDeclared] broken; prefix " (car unres-name)))) + (cdr unres-name))) + (apply-default-ns? ; Do apply the default namespace, if any + (let ((default-ns (assq '*DEFAULT* namespaces))) + (if (and default-ns (cadr default-ns)) + (cons (cadr default-ns) unres-name) + unres-name))) ; no default namespace declared + (else unres-name))) ; no prefix, don't apply the default-ns + + +(run-test + (let* ((namespaces + '((HTML UHTML . URN-HTML) + (HTML UHTML-1 . URN-HTML) + (A UHTML . URN-HTML))) + (namespaces-def + (cons + '(*DEFAULT* DEF . URN-DEF) namespaces)) + (namespaces-undef + (cons + '(*DEFAULT* #f . #f) namespaces-def)) + (port (current-input-port))) + + (assert (equal? 'ABC + (ssax:resolve-name port 'ABC namespaces #t))) + (assert (equal? '(DEF . ABC) + (ssax:resolve-name port 'ABC namespaces-def #t))) + (assert (equal? 'ABC + (ssax:resolve-name port 'ABC namespaces-def #f))) + (assert (equal? 'ABC + (ssax:resolve-name port 'ABC namespaces-undef #t))) + (assert (equal? '(UHTML . ABC) + (ssax:resolve-name port '(HTML . ABC) namespaces-def #t))) + (assert (equal? '(UHTML . ABC) + (ssax:resolve-name port '(HTML . ABC) namespaces-def #f))) + (assert (equal? `(,ssax:Prefix-XML . space) + (ssax:resolve-name port + `(,(string->symbol "xml") . space) namespaces-def #f))) + (assert (failed? + (ssax:resolve-name port '(XXX . ABC) namespaces-def #f))) +)) + + +; procedure+: ssax:uri-string->symbol URI-STR +; Convert a URI-STR to an appropriate symbol +(define (ssax:uri-string->symbol uri-str) + (string->symbol uri-str)) + +; procedure+: ssax:complete-start-tag TAG PORT ELEMS ENTITIES NAMESPACES +; +; This procedure is to complete parsing of a start-tag markup. The +; procedure must be called after the start tag token has been +; read. TAG is an UNRES-NAME. ELEMS is an instance of xml-decl::elems; +; it can be #f to tell the function to do _no_ validation of elements +; and their attributes. +; +; This procedure returns several values: +; ELEM-GI: a RES-NAME. +; ATTRIBUTES: element's attributes, an ATTLIST of (RES-NAME . STRING) +; pairs. The list does NOT include xmlns attributes. +; NAMESPACES: the input list of namespaces amended with namespace +; (re-)declarations contained within the start-tag under parsing +; ELEM-CONTENT-MODEL + +; On exit, the current position in PORT will be the first character after +; #\> that terminates the start-tag markup. +; +; Faults detected: +; VC: XML-Spec.html#enum +; VC: XML-Spec.html#RequiredAttr +; VC: XML-Spec.html#FixedAttr +; VC: XML-Spec.html#ValueType +; WFC: XML-Spec.html#uniqattspec (after namespaces prefixes are resolved) +; VC: XML-Spec.html#elementvalid +; WFC: REC-xml-names/#dt-NSName + +; Note, although XML Recommendation does not explicitly say it, +; xmlns and xmlns: attributes don't have to be declared (although they +; can be declared, to specify their default value) + +; Procedure: ssax:complete-start-tag tag-head port elems entities namespaces +(define ssax:complete-start-tag + + (let ((xmlns (string->symbol "xmlns")) + (largest-dummy-decl-attr (list ssax:largest-unres-name #f #f #f))) + + ; Scan through the attlist and validate it, against decl-attrs + ; Return an assoc list with added fixed or implied attrs. + ; Note that both attlist and decl-attrs are ATTLISTs, and therefore, + ; sorted + (define (validate-attrs port attlist decl-attrs) + + ; Check to see decl-attr is not of use type REQUIRED. Add + ; the association with the default value, if any declared + (define (add-default-decl decl-attr result) + (let*-values + (((attr-name content-type use-type default-value) + (apply values decl-attr))) + (and (eq? use-type 'REQUIRED) + (parser-error port "[RequiredAttr] broken for" attr-name)) + (if default-value + (cons (cons attr-name default-value) result) + result))) + + (let loop ((attlist attlist) (decl-attrs decl-attrs) (result '())) + (if (attlist-null? attlist) + (attlist-fold add-default-decl result decl-attrs) + (let*-values + (((attr attr-others) + (attlist-remove-top attlist)) + ((decl-attr other-decls) + (if (attlist-null? decl-attrs) + (values largest-dummy-decl-attr decl-attrs) + (attlist-remove-top decl-attrs))) + ) + (case (name-compare (car attr) (car decl-attr)) + ((<) + (if (or (eq? xmlns (car attr)) + (and (pair? (car attr)) (eq? xmlns (caar attr)))) + (loop attr-others decl-attrs (cons attr result)) + (parser-error port "[ValueType] broken for " attr))) + ((>) + (loop attlist other-decls + (add-default-decl decl-attr result))) + (else ; matched occurrence of an attr with its declaration + (let*-values + (((attr-name content-type use-type default-value) + (apply values decl-attr))) + ; Run some tests on the content of the attribute + (cond + ((eq? use-type 'FIXED) + (or (equal? (cdr attr) default-value) + (parser-error port "[FixedAttr] broken for " attr-name))) + ((eq? content-type 'CDATA) #t) ; everything goes + ((pair? content-type) + (or (member (cdr attr) content-type) + (parser-error port "[enum] broken for " attr-name "=" + (cdr attr)))) + (else + (ssax:warn port "declared content type " content-type + " not verified yet"))) + (loop attr-others other-decls (cons attr result))))) + )))) + + + ; Add a new namespace declaration to namespaces. + ; First we convert the uri-str to a uri-symbol and search namespaces for + ; an association (_ user-prefix . uri-symbol). + ; If found, we return the argument namespaces with an association + ; (prefix user-prefix . uri-symbol) prepended. + ; Otherwise, we prepend (prefix uri-symbol . uri-symbol) + (define (add-ns port prefix uri-str namespaces) + (and (equal? "" uri-str) + (parser-error port "[dt-NSName] broken for " prefix)) + (let ((uri-symbol (ssax:uri-string->symbol uri-str))) + (let loop ((nss namespaces)) + (cond + ((null? nss) + (cons (cons* prefix uri-symbol uri-symbol) namespaces)) + ((eq? uri-symbol (cddar nss)) + (cons (cons* prefix (cadar nss) uri-symbol) namespaces)) + (else (loop (cdr nss))))))) + + ; partition attrs into proper attrs and new namespace declarations + ; return two values: proper attrs and the updated namespace declarations + (define (adjust-namespace-decl port attrs namespaces) + (let loop ((attrs attrs) (proper-attrs '()) (namespaces namespaces)) + (cond + ((null? attrs) (values proper-attrs namespaces)) + ((eq? xmlns (caar attrs)) ; re-decl of the default namespace + (loop (cdr attrs) proper-attrs + (if (equal? "" (cdar attrs)) ; un-decl of the default ns + (cons (cons* '*DEFAULT* #f #f) namespaces) + (add-ns port '*DEFAULT* (cdar attrs) namespaces)))) + ((and (pair? (caar attrs)) (eq? xmlns (caaar attrs))) + (loop (cdr attrs) proper-attrs + (add-ns port (cdaar attrs) (cdar attrs) namespaces))) + (else + (loop (cdr attrs) (cons (car attrs) proper-attrs) namespaces))))) + + ; The body of the function + (lambda (tag-head port elems entities namespaces) + (let*-values + (((attlist) (ssax:read-attributes port entities)) + ((empty-el-tag?) + (begin + (ssax:skip-S port) + (and + (eqv? #\/ + (assert-curr-char '(#\> #\/) "XML [40], XML [44], no '>'" port)) + (assert-curr-char '(#\>) "XML [44], no '>'" port)))) + ((elem-content decl-attrs) ; see xml-decl for their type + (if elems ; elements declared: validate! + (cond + ((assoc tag-head elems) => + (lambda (decl-elem) ; of type xml-decl::decl-elem + (values + (if empty-el-tag? 'EMPTY-TAG (cadr decl-elem)) + (caddr decl-elem)))) + (else + (parser-error port "[elementvalid] broken, no decl for " tag-head))) + (values ; non-validating parsing + (if empty-el-tag? 'EMPTY-TAG 'ANY) + #f) ; no attributes declared + )) + ((merged-attrs) (if decl-attrs (validate-attrs port attlist decl-attrs) + (attlist->alist attlist))) + ((proper-attrs namespaces) + (adjust-namespace-decl port merged-attrs namespaces)) + ) + ;(cerr "proper attrs: " proper-attrs nl) + ; build the return value + (values + (ssax:resolve-name port tag-head namespaces #t) + (fold-right + (lambda (name-value attlist) + (or + (attlist-add attlist + (cons (ssax:resolve-name port (car name-value) namespaces #f) + (cdr name-value))) + (parser-error port "[uniqattspec] after NS expansion broken for " + name-value))) + (make-empty-attlist) + proper-attrs) + namespaces + elem-content))))) + +(run-test + (let* ((urn-a (string->symbol "urn:a")) + (urn-b (string->symbol "urn:b")) + (urn-html (string->symbol "http://w3c.org/html")) + (namespaces + `((#f '"UHTML" . ,urn-html) + ('"A" '"UA" . ,urn-a))) + (test + (lambda (tag-head-name elems str) + (call-with-input-string str + (lambda (port) + (call-with-values + (lambda () + (ssax:complete-start-tag + (call-with-input-string tag-head-name + (lambda (port) (ssax:read-QName port))) + port + elems '() namespaces)) + list)))))) + + ; First test with no validation of elements + ;(test "TAG1" #f "") + (assert (equal? `('"TAG1" () ,namespaces ANY) + (test "TAG1" #f ">"))) + (assert (equal? `('"TAG1" () ,namespaces EMPTY-TAG) + (test "TAG1" #f "/>"))) + (assert (equal? `('"TAG1" (('"HREF" . "a")) ,namespaces EMPTY-TAG) + (test "TAG1" #f "HREF='a'/>"))) + (assert (equal? `(('"UA" . '"TAG1") (('"HREF" . "a")) + ,(cons `(*DEFAULT* '"UA" . ,urn-a) namespaces) ANY) + (test "TAG1" #f "HREF='a' xmlns='urn:a'>"))) + (assert (equal? `('"TAG1" (('"HREF" . "a")) + ,(cons '(*DEFAULT* #f . #f) namespaces) ANY) + (test "TAG1" #f "HREF='a' xmlns=''>"))) + (assert (failed? (test "UA:TAG1" #f "HREF='a' xmlns=''/>"))) + (assert (equal? `(('"UA" . '"TAG1") ((('"UA" . '"HREF") . "a")) + ,(cons '(*DEFAULT* #f . #f) namespaces) ANY) + (test "A:TAG1" #f "A:HREF='a' xmlns=''>"))) + (assert (equal? `(('"UA" . '"TAG1") ((('"UA" . '"HREF") . "a")) + ,(cons `(*DEFAULT* ,urn-b . ,urn-b) namespaces) ANY) + (test "A:TAG1" #f "A:HREF='a' xmlns='urn:b'>"))) + (assert (failed? (test "B:TAG1" #f "A:HREF='a' xmlns:b=''/>"))) + (assert (equal? `((,urn-b . '"TAG1") ((('"UA" . '"HREF") . "a")) + ,(cons `('"B" ,urn-b . ,urn-b) namespaces) ANY) + (test "B:TAG1" #f "A:HREF='a' xmlns:B='urn:b'>"))) + (assert (equal? `((,urn-b . '"TAG1") ((('"UA" . '"HREF") . "a") + ((,urn-b . '"SRC") . "b")) + ,(cons `('"B" ,urn-b . ,urn-b) namespaces) ANY) + (test "B:TAG1" #f + "B:SRC='b' A:HREF='a' xmlns:B='urn:b'>"))) + (assert (equal? `((,urn-b . '"TAG1") ((('"UA" . '"HREF") . "a") + ((,urn-b . '"HREF") . "b")) + ,(cons `('"B" ,urn-b . ,urn-b) namespaces) ANY) + (test "B:TAG1" #f + "B:HREF=\"b\" A:HREF='a' xmlns:B='urn:b'>"))) + ; must be an error! Duplicate attr + (assert (failed? (test "B:TAG1" #f + "HREF=\"b\" HREF='a' xmlns:B='urn:a'/>"))) + ; must be an error! Duplicate attr after ns expansion + (assert (failed? (test "B:TAG1" #f + "B:HREF=\"b\" A:HREF='a' xmlns:B='urn:a'/>"))) + (assert (equal? `(('"UA" . '"TAG1") (('"HREF" . "a") + (('"UA" . '"HREF") . "b")) + ,(cons `(*DEFAULT* '"UA" . ,urn-a) namespaces) ANY) + (test "TAG1" #f + "A:HREF=\"b\" HREF='a' xmlns='urn:a'>"))) + (assert (equal? `('"TAG1" ((('"UHTML" . '"HREF") . "a") + ((,urn-b . '"HREF") . "b")) + ,(append `( + ('"HTML" '"UHTML" . ,urn-html) + ('"B" ,urn-b . ,urn-b)) + namespaces) ANY) + (test "TAG1" #f + "B:HREF=\"b\" xmlns:B='urn:b' xmlns:HTML='http://w3c.org/html' HTML:HREF='a' >"))) + + ; Now test the validating parsing + ; No decl for tag1 + (assert (failed? (test "TAG1" '((TAG2 ANY ())) + "B:HREF='b' xmlns:B='urn:b'>"))) + ; No decl for HREF elem +;; (cond-expand +;; ((not (or scm mit-scheme)) ; Regretfully, SCM treats '() as #f +;; (assert (failed? +;; (test "TAG1" '(('"TAG1" ANY ())) +;; "B:HREF='b' xmlns:B='urn:b'>")))) +;; (else #t)) + ; No decl for HREF elem + (assert (failed? + (test "TAG1" '(('"TAG1" ANY (('"HREF1" CDATA IMPLIED #f)))) + "B:HREF='b' xmlns:B='urn:b'>"))) + (assert (equal? `('"TAG1" (('"HREF" . "b")) ,namespaces EMPTY-TAG) + (test "TAG1" '(('"TAG1" PCDATA (('"HREF" CDATA REQUIRED #f)))) + "HREF='b'/>"))) + (assert (equal? `('"TAG1" (('"HREF" . "b")) ,namespaces PCDATA) + (test "TAG1" '(('"TAG1" PCDATA (('"HREF" CDATA REQUIRED #f)))) + "HREF='b'>"))) + ; Req'd attribute not given error + (assert (failed? + (test "TAG1" '(('"TAG1" PCDATA (('"HREF" CDATA REQUIRED #f)))) + ">"))) + ; Wrong content-type of the attribute + (assert (failed? + (test "TAG1" '(('"TAG1" PCDATA (('"HREF" ("c") REQUIRED #f)))) + "HREF='b'>"))) + (assert (equal? `('"TAG1" (('"HREF" . "b")) ,namespaces PCDATA) + (test "TAG1" '(('"TAG1" PCDATA (('"HREF" ("c" "b") IMPLIED #f)))) + "HREF='b'>"))) + (assert (equal? `('"TAG1" (('"HREF" . "b")) ,namespaces PCDATA) + (test "TAG1" '(('"TAG1" PCDATA (('"HREF" CDATA IMPLIED "c")))) + "HREF='b'>"))) + ; Bad fixed attribute + (assert (failed? + (test "TAG1" '(('"TAG1" PCDATA (('"HREF" CDATA FIXED "c")))) + "HREF='b'>"))) + (assert (equal? `('"TAG1" (('"HREF" . "b")) ,namespaces PCDATA) + (test "TAG1" '(('"TAG1" PCDATA (('"HREF" CDATA FIXED "b")))) + "HREF='b'>"))) + (assert (equal? `('"TAG1" (('"HREF" . "b")) ,namespaces PCDATA) + (test "TAG1" '(('"TAG1" PCDATA (('"HREF" CDATA FIXED "b")))) ">"))) + (assert (equal? `('"TAG1" (('"HREF" . "b")) ,namespaces PCDATA) + (test "TAG1" '(('"TAG1" PCDATA (('"HREF" CDATA IMPLIED "b")))) ">"))) + (assert (equal? `('"TAG1" () ,namespaces PCDATA) + (test "TAG1" '(('"TAG1" PCDATA (('"HREF" CDATA IMPLIED #f)))) ">"))) + ; Undeclared attr + (assert (failed? + (test "TAG1" + '(('"TAG1" PCDATA ((('"A" . '"HREF") CDATA IMPLIED "c")))) + "HREF='b'>"))) + (assert (equal? `('"TAG1" (('"HREF" . "b") (('"UA" . '"HREF") . "c")) + ,namespaces PCDATA) + (test "TAG1" '(('"TAG1" PCDATA (('"HREF" CDATA REQUIRED #f) + (('"A" . '"HREF") CDATA IMPLIED "c")))) + "HREF='b'>"))) + (assert (equal? `(('"UA" . '"TAG1") + (('"HREF" . "b") (('"UA" . '"HREF") . "c")) + ,namespaces PCDATA) + (test "A:TAG1" '((('"A" . '"TAG1") PCDATA + (('"HREF" NMTOKEN REQUIRED #f) + (('"A" . '"HREF") CDATA IMPLIED "c")))) + "HREF='b'>"))) + (assert (equal? `((,urn-b . '"TAG1") (('"HREF" . "b")) + ,(cons `('"B" ,urn-b . ,urn-b) namespaces) PCDATA) + (test "B:TAG1" '((('"B" . '"TAG1") PCDATA (('"HREF" CDATA REQUIRED #f) + (('"xmlns" . '"B") CDATA IMPLIED "urn:b")))) + "HREF='b'>"))) + (assert (equal? `((,urn-b . '"TAG1") (((,urn-b . '"HREF") . "b")) + ,(cons `('"B" ,urn-b . ,urn-b) namespaces) PCDATA) + (test "B:TAG1" '((('"B" . '"TAG1") PCDATA + ((('"B" . '"HREF") CDATA REQUIRED #f) + (('"xmlns" . '"B") CDATA IMPLIED "urn:b")))) + "B:HREF='b'>"))) + (assert (equal? `((,urn-b . '"TAG1") (('"HREF" . "b")) + ,(cons `(*DEFAULT* ,urn-b . ,urn-b) namespaces) PCDATA) + (test "TAG1" '(('"TAG1" PCDATA (('"HREF" CDATA REQUIRED #f) + ('"xmlns" CDATA IMPLIED "urn:b")))) + "HREF='b'>"))) + ; xmlns not declared + (assert (equal? `((,urn-b . '"TAG1") (('"HREF" . "b")) + ,(cons `(*DEFAULT* ,urn-b . ,urn-b) namespaces) PCDATA) + (test "TAG1" '(('"TAG1" PCDATA (('"HREF" CDATA REQUIRED #f) + ))) + "HREF='b' xmlns='urn:b'>"))) + ; xmlns:B not declared + (assert (equal? `((,urn-b . '"TAG1") (((,urn-b . '"HREF") . "b")) + ,(cons `('"B" ,urn-b . ,urn-b) namespaces) PCDATA) + (test "B:TAG1" '((('"B" . '"TAG1") PCDATA + ((('"B" . '"HREF") CDATA REQUIRED #f) + ))) + "B:HREF='b' xmlns:B='urn:b'>"))) +)) + +; procedure+: ssax:read-external-id PORT +; +; This procedure parses an ExternalID production: +; [75] ExternalID ::= 'SYSTEM' S SystemLiteral +; | 'PUBLIC' S PubidLiteral S SystemLiteral +; [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'") +; [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'" +; [13] PubidChar ::= #x20 | #xD | #xA | [a-zA-Z0-9] +; | [-'()+,./:=?;!*#@$_%] +; +; This procedure is supposed to be called when an ExternalID is expected; +; that is, the current character must be either #\S or #\P that start +; correspondingly a SYSTEM or PUBLIC token. This procedure returns the +; SystemLiteral as a string. A PubidLiteral is disregarded if present. + +(define (ssax:read-external-id port) + (let ((discriminator (ssax:read-NCName port))) + (assert-curr-char ssax:S-chars "space after SYSTEM or PUBLIC" port) + (ssax:skip-S port) + (let ((delimiter + (assert-curr-char '(#\' #\" ) "XML [11], XML [12]" port))) + (cond + ((eq? discriminator (string->symbol "SYSTEM")) + (begin0 + (next-token '() (list delimiter) "XML [11]" port) + (read-char port) ; reading the closing delim + )) + ((eq? discriminator (string->symbol "PUBLIC")) + (skip-until (list delimiter) port) + (assert-curr-char ssax:S-chars "space after PubidLiteral" port) + (ssax:skip-S port) + (let* ((delimiter + (assert-curr-char '(#\' #\" ) "XML [11]" port)) + (systemid + (next-token '() (list delimiter) "XML [11]" port))) + (read-char port) ; reading the closing delim + systemid)) + (else + (parser-error port "XML [75], " discriminator + " rather than SYSTEM or PUBLIC")))))) + + +;----------------------------------------------------------------------------- +; Higher-level parsers and scanners +; +; They parse productions corresponding to the whole (document) entity +; or its higher-level pieces (prolog, root element, etc). + + +; Scan the Misc production in the context +; [1] document ::= prolog element Misc* +; [22] prolog ::= XMLDecl? Misc* (doctypedec l Misc*)? +; [27] Misc ::= Comment | PI | S +; +; The following function should be called in the prolog or epilog contexts. +; In these contexts, whitespaces are completely ignored. +; The return value from ssax:scan-Misc is either a PI-token, +; a DECL-token, a START token, or EOF. +; Comments are ignored and not reported. + +(define (ssax:scan-Misc port) + (let loop ((c (ssax:skip-S port))) + (cond + ((eof-object? c) c) + ((not (char=? c #\<)) + (parser-error port "XML [22], char '" c "' unexpected")) + (else + (let ((token (ssax:read-markup-token port))) + (case (xml-token-kind token) + ((COMMENT) (loop (ssax:skip-S port))) + ((PI DECL START) token) + (else + (parser-error port "XML [22], unexpected token of kind " + (xml-token-kind token) + )))))))) + +; procedure+: ssax:read-char-data PORT EXPECT-EOF? STR-HANDLER SEED +; +; This procedure is to read the character content of an XML document +; or an XML element. +; [43] content ::= +; (element | CharData | Reference | CDSect | PI +; | Comment)* +; To be more precise, the procedure reads CharData, expands CDSect +; and character entities, and skips comments. The procedure stops +; at a named reference, EOF, at the beginning of a PI or a start/end tag. +; +; port +; a PORT to read +; expect-eof? +; a boolean indicating if EOF is normal, i.e., the character +; data may be terminated by the EOF. EOF is normal +; while processing a parsed entity. +; str-handler +; a STR-HANDLER +; seed +; an argument passed to the first invocation of STR-HANDLER. +; +; The procedure returns two results: SEED and TOKEN. +; The SEED is the result of the last invocation of STR-HANDLER, or the +; original seed if STR-HANDLER was never called. +; +; TOKEN can be either an eof-object (this can happen only if +; expect-eof? was #t), or: +; - an xml-token describing a START tag or an END-tag; +; For a start token, the caller has to finish reading it. +; - an xml-token describing the beginning of a PI. It's up to an +; application to read or skip through the rest of this PI; +; - an xml-token describing a named entity reference. +; +; CDATA sections and character references are expanded inline and +; never returned. Comments are silently disregarded. +; +; As the XML Recommendation requires, all whitespace in character data +; must be preserved. However, a CR character (#xD) must be disregarded +; if it appears before a LF character (#xA), or replaced by a #xA character +; otherwise. See Secs. 2.10 and 2.11 of the XML Recommendation. See also +; the canonical XML Recommendation. + + ; ssax:read-char-data port expect-eof? str-handler seed +(define ssax:read-char-data + (let + ((terminators-usual (list #\< #\& char-return)) + (terminators-usual-eof (list #\< '*eof* #\& char-return)) + + (handle-fragment + (lambda (fragment str-handler seed) + (if (string-null? fragment) seed + (str-handler fragment "" seed)))) + ) + + (lambda (port expect-eof? str-handler seed) + + ; Very often, the first character we encounter is #\< + ; Therefore, we handle this case in a special, fast path + (if (eqv? #\< (peek-char port)) + + ; The fast path + (let ((token (ssax:read-markup-token port))) + (case (xml-token-kind token) + ((START END) ; The most common case + (values seed token)) + ((CDSECT) + (let ((seed (ssax:read-cdata-body port str-handler seed))) + (ssax:read-char-data port expect-eof? str-handler seed))) + ((COMMENT) (ssax:read-char-data port expect-eof? + str-handler seed)) + (else + (values seed token)))) + + + ; The slow path + (let ((char-data-terminators + (if expect-eof? terminators-usual-eof terminators-usual))) + + (let loop ((seed seed)) + (let* ((fragment + (next-token '() char-data-terminators + "reading char data" port)) + (term-char (peek-char port)) ; one of char-data-terminators + ) + (if (eof-object? term-char) + (values + (handle-fragment fragment str-handler seed) + term-char) + (case term-char + ((#\<) + (let ((token (ssax:read-markup-token port))) + (case (xml-token-kind token) + ((CDSECT) + (loop + (ssax:read-cdata-body port str-handler + (handle-fragment fragment str-handler seed)))) + ((COMMENT) + (loop (handle-fragment fragment str-handler seed))) + (else + (values + (handle-fragment fragment str-handler seed) + token))))) + ((#\&) + (case (peek-next-char port) + ((#\#) (read-char port) + (loop (str-handler fragment + (ssax:read-char-ref port) + seed))) + (else + (let ((name (ssax:read-NCName port))) + (assert-curr-char '(#\;) "XML [68]" port) + (values + (handle-fragment fragment str-handler seed) + (make-xml-token 'ENTITY-REF name)))))) + (else ; This must be a CR character + (if (eqv? (peek-next-char port) #\newline) + (read-char port)) + (loop (str-handler fragment (string #\newline) seed)))) + )))))))) + + +; a few lines of validation code +(run-test (letrec + ((a-tag (make-xml-token 'START (string->symbol "BR"))) + (a-ref (make-xml-token 'ENTITY-REF (string->symbol "lt"))) + (eof-object (lambda () eof-object)) ; a unique value + (str-handler (lambda (fragment foll-fragment seed) + (if (string-null? foll-fragment) (cons fragment seed) + (cons* foll-fragment fragment seed)))) + (test (lambda (str expect-eof? expected-data expected-token) + (newline) (display "body: ") (write str) + (newline) (display "Result: ") + (let*-values + (((seed token) + (call-with-input-string (unesc-string str) + (lambda (port) + (ssax:read-char-data port expect-eof? str-handler '())))) + ((result) (reverse seed))) + (write result) + (display " ") + (display token) + (assert (equal? result (map unesc-string expected-data)) + (if (eq? expected-token eof-object) + (eof-object? token) + (equal? token expected-token)))))) + ) + (test "" #t '() eof-object) + (assert (failed? (test "" #f '() eof-object))) + (test " " #t '(" ") eof-object) + (test "<BR/>" #f '() a-tag) + (test " <BR />" #f '(" ") a-tag) + + (test " <" #f '(" ") a-ref) + (test " a<" #f '(" a") a-ref) + (test " a <" #f '(" a ") a-ref) + + (test " <!-- comment--> a a<BR/>" #f '(" " " a a") a-tag) + (test " <!-- comment-->%ra a<BR/>" #f '(" " "" "%n" "a a") a-tag) + (test " <!-- comment-->%r%na a<BR/>" #f '(" " "" "%n" "a a") a-tag) + (test " <!-- comment-->%r%na%t%r%r%na<BR/>" #f + '(" " "" "%n" "a%t" "%n" "" "%n" "a") a-tag) + (test "a<!-- comment--> a a<BR/>" #f '("a" " a a") a-tag) + (test "!<BR/>" #f '("" "!") a-tag) + (test "!%n<BR/>" #f '("" "!" "%n") a-tag) + (test "%t!%n<BR/>" #f '("%t" "!" "%n") a-tag) + (test "%t!%na a<BR/>" #f '("%t" "!" "%na a") a-tag) + (test "%t!%ra a<BR/>" #f '("%t" "!" "" "%n" "a a") a-tag) + (test "%t!%r%na a<BR/>" #f '("%t" "!" "" "%n" "a a") a-tag) + + (test " %ta ! b <BR/>" #f '(" %ta " "!" " b ") a-tag) + (test " %ta   b <BR/>" #f '(" %ta " " " " b ") a-tag) + + (test "<![CDATA[<]]><BR/>" #f '("<") a-tag) + (test "<![CDATA[]]]><BR/>" #f '("]") a-tag) + (test "%t<![CDATA[<]]><BR/>" #f '("%t" "<") a-tag) + (test "%t<![CDATA[<]]>a b<BR/>" #f '("%t" "<" "a b") a-tag) + (test "%t<![CDATA[<]]> a b<BR/>" #f '("%t" "<" " a b") a-tag) + + (test "%td <![CDATA[ <%r%r%n]]> a b<BR/>" #f + '("%td " " <" "%n" "" "%n" " a b") a-tag) +)) + + + +; procedure+: ssax:assert-token TOKEN KIND GI +; Make sure that TOKEN is of anticipated KIND and has anticipated GI +; Note GI argument may actually be a pair of two symbols, Namespace +; URI or the prefix, and of the localname. +; If the assertion fails, error-cont is evaluated by passing it +; three arguments: token kind gi. The result of error-cont is returned. +(define (ssax:assert-token token kind gi error-cont) + (or + (and (xml-token? token) + (eq? kind (xml-token-kind token)) + (equal? gi (xml-token-head token))) + (error-cont token kind gi))) + +;======================================================================== +; Highest-level parsers: XML to SXML + +; These parsers are a set of syntactic forms to instantiate a SSAX parser. +; A user can instantiate the parser to do the full validation, or +; no validation, or any particular validation. The user specifies +; which PI he wants to be notified about. The user tells what to do +; with the parsed character and element data. The latter handlers +; determine if the parsing follows a SAX or a DOM model. + +; syntax: ssax:make-pi-parser my-pi-handlers +; Create a parser to parse and process one Processing Element (PI). + +; my-pi-handlers +; An assoc list of pairs (PI-TAG . PI-HANDLER) +; where PI-TAG is an NCName symbol, the PI target, and +; PI-HANDLER is a procedure PORT PI-TAG SEED +; where PORT points to the first symbol after the PI target. +; The handler should read the rest of the PI up to and including +; the combination '?>' that terminates the PI. The handler should +; return a new seed. +; One of the PI-TAGs may be the symbol *DEFAULT*. The corresponding +; handler will handle PIs that no other handler will. If the +; *DEFAULT* PI-TAG is not specified, ssax:make-pi-parser will assume +; the default handler that skips the body of the PI +; +; The output of the ssax:make-pi-parser is a procedure +; PORT PI-TAG SEED +; that will parse the current PI according to the user-specified handlers. +; +; The previous version of ssax:make-pi-parser was a low-level macro: +; (define-macro ssax:make-pi-parser +; (lambda (my-pi-handlers) +; `(lambda (port target seed) +; (case target +; ; Generate the body of the case statement +; ,@(let loop ((pi-handlers my-pi-handlers) (default #f)) +; (cond +; ((null? pi-handlers) +; (if default `((else (,default port target seed))) +; '((else +; (ssax:warn port "Skipping PI: " target nl) +; (ssax:skip-pi port) +; seed)))) +; ((eq? '*DEFAULT* (caar pi-handlers)) +; (loop (cdr pi-handlers) (cdar pi-handlers))) +; (else +; (cons +; `((,(caar pi-handlers)) (,(cdar pi-handlers) port target seed)) +; (loop (cdr pi-handlers) default))))))))) + +(define-syntax ssax:make-pi-parser + (syntax-rules () + ((ssax:make-pi-parser orig-handlers) + (letrec-syntax + ; Generate the clauses of the case statement + ((loop + (syntax-rules (*DEFAULT*) + ((loop () #f accum port target seed) ; no default + (make-case + ((else + (ssax:warn port "Skipping PI: " target nl) + (ssax:skip-pi port) + seed) + . accum) + () target)) + ((loop () default accum port target seed) + (make-case + ((else (default port target seed)) . accum) + () target)) + ((loop ((*DEFAULT* . default) . handlers) old-def accum + port target seed) + (loop handlers default accum port target seed)) + ((loop ((tag . handler) . handlers) default accum port target seed) + (loop handlers default + (((tag) (handler port target seed)) . accum) + port target seed)) + )) + (make-case ; Reverse the clauses, make the 'case' + (syntax-rules () + ((make-case () clauses target) + (case target . clauses)) + ((make-case (clause . clauses) accum target) + (make-case clauses (clause . accum) target))) + )) + (lambda (port target seed) + (loop orig-handlers #f () port target seed)) + )))) + +(run-test + (pp (ssax:make-pi-parser ())) + (pp (ssax:make-pi-parser ((xml . (lambda (port target seed) seed))))) + (pp (ssax:make-pi-parser ((xml . (lambda (port target seed) seed)) + (html . list) + (*DEFAULT* . ssax:warn)))) +) + +; syntax: ssax:make-elem-parser my-new-level-seed my-finish-element +; my-char-data-handler my-pi-handlers + +; Create a parser to parse and process one element, including its +; character content or children elements. The parser is typically +; applied to the root element of a document. + +; my-new-level-seed +; procedure ELEM-GI ATTRIBUTES NAMESPACES EXPECTED-CONTENT SEED +; where ELEM-GI is a RES-NAME of the element +; about to be processed. +; This procedure is to generate the seed to be passed +; to handlers that process the content of the element. +; This is the function identified as 'fdown' in the denotational +; semantics of the XML parser given in the title comments to this +; file. +; +; my-finish-element +; procedure ELEM-GI ATTRIBUTES NAMESPACES PARENT-SEED SEED +; This procedure is called when parsing of ELEM-GI is finished. +; The SEED is the result from the last content parser (or +; from my-new-level-seed if the element has the empty content). +; PARENT-SEED is the same seed as was passed to my-new-level-seed. +; The procedure is to generate a seed that will be the result +; of the element parser. +; This is the function identified as 'fup' in the denotational +; semantics of the XML parser given in the title comments to this +; file. +; +; my-char-data-handler +; A STR-HANDLER +; +; my-pi-handlers +; See ssax:make-pi-handler above +; + +; The generated parser is a +; procedure START-TAG-HEAD PORT ELEMS ENTITIES +; NAMESPACES PRESERVE-WS? SEED +; The procedure must be called after the start tag token has been +; read. START-TAG-HEAD is an UNRES-NAME from the start-element tag. +; ELEMS is an instance of xml-decl::elems. +; See ssax:complete-start-tag::preserve-ws? + +; Faults detected: +; VC: XML-Spec.html#elementvalid +; WFC: XML-Spec.html#GIMatch + + +(define-syntax ssax:make-elem-parser + (syntax-rules () + ((ssax:make-elem-parser my-new-level-seed my-finish-element + my-char-data-handler my-pi-handlers) + + (lambda (start-tag-head port elems entities namespaces + preserve-ws? seed) + + (define xml-space-gi (cons ssax:Prefix-XML + (string->symbol "space"))) + + (let handle-start-tag ((start-tag-head start-tag-head) + (port port) (entities entities) + (namespaces namespaces) + (preserve-ws? preserve-ws?) (parent-seed seed)) + (let*-values + (((elem-gi attributes namespaces expected-content) + (ssax:complete-start-tag start-tag-head port elems + entities namespaces)) + ((seed) + (my-new-level-seed elem-gi attributes + namespaces expected-content parent-seed))) + (case expected-content + ((EMPTY-TAG) + (my-finish-element + elem-gi attributes namespaces parent-seed seed)) + ((EMPTY) ; The end tag must immediately follow + (ssax:assert-token + (and (eqv? #\< (ssax:skip-S port)) (ssax:read-markup-token port)) + 'END start-tag-head + (lambda (token exp-kind exp-head) + (parser-error port "[elementvalid] broken for " token + " while expecting " + exp-kind exp-head))) + (my-finish-element + elem-gi attributes namespaces parent-seed seed)) + (else ; reading the content... + (let ((preserve-ws? ; inherit or set the preserve-ws? flag + (cond + ((assoc xml-space-gi attributes) => + (lambda (name-value) + (equal? "preserve" (cdr name-value)))) + (else preserve-ws?)))) + (let loop ((port port) (entities entities) + (expect-eof? #f) (seed seed)) + (let*-values + (((seed term-token) + (ssax:read-char-data port expect-eof? + my-char-data-handler seed))) + (if (eof-object? term-token) + seed + (case (xml-token-kind term-token) + ((END) + (ssax:assert-token term-token 'END start-tag-head + (lambda (token exp-kind exp-head) + (parser-error port "[GIMatch] broken for " + term-token " while expecting " + exp-kind exp-head))) + (my-finish-element + elem-gi attributes namespaces parent-seed seed)) + ((PI) + (let ((seed + ((ssax:make-pi-parser my-pi-handlers) + port (xml-token-head term-token) seed))) + (loop port entities expect-eof? seed))) + ((ENTITY-REF) + (let ((seed + (ssax:handle-parsed-entity + port (xml-token-head term-token) + entities + (lambda (port entities seed) + (loop port entities #t seed)) + my-char-data-handler + seed))) ; keep on reading the content after ent + (loop port entities expect-eof? seed))) + ((START) ; Start of a child element + (if (eq? expected-content 'PCDATA) + (parser-error port "[elementvalid] broken for " + elem-gi + " with char content only; unexpected token " + term-token)) + ; Do other validation of the element content + (let ((seed + (handle-start-tag + (xml-token-head term-token) + port entities namespaces + preserve-ws? seed))) + (loop port entities expect-eof? seed))) + (else + (parser-error port "XML [43] broken for " + term-token)))))))) + ))) +)))) + + +; syntax: ssax:make-parser user-handler-tag user-handler-proc ... +; +; Create an XML parser, an instance of the XML parsing framework. +; This will be a SAX, a DOM, or a specialized parser depending +; on the supplied user-handlers. + +; user-handler-tag is a symbol that identifies a procedural expression +; that follows the tag. Given below are tags and signatures of the +; corresponding procedures. Not all tags have to be specified. If some +; are omitted, reasonable defaults will apply. +; + +; tag: DOCTYPE +; handler-procedure: PORT DOCNAME SYSTEMID INTERNAL-SUBSET? SEED +; If internal-subset? is #t, the current position in the port +; is right after we have read #\[ that begins the internal DTD subset. +; We must finish reading of this subset before we return +; (or must call skip-internal-subset if we aren't interested in reading it). +; The port at exit must be at the first symbol after the whole +; DOCTYPE declaration. +; The handler-procedure must generate four values: +; ELEMS ENTITIES NAMESPACES SEED +; See xml-decl::elems for ELEMS. It may be #f to switch off the validation. +; NAMESPACES will typically contain USER-PREFIXes for selected URI-SYMBs. +; The default handler-procedure skips the internal subset, +; if any, and returns (values #f '() '() seed) + +; tag: UNDECL-ROOT +; handler-procedure: ELEM-GI SEED +; where ELEM-GI is an UNRES-NAME of the root element. This procedure +; is called when an XML document under parsing contains _no_ DOCTYPE +; declaration. +; The handler-procedure, as a DOCTYPE handler procedure above, +; must generate four values: +; ELEMS ENTITIES NAMESPACES SEED +; The default handler-procedure returns (values #f '() '() seed) + +; tag: DECL-ROOT +; handler-procedure: ELEM-GI SEED +; where ELEM-GI is an UNRES-NAME of the root element. This procedure +; is called when an XML document under parsing does contains the DOCTYPE +; declaration. +; The handler-procedure must generate a new SEED (and verify +; that the name of the root element matches the doctype, if the handler +; so wishes). +; The default handler-procedure is the identity function. + +; tag: NEW-LEVEL-SEED +; handler-procedure: see ssax:make-elem-parser, my-new-level-seed + +; tag: FINISH-ELEMENT +; handler-procedure: see ssax:make-elem-parser, my-finish-element + +; tag: CHAR-DATA-HANDLER +; handler-procedure: see ssax:make-elem-parser, my-char-data-handler + +; tag: PI +; handler-procedure: see ssax:make-pi-parser +; The default value is '() + +; The generated parser is a +; procedure PORT SEED + +; This procedure parses the document prolog and then exits to +; an element parser (created by ssax:make-elem-parser) to handle +; the rest. +; +; [1] document ::= prolog element Misc* +; [22] prolog ::= XMLDecl? Misc* (doctypedec | Misc*)? +; [27] Misc ::= Comment | PI | S +; +; [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S? +; ('[' (markupdecl | PEReference | S)* ']' S?)? '>' +; [29] markupdecl ::= elementdecl | AttlistDecl +; | EntityDecl +; | NotationDecl | PI +; | Comment +; + + +; This is ssax:make-parser with all the (specialization) handlers given +; as positional arguments. It is called by ssax:make-parser, see below +(define-syntax ssax:make-parser/positional-args + (syntax-rules () + ((ssax:make-parser/positional-args + *handler-DOCTYPE + *handler-UNDECL-ROOT + *handler-DECL-ROOT + *handler-NEW-LEVEL-SEED + *handler-FINISH-ELEMENT + *handler-CHAR-DATA-HANDLER + *handler-PI) + (lambda (port seed) + + ; We must've just scanned the DOCTYPE token + ; Handle the doctype declaration and exit to + ; scan-for-significant-prolog-token-2, and eventually, to the + ; element parser. + (define (handle-decl port token-head seed) + (or (eq? (string->symbol "DOCTYPE") token-head) + (parser-error port "XML [22], expected DOCTYPE declaration, found " + token-head)) + (assert-curr-char ssax:S-chars "XML [28], space after DOCTYPE" port) + (ssax:skip-S port) + (let*-values + (((docname) (ssax:read-QName port)) + ((systemid) + (and (ssax:ncname-starting-char? (ssax:skip-S port)) + (ssax:read-external-id port))) + ((internal-subset?) + (begin (ssax:skip-S port) + (eqv? #\[ (assert-curr-char '(#\> #\[) + "XML [28], end-of-DOCTYPE" port)))) + ((elems entities namespaces seed) + (*handler-DOCTYPE port docname systemid + internal-subset? seed)) + ) + (scan-for-significant-prolog-token-2 port elems entities namespaces + seed))) + + + ; Scan the leading PIs until we encounter either a doctype declaration + ; or a start token (of the root element) + ; In the latter two cases, we exit to the appropriate continuation + (define (scan-for-significant-prolog-token-1 port seed) + (let ((token (ssax:scan-Misc port))) + (if (eof-object? token) + (parser-error port "XML [22], unexpected EOF") + (case (xml-token-kind token) + ((PI) + (let ((seed + ((ssax:make-pi-parser *handler-PI) + port (xml-token-head token) seed))) + (scan-for-significant-prolog-token-1 port seed))) + ((DECL) (handle-decl port (xml-token-head token) seed)) + ((START) + (let*-values + (((elems entities namespaces seed) + (*handler-UNDECL-ROOT (xml-token-head token) seed))) + (element-parser (xml-token-head token) port elems + entities namespaces #f seed))) + (else (parser-error port "XML [22], unexpected markup " + token)))))) + + + ; Scan PIs after the doctype declaration, till we encounter + ; the start tag of the root element. After that we exit + ; to the element parser + (define (scan-for-significant-prolog-token-2 port elems entities + namespaces seed) + (let ((token (ssax:scan-Misc port))) + (if (eof-object? token) + (parser-error port "XML [22], unexpected EOF") + (case (xml-token-kind token) + ((PI) + (let ((seed + ((ssax:make-pi-parser *handler-PI) + port (xml-token-head token) seed))) + (scan-for-significant-prolog-token-2 port elems entities + namespaces seed))) + ((START) + (element-parser (xml-token-head token) port elems + entities namespaces #f + (*handler-DECL-ROOT (xml-token-head token) seed))) + (else (parser-error port "XML [22], unexpected markup " + token)))))) + + + ; A procedure start-tag-head port elems entities namespaces + ; preserve-ws? seed + (define element-parser + (ssax:make-elem-parser *handler-NEW-LEVEL-SEED + *handler-FINISH-ELEMENT + *handler-CHAR-DATA-HANDLER + *handler-PI)) + + ; Get the ball rolling ... + (scan-for-significant-prolog-token-1 port seed) +)))) + + + +; The following meta-macro turns a regular macro (with positional +; arguments) into a form with keyword (labeled) arguments. We later +; use the meta-macro to convert ssax:make-parser/positional-args into +; ssax:make-parser. The latter provides a prettier (with labeled +; arguments and defaults) interface to +; ssax:make-parser/positional-args +; +; ssax:define-labeled-arg-macro LABELED-ARG-MACRO-NAME +; (POS-MACRO-NAME ARG-DESCRIPTOR ...) +; expands into the definition of a macro +; LABELED-ARG-MACRO-NAME KW-NAME KW-VALUE KW-NAME1 KW-VALUE1 ... +; which, in turn, expands into +; POS-MACRO-NAME ARG1 ARG2 ... +; where each ARG1 etc. comes either from KW-VALUE or from +; the deafult part of ARG-DESCRIPTOR. ARG1 corresponds to the first +; ARG-DESCRIPTOR, ARG2 corresponds to the second descriptor, etc. +; Here ARG-DESCRIPTOR describes one argument of the positional macro. +; It has the form +; (ARG-NAME DEFAULT-VALUE) +; or +; (ARG-NAME) +; In the latter form, the default value is not given, so that the invocation of +; LABELED-ARG-MACRO-NAME must mention the corresponding parameter. +; ARG-NAME can be anything: an identifier, a string, or even a number. + + +(define-syntax ssax:define-labeled-arg-macro + (syntax-rules () + ((ssax:define-labeled-arg-macro + labeled-arg-macro-name + (positional-macro-name + (arg-name . arg-def) ...)) + (define-syntax labeled-arg-macro-name + (syntax-rules () + ((labeled-arg-macro-name . kw-val-pairs) + (letrec-syntax + ((find + (syntax-rules (arg-name ...) + ((find k-args (arg-name . default) arg-name + val . others) ; found arg-name among kw-val-pairs + (next val . k-args)) ... + ((find k-args key arg-no-match-name val . others) + (find k-args key . others)) + ((find k-args (arg-name default)) ; default must be here + (next default . k-args)) ... + )) + (next ; pack the continuation to find + (syntax-rules () + ((next val vals key . keys) + (find ((val . vals) . keys) key . kw-val-pairs)) + ((next val vals) ; processed all arg-descriptors + (rev-apply (val) vals)))) + (rev-apply + (syntax-rules () + ((rev-apply form (x . xs)) + (rev-apply (x . form) xs)) + ((rev-apply form ()) form)))) + (next positional-macro-name () + (arg-name . arg-def) ...)))))))) + + +; The definition of ssax:make-parser +(ssax:define-labeled-arg-macro ssax:make-parser + (ssax:make-parser/positional-args + (DOCTYPE + (lambda (port docname systemid internal-subset? seed) + (when internal-subset? + (ssax:warn port "Internal DTD subset is not currently handled ") + (ssax:skip-internal-dtd port)) + (ssax:warn port "DOCTYPE DECL " docname " " + systemid " found and skipped") + (values #f '() '() seed) + )) + (UNDECL-ROOT + (lambda (elem-gi seed) (values #f '() '() seed))) + (DECL-ROOT + (lambda (elem-gi seed) seed)) + (NEW-LEVEL-SEED) ; required + (FINISH-ELEMENT) ; required + (CHAR-DATA-HANDLER) ; required + (PI ()) + )) + +(run-test + (letrec ((simple-parser + (lambda (str doctype-fn) + (call-with-input-string str + (lambda (port) + ((ssax:make-parser + NEW-LEVEL-SEED + (lambda (elem-gi attributes namespaces + expected-content seed) + '()) + + FINISH-ELEMENT + (lambda (elem-gi attributes namespaces parent-seed seed) + (let + ((seed (if (null? namespaces) (reverse seed) + (cons (list '*NAMESPACES* namespaces) + (reverse seed))))) + (let ((seed (if (attlist-null? attributes) seed + (cons + (cons '@ + (map (lambda (attr) + (list (car attr) (cdr attr))) + (attlist->alist attributes))) + seed)))) + (cons (cons elem-gi seed) parent-seed)))) + + CHAR-DATA-HANDLER + (lambda (string1 string2 seed) + (if (string-null? string2) (cons string1 seed) + (cons* string2 string1 seed))) + + DOCTYPE + (lambda (port docname systemid internal-subset? seed) + (when internal-subset? + (ssax:warn port + "Internal DTD subset is not currently handled ") + (ssax:skip-internal-dtd port)) + (ssax:warn port "DOCTYPE DECL " docname " " + systemid " found and skipped") + (doctype-fn docname seed)) + + UNDECL-ROOT + (lambda (elem-gi seed) + (doctype-fn elem-gi seed)) + ) + port '()))))) + + (dummy-doctype-fn (lambda (elem-gi seed) (values #f '() '() seed))) + (test + (lambda (str doctype-fn expected) + (cout nl "Parsing: " str nl) + (let ((result (simple-parser (unesc-string str) doctype-fn))) + (write result) + (assert (equal? result expected))))) + ) + + (test "<BR/>" dummy-doctype-fn '(('"BR"))) + (assert (failed? (test "<BR>" dummy-doctype-fn '()))) + (test "<BR></BR>" dummy-doctype-fn '(('"BR"))) + (assert (failed? (test "<BR></BB>" dummy-doctype-fn '()))) + + (test " <A HREF='URL'> link <I>itlink </I> &amp;</A>" + dummy-doctype-fn + '(('"A" (@ ('"HREF" "URL")) " link " ('"I" "itlink ") + " " "&" "amp;"))) + + (test + " <A HREF='URL' xml:space='preserve'> link <I>itlink </I> &amp;</A>" dummy-doctype-fn + '(('"A" (@ ('"HREF" "URL") (('"xml" . '"space") "preserve")) + " link " ('"I" "itlink ") " " "&" "amp;"))) + + (test " <A HREF='URL' xml:space='preserve'> link <I xml:space='default'>itlink </I> &amp;</A>" dummy-doctype-fn + '(('"A" (@ ('"HREF" "URL") (('"xml" . '"space") "preserve")) + " link " + ('"I" (@ (('"xml" . '"space") "default")) "itlink ") + " " "&" "amp;"))) + (test "<itemize><item>This is item 1 </item>%n<!-- Just:a comment --><item>Item 2</item>%n </itemize>" dummy-doctype-fn + `(('"itemize" ('"item" "This is item 1 ") + ,(unesc-string "%n") ('"item" "Item 2") ,(unesc-string "%n ")))) + (test " <P><![CDATA[<BR>%n<![CDATA[<BR>]]>]]></P>" + dummy-doctype-fn `(('"P" "<BR>" ,nl "<![CDATA[<BR>" "]]" "" ">"))) + + (test " <P><![CDATA[<BR>%r<![CDATA[<BR>]]>]]></P>" + dummy-doctype-fn `(('"P" "<BR>" ,nl "<![CDATA[<BR>" "]]" "" ">"))) + + (test "<?xml version='1.0'?>%n%n<Reports TStamp='1'></Reports>" + dummy-doctype-fn '(('"Reports" (@ ('"TStamp" "1"))))) + (test "%n<?PI xxx?><!-- Comment %n -%r-->%n<?PI1 zzz?><T/>" + dummy-doctype-fn '(('"T"))) + (test "<!DOCTYPE T SYSTEM 'system1' ><!-- comment -->%n<T/>" + (lambda (elem-gi seed) (assert (equal? elem-gi ''"T")) + (values #f '() '() seed)) + '(('"T"))) + (test "<!DOCTYPE T PUBLIC '//EN/T' \"system1\" [ <!ELEMENT a 'aa'> ]>%n<?pi?><T/>" + (lambda (elem-gi seed) (assert (equal? elem-gi ''"T")) + (values #f '() '() seed)) + '(('"T"))) + (test "<BR/>" + (lambda (elem-gi seed) + (values '(('"BR" EMPTY ())) '() '() seed)) '(('"BR"))) + (test "<BR></BR>" + (lambda (elem-gi seed) + (values '(('"BR" EMPTY ())) '() '() seed)) '(('"BR"))) + (assert (failed? (test "<BR>aa</BR>" + (lambda (elem-gi seed) + (values '(('"BR" EMPTY ())) '() '() seed)) '()))) + (test "<BR>aa</BR>" + (lambda (elem-gi seed) + (values '(('"BR" PCDATA ())) '() '() seed)) '(('"BR" "aa"))) + (assert (failed? (test "<BR>a<I>a</I></BR>" + (lambda (elem-gi seed) + (values '(('"BR" PCDATA ())) '() '() seed)) '()))) + (test "<BR>a<I>a</I></BR>" + (lambda (elem-gi seed) + (values '(('"BR" ANY ()) ('"I" PCDATA ())) '() '() seed)) + '(('"BR" "a" ('"I" "a")))) + + + (test "<DIV>Example: \"&example;\"</DIV>" + (lambda (elem-gi seed) + (values #f '((example . "<P>An ampersand (&) may be escaped numerically (&#38;) or with a general entity (&amp;).</P>")) '() seed)) + '(('"DIV" "Example: \"" + ('"P" "An ampersand (" "&" ") may be escaped numerically (" "&" "#38;) or with a general entity (" "&" "amp;).") "\""))) + (test "<DIV>Example: \"&example;\" <P/></DIV>" + (lambda (elem-gi seed) + (values #f '(('"quote" . "<I>example:</I> ex") + ('"example" . "<Q>"e;!</Q>?")) '() seed)) + '(('"DIV" "Example: \"" ('"Q" ('"I" "example:") " ex" "!") "?" + "\" " ('"P")))) + (assert (failed? + (test "<DIV>Example: \"&example;\" <P/></DIV>" + (lambda (elem-gi seed) + (values #f '(('"quote" . "<I>example:") + ('"example" . "<Q>"e;</I>!</Q>?")) '() seed)) + '()))) + + (test "<DIV A:B='A' B='B' xmlns:A='URI1' xmlns='URI1'><A:P xmlns=''><BR/></A:P></DIV>" + (lambda (elem-gi seed) + (values #f '() '() seed)) + '((('"URI1" . '"DIV") (@ ('"B" "B") (('"URI1" . '"B") "A")) + (*NAMESPACES* (('"A" '"URI1" . '"URI1") + (*DEFAULT* '"URI1" . '"URI1"))) + (('"URI1" . '"P") + (*NAMESPACES* ((*DEFAULT* #f . #f) ('"A" '"URI1" . '"URI1") + (*DEFAULT* '"URI1" . '"URI1"))) + ('"BR" + (*NAMESPACES* ((*DEFAULT* #f . #f) + ('"A" '"URI1" . '"URI1") + (*DEFAULT* '"URI1" . '"URI1")))))))) + (test "<DIV A:B='A' B='B' xmlns:A='URI1' xmlns='URI1'><A:P xmlns=''><BR/></A:P></DIV>" + (lambda (elem-gi seed) + (values #f '() '((#f '"UA" . '"URI1")) seed)) + '((('"UA" . '"DIV") (@ ('"B" "B") (('"UA" . '"B") "A")) + (*NAMESPACES* (('"A" '"UA" . '"URI1") + (*DEFAULT* '"UA" . '"URI1") (#f '"UA" . '"URI1"))) + (('"UA" . '"P") + (*NAMESPACES* ((*DEFAULT* #f . #f) ('"A" '"UA" . '"URI1") + (*DEFAULT* '"UA" . '"URI1") (#f '"UA" . '"URI1"))) + ('"BR" + (*NAMESPACES* ((*DEFAULT* #f . #f) ('"A" '"UA" . '"URI1") + (*DEFAULT* '"UA" . '"URI1") + (#f '"UA" . '"URI1")))))))) + ; uniqattr should fail + (assert (failed? + (test "<DIV A:B='A' B='B' xmlns:A='URI1' xmlns='URI1'><A:P xmlns=''><BR/></A:P></DIV>" + (lambda (elem-gi seed) + (values + `(('"DIV" ANY (('"B" CDATA IMPLIED #f) + (('"A" . '"B") CDATA IMPLIED #f) + (('"C" . '"B") CDATA IMPLIED "xx") + (('"xmlns" . '"C") CDATA IMPLIED "URI1") + )) + (('"A" . '"P") ANY ()) ('"BR" '"EMPTY" ())) + '() '((#f '"UA" . '"URI1")) seed)) + '()))) + ; prefix C undeclared + (assert (failed? + (test "<DIV A:B='A' B='B' xmlns:A='URI1' xmlns='URI1'><A:P xmlns=''><BR/></A:P></DIV>" + (lambda (elem-gi seed) + (values + '(('"DIV" ANY (('"B" CDATA IMPLIED #f) + ('"xmlns" CDATA IMPLIED "URI1") + (('"A" . '"B") CDATA IMPLIED #f) + (('"C" . '"B") CDATA IMPLIED "xx") + )) + (('"A" . '"P") ANY ()) ('"BR" EMPTY ())) + '() '((#f '"UA" . '"URI1")) seed)) + '()))) + + ; contradiction to xmlns declaration + (assert (failed? + (test "<DIV A:B='A' B='B' xmlns:A='URI1' xmlns='URI1'><A:P xmlns=''><BR/></A:P></DIV>" + (lambda (elem-gi seed) + (values + '(('"DIV" ANY (('"B" CDATA IMPLIED #f) + ('"xmlns" CDATA FIXED "URI2") + (('"A" . '"B") CDATA IMPLIED #f) + )) + (('"A" . '"P") ANY ()) ('"BR" EMPTY ())) + '() '((#f '"UA" . '"URI1")) seed)) + '()))) + + (test "<DIV A:B='A' B='B' xmlns:A='URI1' xmlns='URI1'><A:P xmlns=''><BR/></A:P></DIV>" + (lambda (elem-gi seed) + (values + '(('"DIV" ANY (('"B" CDATA IMPLIED #f) + ('"xmlns" CDATA FIXED "URI1") + (('"A" . '"B") CDATA IMPLIED #f) + )) + (('"A" . '"P") ANY ()) ('"BR" EMPTY ())) + '() '((#f '"UA" . '"URI1")) seed)) + '((('"UA" . '"DIV") (@ ('"B" "B") (('"UA" . '"B") "A")) + (*NAMESPACES* ((*DEFAULT* '"UA" . '"URI1") + ('"A" '"UA" . '"URI1") (#f '"UA" . '"URI1"))) + (('"UA" . '"P") + (*NAMESPACES* ((*DEFAULT* #f . #f) + (*DEFAULT* '"UA" . '"URI1") + ('"A" '"UA" . '"URI1") (#f '"UA" . '"URI1"))) + ('"BR" + (*NAMESPACES* ((*DEFAULT* #f . #f) (*DEFAULT* '"UA" . '"URI1") + ('"A" '"UA" . '"URI1") (#f '"UA" . '"URI1")))))))) + + (test "<DIV A:B='A' B='B' xmlns:A='URI1' xmlns='URI1'><A:P xmlns=''><BR/></A:P></DIV>" + (lambda (elem-gi seed) + (values + '(('"DIV" ANY (('"B" CDATA IMPLIED #f) + (('"A" . '"B") CDATA IMPLIED #f) + (('"C" . '"B") CDATA IMPLIED "xx") + (('"xmlns" . '"C") CDATA IMPLIED "URI2") + )) + (('"A" . '"P") ANY ()) ('"BR" EMPTY ())) + '() '((#f '"UA" . '"URI1")) seed)) + '((('"UA" . '"DIV") (@ ('"B" "B") (('"UA" . '"B") "A") + (('"URI2" . '"B") "xx")) + (*NAMESPACES* ((*DEFAULT* '"UA" . '"URI1") + ('"A" '"UA" . '"URI1") + ('"C" '"URI2" . '"URI2") + (#f '"UA" . '"URI1"))) + (('"UA" . '"P") + (*NAMESPACES* ((*DEFAULT* #f . #f) (*DEFAULT* '"UA" . '"URI1") + ('"A" '"UA" . '"URI1") + ('"C" '"URI2" . '"URI2") (#f '"UA" . '"URI1"))) + ('"BR" + (*NAMESPACES* ((*DEFAULT* #f . #f) + (*DEFAULT* '"UA" . '"URI1") + ('"A" '"UA" . '"URI1") + ('"C" '"URI2" . '"URI2") + (#f '"UA" . '"URI1")))))))) +)) + + + +;======================================================================== +; Highest-level parsers: XML to SXML +; + +; First, a few utility procedures that turned out useful + +; ssax:reverse-collect-str LIST-OF-FRAGS -> LIST-OF-FRAGS +; given the list of fragments (some of which are text strings) +; reverse the list and concatenate adjacent text strings. +; We can prove from the general case below that if LIST-OF-FRAGS +; has zero or one element, the result of the procedure is equal? +; to its argument. This fact justifies the shortcut evaluation below. +(define (ssax:reverse-collect-str fragments) + (cond + ((null? fragments) '()) ; a shortcut + ((null? (cdr fragments)) fragments) ; see the comment above + (else + (let loop ((fragments fragments) (result '()) (strs '())) + (cond + ((null? fragments) + (if (null? strs) result + (cons (string-concatenate/shared strs) result))) + ((string? (car fragments)) + (loop (cdr fragments) result (cons (car fragments) strs))) + (else + (loop (cdr fragments) + (cons + (car fragments) + (if (null? strs) result + (cons (string-concatenate/shared strs) result))) + '()))))))) + + +; ssax:reverse-collect-str-drop-ws LIST-OF-FRAGS -> LIST-OF-FRAGS +; given the list of fragments (some of which are text strings) +; reverse the list and concatenate adjacent text strings. +; We also drop "unsignificant" whitespace, that is, whitespace +; in front, behind and between elements. The whitespace that +; is included in character data is not affected. +; We use this procedure to "intelligently" drop "insignificant" +; whitespace in the parsed SXML. If the strict compliance with +; the XML Recommendation regarding the whitespace is desired, please +; use the ssax:reverse-collect-str procedure instead. + +(define (ssax:reverse-collect-str-drop-ws fragments) + (cond + ((null? fragments) '()) ; a shortcut + ((null? (cdr fragments)) ; another shortcut + (if (and (string? (car fragments)) (string-whitespace? (car fragments))) + '() fragments)) ; remove trailing ws + (else + (let loop ((fragments fragments) (result '()) (strs '()) + (all-whitespace? #t)) + (cond + ((null? fragments) + (if all-whitespace? result ; remove leading ws + (cons (string-concatenate/shared strs) result))) + ((string? (car fragments)) + (loop (cdr fragments) result (cons (car fragments) strs) + (and all-whitespace? + (string-whitespace? (car fragments))))) + (else + (loop (cdr fragments) + (cons + (car fragments) + (if all-whitespace? result + (cons (string-concatenate/shared strs) result))) + '() #t))))))) + + +; procedure: ssax:xml->sxml PORT NAMESPACE-PREFIX-ASSIG +; +; This is an instance of a SSAX parser above that returns an SXML +; representation of the XML document to be read from PORT. +; NAMESPACE-PREFIX-ASSIG is a list of (USER-PREFIX . URI-STRING) +; that assigns USER-PREFIXes to certain namespaces identified by +; particular URI-STRINGs. It may be an empty list. +; The procedure returns an SXML tree. The port points out to the +; first character after the root element. + +(define (ssax:xml->sxml port namespace-prefix-assig) + (letrec + ((namespaces + (map (lambda (el) + (cons* #f (car el) (ssax:uri-string->symbol (cdr el)))) + namespace-prefix-assig)) + + (RES-NAME->SXML + (lambda (res-name) + (string->symbol + (string-append + (symbol->string (car res-name)) + ":" + (symbol->string (cdr res-name)))))) + + ) + (let ((result + (reverse + ((ssax:make-parser + NEW-LEVEL-SEED + (lambda (elem-gi attributes namespaces + expected-content seed) + '()) + + FINISH-ELEMENT + (lambda (elem-gi attributes namespaces parent-seed seed) + (let ((seed (ssax:reverse-collect-str seed)) + (attrs + (attlist-fold + (lambda (attr accum) + (cons (list + (if (symbol? (car attr)) (car attr) + (RES-NAME->SXML (car attr))) + (cdr attr)) accum)) + '() attributes))) + (cons + (cons + (if (symbol? elem-gi) elem-gi + (RES-NAME->SXML elem-gi)) + (if (null? attrs) seed + (cons (cons '@ attrs) seed))) + parent-seed))) + + CHAR-DATA-HANDLER + (lambda (string1 string2 seed) + (if (string-null? string2) (cons string1 seed) + (cons* string2 string1 seed))) + + DOCTYPE + (lambda (port docname systemid internal-subset? seed) + (when internal-subset? + (ssax:warn port + "Internal DTD subset is not currently handled ") + (ssax:skip-internal-dtd port)) + (ssax:warn port "DOCTYPE DECL " docname " " + systemid " found and skipped") + (values #f '() namespaces seed)) + + UNDECL-ROOT + (lambda (elem-gi seed) + (values #f '() namespaces seed)) + + PI + ((*DEFAULT* . + (lambda (port pi-tag seed) + (cons + (list '*PI* pi-tag (ssax:read-pi-body-as-string port)) + seed)))) + ) + port '())))) + (cons '*TOP* + (if (null? namespace-prefix-assig) result + (cons + (list '@ (cons '*NAMESPACES* + (map (lambda (ns) (list (car ns) (cdr ns))) + namespace-prefix-assig))) + result))) +))) + +; For backwards compatibility +(define SSAX:XML->SXML ssax:xml->sxml) + + +; a few lines of validation code +(run-test (letrec + ((test (lambda (str namespace-assig expected-res) + (newline) (display "input: ") + (write (unesc-string str)) (newline) (display "Result: ") + (let ((result + (call-with-input-string (unesc-string str) + (lambda (port) + (ssax:xml->sxml port namespace-assig))))) + (pp result) + (assert (equal_? result expected-res)))))) + + (test " <BR/>" '() '(*TOP* (BR))) + (test "<BR></BR>" '() '(*TOP* (BR))) + (test " <BR CLEAR='ALL'%nCLASS='Class1'/>" '() + '(*TOP* (BR (@ (CLEAR "ALL") (CLASS "Class1"))))) + (test " <A HREF='URL'> link <I>itlink </I> &amp;</A>" '() + '(*TOP* (A (@ (HREF "URL")) " link " (I "itlink ") " &"))) + (test " <A HREF='URL' xml:space='preserve'> link <I>itlink </I> &amp;</A>" '() + '(*TOP* (A (@ (xml:space "preserve") (HREF "URL")) + " link " (I "itlink ") " &"))) + (test " <A HREF='URL' xml:space='preserve'> link <I xml:space='default'>itlink </I> &amp;</A>" '() + '(*TOP* (A (@ (xml:space "preserve") (HREF "URL")) + " link " (I (@ (xml:space "default")) + "itlink ") " &"))) + (test " <P><?pi1 p1 content ?>?<?pi2 pi2? content? ??></P>" '() + '(*TOP* (P (*PI* pi1 "p1 content ") "?" + (*PI* pi2 "pi2? content? ?")))) + (test " <P>some text <![CDATA[<]]>1%n"<B>strong</B>"%r</P>" + '() + `(*TOP* (P ,(unesc-string "some text <1%n\"") + (B "strong") ,(unesc-string "\"%n")))) + (test " <P><![CDATA[<BR>%n<![CDATA[<BR>]]>]]></P>" '() + `(*TOP* (P ,(unesc-string "<BR>%n<![CDATA[<BR>]]>")))) +; (test "<T1><T2>it's%r%nand that%n</T2>%r%n%r%n%n</T1>" '() +; '(*TOP* (T1 (T2 "it's%nand that%n") "%n%n%n"))) + (test "<T1><T2>it's%r%nand that%n</T2>%r%n%r%n%n</T1>" '() + `(*TOP* (T1 (T2 ,(unesc-string "it's%nand that%n")) ,(unesc-string "%n%n%n")))) + (test "<T1><T2>it's%rand that%n</T2>%r%n%r%n%n</T1>" '() + `(*TOP* (T1 (T2 ,(unesc-string "it's%nand that%n")) ,(unesc-string "%n%n%n")))) + (test "<!DOCTYPE T SYSTEM 'system1' ><!-- comment -->%n<T/>" '() + '(*TOP* (T))) + (test "<?xml version='1.0'?>%n<WEIGHT unit=\"pound\">%n<NET certified='certified'> 67 </NET>%n<GROSS> 95 </GROSS>%n</WEIGHT>" '() + `(*TOP* (*PI* xml "version='1.0'") (WEIGHT (@ (unit "pound")) + ,nl (NET (@ (certified "certified")) " 67 ") ,nl + (GROSS " 95 ") ,nl) + )) +; (test "<?xml version='1.0'?>%n<WEIGHT unit=\"pound\">%n<NET certified='certified'> 67 </NET>%n<GROSS> 95 </GROSS>%n</WEIGHT>" '() +; '(*TOP* (*PI* xml "version='1.0'") (WEIGHT (@ (unit "pound")) +; "%n" (NET (@ (certified "certified")) " 67 ") +; "%n" (GROSS " 95 ") "%n") +; )) + (test "<DIV A:B='A' B='B' xmlns:A='URI1' xmlns='URI1'><A:P xmlns=''><BR/></A:P></DIV>" '() + '(*TOP* (URI1:DIV (@ (URI1:B "A") (B "B")) (URI1:P (BR))))) + (test "<DIV A:B='A' B='B' xmlns:A='URI1' xmlns='URI1'><A:P xmlns=''><BR/></A:P></DIV>" '((UA . "URI1")) + '(*TOP* (@ (*NAMESPACES* (UA "URI1"))) + (UA:DIV (@ (UA:B "A") (B "B")) (UA:P (BR))))) + + ; A few tests from XML Namespaces Recommendation + (test (string-append + "<x xmlns:edi='http://ecommerce.org/schema'>" + "<!-- the 'taxClass' attribute's ns http://ecommerce.org/schema -->" + "<lineItem edi:taxClass='exempt'>Baby food</lineItem>" nl + "</x>") '() + `(*TOP* + (x (lineItem + (@ (http://ecommerce.org/schema:taxClass "exempt")) + "Baby food") ,nl))) + (test (string-append + "<x xmlns:edi='http://ecommerce.org/schema'>" + "<!-- the 'taxClass' attribute's ns http://ecommerce.org/schema -->" + "<lineItem edi:taxClass='exempt'>Baby food</lineItem>" + "</x>") '((EDI . "http://ecommerce.org/schema")) + '(*TOP* + (@ (*NAMESPACES* (EDI "http://ecommerce.org/schema"))) + (x (lineItem + (@ (EDI:taxClass "exempt")) + "Baby food")))) + + (test (string-append + "<bk:book xmlns:bk='urn:loc.gov:books' " + "xmlns:isbn='urn:ISBN:0-395-36341-6'>" + "<bk:title>Cheaper by the Dozen</bk:title>" + "<isbn:number>1568491379</isbn:number></bk:book>") + '() + '(*TOP* (urn:loc.gov:books:book + (urn:loc.gov:books:title "Cheaper by the Dozen") + (urn:ISBN:0-395-36341-6:number "1568491379")))) + + (test (string-append + "<!-- initially, the default namespace is 'books' -->" + "<book xmlns='urn:loc.gov:books' " + "xmlns:isbn='urn:ISBN:0-395-36341-6'>" + "<title>Cheaper by the Dozen</title>" + "<isbn:number>1568491379</isbn:number>" + "<notes>" + "<!-- make HTML the default namespace for some commentary -->" + "<p xmlns='urn:w3-org-ns:HTML'>" + "This is a <i>funny</i> book!" + "</p>" + "</notes>" + "</book>") '() + '(*TOP* (urn:loc.gov:books:book + (urn:loc.gov:books:title "Cheaper by the Dozen") + (urn:ISBN:0-395-36341-6:number "1568491379") + (urn:loc.gov:books:notes + (urn:w3-org-ns:HTML:p + "This is a " (urn:w3-org-ns:HTML:i "funny") + " book!"))))) + + (test (string-append + "<Beers>" + "<!-- the default namespace is now that of HTML -->" + "<table xmlns='http://www.w3.org/TR/REC-html40'>" + "<th><td>Name</td><td>Origin</td><td>Description</td></th>" + "<tr>" + "<!-- no default namespace inside table cells -->" + "<td><brandName xmlns=\"\">Huntsman</brandName></td>" + "<td><origin xmlns=''>Bath, UK</origin></td>" + "<td>" + "<details xmlns=''><class>Bitter</class><hop>Fuggles</hop>" + "<pro>Wonderful hop, light alcohol, good summer beer</pro>" + "<con>Fragile; excessive variance pub to pub</con>" + "</details>" + "</td>" + "</tr>" + "</table>" + "</Beers>") + '((html . "http://www.w3.org/TR/REC-html40")) + '(*TOP* + (@ (*NAMESPACES* (html "http://www.w3.org/TR/REC-html40"))) + (Beers (html:table + (html:th (html:td "Name") + (html:td "Origin") + (html:td "Description")) + (html:tr (html:td (brandName "Huntsman")) + (html:td (origin "Bath, UK")) + (html:td + (details + (class "Bitter") + (hop "Fuggles") + (pro "Wonderful hop, light alcohol, good summer beer") + (con "Fragile; excessive variance pub to pub")))))))) + + (test (string-append + "<!-- 1 --><RESERVATION xmlns:HTML='http://www.w3.org/TR/REC-html40'>" + "<!-- 2 --><NAME HTML:CLASS=\"largeSansSerif\">Layman, A</NAME>" + "<!-- 3 --><SEAT CLASS='Y' HTML:CLASS=\"largeMonotype\">33B</SEAT>" + "<!-- 4 --><HTML:A HREF='/cgi-bin/ResStatus'>Check Status</HTML:A>" + "<!-- 5 --><DEPARTURE>1997-05-24T07:55:00+1</DEPARTURE></RESERVATION>") + '((HTML . "http://www.w3.org/TR/REC-html40")) + '(*TOP* + (@ (*NAMESPACES* (HTML "http://www.w3.org/TR/REC-html40"))) + (RESERVATION + (NAME (@ (HTML:CLASS "largeSansSerif")) "Layman, A") + (SEAT (@ (HTML:CLASS "largeMonotype") (CLASS "Y")) "33B") + (HTML:A (@ (HREF "/cgi-bin/ResStatus")) "Check Status") + (DEPARTURE "1997-05-24T07:55:00+1")))) + ; Part of RDF from the XML Infoset + (test (string-concatenate/shared '( + "<?xml version='1.0' encoding='utf-8' standalone='yes'?>" + "<!-- this can be decoded as US-ASCII or iso-8859-1 as well," + " since it contains no characters outside the US-ASCII repertoire -->" + "<rdf:RDF xmlns:rdf='http://www.w3.org/1999/02/22-rdf-syntax-ns#'" + " xmlns:rdfs='http://www.w3.org/2000/01/rdf-schema#'" + " xmlns='http://www.w3.org/2001/02/infoset#'>" + "<rdfs:Class ID='Boolean'/>" + "<Boolean ID='Boolean.true'/>" + "<Boolean ID='Boolean.false'/>" + "<!--Info item classes-->" + "<rdfs:Class ID='InfoItem'/>" + "<rdfs:Class ID='Document' rdfs:subClassOf='#InfoItem'/>" + "<rdfs:Class ID='Element' rdfs:subClassOf='#InfoItem'/>" + "<rdfs:Class ID='Attribute' rdfs:subClassOf='#InfoItem'/>" + "<rdfs:Class ID='InfoItemSet' + rdfs:subClassOf='http://www.w3.org/1999/02/22-rdf-syntax-ns#Bag'/>" + "<rdfs:Class ID='AttributeSet' rdfs:subClassOf='#InfoItemSet'/>" + "<!--Info item properties-->" + "<rdfs:Property ID='allDeclarationsProcessed'>" + "<rdfs:domain resource='#Document'/>" + "<rdfs:range resource='#Boolean'/></rdfs:Property>" + "<rdfs:Property ID='attributes'>" + "<rdfs:domain resource='#Element'/>" + "<rdfs:range resource='#AttributeSet'/>" + "</rdfs:Property>" + "</rdf:RDF>")) + '((RDF . "http://www.w3.org/1999/02/22-rdf-syntax-ns#") + (RDFS . "http://www.w3.org/2000/01/rdf-schema#") + (ISET . "http://www.w3.org/2001/02/infoset#")) + '(*TOP* (@ (*NAMESPACES* + (RDF "http://www.w3.org/1999/02/22-rdf-syntax-ns#") + (RDFS "http://www.w3.org/2000/01/rdf-schema#") + (ISET "http://www.w3.org/2001/02/infoset#"))) + (*PI* xml "version='1.0' encoding='utf-8' standalone='yes'") + (RDF:RDF + (RDFS:Class (@ (ID "Boolean"))) + (ISET:Boolean (@ (ID "Boolean.true"))) + (ISET:Boolean (@ (ID "Boolean.false"))) + (RDFS:Class (@ (ID "InfoItem"))) + (RDFS:Class (@ (RDFS:subClassOf "#InfoItem") (ID "Document"))) + (RDFS:Class (@ (RDFS:subClassOf "#InfoItem") (ID "Element"))) + (RDFS:Class (@ (RDFS:subClassOf "#InfoItem") (ID "Attribute"))) + (RDFS:Class + (@ (RDFS:subClassOf + "http://www.w3.org/1999/02/22-rdf-syntax-ns#Bag") + (ID "InfoItemSet"))) + (RDFS:Class + (@ (RDFS:subClassOf "#InfoItemSet") (ID "AttributeSet"))) + (RDFS:Property + (@ (ID "allDeclarationsProcessed")) + (RDFS:domain (@ (resource "#Document"))) + (RDFS:range (@ (resource "#Boolean")))) + (RDFS:Property + (@ (ID "attributes")) + (RDFS:domain (@ (resource "#Element"))) + (RDFS:range (@ (resource "#AttributeSet"))))))) + + ; Part of RDF from RSS of the Daemon News Mall + (test (string-concatenate/shared (list-intersperse '( + "<?xml version='1.0'?><rdf:RDF " + "xmlns:rdf='http://www.w3.org/1999/02/22-rdf-syntax-ns#' " + "xmlns='http://my.netscape.com/rdf/simple/0.9/'>" + "<channel>" + "<title>Daemon News Mall</title>" + "<link>http://mall.daemonnews.org/</link>" + "<description>Central source for all your BSD needs</description>" + "</channel>" + "<item>" + "<title>Daemon News Jan/Feb Issue NOW Available! Subscribe $24.95</title>" + "<link>http://mall.daemonnews.org/?page=shop/flypage&product_id=880</link>" + "</item>" + "<item>" + "<title>The Design and Implementation of the 4.4BSD Operating System $54.95</title>" + "<link>http://mall.daemonnews.org/?page=shop/flypage&product_id=912&category_id=1761</link>" + "</item>" + "</rdf:RDF>") + (string #\newline) + )) + '((RDF . "http://www.w3.org/1999/02/22-rdf-syntax-ns#") + (RSS . "http://my.netscape.com/rdf/simple/0.9/") + (ISET . "http://www.w3.org/2001/02/infoset#")) + `(*TOP* (@ (*NAMESPACES* + (RDF "http://www.w3.org/1999/02/22-rdf-syntax-ns#") + (RSS "http://my.netscape.com/rdf/simple/0.9/") + (ISET "http://www.w3.org/2001/02/infoset#"))) + (*PI* xml "version='1.0'") + (RDF:RDF ,nl + (RSS:channel ,nl + (RSS:title "Daemon News Mall") ,nl + (RSS:link "http://mall.daemonnews.org/") ,nl + (RSS:description "Central source for all your BSD needs") ,nl) ,nl + (RSS:item ,nl + (RSS:title + "Daemon News Jan/Feb Issue NOW Available! Subscribe $24.95") ,nl + (RSS:link + "http://mall.daemonnews.org/?page=shop/flypage&product_id=880") ,nl) ,nl + (RSS:item ,nl + (RSS:title + "The Design and Implementation of the 4.4BSD Operating System $54.95") ,nl + (RSS:link + "http://mall.daemonnews.org/?page=shop/flypage&product_id=912&category_id=1761") ,nl) ,nl))) + + (test (string-concatenate/shared + '("<Forecasts TStamp='958082142'>" + "<TAF TStamp='958066200' LatLon='36.583, -121.850' BId='724915'" + " SName='KMRY, MONTEREY PENINSULA'>" + "<VALID TRange='958068000, 958154400'>111730Z 111818</VALID>" + "<PERIOD TRange='958068000, 958078800'>" + "<PREVAILING>31010KT P6SM FEW030</PREVAILING>" + "</PERIOD>" + "<PERIOD TRange='958078800, 958104000' Title='FM2100'>" + "<PREVAILING>29016KT P6SM FEW040</PREVAILING>" + "</PERIOD>" + "<PERIOD TRange='958104000, 958154400' Title='FM0400'>" + "<PREVAILING>29010KT P6SM SCT200</PREVAILING>" + "<VAR Title='BECMG 0708' TRange='958114800, 958118400'>VRB05KT</VAR>" + "</PERIOD></TAF>" + "</Forecasts>")) + '() + '(*TOP* (Forecasts + (@ (TStamp "958082142")) + (TAF (@ (TStamp "958066200") + (SName "KMRY, MONTEREY PENINSULA") + (LatLon "36.583, -121.850") + (BId "724915")) + (VALID (@ (TRange "958068000, 958154400")) "111730Z 111818") + (PERIOD (@ (TRange "958068000, 958078800")) + (PREVAILING "31010KT P6SM FEW030")) + (PERIOD (@ (Title "FM2100") (TRange "958078800, 958104000")) + (PREVAILING "29016KT P6SM FEW040")) + (PERIOD (@ (Title "FM0400") (TRange "958104000, 958154400")) + (PREVAILING "29010KT P6SM SCT200") + (VAR (@ (Title "BECMG 0708") + (TRange "958114800, 958118400")) + "VRB05KT")))))) +)) + +(run-test + (newline) + (display "All tests passed") + (newline) +) diff --git a/module/sxml/upstream/SXML-tree-trans.scm b/module/sxml/upstream/SXML-tree-trans.scm new file mode 100644 index 000000000..f2c3293ca --- /dev/null +++ b/module/sxml/upstream/SXML-tree-trans.scm @@ -0,0 +1,249 @@ +; XML/HTML processing in Scheme +; SXML expression tree transformers +; +; IMPORT +; A prelude appropriate for your Scheme system +; (myenv-bigloo.scm, myenv-mit.scm, etc.) +; +; EXPORT +; (provide SRV:send-reply +; post-order pre-post-order replace-range) +; +; See vSXML-tree-trans.scm for the validation code, which also +; serves as usage examples. +; +; $Id: SXML-tree-trans.scm,v 1.6 2003/04/25 19:16:15 oleg Exp $ + + +; Output the 'fragments' +; The fragments are a list of strings, characters, +; numbers, thunks, #f, #t -- and other fragments. +; The function traverses the tree depth-first, writes out +; strings and characters, executes thunks, and ignores +; #f and '(). +; The function returns #t if anything was written at all; +; otherwise the result is #f +; If #t occurs among the fragments, it is not written out +; but causes the result of SRV:send-reply to be #t + +(define (SRV:send-reply . fragments) + (let loop ((fragments fragments) (result #f)) + (cond + ((null? fragments) result) + ((not (car fragments)) (loop (cdr fragments) result)) + ((null? (car fragments)) (loop (cdr fragments) result)) + ((eq? #t (car fragments)) (loop (cdr fragments) #t)) + ((pair? (car fragments)) + (loop (cdr fragments) (loop (car fragments) result))) + ((procedure? (car fragments)) + ((car fragments)) + (loop (cdr fragments) #t)) + (else + (display (car fragments)) + (loop (cdr fragments) #t))))) + + + +;------------------------------------------------------------------------ +; Traversal of an SXML tree or a grove: +; a <Node> or a <Nodelist> +; +; A <Node> and a <Nodelist> are mutually-recursive datatypes that +; underlie the SXML tree: +; <Node> ::= (name . <Nodelist>) | "text string" +; An (ordered) set of nodes is just a list of the constituent nodes: +; <Nodelist> ::= (<Node> ...) +; Nodelists, and Nodes other than text strings are both lists. A +; <Nodelist> however is either an empty list, or a list whose head is +; not a symbol (an atom in general). A symbol at the head of a node is +; either an XML name (in which case it's a tag of an XML element), or +; an administrative name such as '@'. +; See SXPath.scm and SSAX.scm for more information on SXML. + + +; Pre-Post-order traversal of a tree and creation of a new tree: +; pre-post-order:: <tree> x <bindings> -> <new-tree> +; where +; <bindings> ::= (<binding> ...) +; <binding> ::= (<trigger-symbol> *preorder* . <handler>) | +; (<trigger-symbol> *macro* . <handler>) | +; (<trigger-symbol> <new-bindings> . <handler>) | +; (<trigger-symbol> . <handler>) +; <trigger-symbol> ::= XMLname | *text* | *default* +; <handler> :: <trigger-symbol> x [<tree>] -> <new-tree> +; +; The pre-post-order function visits the nodes and nodelists +; pre-post-order (depth-first). For each <Node> of the form (name +; <Node> ...) it looks up an association with the given 'name' among +; its <bindings>. If failed, pre-post-order tries to locate a +; *default* binding. It's an error if the latter attempt fails as +; well. Having found a binding, the pre-post-order function first +; checks to see if the binding is of the form +; (<trigger-symbol> *preorder* . <handler>) +; If it is, the handler is 'applied' to the current node. Otherwise, +; the pre-post-order function first calls itself recursively for each +; child of the current node, with <new-bindings> prepended to the +; <bindings> in effect. The result of these calls is passed to the +; <handler> (along with the head of the current <Node>). To be more +; precise, the handler is _applied_ to the head of the current node +; and its processed children. The result of the handler, which should +; also be a <tree>, replaces the current <Node>. If the current <Node> +; is a text string or other atom, a special binding with a symbol +; *text* is looked up. +; +; A binding can also be of a form +; (<trigger-symbol> *macro* . <handler>) +; This is equivalent to *preorder* described above. However, the result +; is re-processed again, with the current stylesheet. + +(define (pre-post-order tree bindings) + (let* ((default-binding (assq '*default* bindings)) + (text-binding (or (assq '*text* bindings) default-binding)) + (text-handler ; Cache default and text bindings + (and text-binding + (if (procedure? (cdr text-binding)) + (cdr text-binding) (cddr text-binding))))) + (let loop ((tree tree)) + (cond + ((null? tree) '()) + ((not (pair? tree)) + (let ((trigger '*text*)) + (if text-handler (text-handler trigger tree) + (error "Unknown binding for " trigger " and no default")))) + ((not (symbol? (car tree))) (map loop tree)) ; tree is a nodelist + (else ; tree is an SXML node + (let* ((trigger (car tree)) + (binding (or (assq trigger bindings) default-binding))) + (cond + ((not binding) + (error "Unknown binding for " trigger " and no default")) + ((not (pair? (cdr binding))) ; must be a procedure: handler + (apply (cdr binding) trigger (map loop (cdr tree)))) + ((eq? '*preorder* (cadr binding)) + (apply (cddr binding) tree)) + ((eq? '*macro* (cadr binding)) + (loop (apply (cddr binding) tree))) + (else ; (cadr binding) is a local binding + (apply (cddr binding) trigger + (pre-post-order (cdr tree) (append (cadr binding) bindings))) + )))))))) + +; post-order is a strict subset of pre-post-order without *preorder* +; (let alone *macro*) traversals. +; Now pre-post-order is actually faster than the old post-order. +; The function post-order is deprecated and is aliased below for +; backward compatibility. +(define post-order pre-post-order) + +;------------------------------------------------------------------------ +; Extended tree fold +; tree = atom | (node-name tree ...) +; +; foldts fdown fup fhere seed (Leaf str) = fhere seed str +; foldts fdown fup fhere seed (Nd kids) = +; fup seed $ foldl (foldts fdown fup fhere) (fdown seed) kids + +; procedure fhere: seed -> atom -> seed +; procedure fdown: seed -> node -> seed +; procedure fup: parent-seed -> last-kid-seed -> node -> seed +; foldts returns the final seed + +(define (foldts fdown fup fhere seed tree) + (cond + ((null? tree) seed) + ((not (pair? tree)) ; An atom + (fhere seed tree)) + (else + (let loop ((kid-seed (fdown seed tree)) (kids (cdr tree))) + (if (null? kids) + (fup seed kid-seed tree) + (loop (foldts fdown fup fhere kid-seed (car kids)) + (cdr kids))))))) + +;------------------------------------------------------------------------ +; Traverse a forest depth-first and cut/replace ranges of nodes. +; +; The nodes that define a range don't have to have the same immediate +; parent, don't have to be on the same level, and the end node of a +; range doesn't even have to exist. A replace-range procedure removes +; nodes from the beginning node of the range up to (but not including) +; the end node of the range. In addition, the beginning node of the +; range can be replaced by a node or a list of nodes. The range of +; nodes is cut while depth-first traversing the forest. If all +; branches of the node are cut a node is cut as well. The procedure +; can cut several non-overlapping ranges from a forest. + +; replace-range:: BEG-PRED x END-PRED x FOREST -> FOREST +; where +; type FOREST = (NODE ...) +; type NODE = Atom | (Name . FOREST) | FOREST +; +; The range of nodes is specified by two predicates, beg-pred and end-pred. +; beg-pred:: NODE -> #f | FOREST +; end-pred:: NODE -> #f | FOREST +; The beg-pred predicate decides on the beginning of the range. The node +; for which the predicate yields non-#f marks the beginning of the range +; The non-#f value of the predicate replaces the node. The value can be a +; list of nodes. The replace-range procedure then traverses the tree and skips +; all the nodes, until the end-pred yields non-#f. The value of the end-pred +; replaces the end-range node. The new end node and its brothers will be +; re-scanned. +; The predicates are evaluated pre-order. We do not descend into a node that +; is marked as the beginning of the range. + +(define (replace-range beg-pred end-pred forest) + + ; loop forest keep? new-forest + ; forest is the forest to traverse + ; new-forest accumulates the nodes we will keep, in the reverse + ; order + ; If keep? is #t, keep the curr node if atomic. If the node is not atomic, + ; traverse its children and keep those that are not in the skip range. + ; If keep? is #f, skip the current node if atomic. Otherwise, + ; traverse its children. If all children are skipped, skip the node + ; as well. + + (define (loop forest keep? new-forest) + (if (null? forest) (values (reverse new-forest) keep?) + (let ((node (car forest))) + (if keep? + (cond ; accumulate mode + ((beg-pred node) => ; see if the node starts the skip range + (lambda (repl-branches) ; if so, skip/replace the node + (loop (cdr forest) #f + (append (reverse repl-branches) new-forest)))) + ((not (pair? node)) ; it's an atom, keep it + (loop (cdr forest) keep? (cons node new-forest))) + (else + (let*-values + (((node?) (symbol? (car node))) ; or is it a nodelist? + ((new-kids keep?) ; traverse its children + (loop (if node? (cdr node) node) #t '()))) + (loop (cdr forest) keep? + (cons + (if node? (cons (car node) new-kids) new-kids) + new-forest))))) + ; skip mode + (cond + ((end-pred node) => ; end the skip range + (lambda (repl-branches) ; repl-branches will be re-scanned + (loop (append repl-branches (cdr forest)) #t + new-forest))) + ((not (pair? node)) ; it's an atom, skip it + (loop (cdr forest) keep? new-forest)) + (else + (let*-values + (((node?) (symbol? (car node))) ; or is it a nodelist? + ((new-kids keep?) ; traverse its children + (loop (if node? (cdr node) node) #f '()))) + (loop (cdr forest) keep? + (if (or keep? (pair? new-kids)) + (cons + (if node? (cons (car node) new-kids) new-kids) + new-forest) + new-forest) ; if all kids are skipped + )))))))) ; skip the node too + + (let*-values (((new-forest keep?) (loop forest #t '()))) + new-forest)) + diff --git a/module/sxml/upstream/SXPath-old.scm b/module/sxml/upstream/SXPath-old.scm new file mode 100644 index 000000000..cf4526ed6 --- /dev/null +++ b/module/sxml/upstream/SXPath-old.scm @@ -0,0 +1,1216 @@ +; XML processing in Scheme +; SXPath -- SXML Query Language +; +; SXPath is a query language for SXML, an instance of XML Information +; set (Infoset) in the form of s-expressions. See SSAX.scm for the +; definition of SXML and more details. SXPath is also a translation into +; Scheme of an XML Path Language, XPath: +; http://www.w3.org/TR/xpath +; XPath and SXPath describe means of selecting a set of Infoset's items +; or their properties. +; +; To facilitate queries, XPath maps the XML Infoset into an explicit +; tree, and introduces important notions of a location path and a +; current, context node. A location path denotes a selection of a set of +; nodes relative to a context node. Any XPath tree has a distinguished, +; root node -- which serves as the context node for absolute location +; paths. Location path is recursively defined as a location step joined +; with a location path. A location step is a simple query of the +; database relative to a context node. A step may include expressions +; that further filter the selected set. Each node in the resulting set +; is used as a context node for the adjoining location path. The result +; of the step is a union of the sets returned by the latter location +; paths. +; +; The SXML representation of the XML Infoset (see SSAX.scm) is rather +; suitable for querying as it is. Bowing to the XPath specification, +; we will refer to SXML information items as 'Nodes': +; <Node> ::= <Element> | <attributes-coll> | <attrib> +; | "text string" | <PI> +; This production can also be described as +; <Node> ::= (name . <Nodeset>) | "text string" +; An (ordered) set of nodes is just a list of the constituent nodes: +; <Nodeset> ::= (<Node> ...) +; Nodesets, and Nodes other than text strings are both lists. A +; <Nodeset> however is either an empty list, or a list whose head is not +; a symbol. A symbol at the head of a node is either an XML name (in +; which case it's a tag of an XML element), or an administrative name +; such as '@'. This uniform list representation makes processing rather +; simple and elegant, while avoiding confusion. The multi-branch tree +; structure formed by the mutually-recursive datatypes <Node> and +; <Nodeset> lends itself well to processing by functional languages. +; +; A location path is in fact a composite query over an XPath tree or +; its branch. A singe step is a combination of a projection, selection +; or a transitive closure. Multiple steps are combined via join and +; union operations. This insight allows us to _elegantly_ implement +; XPath as a sequence of projection and filtering primitives -- +; converters -- joined by _combinators_. Each converter takes a node +; and returns a nodeset which is the result of the corresponding query +; relative to that node. A converter can also be called on a set of +; nodes. In that case it returns a union of the corresponding queries over +; each node in the set. The union is easily implemented as a list +; append operation as all nodes in a SXML tree are considered +; distinct, by XPath conventions. We also preserve the order of the +; members in the union. Query combinators are high-order functions: +; they take converter(s) (which is a Node|Nodeset -> Nodeset function) +; and compose or otherwise combine them. We will be concerned with +; only relative location paths [XPath]: an absolute location path is a +; relative path applied to the root node. +; +; Similarly to XPath, SXPath defines full and abbreviated notations +; for location paths. In both cases, the abbreviated notation can be +; mechanically expanded into the full form by simple rewriting +; rules. In case of SXPath the corresponding rules are given as +; comments to a sxpath function, below. The regression test suite at +; the end of this file shows a representative sample of SXPaths in +; both notations, juxtaposed with the corresponding XPath +; expressions. Most of the samples are borrowed literally from the +; XPath specification, while the others are adjusted for our running +; example, tree1. +; +; To do: +; Rename filter to node-filter or ns-filter +; Use ;=== for chapters, ;--- for sections, and ;^^^ for end sections +; +; $Id: SXPath-old.scm,v 1.4 2004/07/07 16:02:31 sperber Exp $ + + + ; See http://pobox.com/~oleg/ftp/Scheme/myenv.scm + ; See http://pobox.com/~oleg/ftp/Scheme/myenv-scm.scm + ; See http://pobox.com/~oleg/ftp/Scheme/myenv-bigloo.scm +;(module SXPath +; (include "myenv-bigloo.scm")) ; For use with Bigloo 2.2b +;(load "myenv-scm.scm") ; For use with SCM v5d2 +;(include "myenv.scm") ; For use with Gambit-C 3.0 + + + +(define (nodeset? x) + (or (and (pair? x) (not (symbol? (car x)))) (null? x))) + +;------------------------- +; Basic converters and applicators +; A converter is a function +; type Converter = Node|Nodeset -> Nodeset +; A converter can also play a role of a predicate: in that case, if a +; converter, applied to a node or a nodeset, yields a non-empty +; nodeset, the converter-predicate is deemed satisfied. Throughout +; this file a nil nodeset is equivalent to #f in denoting a failure. + +; The following function implements a 'Node test' as defined in +; Sec. 2.3 of XPath document. A node test is one of the components of a +; location step. It is also a converter-predicate in SXPath. +; +; The function node-typeof? takes a type criterion and returns a function, +; which, when applied to a node, will tell if the node satisfies +; the test. +; node-typeof? :: Crit -> Node -> Boolean +; +; The criterion 'crit' is a symbol, one of the following: +; id - tests if the Node has the right name (id) +; @ - tests if the Node is an <attributes-coll> +; * - tests if the Node is an <Element> +; *text* - tests if the Node is a text node +; *PI* - tests if the Node is a PI node +; *any* - #t for any type of Node + +(define (node-typeof? crit) + (lambda (node) + (case crit + ((*) (and (pair? node) (not (memq (car node) '(@ *PI*))))) + ((*any*) #t) + ((*text*) (string? node)) + (else + (and (pair? node) (eq? crit (car node)))) +))) + + +; Curried equivalence converter-predicates +(define (node-eq? other) + (lambda (node) + (eq? other node))) + +(define (node-equal? other) + (lambda (node) + (equal? other node))) + +; node-pos:: N -> Nodeset -> Nodeset, or +; node-pos:: N -> Converter +; Select the N'th element of a Nodeset and return as a singular Nodeset; +; Return an empty nodeset if the Nth element does not exist. +; ((node-pos 1) Nodeset) selects the node at the head of the Nodeset, +; if exists; ((node-pos 2) Nodeset) selects the Node after that, if +; exists. +; N can also be a negative number: in that case the node is picked from +; the tail of the list. +; ((node-pos -1) Nodeset) selects the last node of a non-empty nodeset; +; ((node-pos -2) Nodeset) selects the last but one node, if exists. + +(define (node-pos n) + (lambda (nodeset) + (cond + ((not (nodeset? nodeset)) '()) + ((null? nodeset) nodeset) + ((eqv? n 1) (list (car nodeset))) + ((negative? n) ((node-pos (+ n 1 (length nodeset))) nodeset)) + (else + (assert (positive? n)) + ((node-pos (dec n)) (cdr nodeset)))))) + +; filter:: Converter -> Converter +; A filter applicator, which introduces a filtering context. The argument +; converter is considered a predicate, with either #f or nil result meaning +; failure. +(define (filter pred?) + (lambda (lst) ; a nodeset or a node (will be converted to a singleton nset) + (let loop ((lst (if (nodeset? lst) lst (list lst))) (res '())) + (if (null? lst) + (reverse res) + (let ((pred-result (pred? (car lst)))) + (loop (cdr lst) + (if (and pred-result (not (null? pred-result))) + (cons (car lst) res) + res))))))) + +; take-until:: Converter -> Converter, or +; take-until:: Pred -> Node|Nodeset -> Nodeset +; Given a converter-predicate and a nodeset, apply the predicate to +; each element of the nodeset, until the predicate yields anything but #f or +; nil. Return the elements of the input nodeset that have been processed +; till that moment (that is, which fail the predicate). +; take-until is a variation of the filter above: take-until passes +; elements of an ordered input set till (but not including) the first +; element that satisfies the predicate. +; The nodeset returned by ((take-until (not pred)) nset) is a subset -- +; to be more precise, a prefix -- of the nodeset returned by +; ((filter pred) nset) + +(define (take-until pred?) + (lambda (lst) ; a nodeset or a node (will be converted to a singleton nset) + (let loop ((lst (if (nodeset? lst) lst (list lst)))) + (if (null? lst) lst + (let ((pred-result (pred? (car lst)))) + (if (and pred-result (not (null? pred-result))) + '() + (cons (car lst) (loop (cdr lst))))) + )))) + + +; take-after:: Converter -> Converter, or +; take-after:: Pred -> Node|Nodeset -> Nodeset +; Given a converter-predicate and a nodeset, apply the predicate to +; each element of the nodeset, until the predicate yields anything but #f or +; nil. Return the elements of the input nodeset that have not been processed: +; that is, return the elements of the input nodeset that follow the first +; element that satisfied the predicate. +; take-after along with take-until partition an input nodeset into three +; parts: the first element that satisfies a predicate, all preceding +; elements and all following elements. + +(define (take-after pred?) + (lambda (lst) ; a nodeset or a node (will be converted to a singleton nset) + (let loop ((lst (if (nodeset? lst) lst (list lst)))) + (if (null? lst) lst + (let ((pred-result (pred? (car lst)))) + (if (and pred-result (not (null? pred-result))) + (cdr lst) + (loop (cdr lst)))) + )))) + +; Apply proc to each element of lst and return the list of results. +; if proc returns a nodeset, splice it into the result +; +; From another point of view, map-union is a function Converter->Converter, +; which places an argument-converter in a joining context. + +(define (map-union proc lst) + (if (null? lst) lst + (let ((proc-res (proc (car lst)))) + ((if (nodeset? proc-res) append cons) + proc-res (map-union proc (cdr lst)))))) + +; node-reverse :: Converter, or +; node-reverse:: Node|Nodeset -> Nodeset +; Reverses the order of nodes in the nodeset +; This basic converter is needed to implement a reverse document order +; (see the XPath Recommendation). +(define node-reverse + (lambda (node-or-nodeset) + (if (not (nodeset? node-or-nodeset)) (list node-or-nodeset) + (reverse node-or-nodeset)))) + +; node-trace:: String -> Converter +; (node-trace title) is an identity converter. In addition it prints out +; a node or nodeset it is applied to, prefixed with the 'title'. +; This converter is very useful for debugging. + +(define (node-trace title) + (lambda (node-or-nodeset) + (cout nl "-->") + (display title) + (display " :") + (pretty-print node-or-nodeset) + node-or-nodeset)) + + +;------------------------- +; Converter combinators +; +; Combinators are higher-order functions that transmogrify a converter +; or glue a sequence of converters into a single, non-trivial +; converter. The goal is to arrive at converters that correspond to +; XPath location paths. +; +; From a different point of view, a combinator is a fixed, named +; _pattern_ of applying converters. Given below is a complete set of +; such patterns that together implement XPath location path +; specification. As it turns out, all these combinators can be built +; from a small number of basic blocks: regular functional composition, +; map-union and filter applicators, and the nodeset union. + + + +; select-kids:: Pred -> Node -> Nodeset +; Given a Node, return an (ordered) subset its children that satisfy +; the Pred (a converter, actually) +; select-kids:: Pred -> Nodeset -> Nodeset +; The same as above, but select among children of all the nodes in +; the Nodeset +; +; More succinctly, the signature of this function is +; select-kids:: Converter -> Converter + +(define (select-kids test-pred?) + (lambda (node) ; node or node-set + (cond + ((null? node) node) + ((not (pair? node)) '()) ; No children + ((symbol? (car node)) + ((filter test-pred?) (cdr node))) ; it's a single node + (else (map-union (select-kids test-pred?) node))))) + + +; node-self:: Pred -> Node -> Nodeset, or +; node-self:: Converter -> Converter +; Similar to select-kids but apply to the Node itself rather +; than to its children. The resulting Nodeset will contain either one +; component, or will be empty (if the Node failed the Pred). +(define node-self filter) + + +; node-join:: [LocPath] -> Node|Nodeset -> Nodeset, or +; node-join:: [Converter] -> Converter +; join the sequence of location steps or paths as described +; in the title comments above. +(define (node-join . selectors) + (lambda (nodeset) ; Nodeset or node + (let loop ((nodeset nodeset) (selectors selectors)) + (if (null? selectors) nodeset + (loop + (if (nodeset? nodeset) + (map-union (car selectors) nodeset) + ((car selectors) nodeset)) + (cdr selectors)))))) + + +; node-reduce:: [LocPath] -> Node|Nodeset -> Nodeset, or +; node-reduce:: [Converter] -> Converter +; A regular functional composition of converters. +; From a different point of view, +; ((apply node-reduce converters) nodeset) +; is equivalent to +; (foldl apply nodeset converters) +; i.e., folding, or reducing, a list of converters with the nodeset +; as a seed. +(define (node-reduce . converters) + (lambda (nodeset) ; Nodeset or node + (let loop ((nodeset nodeset) (converters converters)) + (if (null? converters) nodeset + (loop ((car converters) nodeset) (cdr converters)))))) + + +; node-or:: [Converter] -> Converter +; This combinator applies all converters to a given node and +; produces the union of their results. +; This combinator corresponds to a union, '|' operation for XPath +; location paths. +; (define (node-or . converters) +; (lambda (node-or-nodeset) +; (if (null? converters) node-or-nodeset +; (append +; ((car converters) node-or-nodeset) +; ((apply node-or (cdr converters)) node-or-nodeset))))) +; More optimal implementation follows +(define (node-or . converters) + (lambda (node-or-nodeset) + (let loop ((result '()) (converters converters)) + (if (null? converters) result + (loop (append result (or ((car converters) node-or-nodeset) '())) + (cdr converters)))))) + + +; node-closure:: Converter -> Converter +; Select all _descendants_ of a node that satisfy a converter-predicate. +; This combinator is similar to select-kids but applies to +; grand... children as well. +; This combinator implements the "descendant::" XPath axis +; Conceptually, this combinator can be expressed as +; (define (node-closure f) +; (node-or +; (select-kids f) +; (node-reduce (select-kids (node-typeof? '*)) (node-closure f)))) +; This definition, as written, looks somewhat like a fixpoint, and it +; will run forever. It is obvious however that sooner or later +; (select-kids (node-typeof? '*)) will return an empty nodeset. At +; this point further iterations will no longer affect the result and +; can be stopped. + +(define (node-closure test-pred?) + (lambda (node) ; Nodeset or node + (let loop ((parent node) (result '())) + (if (null? parent) result + (loop ((select-kids (node-typeof? '*)) parent) + (append result + ((select-kids test-pred?) parent))) + )))) + +; node-parent:: RootNode -> Converter +; (node-parent rootnode) yields a converter that returns a parent of a +; node it is applied to. If applied to a nodeset, it returns the list +; of parents of nodes in the nodeset. The rootnode does not have +; to be the root node of the whole SXML tree -- it may be a root node +; of a branch of interest. +; Given the notation of Philip Wadler's paper on semantics of XSLT, +; parent(x) = { y | y=subnode*(root), x=subnode(y) } +; Therefore, node-parent is not the fundamental converter: it can be +; expressed through the existing ones. Yet node-parent is a rather +; convenient converter. It corresponds to a parent:: axis of SXPath. +; Note that the parent:: axis can be used with an attribute node as well! + +(define (node-parent rootnode) + (lambda (node) ; Nodeset or node + (if (nodeset? node) (map-union (node-parent rootnode) node) + (let ((pred + (node-or + (node-reduce + (node-self (node-typeof? '*)) + (select-kids (node-eq? node))) + (node-join + (select-kids (node-typeof? '@)) + (select-kids (node-eq? node)))))) + ((node-or + (node-self pred) + (node-closure pred)) + rootnode))))) + +;------------------------- +; Evaluate an abbreviated SXPath +; sxpath:: AbbrPath -> Converter, or +; sxpath:: AbbrPath -> Node|Nodeset -> Nodeset +; AbbrPath is a list. It is translated to the full SXPath according +; to the following rewriting rules +; (sxpath '()) -> (node-join) +; (sxpath '(path-component ...)) -> +; (node-join (sxpath1 path-component) (sxpath '(...))) +; (sxpath1 '//) -> (node-or +; (node-self (node-typeof? '*any*)) +; (node-closure (node-typeof? '*any*))) +; (sxpath1 '(equal? x)) -> (select-kids (node-equal? x)) +; (sxpath1 '(eq? x)) -> (select-kids (node-eq? x)) +; (sxpath1 ?symbol) -> (select-kids (node-typeof? ?symbol) +; (sxpath1 procedure) -> procedure +; (sxpath1 '(?symbol ...)) -> (sxpath1 '((?symbol) ...)) +; (sxpath1 '(path reducer ...)) -> +; (node-reduce (sxpath path) (sxpathr reducer) ...) +; (sxpathr number) -> (node-pos number) +; (sxpathr path-filter) -> (filter (sxpath path-filter)) + +(define (sxpath path) + (lambda (nodeset) + (let loop ((nodeset nodeset) (path path)) + (cond + ((null? path) nodeset) + ((nodeset? nodeset) + (map-union (sxpath path) nodeset)) + ((procedure? (car path)) + (loop ((car path) nodeset) (cdr path))) + ((eq? '// (car path)) + (loop + ((if (nodeset? nodeset) append cons) nodeset + ((node-closure (node-typeof? '*any*)) nodeset)) + (cdr path))) + ((symbol? (car path)) + (loop ((select-kids (node-typeof? (car path))) nodeset) + (cdr path))) + ((and (pair? (car path)) (eq? 'equal? (caar path))) + (loop ((select-kids (apply node-equal? (cdar path))) nodeset) + (cdr path))) + ((and (pair? (car path)) (eq? 'eq? (caar path))) + (loop ((select-kids (apply node-eq? (cdar path))) nodeset) + (cdr path))) + ((pair? (car path)) + (let reducer ((nodeset + (if (symbol? (caar path)) + ((select-kids (node-typeof? (caar path))) nodeset) + (loop nodeset (caar path)))) + (reducing-path (cdar path))) + (cond + ((null? reducing-path) (loop nodeset (cdr path))) + ((number? (car reducing-path)) + (reducer ((node-pos (car reducing-path)) nodeset) + (cdr reducing-path))) + (else + (reducer ((filter (sxpath (car reducing-path))) nodeset) + (cdr reducing-path)))))) + (else + (error "Invalid path step: " (car path))) +)))) + +;------------------------------------------------------------------------ +; Sample XPath/SXPath expressions: regression test suite for the +; implementation above. + +; A running example + +(define tree1 + '(html + (head (title "Slides")) + (body + (p (@ (align "center")) + (table (@ (style "font-size: x-large")) + (tr + (td (@ (align "right")) "Talks ") + (td (@ (align "center")) " = ") + (td " slides + transition")) + (tr (td) + (td (@ (align "center")) " = ") + (td " data + control")) + (tr (td) + (td (@ (align "center")) " = ") + (td " programs")))) + (ul + (li (a (@ (href "slides/slide0001.gif")) "Introduction")) + (li (a (@ (href "slides/slide0010.gif")) "Summary"))) + ))) + + +; Example from a posting "Re: DrScheme and XML", +; Shriram Krishnamurthi, comp.lang.scheme, Nov. 26. 1999. +; http://www.deja.com/getdoc.xp?AN=553507805 +(define tree3 + '(poem (@ (title "The Lovesong of J. Alfred Prufrock") + (poet "T. S. Eliot")) + (stanza + (line "Let us go then, you and I,") + (line "When the evening is spread out against the sky") + (line "Like a patient etherized upon a table:")) + (stanza + (line "In the room the women come and go") + (line "Talking of Michaelangelo.")))) + +; Validation Test harness + +(define-syntax run-test + (syntax-rules (define) + ((run-test "scan-exp" (define vars body)) + (define vars (run-test "scan-exp" body))) + ((run-test "scan-exp" ?body) + (letrec-syntax + ((scan-exp ; (scan-exp body k) + (syntax-rules (quote quasiquote !) + ((scan-exp '() (k-head ! . args)) + (k-head '() . args)) + ((scan-exp (quote (hd . tl)) k) + (scan-lit-lst (hd . tl) (do-wrap ! quasiquote k))) + ((scan-exp (quasiquote (hd . tl)) k) + (scan-lit-lst (hd . tl) (do-wrap ! quasiquote k))) + ((scan-exp (quote x) (k-head ! . args)) + (k-head + (if (string? (quote x)) (string->symbol (quote x)) (quote x)) + . args)) + ((scan-exp (hd . tl) k) + (scan-exp hd (do-tl ! scan-exp tl k))) + ((scan-exp x (k-head ! . args)) + (k-head x . args)))) + (do-tl + (syntax-rules (!) + ((do-tl processed-hd fn () (k-head ! . args)) + (k-head (processed-hd) . args)) + ((do-tl processed-hd fn old-tl k) + (fn old-tl (do-cons ! processed-hd k))))) + (do-cons + (syntax-rules (!) + ((do-cons processed-tl processed-hd (k-head ! . args)) + (k-head (processed-hd . processed-tl) . args)))) + (do-wrap + (syntax-rules (!) + ((do-wrap val fn (k-head ! . args)) + (k-head (fn val) . args)))) + (do-finish + (syntax-rules () + ((do-finish new-body) new-body))) + + (scan-lit-lst ; scan literal list + (syntax-rules (quote unquote unquote-splicing !) + ((scan-lit-lst '() (k-head ! . args)) + (k-head '() . args)) + ((scan-lit-lst (quote (hd . tl)) k) + (do-tl quote scan-lit-lst ((hd . tl)) k)) + ((scan-lit-lst (unquote x) k) + (scan-exp x (do-wrap ! unquote k))) + ((scan-lit-lst (unquote-splicing x) k) + (scan-exp x (do-wrap ! unquote-splicing k))) + ((scan-lit-lst (quote x) (k-head ! . args)) + (k-head + ,(if (string? (quote x)) (string->symbol (quote x)) (quote x)) + . args)) + ((scan-lit-lst (hd . tl) k) + (scan-lit-lst hd (do-tl ! scan-lit-lst tl k))) + ((scan-lit-lst x (k-head ! . args)) + (k-head x . args)))) + ) + (scan-exp ?body (do-finish !)))) + ((run-test body ...) + (begin + (run-test "scan-exp" body) ...)) +)) + +; Overwrite the above macro to switch the tests off +; (define-macro (run-test selector node expected-result) #f) + +; Location path, full form: child::para +; Location path, abbreviated form: para +; selects the para element children of the context node + +(let ((tree + '(elem (@) (para (@) "para") (br (@)) "cdata" (para (@) "second par")) + ) + (expected '((para (@) "para") (para (@) "second par"))) + ) + (run-test (select-kids (node-typeof? 'para)) tree expected) + (run-test (sxpath '(para)) tree expected) +) + +; Location path, full form: child::* +; Location path, abbreviated form: * +; selects all element children of the context node + +(let ((tree + '(elem (@) (para (@) "para") (br (@)) "cdata" (para "second par")) + ) + (expected + '((para (@) "para") (br (@)) (para "second par"))) + ) + (run-test (select-kids (node-typeof? '*)) tree expected) + (run-test (sxpath '(*)) tree expected) +) + + + +; Location path, full form: child::text() +; Location path, abbreviated form: text() +; selects all text node children of the context node +(let ((tree + '(elem (@) (para (@) "para") (br (@)) "cdata" (para "second par")) + ) + (expected + '("cdata")) + ) + (run-test (select-kids (node-typeof? '*text*)) tree expected) + (run-test (sxpath '(*text*)) tree expected) +) + + +; Location path, full form: child::node() +; Location path, abbreviated form: node() +; selects all the children of the context node, whatever their node type +(let* ((tree + '(elem (@) (para (@) "para") (br (@)) "cdata" (para "second par")) + ) + (expected (cdr tree)) + ) + (run-test (select-kids (node-typeof? '*any*)) tree expected) + (run-test (sxpath '(*any*)) tree expected) +) + +; Location path, full form: child::*/child::para +; Location path, abbreviated form: */para +; selects all para grandchildren of the context node + +(let ((tree + '(elem (@) (para (@) "para") (br (@)) "cdata" (para "second par") + (div (@ (name "aa")) (para "third para"))) + ) + (expected + '((para "third para"))) + ) + (run-test + (node-join (select-kids (node-typeof? '*)) + (select-kids (node-typeof? 'para))) + tree expected) + (run-test (sxpath '(* para)) tree expected) +) + + +; Location path, full form: attribute::name +; Location path, abbreviated form: @name +; selects the 'name' attribute of the context node + +(let ((tree + '(elem (@ (name "elem") (id "idz")) + (para (@) "para") (br (@)) "cdata" (para (@) "second par") + (div (@ (name "aa")) (para (@) "third para"))) + ) + (expected + '((name "elem"))) + ) + (run-test + (node-join (select-kids (node-typeof? '@)) + (select-kids (node-typeof? 'name))) + tree expected) + (run-test (sxpath '(@ name)) tree expected) +) + +; Location path, full form: attribute::* +; Location path, abbreviated form: @* +; selects all the attributes of the context node +(let ((tree + '(elem (@ (name "elem") (id "idz")) + (para (@) "para") (br (@)) "cdata" (para "second par") + (div (@ (name "aa")) (para (@) "third para"))) + ) + (expected + '((name "elem") (id "idz"))) + ) + (run-test + (node-join (select-kids (node-typeof? '@)) + (select-kids (node-typeof? '*))) + tree expected) + (run-test (sxpath '(@ *)) tree expected) +) + + +; Location path, full form: descendant::para +; Location path, abbreviated form: .//para +; selects the para element descendants of the context node + +(let ((tree + '(elem (@ (name "elem") (id "idz")) + (para (@) "para") (br (@)) "cdata" (para "second par") + (div (@ (name "aa")) (para (@) "third para"))) + ) + (expected + '((para (@) "para") (para "second par") (para (@) "third para"))) + ) + (run-test + (node-closure (node-typeof? 'para)) + tree expected) + (run-test (sxpath '(// para)) tree expected) +) + +; Location path, full form: self::para +; Location path, abbreviated form: _none_ +; selects the context node if it is a para element; otherwise selects nothing + +(let ((tree + '(elem (@ (name "elem") (id "idz")) + (para (@) "para") (br (@)) "cdata" (para "second par") + (div (@ (name "aa")) (para (@) "third para"))) + ) + ) + (run-test (node-self (node-typeof? 'para)) tree '()) + (run-test (node-self (node-typeof? 'elem)) tree (list tree)) +) + +; Location path, full form: descendant-or-self::node() +; Location path, abbreviated form: // +; selects the context node, all the children (including attribute nodes) +; of the context node, and all the children of all the (element) +; descendants of the context node. +; This is _almost_ a powerset of the context node. +(let* ((tree + '(para (@ (name "elem") (id "idz")) + (para (@) "para") (br (@)) "cdata" (para "second par") + (div (@ (name "aa")) (para (@) "third para"))) + ) + (expected + (cons tree + (append (cdr tree) + '((@) "para" (@) "second par" + (@ (name "aa")) (para (@) "third para") + (@) "third para")))) + ) + (run-test + (node-or + (node-self (node-typeof? '*any*)) + (node-closure (node-typeof? '*any*))) + tree expected) + (run-test (sxpath '(//)) tree expected) +) + +; Location path, full form: ancestor::div +; Location path, abbreviated form: _none_ +; selects all div ancestors of the context node +; This Location expression is equivalent to the following: +; /descendant-or-self::div[descendant::node() = curr_node] +; This shows that the ancestor:: axis is actually redundant. Still, +; it can be emulated as the following SXPath expression demonstrates. + +; The insight behind "ancestor::div" -- selecting all "div" ancestors +; of the current node -- is +; S[ancestor::div] context_node = +; { y | y=subnode*(root), context_node=subnode(subnode*(y)), +; isElement(y), name(y) = "div" } +; We observe that +; { y | y=subnode*(root), pred(y) } +; can be expressed in SXPath as +; ((node-or (node-self pred) (node-closure pred)) root-node) +; The composite predicate 'isElement(y) & name(y) = "div"' corresponds to +; (node-self (node-typeof? 'div)) in SXPath. Finally, filter +; context_node=subnode(subnode*(y)) is tantamount to +; (node-closure (node-eq? context-node)), whereas node-reduce denotes the +; the composition of converters-predicates in the filtering context. + +(let* + ((root + '(div (@ (name "elem") (id "idz")) + (para (@) "para") (br (@)) "cdata" (para (@) "second par") + (div (@ (name "aa")) (para (@) "third para")))) + (context-node ; /descendant::any()[child::text() == "third para"] + (car + ((node-closure + (select-kids + (node-equal? "third para"))) + root))) + (pred + (node-reduce (node-self (node-typeof? 'div)) + (node-closure (node-eq? context-node)) + )) + ) + (run-test + (node-or + (node-self pred) + (node-closure pred)) + root + (cons root + '((div (@ (name "aa")) (para (@) "third para"))))) +) + + + +; Location path, full form: child::div/descendant::para +; Location path, abbreviated form: div//para +; selects the para element descendants of the div element +; children of the context node + +(let ((tree + '(elem (@ (name "elem") (id "idz")) + (para (@) "para") (br (@)) "cdata" (para "second par") + (div (@ (name "aa")) (para (@) "third para") + (div (para "fourth para")))) + ) + (expected + '((para (@) "third para") (para "fourth para"))) + ) + (run-test + (node-join + (select-kids (node-typeof? 'div)) + (node-closure (node-typeof? 'para))) + tree expected) + (run-test (sxpath '(div // para)) tree expected) +) + + +; Location path, full form: /descendant::olist/child::item +; Location path, abbreviated form: //olist/item +; selects all the item elements that have an olist parent (which is not root) +; and that are in the same document as the context node +; See the following test. + +; Location path, full form: /descendant::td/attribute::align +; Location path, abbreviated form: //td/@align +; Selects 'align' attributes of all 'td' elements in tree1 +(let ((tree tree1) + (expected + '((align "right") (align "center") (align "center") (align "center")) + )) + (run-test + (node-join + (node-closure (node-typeof? 'td)) + (select-kids (node-typeof? '@)) + (select-kids (node-typeof? 'align))) + tree expected) + (run-test (sxpath '(// td @ align)) tree expected) +) + + +; Location path, full form: /descendant::td[attribute::align] +; Location path, abbreviated form: //td[@align] +; Selects all td elements that have an attribute 'align' in tree1 +(let ((tree tree1) + (expected + '((td (@ (align "right")) "Talks ") (td (@ (align "center")) " = ") + (td (@ (align "center")) " = ") (td (@ (align "center")) " = ")) + )) + (run-test + (node-reduce + (node-closure (node-typeof? 'td)) + (filter + (node-join + (select-kids (node-typeof? '@)) + (select-kids (node-typeof? 'align))))) + tree expected) + (run-test (sxpath `(// td ,(node-self (sxpath '(@ align))))) tree expected) + (run-test (sxpath '(// (td (@ align)))) tree expected) + (run-test (sxpath '(// ((td) (@ align)))) tree expected) + ; note! (sxpath ...) is a converter. Therefore, it can be used + ; as any other converter, for example, in the full-form SXPath. + ; Thus we can mix the full and abbreviated form SXPath's freely. + (run-test + (node-reduce + (node-closure (node-typeof? 'td)) + (filter + (sxpath '(@ align)))) + tree expected) +) + + +; Location path, full form: /descendant::td[attribute::align = "right"] +; Location path, abbreviated form: //td[@align = "right"] +; Selects all td elements that have an attribute align = "right" in tree1 +(let ((tree tree1) + (expected + '((td (@ (align "right")) "Talks ")) + )) + (run-test + (node-reduce + (node-closure (node-typeof? 'td)) + (filter + (node-join + (select-kids (node-typeof? '@)) + (select-kids (node-equal? '(align "right")))))) + tree expected) + (run-test (sxpath '(// (td (@ (equal? (align "right")))))) tree expected) +) + +; Location path, full form: child::para[position()=1] +; Location path, abbreviated form: para[1] +; selects the first para child of the context node +(let ((tree + '(elem (@ (name "elem") (id "idz")) + (para (@) "para") (br (@)) "cdata" (para "second par") + (div (@ (name "aa")) (para (@) "third para"))) + ) + (expected + '((para (@) "para")) + )) + (run-test + (node-reduce + (select-kids (node-typeof? 'para)) + (node-pos 1)) + tree expected) + (run-test (sxpath '((para 1))) tree expected) +) + +; Location path, full form: child::para[position()=last()] +; Location path, abbreviated form: para[last()] +; selects the last para child of the context node +(let ((tree + '(elem (@ (name "elem") (id "idz")) + (para (@) "para") (br (@)) "cdata" (para "second par") + (div (@ (name "aa")) (para (@) "third para"))) + ) + (expected + '((para "second par")) + )) + (run-test + (node-reduce + (select-kids (node-typeof? 'para)) + (node-pos -1)) + tree expected) + (run-test (sxpath '((para -1))) tree expected) +) + +; Illustrating the following Note of Sec 2.5 of XPath: +; "NOTE: The location path //para[1] does not mean the same as the +; location path /descendant::para[1]. The latter selects the first +; descendant para element; the former selects all descendant para +; elements that are the first para children of their parents." + +(let ((tree + '(elem (@ (name "elem") (id "idz")) + (para (@) "para") (br (@)) "cdata" (para "second par") + (div (@ (name "aa")) (para (@) "third para"))) + ) + ) + (run-test + (node-reduce ; /descendant::para[1] in SXPath + (node-closure (node-typeof? 'para)) + (node-pos 1)) + tree '((para (@) "para"))) + (run-test (sxpath '(// (para 1))) tree + '((para (@) "para") (para (@) "third para"))) +) + +; Location path, full form: parent::node() +; Location path, abbreviated form: .. +; selects the parent of the context node. The context node may be +; an attribute node! +; For the last test: +; Location path, full form: parent::*/attribute::name +; Location path, abbreviated form: ../@name +; Selects the name attribute of the parent of the context node + +(let* ((tree + '(elem (@ (name "elem") (id "idz")) + (para (@) "para") (br (@)) "cdata" (para "second par") + (div (@ (name "aa")) (para (@) "third para"))) + ) + (para1 ; the first para node + (car ((sxpath '(para)) tree))) + (para3 ; the third para node + (car ((sxpath '(div para)) tree))) + (div ; div node + (car ((sxpath '(// div)) tree))) + ) + (run-test + (node-parent tree) + para1 (list tree)) + (run-test + (node-parent tree) + para3 (list div)) + (run-test ; checking the parent of an attribute node + (node-parent tree) + ((sxpath '(@ name)) div) (list div)) + (run-test + (node-join + (node-parent tree) + (select-kids (node-typeof? '@)) + (select-kids (node-typeof? 'name))) + para3 '((name "aa"))) + (run-test + (sxpath `(,(node-parent tree) @ name)) + para3 '((name "aa"))) +) + +; Location path, full form: following-sibling::chapter[position()=1] +; Location path, abbreviated form: none +; selects the next chapter sibling of the context node +; The path is equivalent to +; let cnode = context-node +; in +; parent::* / child::chapter [take-after node_eq(self::*,cnode)] +; [position()=1] +(let* ((tree + '(document + (preface "preface") + (chapter (@ (id "one")) "Chap 1 text") + (chapter (@ (id "two")) "Chap 2 text") + (chapter (@ (id "three")) "Chap 3 text") + (chapter (@ (id "four")) "Chap 4 text") + (epilogue "Epilogue text") + (appendix (@ (id "A")) "App A text") + (References "References")) + ) + (a-node ; to be used as a context node + (car ((sxpath '(// (chapter (@ (equal? (id "two")))))) tree))) + (expected + '((chapter (@ (id "three")) "Chap 3 text"))) + ) + (run-test + (node-reduce + (node-join + (node-parent tree) + (select-kids (node-typeof? 'chapter))) + (take-after (node-eq? a-node)) + (node-pos 1) + ) + a-node expected) +) + +; preceding-sibling::chapter[position()=1] +; selects the previous chapter sibling of the context node +; The path is equivalent to +; let cnode = context-node +; in +; parent::* / child::chapter [take-until node_eq(self::*,cnode)] +; [position()=-1] +(let* ((tree + '(document + (preface "preface") + (chapter (@ (id "one")) "Chap 1 text") + (chapter (@ (id "two")) "Chap 2 text") + (chapter (@ (id "three")) "Chap 3 text") + (chapter (@ (id "four")) "Chap 4 text") + (epilogue "Epilogue text") + (appendix (@ (id "A")) "App A text") + (References "References")) + ) + (a-node ; to be used as a context node + (car ((sxpath '(// (chapter (@ (equal? (id "three")))))) tree))) + (expected + '((chapter (@ (id "two")) "Chap 2 text"))) + ) + (run-test + (node-reduce + (node-join + (node-parent tree) + (select-kids (node-typeof? 'chapter))) + (take-until (node-eq? a-node)) + (node-pos -1) + ) + a-node expected) +) + + +; /descendant::figure[position()=42] +; selects the forty-second figure element in the document +; See the next example, which is more general. + +; Location path, full form: +; child::table/child::tr[position()=2]/child::td[position()=3] +; Location path, abbreviated form: table/tr[2]/td[3] +; selects the third td of the second tr of the table +(let ((tree ((node-closure (node-typeof? 'p)) tree1)) + (expected + '((td " data + control")) + )) + (run-test + (node-join + (select-kids (node-typeof? 'table)) + (node-reduce (select-kids (node-typeof? 'tr)) + (node-pos 2)) + (node-reduce (select-kids (node-typeof? 'td)) + (node-pos 3))) + tree expected) + (run-test (sxpath '(table (tr 2) (td 3))) tree expected) +) + + +; Location path, full form: +; child::para[attribute::type='warning'][position()=5] +; Location path, abbreviated form: para[@type='warning'][5] +; selects the fifth para child of the context node that has a type +; attribute with value warning +(let ((tree + '(chapter + (para "para1") + (para (@ (type "warning")) "para 2") + (para (@ (type "warning")) "para 3") + (para (@ (type "warning")) "para 4") + (para (@ (type "warning")) "para 5") + (para (@ (type "warning")) "para 6")) + ) + (expected + '((para (@ (type "warning")) "para 6")) + )) + (run-test + (node-reduce + (select-kids (node-typeof? 'para)) + (filter + (node-join + (select-kids (node-typeof? '@)) + (select-kids (node-equal? '(type "warning"))))) + (node-pos 5)) + tree expected) + (run-test (sxpath '( (((para (@ (equal? (type "warning"))))) 5 ) )) + tree expected) + (run-test (sxpath '( (para (@ (equal? (type "warning"))) 5 ) )) + tree expected) +) + + +; Location path, full form: +; child::para[position()=5][attribute::type='warning'] +; Location path, abbreviated form: para[5][@type='warning'] +; selects the fifth para child of the context node if that child has a 'type' +; attribute with value warning +(let ((tree + '(chapter + (para "para1") + (para (@ (type "warning")) "para 2") + (para (@ (type "warning")) "para 3") + (para (@ (type "warning")) "para 4") + (para (@ (type "warning")) "para 5") + (para (@ (type "warning")) "para 6")) + ) + (expected + '((para (@ (type "warning")) "para 5")) + )) + (run-test + (node-reduce + (select-kids (node-typeof? 'para)) + (node-pos 5) + (filter + (node-join + (select-kids (node-typeof? '@)) + (select-kids (node-equal? '(type "warning")))))) + tree expected) + (run-test (sxpath '( (( (para 5)) (@ (equal? (type "warning")))))) + tree expected) + (run-test (sxpath '( (para 5 (@ (equal? (type "warning")))) )) + tree expected) +) + +; Location path, full form: +; child::*[self::chapter or self::appendix] +; Location path, semi-abbreviated form: *[self::chapter or self::appendix] +; selects the chapter and appendix children of the context node +(let ((tree + '(document + (preface "preface") + (chapter (@ (id "one")) "Chap 1 text") + (chapter (@ (id "two")) "Chap 2 text") + (chapter (@ (id "three")) "Chap 3 text") + (epilogue "Epilogue text") + (appendix (@ (id "A")) "App A text") + (References "References")) + ) + (expected + '((chapter (@ (id "one")) "Chap 1 text") + (chapter (@ (id "two")) "Chap 2 text") + (chapter (@ (id "three")) "Chap 3 text") + (appendix (@ (id "A")) "App A text")) + )) + (run-test + (node-join + (select-kids (node-typeof? '*)) + (filter + (node-or + (node-self (node-typeof? 'chapter)) + (node-self (node-typeof? 'appendix))))) + tree expected) + (run-test (sxpath `(* ,(node-or (node-self (node-typeof? 'chapter)) + (node-self (node-typeof? 'appendix))))) + tree expected) +) + + +; Location path, full form: child::chapter[child::title='Introduction'] +; Location path, abbreviated form: chapter[title = 'Introduction'] +; selects the chapter children of the context node that have one or more +; title children with string-value equal to Introduction +; See a similar example: //td[@align = "right"] above. + +; Location path, full form: child::chapter[child::title] +; Location path, abbreviated form: chapter[title] +; selects the chapter children of the context node that have one or +; more title children +; See a similar example //td[@align] above. + +(cerr nl "Example with tree3: extracting the first lines of every stanza" nl) +(let ((tree tree3) + (expected + '("Let us go then, you and I," "In the room the women come and go") + )) + (run-test + (node-join + (node-closure (node-typeof? 'stanza)) + (node-reduce + (select-kids (node-typeof? 'line)) (node-pos 1)) + (select-kids (node-typeof? '*text*))) + tree expected) + (run-test (sxpath '(// stanza (line 1) *text*)) tree expected) +) + diff --git a/module/sxml/upstream/assert.scm b/module/sxml/upstream/assert.scm new file mode 100644 index 000000000..e9e983d5b --- /dev/null +++ b/module/sxml/upstream/assert.scm @@ -0,0 +1,35 @@ +; +; syntax: assert ?expr ?expr ... [report: ?r-exp ?r-exp ...] +; +; If (and ?expr ?expr ...) evaluates to anything but #f, the result +; is the value of that expression. +; If (and ?expr ?expr ...) evaluates to #f, an error is reported. +; The error message will show the failed expressions, as well +; as the values of selected variables (or expressions, in general). +; The user may explicitly specify the expressions whose +; values are to be printed upon assertion failure -- as ?r-exp that +; follow the identifier 'report:' +; Typically, ?r-exp is either a variable or a string constant. +; If the user specified no ?r-exp, the values of variables that are +; referenced in ?expr will be printed upon the assertion failure. + +(define-syntax assert + (syntax-rules (report:) + ((assert "doit" (expr ...) (r-exp ...)) + (cond + ((and expr ...) => (lambda (x) x)) + (else + (error "assertion failure: ~a" (list '(and expr ...) r-exp ...))))) + ((assert "collect" (expr ...)) + (assert "doit" (expr ...) ())) + ((assert "collect" (expr ...) report: r-exp ...) + (assert "doit" (expr ...) (r-exp ...))) + ((assert "collect" (expr ...) expr1 stuff ...) + (assert "collect" (expr ... expr1) stuff ...)) + ((assert stuff ...) + (assert "collect" () stuff ...)))) + +(define-syntax assure + (syntax-rules () + ((assure exp error-msg) + (assert exp report: error-msg))))
\ No newline at end of file diff --git a/module/sxml/upstream/input-parse.scm b/module/sxml/upstream/input-parse.scm new file mode 100644 index 000000000..e0bccfda8 --- /dev/null +++ b/module/sxml/upstream/input-parse.scm @@ -0,0 +1,326 @@ +;**************************************************************************** +; Simple Parsing of input +; +; The following simple functions surprisingly often suffice to parse +; an input stream. They either skip, or build and return tokens, +; according to inclusion or delimiting semantics. The list of +; characters to expect, include, or to break at may vary from one +; invocation of a function to another. This allows the functions to +; easily parse even context-sensitive languages. +; +; EOF is generally frowned on, and thrown up upon if encountered. +; Exceptions are mentioned specifically. The list of expected characters +; (characters to skip until, or break-characters) may include an EOF +; "character", which is to be coded as symbol *eof* +; +; The input stream to parse is specified as a PORT, which is usually +; the last (and optional) argument. It defaults to the current input +; port if omitted. +; +; IMPORT +; This package relies on a function parser-error, which must be defined +; by a user of the package. The function has the following signature: +; parser-error PORT MESSAGE SPECIALISING-MSG* +; Many procedures of this package call parser-error to report a parsing +; error. The first argument is a port, which typically points to the +; offending character or its neighborhood. Most of the Scheme systems +; let the user query a PORT for the current position. MESSAGE is the +; description of the error. Other arguments supply more details about +; the problem. +; myenv.scm, myenv-bigloo.scm or a similar prelude is assumed. +; From SRFI-13, string-concatenate-reverse +; If a particular implementation lacks SRFI-13 support, please +; include the file srfi-13-local.scm +; +; $Id: input-parse.scm,v 1.7 2004/07/07 16:02:31 sperber Exp $ + +;------------------------------------------------------------------------ + +; -- procedure+: peek-next-char [PORT] +; advances to the next character in the PORT and peeks at it. +; This function is useful when parsing LR(1)-type languages +; (one-char-read-ahead). +; The optional argument PORT defaults to the current input port. + +(define-opt (peek-next-char (optional (port (current-input-port)))) + (read-char port) + (peek-char port)) + + +;------------------------------------------------------------------------ + +; -- procedure+: assert-curr-char CHAR-LIST STRING [PORT] +; Reads a character from the PORT and looks it up +; in the CHAR-LIST of expected characters +; If the read character was found among expected, it is returned +; Otherwise, the procedure writes a nasty message using STRING +; as a comment, and quits. +; The optional argument PORT defaults to the current input port. +; +(define-opt (assert-curr-char expected-chars comment + (optional (port (current-input-port)))) + (let ((c (read-char port))) + (if (memv c expected-chars) c + (parser-error port "Wrong character " c + " (0x" (if (eof-object? c) "*eof*" + (number->string (char->integer c) 16)) ") " + comment ". " expected-chars " expected")))) + + +; -- procedure+: skip-until CHAR-LIST [PORT] +; Reads and skips characters from the PORT until one of the break +; characters is encountered. This break character is returned. +; The break characters are specified as the CHAR-LIST. This list +; may include EOF, which is to be coded as a symbol *eof* +; +; -- procedure+: skip-until NUMBER [PORT] +; Skips the specified NUMBER of characters from the PORT and returns #f +; +; The optional argument PORT defaults to the current input port. + + +(define-opt (skip-until arg (optional (port (current-input-port))) ) + (cond + ((number? arg) ; skip 'arg' characters + (do ((i arg (dec i))) + ((not (positive? i)) #f) + (if (eof-object? (read-char port)) + (parser-error port "Unexpected EOF while skipping " + arg " characters")))) + (else ; skip until break-chars (=arg) + (let loop ((c (read-char port))) + (cond + ((memv c arg) c) + ((eof-object? c) + (if (memq '*eof* arg) c + (parser-error port "Unexpected EOF while skipping until " arg))) + (else (loop (read-char port)))))))) + + +; -- procedure+: skip-while CHAR-LIST [PORT] +; Reads characters from the PORT and disregards them, +; as long as they are mentioned in the CHAR-LIST. +; The first character (which may be EOF) peeked from the stream +; that is NOT a member of the CHAR-LIST is returned. This character +; is left on the stream. +; The optional argument PORT defaults to the current input port. + +(define-opt (skip-while skip-chars (optional (port (current-input-port))) ) + (do ((c (peek-char port) (peek-char port))) + ((not (memv c skip-chars)) c) + (read-char port))) + +; whitespace const + +;------------------------------------------------------------------------ +; Stream tokenizers + + +; -- procedure+: +; next-token PREFIX-CHAR-LIST BREAK-CHAR-LIST [COMMENT-STRING] [PORT] +; skips any number of the prefix characters (members of the +; PREFIX-CHAR-LIST), if any, and reads the sequence of characters +; up to (but not including) a break character, one of the +; BREAK-CHAR-LIST. +; The string of characters thus read is returned. +; The break character is left on the input stream +; The list of break characters may include EOF, which is to be coded as +; a symbol *eof*. Otherwise, EOF is fatal, generating an error message +; including a specified COMMENT-STRING (if any) +; +; The optional argument PORT defaults to the current input port. +; +; Note: since we can't tell offhand how large the token being read is +; going to be, we make a guess, pre-allocate a string, and grow it by +; quanta if necessary. The quantum is always the length of the string +; before it was extended the last time. Thus the algorithm does +; a Fibonacci-type extension, which has been proven optimal. +; Note, explicit port specification in read-char, peek-char helps. + +; Procedure: input-parse:init-buffer +; returns an initial buffer for next-token* procedures. +; The input-parse:init-buffer may allocate a new buffer per each invocation: +; (define (input-parse:init-buffer) (make-string 32)) +; Size 32 turns out to be fairly good, on average. +; That policy is good only when a Scheme system is multi-threaded with +; preemptive scheduling, or when a Scheme system supports shared substrings. +; In all the other cases, it's better for input-parse:init-buffer to +; return the same static buffer. next-token* functions return a copy +; (a substring) of accumulated data, so the same buffer can be reused. +; We shouldn't worry about an incoming token being too large: +; next-token will use another chunk automatically. Still, +; the best size for the static buffer is to allow most of the tokens to fit in. +; Using a static buffer _dramatically_ reduces the amount of produced garbage +; (e.g., during XML parsing). + +(define input-parse:init-buffer + (let ((buffer (make-string 512))) + (lambda () buffer))) + + + ; See a better version below +(define-opt (next-token-old prefix-skipped-chars break-chars + (optional (comment "") (port (current-input-port))) ) + (let* ((buffer (input-parse:init-buffer)) + (curr-buf-len (string-length buffer)) + (quantum curr-buf-len)) + (let loop ((i 0) (c (skip-while prefix-skipped-chars port))) + (cond + ((memv c break-chars) (substring buffer 0 i)) + ((eof-object? c) + (if (memq '*eof* break-chars) + (substring buffer 0 i) ; was EOF expected? + (parser-error port "EOF while reading a token " comment))) + (else + (if (>= i curr-buf-len) ; make space for i-th char in buffer + (begin ; -> grow the buffer by the quantum + (set! buffer (string-append buffer (make-string quantum))) + (set! quantum curr-buf-len) + (set! curr-buf-len (string-length buffer)))) + (string-set! buffer i c) + (read-char port) ; move to the next char + (loop (inc i) (peek-char port)) + ))))) + + +; A better version of next-token, which accumulates the characters +; in chunks, and later on reverse-concatenates them, using +; SRFI-13 if available. +; The overhead of copying characters is only 100% (or even smaller: bulk +; string copying might be well-optimised), compared to the (hypothetical) +; circumstance if we had known the size of the token beforehand. +; For small tokens, the code performs just as above. For large +; tokens, we expect an improvement. Note, the code also has no +; assignments. +; See next-token-comp.scm + +(define-opt (next-token prefix-skipped-chars break-chars + (optional (comment "") (port (current-input-port))) ) + (let outer ((buffer (input-parse:init-buffer)) (filled-buffer-l '()) + (c (skip-while prefix-skipped-chars port))) + (let ((curr-buf-len (string-length buffer))) + (let loop ((i 0) (c c)) + (cond + ((memv c break-chars) + (if (null? filled-buffer-l) (substring buffer 0 i) + (string-concatenate-reverse filled-buffer-l buffer i))) + ((eof-object? c) + (if (memq '*eof* break-chars) ; was EOF expected? + (if (null? filled-buffer-l) (substring buffer 0 i) + (string-concatenate-reverse filled-buffer-l buffer i)) + (parser-error port "EOF while reading a token " comment))) + ((>= i curr-buf-len) + (outer (make-string curr-buf-len) + (cons buffer filled-buffer-l) c)) + (else + (string-set! buffer i c) + (read-char port) ; move to the next char + (loop (inc i) (peek-char port)))))))) + +; -- procedure+: next-token-of INC-CHARSET [PORT] +; Reads characters from the PORT that belong to the list of characters +; INC-CHARSET. The reading stops at the first character which is not +; a member of the set. This character is left on the stream. +; All the read characters are returned in a string. +; +; -- procedure+: next-token-of PRED [PORT] +; Reads characters from the PORT for which PRED (a procedure of one +; argument) returns non-#f. The reading stops at the first character +; for which PRED returns #f. That character is left on the stream. +; All the results of evaluating of PRED up to #f are returned in a +; string. +; +; PRED is a procedure that takes one argument (a character +; or the EOF object) and returns a character or #f. The returned +; character does not have to be the same as the input argument +; to the PRED. For example, +; (next-token-of (lambda (c) +; (cond ((eof-object? c) #f) +; ((char-alphabetic? c) (char-downcase c)) +; (else #f)))) +; will try to read an alphabetic token from the current +; input port, and return it in lower case. +; +; The optional argument PORT defaults to the current input port. +; +; This procedure is similar to next-token but only it implements +; an inclusion rather than delimiting semantics. + +(define-opt (next-token-of incl-list/pred + (optional (port (current-input-port))) ) + (let* ((buffer (input-parse:init-buffer)) + (curr-buf-len (string-length buffer))) + (if (procedure? incl-list/pred) + (let outer ((buffer buffer) (filled-buffer-l '())) + (let loop ((i 0)) + (if (>= i curr-buf-len) ; make sure we have space + (outer (make-string curr-buf-len) (cons buffer filled-buffer-l)) + (let ((c (incl-list/pred (peek-char port)))) + (if c + (begin + (string-set! buffer i c) + (read-char port) ; move to the next char + (loop (inc i))) + ; incl-list/pred decided it had had enough + (if (null? filled-buffer-l) (substring buffer 0 i) + (string-concatenate-reverse filled-buffer-l buffer i))))))) + + ; incl-list/pred is a list of allowed characters + (let outer ((buffer buffer) (filled-buffer-l '())) + (let loop ((i 0)) + (if (>= i curr-buf-len) ; make sure we have space + (outer (make-string curr-buf-len) (cons buffer filled-buffer-l)) + (let ((c (peek-char port))) + (cond + ((not (memv c incl-list/pred)) + (if (null? filled-buffer-l) (substring buffer 0 i) + (string-concatenate-reverse filled-buffer-l buffer i))) + (else + (string-set! buffer i c) + (read-char port) ; move to the next char + (loop (inc i)))))))) + ))) + + +; -- procedure+: read-text-line [PORT] +; Reads one line of text from the PORT, and returns it as a string. +; A line is a (possibly empty) sequence of characters terminated +; by CR, CRLF or LF (or even the end of file). +; The terminating character (or CRLF combination) is removed from +; the input stream. The terminating character(s) is not a part +; of the return string either. +; If EOF is encountered before any character is read, the return +; value is EOF. +; +; The optional argument PORT defaults to the current input port. + +(define *read-line-breaks* (list char-newline char-return '*eof*)) + +(define-opt (read-text-line (optional (port (current-input-port))) ) + (if (eof-object? (peek-char port)) (peek-char port) + (let* ((line + (next-token '() *read-line-breaks* + "reading a line" port)) + (c (read-char port))) ; must be either \n or \r or EOF + (and (eqv? c char-return) (eqv? (peek-char port) #\newline) + (read-char port)) ; skip \n that follows \r + line))) + + +; -- procedure+: read-string N [PORT] +; Reads N characters from the PORT, and returns them in a string. +; If EOF is encountered before N characters are read, a shorter string +; will be returned. +; If N is not positive, an empty string will be returned. +; The optional argument PORT defaults to the current input port. + +(define-opt (read-string n (optional (port (current-input-port))) ) + (if (not (positive? n)) "" + (let ((buffer (make-string n))) + (let loop ((i 0) (c (read-char port))) + (if (eof-object? c) (substring buffer 0 i) + (let ((i1 (inc i))) + (string-set! buffer i c) + (if (= i1 n) buffer + (loop i1 (read-char port))))))))) + diff --git a/module/sxml/xpath.scm b/module/sxml/xpath.scm new file mode 100644 index 000000000..bdf4ae98a --- /dev/null +++ b/module/sxml/xpath.scm @@ -0,0 +1,493 @@ +;;;; (sxml xpath) -- SXPath +;;;; +;;;; Copyright (C) 2009 Free Software Foundation, Inc. +;;;; Modified 2004 by Andy Wingo <wingo at pobox dot com>. +;;;; Written 2001 by Oleg Kiselyov <oleg at pobox dot com> SXPath.scm. +;;;; +;;;; This library is free software; you can redistribute it and/or +;;;; modify it under the terms of the GNU Lesser General Public +;;;; License as published by the Free Software Foundation; either +;;;; version 3 of the License, or (at your option) any later version. +;;;; +;;;; This library is distributed in the hope that it will be useful, +;;;; but WITHOUT ANY WARRANTY; without even the implied warranty of +;;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;;;; Lesser General Public License for more details. +;;;; +;;;; You should have received a copy of the GNU Lesser General Public +;;;; License along with this library; if not, write to the Free Software +;;;; Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;;;; + +;;; Commentary: +;; +;;@heading SXPath: SXML Query Language +;; +;; SXPath is a query language for SXML, an instance of XML Information +;; set (Infoset) in the form of s-expressions. See @code{(sxml ssax)} +;; for the definition of SXML and more details. SXPath is also a +;; translation into Scheme of an XML Path Language, +;; @uref{http://www.w3.org/TR/xpath,XPath}. XPath and SXPath describe +;; means of selecting a set of Infoset's items or their properties. +;; +;; To facilitate queries, XPath maps the XML Infoset into an explicit +;; tree, and introduces important notions of a location path and a +;; current, context node. A location path denotes a selection of a set of +;; nodes relative to a context node. Any XPath tree has a distinguished, +;; root node -- which serves as the context node for absolute location +;; paths. Location path is recursively defined as a location step joined +;; with a location path. A location step is a simple query of the +;; database relative to a context node. A step may include expressions +;; that further filter the selected set. Each node in the resulting set +;; is used as a context node for the adjoining location path. The result +;; of the step is a union of the sets returned by the latter location +;; paths. +;; +;; The SXML representation of the XML Infoset (see SSAX.scm) is rather +;; suitable for querying as it is. Bowing to the XPath specification, +;; we will refer to SXML information items as 'Nodes': +;;@example +;; <Node> ::= <Element> | <attributes-coll> | <attrib> +;; | "text string" | <PI> +;;@end example +;; This production can also be described as +;;@example +;; <Node> ::= (name . <Nodeset>) | "text string" +;;@end example +;; An (ordered) set of nodes is just a list of the constituent nodes: +;;@example +;; <Nodeset> ::= (<Node> ...) +;;@end example +;; Nodesets, and Nodes other than text strings are both lists. A +;; <Nodeset> however is either an empty list, or a list whose head is not +;; a symbol. A symbol at the head of a node is either an XML name (in +;; which case it's a tag of an XML element), or an administrative name +;; such as '@@'. This uniform list representation makes processing rather +;; simple and elegant, while avoiding confusion. The multi-branch tree +;; structure formed by the mutually-recursive datatypes <Node> and +;; <Nodeset> lends itself well to processing by functional languages. +;; +;; A location path is in fact a composite query over an XPath tree or +;; its branch. A singe step is a combination of a projection, selection +;; or a transitive closure. Multiple steps are combined via join and +;; union operations. This insight allows us to @emph{elegantly} +;; implement XPath as a sequence of projection and filtering primitives +;; -- converters -- joined by @dfn{combinators}. Each converter takes a +;; node and returns a nodeset which is the result of the corresponding +;; query relative to that node. A converter can also be called on a set +;; of nodes. In that case it returns a union of the corresponding +;; queries over each node in the set. The union is easily implemented as +;; a list append operation as all nodes in a SXML tree are considered +;; distinct, by XPath conventions. We also preserve the order of the +;; members in the union. Query combinators are high-order functions: +;; they take converter(s) (which is a Node|Nodeset -> Nodeset function) +;; and compose or otherwise combine them. We will be concerned with only +;; relative location paths [XPath]: an absolute location path is a +;; relative path applied to the root node. +;; +;; Similarly to XPath, SXPath defines full and abbreviated notations +;; for location paths. In both cases, the abbreviated notation can be +;; mechanically expanded into the full form by simple rewriting +;; rules. In case of SXPath the corresponding rules are given as +;; comments to a sxpath function, below. The regression test suite at +;; the end of this file shows a representative sample of SXPaths in +;; both notations, juxtaposed with the corresponding XPath +;; expressions. Most of the samples are borrowed literally from the +;; XPath specification, while the others are adjusted for our running +;; example, tree1. +;; +;;; Code: + +(define-module (sxml xpath) + #:use-module (ice-9 pretty-print) + #:export (nodeset? node-typeof? node-eq? node-equal? node-pos + filter take-until take-after map-union node-reverse + node-trace select-kids node-self node-join node-reduce + node-or node-closure node-parent + sxpath)) + +;; Upstream version: +; $Id: SXPath.scm,v 3.5 2001/01/12 23:20:35 oleg Exp oleg $ + +(define (nodeset? x) + (or (and (pair? x) (not (symbol? (car x)))) (null? x))) + +;------------------------- +; Basic converters and applicators +; A converter is a function +; type Converter = Node|Nodeset -> Nodeset +; A converter can also play a role of a predicate: in that case, if a +; converter, applied to a node or a nodeset, yields a non-empty +; nodeset, the converter-predicate is deemed satisfied. Throughout +; this file a nil nodeset is equivalent to #f in denoting a failure. + +; The following function implements a 'Node test' as defined in +; Sec. 2.3 of XPath document. A node test is one of the components of a +; location step. It is also a converter-predicate in SXPath. +; +; The function node-typeof? takes a type criterion and returns a function, +; which, when applied to a node, will tell if the node satisfies +; the test. +; node-typeof? :: Crit -> Node -> Boolean +; +; The criterion 'crit' is a symbol, one of the following: +; id - tests if the Node has the right name (id) +; @ - tests if the Node is an <attributes-coll> +; * - tests if the Node is an <Element> +; *text* - tests if the Node is a text node +; *PI* - tests if the Node is a PI node +; *any* - #t for any type of Node + +(define (node-typeof? crit) + (lambda (node) + (case crit + ((*) (and (pair? node) (not (memq (car node) '(@ *PI*))))) + ((*any*) #t) + ((*text*) (string? node)) + (else + (and (pair? node) (eq? crit (car node)))) +))) + + +; Curried equivalence converter-predicates +(define (node-eq? other) + (lambda (node) + (eq? other node))) + +(define (node-equal? other) + (lambda (node) + (equal? other node))) + +; node-pos:: N -> Nodeset -> Nodeset, or +; node-pos:: N -> Converter +; Select the N'th element of a Nodeset and return as a singular Nodeset; +; Return an empty nodeset if the Nth element does not exist. +; ((node-pos 1) Nodeset) selects the node at the head of the Nodeset, +; if exists; ((node-pos 2) Nodeset) selects the Node after that, if +; exists. +; N can also be a negative number: in that case the node is picked from +; the tail of the list. +; ((node-pos -1) Nodeset) selects the last node of a non-empty nodeset; +; ((node-pos -2) Nodeset) selects the last but one node, if exists. + +(define (node-pos n) + (lambda (nodeset) + (cond + ((not (nodeset? nodeset)) '()) + ((null? nodeset) nodeset) + ((eqv? n 1) (list (car nodeset))) + ((negative? n) ((node-pos (+ n 1 (length nodeset))) nodeset)) + (else + (or (positive? n) (error "yikes!")) + ((node-pos (1- n)) (cdr nodeset)))))) + +; filter:: Converter -> Converter +; A filter applicator, which introduces a filtering context. The argument +; converter is considered a predicate, with either #f or nil result meaning +; failure. +(define (filter pred?) + (lambda (lst) ; a nodeset or a node (will be converted to a singleton nset) + (let loop ((lst (if (nodeset? lst) lst (list lst))) (res '())) + (if (null? lst) + (reverse res) + (let ((pred-result (pred? (car lst)))) + (loop (cdr lst) + (if (and pred-result (not (null? pred-result))) + (cons (car lst) res) + res))))))) + +; take-until:: Converter -> Converter, or +; take-until:: Pred -> Node|Nodeset -> Nodeset +; Given a converter-predicate and a nodeset, apply the predicate to +; each element of the nodeset, until the predicate yields anything but #f or +; nil. Return the elements of the input nodeset that have been processed +; till that moment (that is, which fail the predicate). +; take-until is a variation of the filter above: take-until passes +; elements of an ordered input set till (but not including) the first +; element that satisfies the predicate. +; The nodeset returned by ((take-until (not pred)) nset) is a subset -- +; to be more precise, a prefix -- of the nodeset returned by +; ((filter pred) nset) + +(define (take-until pred?) + (lambda (lst) ; a nodeset or a node (will be converted to a singleton nset) + (let loop ((lst (if (nodeset? lst) lst (list lst)))) + (if (null? lst) lst + (let ((pred-result (pred? (car lst)))) + (if (and pred-result (not (null? pred-result))) + '() + (cons (car lst) (loop (cdr lst))))) + )))) + + +; take-after:: Converter -> Converter, or +; take-after:: Pred -> Node|Nodeset -> Nodeset +; Given a converter-predicate and a nodeset, apply the predicate to +; each element of the nodeset, until the predicate yields anything but #f or +; nil. Return the elements of the input nodeset that have not been processed: +; that is, return the elements of the input nodeset that follow the first +; element that satisfied the predicate. +; take-after along with take-until partition an input nodeset into three +; parts: the first element that satisfies a predicate, all preceding +; elements and all following elements. + +(define (take-after pred?) + (lambda (lst) ; a nodeset or a node (will be converted to a singleton nset) + (let loop ((lst (if (nodeset? lst) lst (list lst)))) + (if (null? lst) lst + (let ((pred-result (pred? (car lst)))) + (if (and pred-result (not (null? pred-result))) + (cdr lst) + (loop (cdr lst)))) + )))) + +; Apply proc to each element of lst and return the list of results. +; if proc returns a nodeset, splice it into the result +; +; From another point of view, map-union is a function Converter->Converter, +; which places an argument-converter in a joining context. + +(define (map-union proc lst) + (if (null? lst) lst + (let ((proc-res (proc (car lst)))) + ((if (nodeset? proc-res) append cons) + proc-res (map-union proc (cdr lst)))))) + +; node-reverse :: Converter, or +; node-reverse:: Node|Nodeset -> Nodeset +; Reverses the order of nodes in the nodeset +; This basic converter is needed to implement a reverse document order +; (see the XPath Recommendation). +(define node-reverse + (lambda (node-or-nodeset) + (if (not (nodeset? node-or-nodeset)) (list node-or-nodeset) + (reverse node-or-nodeset)))) + +; node-trace:: String -> Converter +; (node-trace title) is an identity converter. In addition it prints out +; a node or nodeset it is applied to, prefixed with the 'title'. +; This converter is very useful for debugging. + +(define (node-trace title) + (lambda (node-or-nodeset) + (display "\n-->") + (display title) + (display " :") + (pretty-print node-or-nodeset) + node-or-nodeset)) + + +;------------------------- +; Converter combinators +; +; Combinators are higher-order functions that transmogrify a converter +; or glue a sequence of converters into a single, non-trivial +; converter. The goal is to arrive at converters that correspond to +; XPath location paths. +; +; From a different point of view, a combinator is a fixed, named +; _pattern_ of applying converters. Given below is a complete set of +; such patterns that together implement XPath location path +; specification. As it turns out, all these combinators can be built +; from a small number of basic blocks: regular functional composition, +; map-union and filter applicators, and the nodeset union. + + + +; select-kids:: Pred -> Node -> Nodeset +; Given a Node, return an (ordered) subset its children that satisfy +; the Pred (a converter, actually) +; select-kids:: Pred -> Nodeset -> Nodeset +; The same as above, but select among children of all the nodes in +; the Nodeset +; +; More succinctly, the signature of this function is +; select-kids:: Converter -> Converter + +(define (select-kids test-pred?) + (lambda (node) ; node or node-set + (cond + ((null? node) node) + ((not (pair? node)) '()) ; No children + ((symbol? (car node)) + ((filter test-pred?) (cdr node))) ; it's a single node + (else (map-union (select-kids test-pred?) node))))) + + +; node-self:: Pred -> Node -> Nodeset, or +; node-self:: Converter -> Converter +; Similar to select-kids but apply to the Node itself rather +; than to its children. The resulting Nodeset will contain either one +; component, or will be empty (if the Node failed the Pred). +(define node-self filter) + + +; node-join:: [LocPath] -> Node|Nodeset -> Nodeset, or +; node-join:: [Converter] -> Converter +; join the sequence of location steps or paths as described +; in the title comments above. +(define (node-join . selectors) + (lambda (nodeset) ; Nodeset or node + (let loop ((nodeset nodeset) (selectors selectors)) + (if (null? selectors) nodeset + (loop + (if (nodeset? nodeset) + (map-union (car selectors) nodeset) + ((car selectors) nodeset)) + (cdr selectors)))))) + + +; node-reduce:: [LocPath] -> Node|Nodeset -> Nodeset, or +; node-reduce:: [Converter] -> Converter +; A regular functional composition of converters. +; From a different point of view, +; ((apply node-reduce converters) nodeset) +; is equivalent to +; (foldl apply nodeset converters) +; i.e., folding, or reducing, a list of converters with the nodeset +; as a seed. +(define (node-reduce . converters) + (lambda (nodeset) ; Nodeset or node + (let loop ((nodeset nodeset) (converters converters)) + (if (null? converters) nodeset + (loop ((car converters) nodeset) (cdr converters)))))) + + +; node-or:: [Converter] -> Converter +; This combinator applies all converters to a given node and +; produces the union of their results. +; This combinator corresponds to a union, '|' operation for XPath +; location paths. +; (define (node-or . converters) +; (lambda (node-or-nodeset) +; (if (null? converters) node-or-nodeset +; (append +; ((car converters) node-or-nodeset) +; ((apply node-or (cdr converters)) node-or-nodeset))))) +; More optimal implementation follows +(define (node-or . converters) + (lambda (node-or-nodeset) + (let loop ((result '()) (converters converters)) + (if (null? converters) result + (loop (append result (or ((car converters) node-or-nodeset) '())) + (cdr converters)))))) + + +; node-closure:: Converter -> Converter +; Select all _descendants_ of a node that satisfy a converter-predicate. +; This combinator is similar to select-kids but applies to +; grand... children as well. +; This combinator implements the "descendant::" XPath axis +; Conceptually, this combinator can be expressed as +; (define (node-closure f) +; (node-or +; (select-kids f) +; (node-reduce (select-kids (node-typeof? '*)) (node-closure f)))) +; This definition, as written, looks somewhat like a fixpoint, and it +; will run forever. It is obvious however that sooner or later +; (select-kids (node-typeof? '*)) will return an empty nodeset. At +; this point further iterations will no longer affect the result and +; can be stopped. + +(define (node-closure test-pred?) + (lambda (node) ; Nodeset or node + (let loop ((parent node) (result '())) + (if (null? parent) result + (loop ((select-kids (node-typeof? '*)) parent) + (append result + ((select-kids test-pred?) parent))) + )))) + +; node-parent:: RootNode -> Converter +; (node-parent rootnode) yields a converter that returns a parent of a +; node it is applied to. If applied to a nodeset, it returns the list +; of parents of nodes in the nodeset. The rootnode does not have +; to be the root node of the whole SXML tree -- it may be a root node +; of a branch of interest. +; Given the notation of Philip Wadler's paper on semantics of XSLT, +; parent(x) = { y | y=subnode*(root), x=subnode(y) } +; Therefore, node-parent is not the fundamental converter: it can be +; expressed through the existing ones. Yet node-parent is a rather +; convenient converter. It corresponds to a parent:: axis of SXPath. +; Note that the parent:: axis can be used with an attribute node as well! + +(define (node-parent rootnode) + (lambda (node) ; Nodeset or node + (if (nodeset? node) (map-union (node-parent rootnode) node) + (let ((pred + (node-or + (node-reduce + (node-self (node-typeof? '*)) + (select-kids (node-eq? node))) + (node-join + (select-kids (node-typeof? '@)) + (select-kids (node-eq? node)))))) + ((node-or + (node-self pred) + (node-closure pred)) + rootnode))))) + +;------------------------- +; Evaluate an abbreviated SXPath +; sxpath:: AbbrPath -> Converter, or +; sxpath:: AbbrPath -> Node|Nodeset -> Nodeset +; AbbrPath is a list. It is translated to the full SXPath according +; to the following rewriting rules +; (sxpath '()) -> (node-join) +; (sxpath '(path-component ...)) -> +; (node-join (sxpath1 path-component) (sxpath '(...))) +; (sxpath1 '//) -> (node-or +; (node-self (node-typeof? '*any*)) +; (node-closure (node-typeof? '*any*))) +; (sxpath1 '(equal? x)) -> (select-kids (node-equal? x)) +; (sxpath1 '(eq? x)) -> (select-kids (node-eq? x)) +; (sxpath1 ?symbol) -> (select-kids (node-typeof? ?symbol) +; (sxpath1 procedure) -> procedure +; (sxpath1 '(?symbol ...)) -> (sxpath1 '((?symbol) ...)) +; (sxpath1 '(path reducer ...)) -> +; (node-reduce (sxpath path) (sxpathr reducer) ...) +; (sxpathr number) -> (node-pos number) +; (sxpathr path-filter) -> (filter (sxpath path-filter)) + +(define (sxpath path) + (lambda (nodeset) + (let loop ((nodeset nodeset) (path path)) + (cond + ((null? path) nodeset) + ((nodeset? nodeset) + (map-union (sxpath path) nodeset)) + ((procedure? (car path)) + (loop ((car path) nodeset) (cdr path))) + ((eq? '// (car path)) + (loop + ((if (nodeset? nodeset) append cons) nodeset + ((node-closure (node-typeof? '*any*)) nodeset)) + (cdr path))) + ((symbol? (car path)) + (loop ((select-kids (node-typeof? (car path))) nodeset) + (cdr path))) + ((and (pair? (car path)) (eq? 'equal? (caar path))) + (loop ((select-kids (apply node-equal? (cdar path))) nodeset) + (cdr path))) + ((and (pair? (car path)) (eq? 'eq? (caar path))) + (loop ((select-kids (apply node-eq? (cdar path))) nodeset) + (cdr path))) + ((pair? (car path)) + (let reducer ((nodeset + (if (symbol? (caar path)) + ((select-kids (node-typeof? (caar path))) nodeset) + (loop nodeset (caar path)))) + (reducing-path (cdar path))) + (cond + ((null? reducing-path) (loop nodeset (cdr path))) + ((number? (car reducing-path)) + (reducer ((node-pos (car reducing-path)) nodeset) + (cdr reducing-path))) + (else + (reducer ((filter (sxpath (car reducing-path))) nodeset) + (cdr reducing-path)))))) + (else + (error "Invalid path step: " (car path))))))) + +;;; arch-tag: c4e57abf-6b61-4612-a6aa-d1536d440774 +;;; xpath.scm ends here diff --git a/module/texinfo.scm b/module/texinfo.scm new file mode 100644 index 000000000..76d79dfab --- /dev/null +++ b/module/texinfo.scm @@ -0,0 +1,1217 @@ +;;;; (texinfo) -- parsing of texinfo into SXML +;;;; +;;;; Copyright (C) 2009 Free Software Foundation, Inc. +;;;; Copyright (C) 2004, 2009 Andy Wingo <wingo at pobox dot com> +;;;; Copyright (C) 2001,2002 Oleg Kiselyov <oleg at pobox dot com> +;;;; +;;;; This file is based on SSAX's SSAX.scm. +;;;; +;;;; This library is free software; you can redistribute it and/or +;;;; modify it under the terms of the GNU Lesser General Public +;;;; License as published by the Free Software Foundation; either +;;;; version 3 of the License, or (at your option) any later version. +;;;; +;;;; This library is distributed in the hope that it will be useful, +;;;; but WITHOUT ANY WARRANTY; without even the implied warranty of +;;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;;;; Lesser General Public License for more details. +;;;; +;;;; You should have received a copy of the GNU Lesser General Public +;;;; License along with this library; if not, write to the Free Software +;;;; Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + +;;; Commentary: +;; +;; @subheading Texinfo processing in scheme +;; +;; This module parses texinfo into SXML. TeX will always be the +;; processor of choice for print output, of course. However, although +;; @code{makeinfo} works well for info, its output in other formats is +;; not very customizable, and the program is not extensible as a whole. +;; This module aims to provide an extensible framework for texinfo +;; processing that integrates texinfo into the constellation of SXML +;; processing tools. +;; +;; @subheading Notes on the SXML vocabulary +;; +;; Consider the following texinfo fragment: +;; +;;@example +;; @@deffn Primitive set-car! pair value +;; This function... +;; @@end deffn +;;@end example +;; +;; Logically, the category (Primitive), name (set-car!), and arguments +;; (pair value) are ``attributes'' of the deffn, with the description as +;; the content. However, texinfo allows for @@-commands within the +;; arguments to an environment, like @code{@@deffn}, which means that +;; texinfo ``attributes'' are PCDATA. XML attributes, on the other hand, +;; are CDATA. For this reason, ``attributes'' of texinfo @@-commands are +;; called ``arguments'', and are grouped under the special element, `%'. +;; +;; Because `%' is not a valid NCName, stexinfo is a superset of SXML. In +;; the interests of interoperability, this module provides a conversion +;; function to replace the `%' with `texinfo-arguments'. +;; +;;; Code: + +;; Comparison to xml output of texinfo (which is rather undocumented): +;; Doesn't conform to texinfo dtd +;; No DTD at all, in fact :-/ +;; Actually outputs valid xml, after transforming % +;; Slower (although with caching the SXML that problem can go away) +;; Doesn't parse menus (although menus are shite) +;; Args go in a dedicated element, FBOFW +;; Definitions are handled a lot better +;; Does parse comments +;; Outputs only significant line breaks (a biggie!) +;; Nodes are treated as anchors, rather than content organizers (a biggie) +;; (more book-like, less info-like) + +;; TODO +;; Integration: help, indexing, plain text + +(define-module (texinfo) + #:use-module (sxml simple) + #:use-module (sxml transform) + #:use-module (sxml ssax input-parse) + #:use-module (srfi srfi-1) + #:use-module (srfi srfi-13) + #:export (call-with-file-and-dir + texi-command-specs + texi-command-depth + texi-fragment->stexi + texi->stexi + stexi->sxml)) + +;; Some utilities + +(define (parser-error port message . rest) + (apply error port message rest)) + +(define (call-with-file-and-dir filename proc) + "Call the one-argument procedure @var{proc} with an input port that +reads from @var{filename}. During the dynamic extent of @var{proc}'s +execution, the current directory will be @code{(dirname +@var{filename})}. This is useful for parsing documents that can include +files by relative path name." + (let ((current-dir (getcwd))) + (dynamic-wind + (lambda () (chdir (dirname filename))) + (lambda () + (call-with-input-file (basename filename) proc)) + (lambda () (chdir current-dir))))) + +;; Define this version here, because (srfi srfi-11)'s definition uses +;; syntax-rules, which is really damn slow +(define-macro (let*-values bindings . body) + (if (null? bindings) (cons 'begin body) + (apply + (lambda (vars initializer) + (let ((cont + (cons 'let*-values + (cons (cdr bindings) body)))) + (cond + ((not (pair? vars)) ; regular let case, a single var + `(let ((,vars ,initializer)) ,cont)) + ((null? (cdr vars)) ; single var, see the prev case + `(let ((,(car vars) ,initializer)) ,cont)) + (else ; the most generic case + `(call-with-values (lambda () ,initializer) + (lambda ,vars ,cont)))))) + (car bindings)))) + +;;======================================================================== +;; Reflection on the XML vocabulary + +(define texi-command-specs + #; +"A list of (@var{name} @var{content-model} . @var{args}) + +@table @var +@item name +The name of an @@-command, as a symbol. + +@item content-model +A symbol indicating the syntactic type of the @@-command: +@table @code +@item EMPTY-COMMAND +No content, and no @code{@@end} is coming +@item EOL-ARGS +Unparsed arguments until end of line +@item EOL-TEXT +Parsed arguments until end of line +@item INLINE-ARGS +Unparsed arguments ending with @code{#\\@}} +@item INLINE-TEXT +Parsed arguments ending with @code{#\\@}} +@item ENVIRON +The tag is an environment tag, expect @code{@@end foo}. +@item TABLE-ENVIRON +Like ENVIRON, but with special parsing rules for its arguments. +@item FRAGMENT +For @code{*fragment*}, the command used for parsing fragments of +texinfo documents. +@end table + +@code{INLINE-TEXT} commands will receive their arguments within their +bodies, whereas the @code{-ARGS} commands will receive them in their +attribute list. + +@code{EOF-TEXT} receives its arguments in its body. + +@code{ENVIRON} commands have both: parsed arguments until the end of +line, received through their attribute list, and parsed text until the +@code{@@end}, received in their bodies. + +@code{EOF-TEXT-ARGS} receives its arguments in its attribute list, as in +@code{ENVIRON}. + +There are four @@-commands that are treated specially. @code{@@include} +is a low-level token that will not be seen by higher-level parsers, so +it has no content-model. @code{@@para} is the paragraph command, which +is only implicit in the texinfo source. @code{@@item} has special +syntax, as noted above, and @code{@@entry} is how this parser treats +@code{@@item} commands within @code{@@table}, @code{@@ftable}, and +@code{@@vtable}. + +Also, indexing commands (@code{@@cindex}, etc.) are treated specially. +Their arguments are parsed, but they are needed before entering the +element so that an anchor can be inserted into the text before the index +entry. + +@item args +Named arguments to the command, in the same format as the formals for a +lambda. Only present for @code{INLINE-ARGS}, @code{EOL-ARGS}, +@code{ENVIRON}, @code{TABLE-ENVIRON} commands. +@end table" + '(;; Special commands + (include #f) ;; this is a low-level token + (para PARAGRAPH) + (item ITEM) + (entry ENTRY . heading) + (noindent EMPTY-COMMAND) + (*fragment* FRAGMENT) + + ;; Inline text commands + (*braces* INLINE-TEXT) ;; FIXME: make me irrelevant + (bold INLINE-TEXT) + (sample INLINE-TEXT) + (samp INLINE-TEXT) + (code INLINE-TEXT) + (kbd INLINE-TEXT) + (key INLINE-TEXT) + (var INLINE-TEXT) + (env INLINE-TEXT) + (file INLINE-TEXT) + (command INLINE-TEXT) + (option INLINE-TEXT) + (dfn INLINE-TEXT) + (cite INLINE-TEXT) + (acro INLINE-TEXT) + (url INLINE-TEXT) + (email INLINE-TEXT) + (emph INLINE-TEXT) + (strong INLINE-TEXT) + (sample INLINE-TEXT) + (sc INLINE-TEXT) + (titlefont INLINE-TEXT) + (asis INLINE-TEXT) + (b INLINE-TEXT) + (i INLINE-TEXT) + (r INLINE-TEXT) + (sansserif INLINE-TEXT) + (slanted INLINE-TEXT) + (t INLINE-TEXT) + + ;; Inline args commands + (value INLINE-ARGS . (key)) + (ref INLINE-ARGS . (node #:opt name section info-file manual)) + (xref INLINE-ARGS . (node #:opt name section info-file manual)) + (pxref INLINE-ARGS . (node #:opt name section info-file manual)) + (uref INLINE-ARGS . (url #:opt title replacement)) + (anchor INLINE-ARGS . (name)) + (dots INLINE-ARGS . ()) + (result INLINE-ARGS . ()) + (bullet INLINE-ARGS . ()) + (copyright INLINE-ARGS . ()) + (tie INLINE-ARGS . ()) + (image INLINE-ARGS . (file #:opt width height alt-text extension)) + + ;; EOL args elements + (node EOL-ARGS . (name #:opt next previous up)) + (c EOL-ARGS . all) + (comment EOL-ARGS . all) + (setchapternewpage EOL-ARGS . all) + (sp EOL-ARGS . all) + (page EOL-ARGS . ()) + (vskip EOL-ARGS . all) + (syncodeindex EOL-ARGS . all) + (contents EOL-ARGS . ()) + (shortcontents EOL-ARGS . ()) + (summarycontents EOL-ARGS . ()) + (insertcopying EOL-ARGS . ()) + (dircategory EOL-ARGS . (category)) + (top EOL-ARGS . (title)) + (printindex EOL-ARGS . (type)) + + ;; EOL text commands + (*ENVIRON-ARGS* EOL-TEXT) + (itemx EOL-TEXT) + (set EOL-TEXT) + (center EOL-TEXT) + (title EOL-TEXT) + (subtitle EOL-TEXT) + (author EOL-TEXT) + (chapter EOL-TEXT) + (section EOL-TEXT) + (appendix EOL-TEXT) + (appendixsec EOL-TEXT) + (unnumbered EOL-TEXT) + (unnumberedsec EOL-TEXT) + (subsection EOL-TEXT) + (subsubsection EOL-TEXT) + (appendixsubsec EOL-TEXT) + (appendixsubsubsec EOL-TEXT) + (unnumberedsubsec EOL-TEXT) + (unnumberedsubsubsec EOL-TEXT) + (chapheading EOL-TEXT) + (majorheading EOL-TEXT) + (heading EOL-TEXT) + (subheading EOL-TEXT) + (subsubheading EOL-TEXT) + + (deftpx EOL-TEXT-ARGS . (category name . attributes)) + (defcvx EOL-TEXT-ARGS . (category class name)) + (defivarx EOL-TEXT-ARGS . (class name)) + (deftypeivarx EOL-TEXT-ARGS . (class data-type name)) + (defopx EOL-TEXT-ARGS . (category class name . arguments)) + (deftypeopx EOL-TEXT-ARGS . (category class data-type name . arguments)) + (defmethodx EOL-TEXT-ARGS . (class name . arguments)) + (deftypemethodx EOL-TEXT-ARGS . (class data-type name . arguments)) + (defoptx EOL-TEXT-ARGS . (name)) + (defvrx EOL-TEXT-ARGS . (category name)) + (defvarx EOL-TEXT-ARGS . (name)) + (deftypevrx EOL-TEXT-ARGS . (category data-type name)) + (deftypevarx EOL-TEXT-ARGS . (data-type name)) + (deffnx EOL-TEXT-ARGS . (category name . arguments)) + (deftypefnx EOL-TEXT-ARGS . (category data-type name . arguments)) + (defspecx EOL-TEXT-ARGS . (name . arguments)) + (defmacx EOL-TEXT-ARGS . (name . arguments)) + (defunx EOL-TEXT-ARGS . (name . arguments)) + (deftypefunx EOL-TEXT-ARGS . (data-type name . arguments)) + + ;; Indexing commands + (cindex INDEX . entry) + (findex INDEX . entry) + (vindex INDEX . entry) + (kindex INDEX . entry) + (pindex INDEX . entry) + (tindex INDEX . entry) + + ;; Environment commands (those that need @end) + (texinfo ENVIRON . title) + (ignore ENVIRON . ()) + (ifinfo ENVIRON . ()) + (iftex ENVIRON . ()) + (ifhtml ENVIRON . ()) + (ifxml ENVIRON . ()) + (ifplaintext ENVIRON . ()) + (ifnotinfo ENVIRON . ()) + (ifnottex ENVIRON . ()) + (ifnothtml ENVIRON . ()) + (ifnotxml ENVIRON . ()) + (ifnotplaintext ENVIRON . ()) + (titlepage ENVIRON . ()) + (menu ENVIRON . ()) + (direntry ENVIRON . ()) + (copying ENVIRON . ()) + (example ENVIRON . ()) + (smallexample ENVIRON . ()) + (display ENVIRON . ()) + (smalldisplay ENVIRON . ()) + (verbatim ENVIRON . ()) + (format ENVIRON . ()) + (smallformat ENVIRON . ()) + (lisp ENVIRON . ()) + (smalllisp ENVIRON . ()) + (cartouche ENVIRON . ()) + (quotation ENVIRON . ()) + + (deftp ENVIRON . (category name . attributes)) + (defcv ENVIRON . (category class name)) + (defivar ENVIRON . (class name)) + (deftypeivar ENVIRON . (class data-type name)) + (defop ENVIRON . (category class name . arguments)) + (deftypeop ENVIRON . (category class data-type name . arguments)) + (defmethod ENVIRON . (class name . arguments)) + (deftypemethod ENVIRON . (class data-type name . arguments)) + (defopt ENVIRON . (name)) + (defvr ENVIRON . (category name)) + (defvar ENVIRON . (name)) + (deftypevr ENVIRON . (category data-type name)) + (deftypevar ENVIRON . (data-type name)) + (deffn ENVIRON . (category name . arguments)) + (deftypefn ENVIRON . (category data-type name . arguments)) + (defspec ENVIRON . (name . arguments)) + (defmac ENVIRON . (name . arguments)) + (defun ENVIRON . (name . arguments)) + (deftypefun ENVIRON . (data-type name . arguments)) + + (table TABLE-ENVIRON . (formatter)) + (itemize TABLE-ENVIRON . (formatter)) + (enumerate TABLE-ENVIRON . (start)) + (ftable TABLE-ENVIRON . (formatter)) + (vtable TABLE-ENVIRON . (formatter)))) + +(define command-depths + '((chapter . 1) (section . 2) (subsection . 3) (subsubsection . 4) + (top . 0) (unnumbered . 1) (unnumberedsec . 2) + (unnumberedsubsec . 3) (unnumberedsubsubsec . 4) + (appendix . 1) (appendixsec . 2) (appendixsection . 2) + (appendixsubsec . 3) (appendixsubsubsec . 4))) +(define (texi-command-depth command max-depth) + "Given the texinfo command @var{command}, return its nesting level, or +@code{#f} if it nests too deep for @var{max-depth}. + +Examples: +@example +(texi-command-depth 'chapter 4) @result{} 1 +(texi-command-depth 'top 4) @result{} 0 +(texi-command-depth 'subsection 4) @result{} 3 +(texi-command-depth 'appendixsubsec 4) @result{} 3 +(texi-command-depth 'subsection 2) @result{} #f +@end example" + (let ((depth (and=> (assq command command-depths) cdr))) + (and depth (<= depth max-depth) depth))) + +;; The % is for arguments +(define (space-significant? command) + (memq command + '(example smallexample verbatim lisp smalllisp menu %))) + +;; Like a DTD for texinfo +(define (command-spec command) + (or (assq command texi-command-specs) + (parser-error #f "Unknown command" command))) + +(define (inline-content? content) + (or (eq? content 'INLINE-TEXT) (eq? content 'INLINE-ARGS))) + + +;;======================================================================== +;; Lower-level parsers and scanners +;; +;; They deal with primitive lexical units (Names, whitespaces, tags) and +;; with pieces of more generic productions. Most of these parsers must +;; be called in appropriate context. For example, complete-start-command +;; must be called only when the @-command start has been detected and +;; its name token has been read. + +;; Test if a string is made of only whitespace +;; An empty string is considered made of whitespace as well +(define (string-whitespace? str) + (or (string-null? str) + (string-every char-whitespace? str))) + +;; Like read-text-line, but allows EOF. +(define read-eof-breaks '(*eof* #\return #\newline)) +(define (read-eof-line port) + (if (eof-object? (peek-char port)) + (peek-char port) + (let* ((line (next-token '() read-eof-breaks + "reading a line" port)) + (c (read-char port))) ; must be either \n or \r or EOF + (if (and (eq? c #\return) (eq? (peek-char port) #\newline)) + (read-char port)) ; skip \n that follows \r + line))) + +(define ascii->char integer->char) + +(define (skip-whitespace port) + (skip-while '(#\space #\tab #\return #\newline) port)) + +(define (skip-horizontal-whitespace port) + (skip-while '(#\space #\tab) port)) + +;; command ::= Letter+ + +;; procedure: read-command PORT +;; +;; Read a command starting from the current position in the PORT and +;; return it as a symbol. +(define (read-command port) + (let ((first-char (peek-char port))) + (or (char-alphabetic? first-char) + (parser-error port "Nonalphabetic @-command char: '" first-char "'"))) + (string->symbol + (next-token-of + (lambda (c) + (cond + ((eof-object? c) #f) + ((char-alphabetic? c) c) + (else #f))) + port))) + +;; A token is a primitive lexical unit. It is a record with two fields, +;; token-head and token-kind. +;; +;; Token types: +;; END The end of a texinfo command. If the command is ended by }, +;; token-head will be #f. Otherwise if the command is ended by +;; @end COMMAND, token-head will be COMMAND. As a special case, +;; @bye is the end of a special @texinfo command. +;; START The start of a texinfo command. The token-head will be a +;; symbol of the @-command name. +;; INCLUDE An @include directive. The token-head will be empty -- the +;; caller is responsible for reading the include file name. +;; ITEM @item commands have an irregular syntax. They end at the +;; next @item, or at the end of the environment. For that +;; read-command-token treats them specially. + +(define (make-token kind head) (cons kind head)) +(define token? pair?) +(define token-kind car) +(define token-head cdr) + +;; procedure: read-command-token PORT +;; +;; This procedure starts parsing of a command token. The current +;; position in the stream must be #\@. This procedure scans enough of +;; the input stream to figure out what kind of a command token it is +;; seeing. The procedure returns a token structure describing the token. + +(define (read-command-token port) + (assert-curr-char '(#\@) "start of the command" port) + (let ((peeked (peek-char port))) + (cond + ((memq peeked '(#\! #\. #\? #\@ #\\ #\{ #\})) + ;; @-commands that escape characters + (make-token 'STRING (string (read-char port)))) + (else + (let ((name (read-command port))) + (case name + ((end) + ;; got an ending tag + (let ((command (string-trim-both + (read-eof-line port)))) + (or (and (not (string-null? command)) + (string-every char-alphabetic? command)) + (parser-error port "malformed @end" command)) + (make-token 'END (string->symbol command)))) + ((bye) + ;; the end of the top + (make-token 'END 'texinfo)) + ((item) + (make-token 'ITEM 'item)) + ((include) + (make-token 'INCLUDE #f)) + (else + (make-token 'START name)))))))) + +;; procedure+: read-verbatim-body PORT STR-HANDLER SEED +;; +;; This procedure must be called after we have read a string +;; "@verbatim\n" that begins a verbatim section. The current position +;; must be the first position of the verbatim body. This function reads +;; _lines_ of the verbatim body and passes them to a STR-HANDLER, a +;; character data consumer. +;; +;; The str-handler is a STR-HANDLER, a procedure STRING1 STRING2 SEED. +;; The first STRING1 argument to STR-HANDLER never contains a newline. +;; The second STRING2 argument often will. On the first invocation of the +;; STR-HANDLER, the seed is the one passed to read-verbatim-body +;; as the third argument. The result of this first invocation will be +;; passed as the seed argument to the second invocation of the line +;; consumer, and so on. The result of the last invocation of the +;; STR-HANDLER is returned by the read-verbatim-body. Note a +;; similarity to the fundamental 'fold' iterator. +;; +;; Within a verbatim section all characters are taken at their face +;; value. It ends with "\n@end verbatim(\r)?\n". + +;; Must be called right after the newline after @verbatim. +(define (read-verbatim-body port str-handler seed) + (let loop ((seed seed)) + (let ((fragment (next-token '() '(#\newline) + "reading verbatim" port))) + ;; We're reading the char after the 'fragment', which is + ;; #\newline. + (read-char port) + (if (string=? fragment "@end verbatim") + seed + (loop (str-handler fragment "\n" seed)))))) + +;; procedure+: read-arguments PORT +;; +;; This procedure reads and parses a production ArgumentList. +;; ArgumentList ::= S* Argument (S* , S* Argument)* S* +;; Argument ::= ([^@{},])* +;; +;; Arguments are the things in braces, i.e @ref{my node} has one +;; argument, "my node". Most commands taking braces actually don't have +;; arguments, they process text. For example, in +;; @emph{@strong{emphasized}}, the emph takes text, because the parse +;; continues into the braces. +;; +;; Any whitespace within Argument is replaced with a single space. +;; Whitespace around an Argument is trimmed. +;; +;; The procedure returns a list of arguments. Afterwards the current +;; character will be after the final #\}. + +(define (read-arguments port stop-char) + (define (split str) + (read-char port) ;; eat the delimiter + (let ((ret (map (lambda (x) (if (string-null? x) #f x)) + (map string-trim-both (string-split str #\,))))) + (if (and (pair? ret) (eq? (car ret) #f) (null? (cdr ret))) + '() + ret))) + (split (next-token '() (list stop-char) + "arguments of @-command" port))) + +;; procedure+: complete-start-command COMMAND PORT +;; +;; This procedure is to complete parsing of an @-command. The procedure +;; must be called after the command token has been read. COMMAND is a +;; TAG-NAME. +;; +;; This procedure returns several values: +;; COMMAND: a symbol. +;; ARGUMENTS: command's arguments, as an alist. +;; CONTENT-MODEL: the content model of the command. +;; +;; On exit, the current position in PORT will depend on the CONTENT-MODEL. +;; +;; Content model Port position +;; ============= ============= +;; INLINE-TEXT One character after the #\{. +;; INLINE-ARGS The first character after the #\}. +;; EOL-TEXT The first non-whitespace character after the command. +;; ENVIRON, TABLE-ENVIRON, EOL-ARGS, EOL-TEXT +;; The first character on the next line. +;; PARAGRAPH, ITEM, EMPTY-COMMAND +;; The first character after the command. + +(define (arguments->attlist port args arg-names) + (let loop ((in args) (names arg-names) (opt? #f) (out '())) + (cond + ((symbol? names) ;; a rest arg + (reverse (if (null? in) out (acons names in out)))) + ((and (not (null? names)) (eq? (car names) #:opt)) + (loop in (cdr names) #t out)) + ((null? in) + (if (or (null? names) opt?) + (reverse out) + (parser-error port "@-command expected more arguments:" + args arg-names names))) + ((null? names) + (parser-error port "@-command didn't expect more arguments:" in)) + ((not (car in)) + (or (and opt? (loop (cdr in) (cdr names) opt? out)) + (parser-error "@-command missing required argument" + (car names)))) + (else + (loop (cdr in) (cdr names) opt? + (cons (list (car names) (car in)) out)))))) + +(define (parse-table-args command port) + (let* ((line (string-trim-both (read-text-line port))) + (length (string-length line))) + (define (get-formatter) + (or (and (not (zero? length)) + (eq? (string-ref line 0) #\@) + (let ((f (string->symbol (substring line 1)))) + (or (inline-content? (cadr (command-spec f))) + (parser-error + port "@item formatter must be INLINE" f)) + f)) + (parser-error "Invalid @item formatter" line))) + (case command + ((enumerate) + (if (zero? length) + '() + `((start + ,(if (or (and (eq? length 1) + (char-alphabetic? (string-ref line 0))) + (string-every char-numeric? line)) + line + (parser-error + port "Invalid enumerate start" line)))))) + ((itemize) + `((bullet + ,(or (and (eq? length 1) line) + (and (string-null? line) '(bullet)) + (list (get-formatter)))))) + (else ;; tables of various varieties + `((formatter (,(get-formatter)))))))) + +(define (complete-start-command command port) + (define (get-arguments type arg-names stop-char) + (arguments->attlist port (read-arguments port stop-char) arg-names)) + + (let* ((spec (command-spec command)) + (type (cadr spec)) + (arg-names (cddr spec))) + (case type + ((INLINE-TEXT) + (assert-curr-char '(#\{) "Inline element lacks {" port) + (values command '() type)) + ((INLINE-ARGS) + (assert-curr-char '(#\{) "Inline element lacks {" port) + (values command (get-arguments type arg-names #\}) type)) + ((EOL-ARGS) + (values command (get-arguments type arg-names #\newline) type)) + ((ENVIRON ENTRY INDEX) + (skip-horizontal-whitespace port) + (values command (parse-environment-args command port) type)) + ((TABLE-ENVIRON) + (skip-horizontal-whitespace port) + (values command (parse-table-args command port) type)) + ((EOL-TEXT) + (skip-horizontal-whitespace port) + (values command '() type)) + ((EOL-TEXT-ARGS) + (skip-horizontal-whitespace port) + (values command (parse-eol-text-args command port) type)) + ((PARAGRAPH EMPTY-COMMAND ITEM FRAGMENT) + (values command '() type)) + (else ;; INCLUDE shouldn't get here + (parser-error port "can't happen"))))) + +;;----------------------------------------------------------------------------- +;; Higher-level parsers and scanners +;; +;; They parse productions corresponding entire @-commands. + +;; Only reads @settitle, leaves it to the command parser to finish +;; reading the title. +(define (take-until-settitle port) + (or (find-string-from-port? "\n@settitle " port) + (parser-error port "No \\n@settitle found")) + (skip-horizontal-whitespace port) + (and (eq? (peek-char port) #\newline) + (parser-error port "You have a @settitle, but no title"))) + +;; procedure+: read-char-data PORT EXPECT-EOF? STR-HANDLER SEED +;; +;; This procedure is to read the CharData of a texinfo document. +;; +;; text ::= (CharData | Command)* +;; +;; The procedure reads CharData and stops at @-commands (or +;; environments). It also stops at an open or close brace. +;; +;; port +;; a PORT to read +;; expect-eof? +;; a boolean indicating if EOF is normal, i.e., the character +;; data may be terminated by the EOF. EOF is normal +;; while processing the main document. +;; preserve-ws? +;; a boolean indicating if we are within a whitespace-preserving +;; environment. If #t, suppress paragraph detection. +;; str-handler +;; a STR-HANDLER, see read-verbatim-body +;; seed +;; an argument passed to the first invocation of STR-HANDLER. +;; +;; The procedure returns two results: SEED and TOKEN. The SEED is the +;; result of the last invocation of STR-HANDLER, or the original seed if +;; STR-HANDLER was never called. +;; +;; TOKEN can be either an eof-object (this can happen only if expect-eof? +;; was #t), or a texinfo token denoting the start or end of a tag. + +;; read-char-data port expect-eof? preserve-ws? str-handler seed +(define read-char-data + (let* ((end-chars-eof '(*eof* #\{ #\} #\@ #\newline))) + (define (handle str-handler str1 str2 seed) + (if (and (string-null? str1) (string-null? str2)) + seed + (str-handler str1 str2 seed))) + + (lambda (port expect-eof? preserve-ws? str-handler seed) + (let ((end-chars ((if expect-eof? identity cdr) end-chars-eof))) + (let loop ((seed seed)) + (let* ((fragment (next-token '() end-chars "reading char data" port)) + (term-char (peek-char port))) ; one of end-chars + (cond + ((eof-object? term-char) ; only if expect-eof? + (values (handle str-handler fragment "" seed) term-char)) + ((memq term-char '(#\@ #\{ #\})) + (values (handle str-handler fragment "" seed) + (case term-char + ((#\@) (read-command-token port)) + ((#\{) (make-token 'START '*braces*)) + ((#\}) (read-char port) (make-token 'END #f))))) + ((eq? term-char #\newline) + ;; Always significant, unless directly before an end token. + (let ((c (peek-next-char port))) + (cond + ((eof-object? c) + (or expect-eof? + (parser-error port "EOF while reading char data")) + (values (handle str-handler fragment "" seed) c)) + ((eq? c #\@) + (let* ((token (read-command-token port)) + (end? (eq? (token-kind token) 'END))) + (values + (handle str-handler fragment (if end? "" " ") seed) + token))) + ((and (not preserve-ws?) (eq? c #\newline)) + ;; paragraph-separator ::= #\newline #\newline+ + (skip-while '(#\newline) port) + (skip-horizontal-whitespace port) + (values (handle str-handler fragment "" seed) + (make-token 'PARA 'para))) + (else + (loop (handle str-handler fragment + (if preserve-ws? "\n" " ") seed))))))))))))) + +; procedure+: assert-token TOKEN KIND NAME +; Make sure that TOKEN is of anticipated KIND and has anticipated NAME +(define (assert-token token kind name) + (or (and (token? token) + (eq? kind (token-kind token)) + (equal? name (token-head token))) + (parser-error #f "Expecting @end for " name ", got " token))) + +;;======================================================================== +;; Highest-level parsers: Texinfo to SXML + +;; These parsers are a set of syntactic forms to instantiate a SSAX +;; parser. The user tells what to do with the parsed character and +;; element data. These latter handlers determine if the parsing follows a +;; SAX or a DOM model. + +;; syntax: make-command-parser fdown fup str-handler + +;; Create a parser to parse and process one element, including its +;; character content or children elements. The parser is typically +;; applied to the root element of a document. + +;; fdown +;; procedure COMMAND ARGUMENTS EXPECTED-CONTENT SEED +;; +;; This procedure is to generate the seed to be passed to handlers +;; that process the content of the element. This is the function +;; identified as 'fdown' in the denotational semantics of the XML +;; parser given in the title comments to (sxml ssax). +;; +;; fup +;; procedure COMMAND ARGUMENTS PARENT-SEED SEED +;; +;; This procedure is called when parsing of COMMAND is finished. +;; The SEED is the result from the last content parser (or from +;; fdown if the element has the empty content). PARENT-SEED is the +;; same seed as was passed to fdown. The procedure is to generate a +;; seed that will be the result of the element parser. This is the +;; function identified as 'fup' in the denotational semantics of +;; the XML parser given in the title comments to (sxml ssax). +;; +;; str-handler +;; A STR-HANDLER, see read-verbatim-body +;; + +;; The generated parser is a +;; procedure COMMAND PORT SEED +;; +;; The procedure must be called *after* the command token has been read. + +(define (read-include-file-name port) + (let ((x (string-trim-both (read-eof-line port)))) + (if (string-null? x) + (error "no file listed") + x))) ;; fixme: should expand @value{} references + +(define (sxml->node-name sxml) + "Turn some sxml string into a valid node name." + (let loop ((in (string->list (sxml->string sxml))) (out '())) + (if (null? in) + (apply string (reverse out)) + (if (memq (car in) '(#\{ #\} #\@ #\,)) + (loop (cdr in) out) + (loop (cdr in) (cons (car in) out)))))) + +(define (index command arguments fdown fup parent-seed) + (case command + ((deftp defcv defivar deftypeivar defop deftypeop defmethod + deftypemethod defopt defvr defvar deftypevr deftypevar deffn + deftypefn defspec defmac defun deftypefun) + (let ((args `((name ,(string-append (symbol->string command) "-" + (cadr (assq 'name arguments))))))) + (fup 'anchor args parent-seed + (fdown 'anchor args 'INLINE-ARGS '())))) + ((cindex findex vindex kindex pindex tindex) + (let ((args `((name ,(string-append (symbol->string command) "-" + (sxml->node-name + (assq 'entry arguments))))))) + (fup 'anchor args parent-seed + (fdown 'anchor args 'INLINE-ARGS '())))) + (else parent-seed))) + +(define (make-command-parser fdown fup str-handler) + (lambda (command port seed) + (let visit ((command command) (port port) (sig-ws? #f) (parent-seed seed)) + (let*-values (((command arguments expected-content) + (complete-start-command command port))) + (let* ((parent-seed (index command arguments fdown fup parent-seed)) + (seed (fdown command arguments expected-content parent-seed)) + (eof-closes? (or (memq command '(texinfo para *fragment*)) + (eq? expected-content 'EOL-TEXT))) + (sig-ws? (or sig-ws? (space-significant? command))) + (up (lambda (s) (fup command arguments parent-seed s))) + (new-para (lambda (s) (fdown 'para '() 'PARAGRAPH s))) + (make-end-para (lambda (p) (lambda (s) (fup 'para '() p s))))) + + (define (port-for-content) + (if (eq? expected-content 'EOL-TEXT) + (call-with-input-string (read-text-line port) identity) + port)) + + (cond + ((memq expected-content '(EMPTY-COMMAND INLINE-ARGS EOL-ARGS INDEX + EOL-TEXT-ARGS)) + ;; empty or finished by complete-start-command + (up seed)) + ((eq? command 'verbatim) + (up (read-verbatim-body port str-handler seed))) + (else + (let loop ((port (port-for-content)) + (expect-eof? eof-closes?) + (end-para identity) + (need-break? (and (not sig-ws?) + (memq expected-content + '(ENVIRON TABLE-ENVIRON + ENTRY ITEM FRAGMENT)))) + (seed seed)) + (cond + ((and need-break? (or sig-ws? (skip-whitespace port)) + (not (memq (peek-char port) '(#\@ #\}))) + (not (eof-object? (peek-char port)))) + ;; Even if we have an @, it might be inline -- check + ;; that later + (let ((seed (end-para seed))) + (loop port expect-eof? (make-end-para seed) #f + (new-para seed)))) + (else + (let*-values (((seed token) + (read-char-data + port expect-eof? sig-ws? str-handler seed))) + (cond + ((eof-object? token) + (case expect-eof? + ((include #f) (end-para seed)) + (else (up (end-para seed))))) + (else + (case (token-kind token) + ((STRING) + ;; this is only @-commands that escape + ;; characters: @}, @@, @{ -- new para if need-break + (let ((seed ((if need-break? end-para identity) seed))) + (loop port expect-eof? + (if need-break? (make-end-para seed) end-para) #f + (str-handler (token-head token) "" + ((if need-break? new-para identity) + seed))))) + ((END) + ;; The end will only have a name if it's for an + ;; environment + (cond + ((memq command '(item entry)) + (let ((spec (command-spec (token-head token)))) + (or (eq? (cadr spec) 'TABLE-ENVIRON) + (parser-error + port "@item not ended by @end table/enumerate/itemize" + token)))) + ((eq? expected-content 'ENVIRON) + (assert-token token 'END command))) + (up (end-para seed))) + ((ITEM) + (cond + ((memq command '(enumerate itemize)) + (up (visit 'item port sig-ws? (end-para seed)))) + ((eq? expected-content 'TABLE-ENVIRON) + (up (visit 'entry port sig-ws? (end-para seed)))) + ((memq command '(item entry)) + (visit command port sig-ws? (up (end-para seed)))) + (else + (parser-error + port "@item must be within a table environment" + command)))) + ((PARA) + ;; examine valid paragraphs? + (loop port expect-eof? end-para (not sig-ws?) seed)) + ((INCLUDE) + ;; Recurse for include files + (let ((seed (call-with-file-and-dir + (read-include-file-name port) + (lambda (port) + (loop port 'include end-para + need-break? seed))))) + (loop port expect-eof? end-para need-break? seed))) + ((START) ; Start of an @-command + (let* ((head (token-head token)) + (type (cadr (command-spec head))) + (inline? (inline-content? type)) + (seed ((if (and inline? (not need-break?)) + identity end-para) seed)) + (end-para (if inline? + (if need-break? (make-end-para seed) + end-para) + identity)) + (new-para (if (and inline? need-break?) + new-para identity))) + (loop port expect-eof? end-para (not inline?) + (visit head port sig-ws? (new-para seed))))) + (else + (parser-error port "Unknown token type" token)))))))))))))))) + +;; procedure: reverse-collect-str-drop-ws fragments +;; +;; Given the list of fragments (some of which are text strings), reverse +;; the list and concatenate adjacent text strings. We also drop +;; "unsignificant" whitespace, that is, whitespace in front, behind and +;; between elements. The whitespace that is included in character data +;; is not affected. +(define (reverse-collect-str-drop-ws fragments) + (cond + ((null? fragments) ; a shortcut + '()) + ((and (string? (car fragments)) ; another shortcut + (null? (cdr fragments)) ; remove single ws-only string + (string-whitespace? (car fragments))) + '()) + (else + (let loop ((fragments fragments) (result '()) (strs '()) + (all-whitespace? #t)) + (cond + ((null? fragments) + (if all-whitespace? + result ; remove leading ws + (cons (apply string-append strs) result))) + ((string? (car fragments)) + (loop (cdr fragments) result (cons (car fragments) strs) + (and all-whitespace? + (string-whitespace? (car fragments))))) + (else + (loop (cdr fragments) + (cons + (car fragments) + (cond + ((null? strs) result) + (all-whitespace? + (if (null? result) + result ; remove trailing whitespace + (cons " " result))); replace interstitial ws with + ; one space + (else + (cons (apply string-append strs) result)))) + '() #t))))))) + +(define (make-dom-parser) + (make-command-parser + (lambda (command args content seed) ; fdown + '()) + (lambda (command args parent-seed seed) ; fup + (let ((seed (reverse-collect-str-drop-ws seed))) + (acons command + (if (null? args) seed (acons '% args seed)) + parent-seed))) + (lambda (string1 string2 seed) ; str-handler + (if (string-null? string2) + (cons string1 seed) + (cons* string2 string1 seed))))) + +(define parse-environment-args + (let ((parser (make-dom-parser))) + ;; duplicate arguments->attlist to avoid unnecessary splitting + (lambda (command port) + (let ((args (cdar (parser '*ENVIRON-ARGS* port '()))) + (arg-names (cddr (command-spec command)))) + (cond + ((not arg-names) + (if (null? args) '() + (parser-error port "@-command doesn't take args" command))) + ((eq? arg-names #t) + (list (cons 'arguments args))) + (else + (let loop ((args args) (arg-names arg-names) (out '())) + (cond + ((null? arg-names) + (if (null? args) (reverse! out) + (parser-error port "@-command didn't expect more args" + command args))) + ((symbol? arg-names) + (reverse! (acons arg-names args out))) + ((null? args) + (parser-error port "@-command expects more args" + command arg-names)) + ((and (string? (car args)) (string-index (car args) #\space)) + => (lambda (i) + (let ((rest (substring/shared (car args) (1+ i)))) + (if (zero? i) + (loop (cons rest (cdr args)) arg-names out) + (loop (cons rest (cdr args)) (cdr arg-names) + (cons (list (car arg-names) + (substring (car args) 0 i)) + out)))))) + (else + (loop (cdr args) (cdr arg-names) + (if (and (pair? (car args)) (eq? (caar args) '*braces*)) + (acons (car arg-names) (cdar args) out) + (cons (list (car arg-names) (car args)) out)))))))))))) + +(define (parse-eol-text-args command port) + ;; perhaps parse-environment-args should be named more + ;; generically. + (parse-environment-args command port)) + +;; procedure: texi-fragment->stexi STRING +;; +;; A DOM parser for a texinfo fragment STRING. +;; +;; The procedure returns an SXML tree headed by the special tag, +;; *fragment*. + +(define (texi-fragment->stexi string-or-port) + "Parse the texinfo commands in @var{string-or-port}, and return the +resultant stexi tree. The head of the tree will be the special command, +@code{*fragment*}." + (define (parse port) + (postprocess (car ((make-dom-parser) '*fragment* port '())))) + (if (input-port? string-or-port) + (parse string-or-port) + (call-with-input-string string-or-port parse))) + +;; procedure: texi->stexi PORT +;; +;; This is an instance of a SSAX parser above that returns an SXML +;; representation of the texinfo document ready to be read at PORT. +;; +;; The procedure returns an SXML tree. The port points to the +;; first character after the @bye, or to the end of the file. + +(define (texi->stexi port) + "Read a full texinfo document from @var{port} and return the parsed +stexi tree. The parsing will start at the @code{@@settitle} and end at +@code{@@bye} or EOF." + (let ((parser (make-dom-parser))) + (take-until-settitle port) + (postprocess (car (parser 'texinfo port '()))))) + +(define (car-eq? x y) (and (pair? x) (eq? (car x) y))) +(define (make-contents tree) + (define (lp in out depth) + (cond + ((null? in) (values in (cons 'enumerate (reverse! out)))) + ((and (pair? (cdr in)) (texi-command-depth (caadr in) 4)) + => (lambda (new-depth) + (let ((node-name (and (car-eq? (car in) 'node) + (cadr (assq 'name (cdadar in)))))) + (cond + ((< new-depth depth) + (values in (cons 'enumerate (reverse! out)))) + ((> new-depth depth) + (let ((out-cdr (if (null? out) '() (cdr out))) + (out-car (if (null? out) (list 'item) (car out)))) + (let*-values (((new-in new-out) (lp in '() (1+ depth)))) + (lp new-in + (cons (append out-car (list new-out)) out-cdr) + depth)))) + (else ;; same depth + (lp (cddr in) + (cons + `(item (para + ,@(if node-name + `((ref (% (node ,node-name)))) + (cdadr in)))) + out) + depth)))))) + (else (lp (cdr in) out depth)))) + (let*-values (((_ contents) (lp tree '() 1))) + `((chapheading "Table of Contents") ,contents))) + +(define (trim-whitespace str trim-left? trim-right?) + (let* ((left-space? (and (not trim-left?) + (string-prefix? " " str))) + (right-space? (and (not trim-right?) + (string-suffix? " " str))) + (tail (append! (string-tokenize str) + (if right-space? '("") '())))) + (string-join (if left-space? (cons "" tail) tail)))) + +(define (postprocess tree) + (define (loop in out state first? sig-ws?) + (cond + ((null? in) + (values (reverse! out) state)) + ((string? (car in)) + (loop (cdr in) + (cons (if sig-ws? (car in) + (trim-whitespace (car in) first? (null? (cdr in)))) + out) + state #f sig-ws?)) + ((pair? (car in)) + (case (caar in) + ((set) + (if (null? (cdar in)) (error "@set missing arguments" in)) + (if (string? (cadar in)) + (let ((i (string-index (cadar in) #\space))) + (if i + (loop (cdr in) out + (acons (substring (cadar in) 0 i) + (cons (substring (cadar in) (1+ i)) (cddar in)) + state) + #f sig-ws?) + (loop (cdr in) out (acons (cadar in) (cddar in) state) + #f sig-ws?))) + (error "expected a constant to define for @set" in))) + ((value) + (loop (fold-right cons (cdr in) + (or (and=> + (assoc (cadr (assq 'key (cdadar in))) state) cdr) + (error "unknown value" (cdadar in) state))) + out + state #f sig-ws?)) + ((copying) + (loop (cdr in) out (cons (car in) state) #f sig-ws?)) + ((insertcopying) + (loop (fold-right cons (cdr in) + (or (cdr (assoc 'copying state)) + (error "copying isn't set yet"))) + out + state #f sig-ws?)) + ((contents) + (loop (cdr in) (fold cons out (make-contents tree)) state #f sig-ws?)) + (else + (let*-values (((kid-out state) + (loop (car in) '() state #t + (or sig-ws? (space-significant? (caar in)))))) + (loop (cdr in) (cons kid-out out) state #f sig-ws?))))) + (else ; a symbol + (loop (cdr in) (cons (car in) out) state #t sig-ws?)))) + + (call-with-values + (lambda () (loop tree '() '() #t #f)) + (lambda (out state) out))) + +;; Replace % with texinfo-arguments. +(define (stexi->sxml tree) + "Transform the stexi tree @var{tree} into sxml. This involves +replacing the @code{%} element that keeps the texinfo arguments with an +element for each argument. + +FIXME: right now it just changes % to @code{texinfo-arguments} -- that +doesn't hang with the idea of making a dtd at some point" + (pre-post-order + tree + `((% . ,(lambda (x . t) (cons 'texinfo-arguments t))) + (*text* . ,(lambda (x t) t)) + (*default* . ,(lambda (x . t) (cons x t)))))) + +;;; arch-tag: 73890afa-597c-4264-ae70-46fe7756ffb5 +;;; texinfo.scm ends here diff --git a/module/texinfo/docbook.scm b/module/texinfo/docbook.scm new file mode 100644 index 000000000..f760e5bd6 --- /dev/null +++ b/module/texinfo/docbook.scm @@ -0,0 +1,233 @@ +;;;; (texinfo docbook) -- translating sdocbook into stexinfo +;;;; +;;;; Copyright (C) 2009 Free Software Foundation, Inc. +;;;; Copyright (C) 2007, 2009 Andy Wingo <wingo at pobox dot com> +;;;; +;;;; This library is free software; you can redistribute it and/or +;;;; modify it under the terms of the GNU Lesser General Public +;;;; License as published by the Free Software Foundation; either +;;;; version 3 of the License, or (at your option) any later version. +;;;; +;;;; This library is distributed in the hope that it will be useful, +;;;; but WITHOUT ANY WARRANTY; without even the implied warranty of +;;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;;;; Lesser General Public License for more details. +;;;; +;;;; You should have received a copy of the GNU Lesser General Public +;;;; License along with this library; if not, write to the Free Software +;;;; Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;;;; + +;;; Commentary: +;; +;; @c +;; This module exports procedures for transforming a limited subset of +;; the SXML representation of docbook into stexi. It is not complete by +;; any means. The intention is to gather a number of routines and +;; stylesheets so that external modules can parse specific subsets of +;; docbook, for example that set generated by certain tools. +;; +;;; Code: + +(define-module (texinfo docbook) + :use-module (sxml fold) + :export (*sdocbook->stexi-rules* + *sdocbook-block-commands* + sdocbook-flatten + filter-empty-elements + replace-titles)) + +(define (identity . args) + args) + +(define (identity-deattr tag . body) + `(,tag ,@(if (and (pair? body) (pair? (car body)) + (eq? (caar body) '@)) + (cdr body) + body))) + +(define (detag-one tag body) + body) + +(define tag-replacements + '((parameter var) + (replaceable var) + (type code) + (function code) + (literal samp) + (emphasis emph) + (simpara para) + (programlisting example) + (firstterm dfn) + (filename file) + (quote cite) + (application cite) + (symbol code) + (note cartouche) + (envar env))) + +(define ignore-list '()) + +(define (stringify exp) + (with-output-to-string (lambda () (write exp)))) + +(define *sdocbook->stexi-rules* + #; + "A stylesheet for use with SSAX's @code{pre-post-order}, which defines +a number of generic rules for transforming docbook into texinfo." + `((@ *preorder* . ,identity) + (% *preorder* . ,identity) + (para . ,identity-deattr) + (orderedlist ((listitem + . ,(lambda (tag . body) + `(item ,@body)))) + . ,(lambda (tag . body) + `(enumerate ,@body))) + (itemizedlist ((listitem + . ,(lambda (tag . body) + `(item ,@body)))) + . ,(lambda (tag . body) + `(itemize ,@body))) + (term . ,detag-one) + (informalexample . ,detag-one) + (section . ,identity) + (subsection . ,identity) + (subsubsection . ,identity) + (ulink . ,(lambda (tag attrs . body) + `(uref (% ,(assq 'url (cdr attrs)) + (title ,@body))))) + (*text* . ,detag-one) + (*default* . ,(lambda (tag . body) + (let ((subst (assq tag tag-replacements))) + (cond + (subst + (if (and (pair? body) (pair? (car body)) (eq? (caar body) '@)) + (begin + (warn "Ignoring" tag "attributes" (car body)) + (append (cdr subst) (cdr body))) + (append (cdr subst) body))) + ((memq tag ignore-list) #f) + (else + (warn "Don't know how to convert" tag "to stexi") + `(c (% (all ,(stringify (cons tag body)))))))))))) + +;; (variablelist +;; ((varlistentry +;; . ,(lambda (tag term . body) +;; `(entry (% (heading ,@(cdr term))) ,@body))) +;; (listitem +;; . ,(lambda (tag simpara) +;; simpara))) +;; . ,(lambda (tag attrs . body) +;; `(table (% (formatter (var))) ,@body))) + +(define *sdocbook-block-commands* + #; + "The set of sdocbook element tags that should not be nested inside +each other. @xref{texinfo docbook sdocbook-flatten,,sdocbook-flatten}, +for more information." + '(para programlisting informalexample indexterm variablelist + orderedlist refsect1 refsect2 refsect3 refsect4 title example + note itemizedlist)) + +(define (inline-command? command) + (not (memq command *sdocbook-block-commands*))) + +(define (sdocbook-flatten sdocbook) + "\"Flatten\" a fragment of sdocbook so that block elements do not nest +inside each other. + +Docbook is a nested format, where e.g. a @code{refsect2} normally +appears inside a @code{refsect1}. Logical divisions in the document are +represented via the tree topology; a @code{refsect2} element +@emph{contains} all of the elements in its section. + +On the contrary, texinfo is a flat format, in which sections are marked +off by standalone section headers like @code{@@chapter}, and block +elements do not nest inside each other. + +This function takes a nested sdocbook fragment @var{sdocbook} and +flattens all of the sections, such that e.g. +@example + (refsect1 (refsect2 (para \"Hello\"))) +@end example +becomes +@example + ((refsect1) (refsect2) (para \"Hello\")) +@end example + +Oftentimes (always?) sectioning elements have @code{<title>} as their +first element child; users interested in processing the @code{refsect*} +elements into proper sectioning elements like @code{chapter} might be +interested in @code{replace-titles} and @code{filter-empty-elements}. +@xref{texinfo docbook replace-titles,,replace-titles}, and @ref{texinfo +docbook filter-empty-elements,,filter-empty-elements}. + +Returns a nodeset, as described in @ref{sxml xpath}. That is to say, +this function returns an untagged list of stexi elements." + (define (fhere str accum block cont) + (values (cons str accum) + block + cont)) + (define (fdown node accum block cont) + (let ((command (car node)) + (attrs (and (pair? (cdr node)) (pair? (cadr node)) + (eq? (caadr node) '%) + (cadr node)))) + (values (if attrs (cddr node) (cdr node)) + '() + '() + (lambda (accum block) + (values + `(,command ,@(if attrs (list attrs) '()) + ,@(reverse accum)) + block))))) + (define (fup node paccum pblock pcont kaccum kblock kcont) + (call-with-values (lambda () (kcont kaccum kblock)) + (lambda (ret block) + (if (inline-command? (car ret)) + (values (cons ret paccum) (append kblock pblock) pcont) + (values paccum (append kblock (cons ret pblock)) pcont))))) + (call-with-values + (lambda () (foldts*-values fdown fup fhere sdocbook '() '() #f)) + (lambda (accum block cont) + (reverse block)))) + +(define (filter-empty-elements sdocbook) + "Filters out empty elements in an sdocbook nodeset. Mostly useful +after running @code{sdocbook-flatten}." + (reverse + (fold + (lambda (x rest) + (if (and (pair? x) (null? (cdr x))) + rest + (cons x rest))) + '() + sdocbook))) + +(define (replace-titles sdocbook-fragment) + "Iterate over the sdocbook nodeset @var{sdocbook-fragment}, +transforming contiguous @code{refsect} and @code{title} elements into +the appropriate texinfo sectioning command. Most useful after having run +@code{sdocbook-flatten}. + +For example: +@example + (replace-titles '((refsect1) (title \"Foo\") (para \"Bar.\"))) + @result{} '((chapter \"Foo\") (para \"Bar.\")) +@end example +" + (define sections '((refsect1 . chapter) + (refsect2 . section) + (refsect3 . subsection) + (refsect4 . subsubsection))) + (let lp ((in sdocbook-fragment) (out '())) + (cond + ((null? in) + (reverse out)) + ((and (pair? (car in)) (assq (caar in) sections)) + ;; pull out the title + => (lambda (pair) + (lp (cddr in) (cons `(,(cdr pair) ,@(cdadr in)) out)))) + (else + (lp (cdr in) (cons (car in) out)))))) diff --git a/module/texinfo/html.scm b/module/texinfo/html.scm new file mode 100644 index 000000000..f9faf6a36 --- /dev/null +++ b/module/texinfo/html.scm @@ -0,0 +1,259 @@ +;;;; (texinfo html) -- translating stexinfo into shtml +;;;; +;;;; Copyright (C) 2009 Free Software Foundation, Inc. +;;;; Copyright (C) 2003,2004,2009 Andy Wingo <wingo at pobox dot com> +;;;; +;;;; This library is free software; you can redistribute it and/or +;;;; modify it under the terms of the GNU Lesser General Public +;;;; License as published by the Free Software Foundation; either +;;;; version 3 of the License, or (at your option) any later version. +;;;; +;;;; This library is distributed in the hope that it will be useful, +;;;; but WITHOUT ANY WARRANTY; without even the implied warranty of +;;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;;;; Lesser General Public License for more details. +;;;; +;;;; You should have received a copy of the GNU Lesser General Public +;;;; License along with this library; if not, write to the Free Software +;;;; Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;;;; + +;;; Commentary: +;; +;;This module implements transformation from @code{stexi} to HTML. Note +;;that the output of @code{stexi->shtml} is actually SXML with the HTML +;;vocabulary. This means that the output can be further processed, and +;;that it must eventually be serialized by +;;@ref{sxml simple sxml->xml,sxml->xml}. +;; +;;References (i.e., the @code{@@ref} family of commands) are resolved by +;;a @dfn{ref-resolver}. +;;@xref{texinfo html add-ref-resolver!,add-ref-resolver!}, for more +;;information. +;; +;;; Code: + +;; TODO: nice ref resolving API, default CSS stylesheet (esp. to remove +;; margin-top on dd > p) + +(define-module (texinfo html) + :use-module (texinfo) + :use-module (sxml transform) + :use-module (srfi srfi-13) + :export (stexi->shtml add-ref-resolver! urlify)) + +;; The caller is responsible for carring the returned list. +(define (arg-ref key %-args) + (and=> (assq key (cdr %-args)) (lambda (x) (stexi->shtml (cdr x))))) +(define (arg-req key %-args) + (or (arg-ref key %-args) + (error "Missing argument:" key %-args))) +(define (car* x) (and x (car x))) + +(define (urlify str) + (string-downcase + (string-map + (lambda (c) + (case c + ((#\space #\/ #\:) #\-) + (else c))) + str))) + +(define ref-resolvers + (list + (lambda (node-name manual-name) ;; the default + (urlify (string-append (or manual-name "") "#" node-name))))) + +(define (add-ref-resolver! proc) + "Add @var{proc} to the head of the list of ref-resolvers. @var{proc} +will be expected to take the name of a node and the name of a manual and +return the URL of the referent, or @code{#f} to pass control to the next +ref-resolver in the list. + +The default ref-resolver will return the concatenation of the manual +name, @code{#}, and the node name." + (set! ref-resolvers (cons proc ref-resolvers))) + +(define (resolve-ref node manual) + (or (or-map (lambda (x) (x node manual)) ref-resolvers) + (error "Could not resolve reference" node manual))) + +(define (ref tag args) + (let* ((node (car (arg-req 'node args))) + (section (or (car* (arg-ref 'section args)) node)) + (manual (car* (arg-ref 'manual args))) + (target (resolve-ref node manual))) + `(span ,(and=> (assq tag '((xref "See ") (pxref "see "))) cdr) + (a (@ (href ,target)) ,section)))) + +(define (uref tag args) + (let ((url (car (arg-req 'url args)))) + `(a (@ (href ,url)) ,(or (car* (arg-ref 'title args)) url)))) + +;; @!*&%( Mozilla gets confused at an empty ("<a .. />") a tag. Put an +;; empty string here to placate the reptile. +(define (node tag args) + `(a (@ (name ,(urlify (car (arg-req 'name args))))) "")) + +(define (def tag args . body) + (define (code x) (and x (cons 'code x))) + (define (var x) (and x (cons 'var x))) + (define (b x) (and x (cons 'b x))) + (define (list/spaces . elts) + (let lp ((in elts) (out '())) + (cond ((null? in) (reverse! out)) + ((null? (car in)) (lp (cdr in) out)) + (else (lp (cdr in) + (cons (car in) + (if (null? out) out (cons " " out)))))))) + (define (left-td-contents) + (list/spaces (code (arg-ref 'data-type args)) + (b (list (code (arg-ref 'class args)))) ;; is this right? + (b (list (code (arg-ref 'name args)))) + (if (memq tag '(deftypeop deftypefn deftypefun)) + (code (arg-ref 'arguments args)) + (var (list (code (arg-ref 'arguments args))))))) + + (let* ((category (case tag + ((defun) "Function") + ((defspec) "Special Form") + ((defvar) "Variable") + (else (car (arg-req 'category args)))))) + `(div + (table + (@ (cellpadding "0") (cellspacing "0") (width "100%") (class "def")) + (tr (td ,@(left-td-contents)) + (td (div (@ (class "right")) "[" ,category "]")))) + (div (@ (class "description")) ,@body)))) + +(define (enumerate tag . elts) + (define (tonumber start) + (let ((c (string-ref start 0))) + (cond ((number? c) (string->number start)) + (else (1+ (- (char->integer c) + (char->integer (if (char-upper-case? c) #\A #\a)))))))) + `(ol ,@(if (and (pair? elts) (pair? (car elts)) (eq? (caar elts) '%)) + (cons `(@ (start ,@(tonumber (arg-req 'start (car elts))))) + ;; (type ,(type (arg-ref 'start (car elts))))) + (cdr elts)) + elts))) + +(define (table tag args . body) + (let ((formatter (caar (arg-req 'formatter args)))) + (cons 'dl + (map (lambda (x) + (cond ((and (pair? x) (eq? (car x) 'dt)) + (list (car x) (cons formatter (cdr x)))) + (else x))) + (apply append body))))) + +(define (entry tag args . body) + `((dt ,@(arg-req 'heading args)) + (dd ,@body))) + +(define tag-replacements + '((titlepage div (@ (class "titlepage"))) + (title h2 (@ (class "title"))) + (subtitle h3 (@ (class "subtitle"))) + (author h3 (@ (class "author"))) + (example pre) + (lisp pre) + (smallexample pre (@ (class "smaller"))) + (smalllisp pre (@ (class "smaller"))) + (cartouche div (@ (class "cartouche"))) + (verbatim pre (@ (class "verbatim"))) + (chapter h2) + (section h3) + (subsection h4) + (subsubsection h5) + (appendix h2) + (appendixsec h3) + (appendixsubsec h4) + (appendixsubsubsec h5) + (unnumbered h2) + (unnumberedsec h3) + (unnumberedsubsec h4) + (unnumberedsubsubsec h5) + (majorheading h2) + (chapheading h2) + (heading h3) + (subheading h4) + (subsubheading h5) + (quotation blockquote) + (itemize ul) + (item li) ;; itemx ? + (para p) + (*fragment* div) ;; should be ok + + (asis span) + (bold b) + (sample samp) + (samp samp) + (code code) + (kbd kbd) + (key code (@ (class "key"))) + (var var) + (env code (@ (class "env"))) + (file code (@ (class "file"))) + (command code (@ (class "command"))) + (option code (@ (class "option"))) + (url code (@ (class "url"))) + (dfn dfn) + (cite cite) + (acro acronym) + (email code (@ (class "email"))) + (emph em) + (strong strong) + (sc span (@ (class "small-caps"))))) + +(define ignore-list + '(page setfilename setchapternewpage iftex ifinfo ifplaintext ifxml sp vskip + menu ignore syncodeindex comment c dircategory direntry top shortcontents + cindex printindex)) +(define (ignored? tag) + (memq tag ignore-list)) + +(define rules + `((% *preorder* . ,(lambda args args)) ;; Keep these around... + (texinfo . ,(lambda (tag args . body) + (pre-post-order + `(html + (@ (xmlns "http://www.w3.org/1999/xhtml")) + (head (title ,(car (arg-req 'title args)))) + (body ,@body)) + `((% *preorder* . ,(lambda args #f)) ;; ... filter out. + (*text* . ,(lambda (tag x) x)) + (*default* . ,(lambda (tag . body) + (cons tag body))))))) + (copyright . ,(lambda args '(*ENTITY* "copy"))) + (result . ,(lambda args '(*ENTITY* "rArr"))) + (xref . ,ref) (ref . ,ref) (pxref . ,ref) + (uref . ,uref) + (node . ,node) (anchor . ,node) + (table . ,table) + (enumerate . ,enumerate) + (entry . ,entry) + + (deftp . ,def) (defcv . ,def) (defivar . ,def) (deftypeivar . ,def) + (defop . ,def) (deftypeop . ,def) (defmethod . ,def) + (deftypemethod . ,def) (defopt . ,def) (defvr . ,def) (defvar . ,def) + (deftypevr . ,def) (deftypevar . ,def) (deffn . ,def) + (deftypefn . ,def) (defmac . ,def) (defspec . ,def) (defun . ,def) + (deftypefun . ,def) + (ifnottex . ,(lambda (tag . body) body)) + (*text* . ,(lambda (tag x) x)) + (*default* . ,(lambda (tag . body) + (let ((subst (assq tag tag-replacements))) + (cond + (subst (append (cdr subst) body)) + ((memq tag ignore-list) #f) + (else + (warn "Don't know how to convert" tag "to HTML") + body))))))) + +(define (stexi->shtml tree) + "Transform the stexi @var{tree} into shtml, resolving references via +ref-resolvers. See the module commentary for more details." + (pre-post-order tree rules)) + +;;; arch-tag: ab05f3fe-9981-4a78-b64c-48efcd9983a6 diff --git a/module/texinfo/indexing.scm b/module/texinfo/indexing.scm new file mode 100644 index 000000000..bc3d7ab74 --- /dev/null +++ b/module/texinfo/indexing.scm @@ -0,0 +1,78 @@ +;;;; (texinfo indexing) -- indexing stexinfo +;;;; +;;;; Copyright (C) 2009 Free Software Foundation, Inc. +;;;; Copyright (C) 2003,2004,2009 Andy Wingo <wingo at pobox dot com> +;;;; +;;;; This library is free software; you can redistribute it and/or +;;;; modify it under the terms of the GNU Lesser General Public +;;;; License as published by the Free Software Foundation; either +;;;; version 3 of the License, or (at your option) any later version. +;;;; +;;;; This library is distributed in the hope that it will be useful, +;;;; but WITHOUT ANY WARRANTY; without even the implied warranty of +;;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;;;; Lesser General Public License for more details. +;;;; +;;;; You should have received a copy of the GNU Lesser General Public +;;;; License along with this library; if not, write to the Free Software +;;;; Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;;;; + +;;; Commentary: +;; +;;@c texinfo formatting +;;Given a piece of stexi, return an index of a specified variety. +;; +;;Note that currently, @code{stexi-extract-index} doesn't differentiate +;;between different kinds of index entries. That's a bug ;) +;;; Code: + +(define-module (texinfo indexing) + #:use-module (sxml simple) + #:use-module (srfi srfi-13) + #:export (stexi-extract-index)) + +(define (def-name def) + (cadr (assq 'name (cdadr def)))) + +(define defines + '(deftp defcv defivar deftypeivar defop deftypeop defmethod + deftypemethod defopt defvr defvar deftypevr deftypevar deffn + deftypefn defspec defmac defun deftypefun)) + +(define indices + '(cindex findex vindex kindex pindex tindex)) + +(define (stexi-extract-index tree manual-name kind) + "Given an stexi tree @var{tree}, index all of the entries of type +@var{kind}. @var{kind} can be one of the predefined texinfo indices +(@code{concept}, @code{variable}, @code{function}, @code{key}, +@code{program}, @code{type}) or one of the special symbols @code{auto} +or @code{all}. @code{auto} will scan the stext for a @code{(printindex)} +statement, and @code{all} will generate an index from all entries, +regardless of type. + +The returned index is a list of pairs, the @sc{car} of which is the +entry (a string) and the @sc{cdr} of which is a node name (a string)." + (let loop ((in tree) (entries '())) + (cond + ((null? in) + entries) + ((pair? (car in)) + (cond + ((and (pair? (cdr in)) (pair? (cadr in)) + (eq? (caar in) 'anchor) (memq (caadr in) defines)) + (loop (cddr in) (acons (cadr (assq 'name (cdr (cadadr in)))) + (cadr (assq 'name (cdadar in))) + entries))) + ((and (pair? (cdr in)) (pair? (cadr in)) + (eq? (caar in) 'anchor) (memq (caadr in) indices)) + (loop (cddr in) (acons (sxml->string (cadr in)) + (cadr (assq 'name (cdadar in))) + entries))) + (else + (loop (cdr in) (loop (car in) entries))))) + (else + (loop (cdr in) entries))))) + +;;; arch-tag: 216d29d3-1ed9-433f-9c19-0dc4d6b439b6 diff --git a/module/texinfo/plain-text.scm b/module/texinfo/plain-text.scm new file mode 100644 index 000000000..dfa4c9acc --- /dev/null +++ b/module/texinfo/plain-text.scm @@ -0,0 +1,319 @@ +;;;; (texinfo plain-text) -- rendering stexinfo as plain text +;;;; +;;;; Copyright (C) 2009 Free Software Foundation, Inc. +;;;; Copyright (C) 2003,2004,2009 Andy Wingo <wingo at pobox dot com> +;;;; +;;;; This library is free software; you can redistribute it and/or +;;;; modify it under the terms of the GNU Lesser General Public +;;;; License as published by the Free Software Foundation; either +;;;; version 3 of the License, or (at your option) any later version. +;;;; +;;;; This library is distributed in the hope that it will be useful, +;;;; but WITHOUT ANY WARRANTY; without even the implied warranty of +;;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;;;; Lesser General Public License for more details. +;;;; +;;;; You should have received a copy of the GNU Lesser General Public +;;;; License along with this library; if not, write to the Free Software +;;;; Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;;;; + +;;; Commentary: +;; +;;Transformation from stexi to plain-text. Strives to re-create the +;;output from @code{info}; comes pretty damn close. +;; +;;; Code: + +(define-module (texinfo plain-text) + #:use-module (texinfo) + #:use-module (texinfo string-utils) + #:use-module (sxml transform) + #:use-module (srfi srfi-1) + #:use-module (srfi srfi-13) + #:export (stexi->plain-text)) + +;; The return value is a string. +(define (arg-ref key %-args) + (and=> (and=> (assq key (cdr %-args)) cdr) + stexi->plain-text)) +(define (arg-req key %-args) + (or (arg-ref key %-args) + (error "Missing argument:" key %-args))) + +(define *indent* (make-fluid)) +(define *itemizer* (make-fluid)) + +(define (make-ticker str) + (lambda () str)) +(define (make-enumerator n) + (lambda () + (let ((last n)) + (set! n (1+ n)) + (format #f "~A. " last)))) + +(fluid-set! *indent* "") +;; Shouldn't be necessary to do this, but just in case. +(fluid-set! *itemizer* (make-ticker "* ")) + +(define-macro (with-indent n . body) + `(with-fluids ((*indent* (string-append (fluid-ref *indent*) + (make-string ,n #\space)))) + ,@body)) + +(define (make-indenter n proc) + (lambda args (with-indent n (apply proc args)))) + +(define (string-indent str) + (string-append (fluid-ref *indent*) str "\n")) + +(define-macro (with-itemizer itemizer . body) + `(with-fluids ((*itemizer* ,itemizer)) + ,@body)) + +(define (wrap* . strings) + (let ((indent (fluid-ref *indent*))) + (fill-string (string-concatenate strings) + #:line-width 72 #:initial-indent indent + #:subsequent-indent indent))) +(define (wrap . strings) + (string-append (apply wrap* strings) "\n\n")) +(define (wrap-heading . strings) + (string-append (apply wrap* strings) "\n")) + +(define (ref tag args) + (let* ((node (arg-req 'node args)) + (name (or (arg-ref 'name args) node)) + (manual (arg-ref 'manual args))) + (string-concatenate + (cons* + (or (and=> (assq tag '((xref "See ") (pxref "see "))) cadr) "") + name + (if manual `(" in manual " ,manual) '()))))) + +(define (uref tag args) + (let ((url (arg-req 'url args)) + (title (arg-ref 'title args))) + (if title + (string-append title " (" url ")") + (string-append "`" url "'")))) + +(define (def tag args . body) + (define (list/spaces . elts) + (let lp ((in elts) (out '())) + (cond ((null? in) (reverse! out)) + ((null? (car in)) (lp (cdr in) out)) + (else (lp (cdr in) + (cons (car in) + (if (null? out) out (cons " " out)))))))) + (define (first-line) + (string-join + (filter identity + (map (lambda (x) (arg-ref x args)) + '(data-type class name arguments))) + " ")) + + (let* ((category (case tag + ((defun) "Function") + ((defspec) "Special Form") + ((defvar) "Variable") + (else (arg-req 'category args))))) + (string-append + (wrap-heading (string-append " - " category ": " (first-line))) + (with-indent 5 (stexi->plain-text body))))) + +(define (enumerate tag . elts) + (define (tonumber start) + (let ((c (string-ref start 0))) + (cond ((number? c) (string->number start)) + (else (1+ (- (char->integer c) + (char->integer (if (char-upper-case? c) #\A #\a)))))))) + (let* ((args? (and (pair? elts) (pair? (car elts)) + (eq? (caar elts) '%))) + (start (and args? (arg-ref 'start (car elts))))) + (with-itemizer (make-enumerator (if start (tonumber start) 1)) + (with-indent 5 + (stexi->plain-text (if start (cdr elts) elts)))))) + +(define (itemize tag args . elts) + (with-itemizer (make-ticker "* ") + (with-indent 5 + (stexi->plain-text elts)))) + +(define (item tag . elts) + (let* ((ret (stexi->plain-text elts)) + (tick ((fluid-ref *itemizer*))) + (tick-pos (- (string-length (fluid-ref *indent*)) + (string-length tick)))) + (if (and (not (string-null? ret)) (not (negative? tick-pos))) + (string-copy! ret tick-pos tick)) + ret)) + +(define (table tag args . body) + (stexi->plain-text body)) + +(define (entry tag args . body) + (let ((heading (wrap-heading + (stexi->plain-text (arg-req 'heading args))))) + (string-append heading + (with-indent 5 (stexi->plain-text body))))) + +(define (make-underliner char) + (lambda (tag . body) + (let ((str (stexi->plain-text body))) + (string-append + "\n" + (string-indent str) + (string-indent (make-string (string-length str) char)) + "\n")))) + +(define chapter (make-underliner #\*)) +(define section (make-underliner #\=)) +(define subsection (make-underliner #\-)) +(define subsubsection (make-underliner #\.)) + +(define (example tag . body) + (let ((ret (stexi->plain-text body))) + (string-append + (string-concatenate + (with-indent 5 (map string-indent (string-split ret #\newline)))) + "\n"))) + +(define (verbatim tag . body) + (let ((ret (stexi->plain-text body))) + (string-append + (string-concatenate + (map string-indent (string-split ret #\newline))) + "\n"))) + +(define (fragment tag . body) + (string-concatenate (map-in-order stexi->plain-text body))) + +(define (para tag . body) + (wrap (stexi->plain-text body))) + +(define (make-surrounder str) + (lambda (tag . body) + (string-append str (stexi->plain-text body) str))) + +(define (code tag . body) + (string-append "`" (stexi->plain-text body) "'")) + +(define (key tag . body) + (string-append "<" (stexi->plain-text body) ">")) + +(define (var tag . body) + (string-upcase (stexi->plain-text body))) + +(define (passthrough tag . body) + (stexi->plain-text body)) + +(define (ignore . args) + "") + +(define (texinfo tag args . body) + (let ((title (chapter 'foo (arg-req 'title args)))) + (string-append title (stexi->plain-text body)))) + +(define ignore-list + '(page setfilename setchapternewpage iftex ifinfo ifplaintext ifxml sp vskip + menu ignore syncodeindex comment c % node anchor)) +(define (ignored? tag) + (memq tag ignore-list)) + +(define tag-handlers + `((title ,chapter) + (chapter ,chapter) + (section ,section) + (subsection ,subsection) + (subsubsection ,subsubsection) + (appendix ,chapter) + (appendixsec ,section) + (appendixsubsec ,subsection) + (appendixsubsubsec ,subsubsection) + (unnumbered ,chapter) + (unnumberedsec ,section) + (unnumberedsubsec ,subsection) + (unnumberedsubsubsec ,subsubsection) + (majorheading ,chapter) + (chapheading ,chapter) + (heading ,section) + (subheading ,subsection) + (subsubheading ,subsubsection) + + (strong ,(make-surrounder "*")) + (sample ,code) + (samp ,code) + (code ,code) + (kbd ,code) + (key ,key) + (var ,var) + (env ,code) + (file ,code) + (command ,code) + (option ,code) + (url ,code) + (dfn ,(make-surrounder "\"")) + (cite ,(make-surrounder "\"")) + (acro ,passthrough) + (email ,key) + (emph ,(make-surrounder "_")) + (sc ,var) + (copyright ,(lambda args "(C)")) + (result ,(lambda args "==>")) + (xref ,ref) + (ref ,ref) + (pxref ,ref) + (uref ,uref) + + (texinfo ,texinfo) + (quotation ,(make-indenter 5 para)) + (itemize ,itemize) + (enumerate ,enumerate) + (item ,item) + (table ,table) + (entry ,entry) + (example ,example) + (lisp ,example) + (smallexample ,example) + (smalllisp ,example) + (verbatim ,verbatim) + (*fragment* ,fragment) + + (deftp ,def) + (defcv ,def) + (defivar ,def) + (deftypeivar ,def) + (defop ,def) + (deftypeop ,def) + (defmethod ,def) + (deftypemethod ,def) + (defopt ,def) + (defvr ,def) + (defvar ,def) + (deftypevr ,def) + (deftypevar ,def) + (deffn ,def) + (deftypefn ,def) + (defmac ,def) + (defspec ,def) + (defun ,def) + (deftypefun ,def))) + +(define (stexi->plain-text tree) + "Transform @var{tree} into plain text. Returns a string." + (cond + ((null? tree) "") + ((string? tree) tree) + ((pair? tree) + (cond + ((symbol? (car tree)) + (let ((handler (and (not (ignored? (car tree))) + (or (and=> (assq (car tree) tag-handlers) cadr) + para)))) + (if handler (apply handler tree) ""))) + (else + (string-concatenate (map-in-order stexi->plain-text tree))))) + (else ""))) + +;;; arch-tag: f966c3f6-3b46-4790-bbf9-3ad27e4917c2 diff --git a/module/texinfo/reflection.scm b/module/texinfo/reflection.scm new file mode 100644 index 000000000..bda1830ea --- /dev/null +++ b/module/texinfo/reflection.scm @@ -0,0 +1,528 @@ +;;;; (texinfo reflection) -- documenting Scheme as stexinfo +;;;; +;;;; Copyright (C) 2009 Free Software Foundation, Inc. +;;;; Copyright (C) 2003,2004,2009 Andy Wingo <wingo at pobox dot com> +;;;; +;;;; This library is free software; you can redistribute it and/or +;;;; modify it under the terms of the GNU Lesser General Public +;;;; License as published by the Free Software Foundation; either +;;;; version 3 of the License, or (at your option) any later version. +;;;; +;;;; This library is distributed in the hope that it will be useful, +;;;; but WITHOUT ANY WARRANTY; without even the implied warranty of +;;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;;;; Lesser General Public License for more details. +;;;; +;;;; You should have received a copy of the GNU Lesser General Public +;;;; License along with this library; if not, write to the Free Software +;;;; Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;;;; + +;;; Commentary: +;; +;;Routines to generare @code{stexi} documentation for objects and +;;modules. +;; +;;Note that in this context, an @dfn{object} is just a value associated +;;with a location. It has nothing to do with GOOPS. +;; +;;; Code: + +(define-module (texinfo reflection) + #:use-module ((srfi srfi-1) #:select (append-map)) + #:use-module (oop goops) + #:use-module (texinfo) + #:use-module (texinfo plain-text) + #:use-module (srfi srfi-13) + #:use-module (ice-9 session) + #:use-module (ice-9 documentation) + #:use-module (ice-9 optargs) + #:use-module ((sxml transform) #:select (pre-post-order)) + #:export (module-stexi-documentation + script-stexi-documentation + object-stexi-documentation + package-stexi-standard-copying + package-stexi-standard-titlepage + package-stexi-generic-menu + package-stexi-standard-menu + package-stexi-extended-menu + package-stexi-standard-prologue + package-stexi-documentation)) + +;; List for sorting the definitions in a module +(define defs + '(deftp defcv defivar deftypeivar defop deftypeop defmethod + deftypemethod defopt defvr defvar deftypevr deftypevar deffn + deftypefn defmac defspec defun deftypefun)) + +(define (sort-defs ordering a b) + (define (def x) + ;; a and b are lists of the form ((anchor ...) (def* ...)...) + (cadr x)) + (define (name x) + (cadr (assq 'name (cdadr (def x))))) + (define (priority x) + (list-index defs (car (def x)))) + (define (order x) + (or (list-index ordering (string->symbol (name x))) + ;; if the def is not in the list, a big number + 1234567890)) + (define (compare-in-order proc eq? < . args) + (if (not (eq? (proc a) (proc b))) + (< (proc a) (proc b)) + (or (null? args) + (apply compare-in-order args)))) + (compare-in-order order = < + priority = < + name string=? string<=?)) + +(define (list*-join l infix restfix) + (let lp ((in l) (out '())) + (cond ((null? in) (reverse! out)) + ((symbol? in) (reverse! (cons* in restfix out))) + (else (lp (cdr in) (if (null? out) + (list (car in)) + (cons* (car in) infix out))))))) + +(define (process-args args) + (map (lambda (x) (if (symbol? x) (symbol->string x) x)) + (list*-join (or args '()) + " " " . "))) + +(define (get-proc-args proc) + (cond + ((procedure-property proc 'arglist) + => (lambda (arglist) + (let ((required-args (car arglist)) + (optional-args (cadr arglist)) + (keyword-args (caddr arglist)) + (rest-arg (car (cddddr arglist)))) + (process-args + (append + ;; start with the required args... + (map symbol->string required-args) + + ;; add any optional args if needed... + (map (lambda (a) + (if (list? a) + (format #f "[~a = ~s]" (car a) (cadr a)) + (format #f "[~a]" a))) + optional-args) + + ;; now the keyword args.. + (map (lambda (a) + (if (list? a) + (format #f "[#:~a = ~s]" (car a) (cadr a)) + (format #f "[#:~a]" a))) + keyword-args) + + ;; now the rest arg... + (if rest-arg + (list "." (symbol->string rest-arg)) + '())))))) + (else + (process-args (and=> (procedure-source proc) cadr))))) + +;; like the normal false-if-exception, but doesn't affect the-last-stack +(define-macro (false-if-exception exp) + `(catch #t + (lambda () + (with-fluids ((the-last-stack (fluid-ref the-last-stack))) + ,exp)) + (lambda args #f))) + +;; This is really nasty, I wish guile gave a better way to get this... +(define (get-macro-args macro) + (process-args + (case (macro-type macro) + ((syncase-macro) + (case (syncase-macro-type macro) + ((macro) + (get-proc-args (car (syncase-macro-binding macro)))) + (else #f))) + (else #f)))) + +(define many-space? (make-regexp "[[:space:]][[:space:]][[:space:]]")) +(define initial-space? (make-regexp "^[[:space:]]")) +(define (string->stexi str) + (or (and (or (not str) (string-null? str)) + '(*fragment*)) + (and (or (string-index str #\@) + (and (not (regexp-exec many-space? str)) + (not (regexp-exec initial-space? str)))) + (false-if-exception + (texi-fragment->stexi str))) + `(*fragment* (verbatim ,str)))) + +(define method-formals + (and (defined? 'method-formals) method-formals)) + +(define (method-stexi-arguments method) + (cond + (method-formals + (let lp ((formals (method-formals method)) + (specializers (method-specializers method)) + (out '())) + (define (arg-texinfo formal specializer) + `(" (" (var ,(symbol->string formal)) " " + (code ,(symbol->string (class-name specializer))) ")")) + (cond + ((null? formals) (reverse out)) + ((pair? formals) + (lp (cdr formals) (cdr specializers) + (append (reverse (arg-texinfo (car formals) (car specializers))) + out))) + (else + (append (reverse out) (arg-texinfo formals specializers) + (list "...")))))) + ((method-source method) + (let lp ((bindings (cadr (method-source method))) (out '())) + (define (arg-texinfo arg) + `(" (" (var ,(symbol->string (car arg))) " " + (code ,(symbol->string (cadr arg))) ")")) + (cond + ((null? bindings) + (reverse out)) + ((not (pair? (car bindings))) + (append (reverse out) (arg-texinfo bindings) (list "..."))) + (else + (lp (cdr bindings) + (append (reverse (arg-texinfo (car bindings))) out)))))) + (else (warn method) '()))) + +(define* (object-stexi-documentation object #:optional (name "[unknown]") + #:key (force #f)) + (if (symbol? name) + (set! name (symbol->string name))) + (let ((stexi ((lambda (x) + (cond ((string? x) (string->stexi x)) + ((and (pair? x) (eq? (car x) '*fragment*)) x) + (force `(*fragment*)) + (else #f))) + (object-documentation + (if (is-a? object <method>) + (method-procedure object) + object))))) + (define (make-def type args) + `(,type (% ,@args) ,@(cdr stexi))) + (cond + ((not stexi) #f) + ;; stexi is now a list, headed by *fragment*. + ((and (pair? (cdr stexi)) (pair? (cadr stexi)) + (memq (caadr stexi) defs)) + ;; it's already a deffoo. + stexi) + ((is-a? object <class>) + (make-def 'deftp `((name ,name) + (category "Class")))) + ((is-a? object <macro>) + (make-def 'defspec `((name ,name) + (arguments ,@(get-macro-args object))))) + ((is-a? object <procedure>) + (make-def 'defun `((name ,name) + (arguments ,@(get-proc-args object))))) + ((is-a? object <method>) + (make-def 'deffn `((category "Method") + (name ,name) + (arguments ,@(method-stexi-arguments object))))) + ((is-a? object <generic>) + `(*fragment* + ,(make-def 'deffn `((name ,name) + (category "Generic"))) + ,@(map + (lambda (method) + (object-stexi-documentation method name #:force force)) + (generic-function-methods object)))) + (else + (make-def 'defvar `((name ,name))))))) + +(define (module-name->node-name sym-name) + (string-join (map symbol->string sym-name) " ")) + +;; this copied from (ice-9 session); need to find a better way +(define (module-filename name) + (let* ((name (map symbol->string name)) + (reverse-name (reverse name)) + (leaf (car reverse-name)) + (dir-hint-module-name (reverse (cdr reverse-name))) + (dir-hint (apply string-append + (map (lambda (elt) + (string-append elt "/")) + dir-hint-module-name)))) + (%search-load-path (in-vicinity dir-hint leaf)))) + +(define (read-module name) + (let ((filename (module-filename name))) + (if filename + (let ((port (open-input-file filename))) + (let lp ((out '()) (form (read port))) + (if (eof-object? form) + (reverse out) + (lp (cons form out) (read port))))) + '()))) + +(define (module-export-list sym-name) + (define (module-form-export-list form) + (and (pair? form) + (eq? (car form) 'define-module) + (equal? (cadr form) sym-name) + (and=> (memq #:export (cddr form)) cadr))) + (let lp ((forms (read-module sym-name))) + (cond ((null? forms) '()) + ((module-form-export-list (car forms)) => identity) + (else (lp (cdr forms)))))) + +(define* (module-stexi-documentation sym-name + #:optional (docs-resolver + (lambda (name def) def))) + "Return documentation for the module named @var{sym-name}. The +documentation will be formatted as @code{stexi} + (@pxref{texinfo,texinfo})." + (let* ((commentary (and=> (module-commentary sym-name) + (lambda (x) (string-trim-both x #\newline)))) + (stexi (string->stexi commentary)) + (node-name (module-name->node-name sym-name)) + (name-str (with-output-to-string + (lambda () (display sym-name)))) + (module (resolve-interface sym-name)) + (export-list (module-export-list sym-name))) + (define (anchor-name sym) + (string-append node-name " " (symbol->string sym))) + (define (make-defs) + (sort! + (module-map + (lambda (sym var) + `((anchor (% (name ,(anchor-name sym)))) + ,@((lambda (x) + (if (eq? (car x) '*fragment*) + (cdr x) + (list x))) + (if (variable-bound? var) + (docs-resolver + sym + (object-stexi-documentation (variable-ref var) sym + #:force #t)) + (begin + (warn "variable unbound!" sym) + `(defvar (% (name ,(symbol->string sym))) + "[unbound!]")))))) + module) + (lambda (a b) (sort-defs export-list a b)))) + + `(texinfo (% (title ,name-str)) + (node (% (name ,node-name))) + (section "Overview") + ,@(cdr stexi) + (section "Usage") + ,@(apply append! (make-defs))))) + +(define (script-stexi-documentation scriptpath) + "Return documentation for given script. The documentation will be +taken from the script's commentary, and will be returned in the +@code{stexi} format (@pxref{texinfo,texinfo})." + (let ((commentary (file-commentary scriptpath))) + `(texinfo (% (title ,(basename scriptpath))) + (node (% (name ,(basename scriptpath)))) + ,@(if commentary + (cdr + (string->stexi + (string-trim-both commentary #\newline))) + '())))) + +(cond + ((defined? 'add-value-help-handler!) + (add-value-help-handler! + (lambda (name value) + (stexi->plain-text + (object-stexi-documentation value name #:force #t)))) + (add-name-help-handler! + (lambda (name) + (and (list? name) + (and-map symbol? name) + (stexi->plain-text (module-stexi-documentation name))))))) + +;; we could be dealing with an old (ice-9 session); fondle it to get +;; module-commentary +(define module-commentary (@@ (ice-9 session) module-commentary)) + +(define (package-stexi-standard-copying name version updated years + copyright-holder permissions) + "Create a standard texinfo @code{copying} section. + +@var{years} is a list of years (as integers) in which the modules +being documented were released. All other arguments are strings." + `(copying + (para "This manual is for " ,name + " (version " ,version ", updated " ,updated ")") + (para "Copyright " ,(string-join (map number->string years) ",") + " " ,copyright-holder) + (quotation + (para ,permissions)))) + +(define (package-stexi-standard-titlepage name version updated authors) + "Create a standard GNU title page. + +@var{authors} is a list of @code{(@var{name} . @var{email})} +pairs. All other arguments are strings. + +Here is an example of the usage of this procedure: + +@smallexample + (package-stexi-standard-titlepage + \"Foolib\" + \"3.2\" + \"26 September 2006\" + '((\"Alyssa P Hacker\" . \"alyssa@@example.com\")) + '(2004 2005 2006) + \"Free Software Foundation, Inc.\" + \"Standard GPL permissions blurb goes here\") +@end smallexample +" + `(;(setchapternewpage (% (all "odd"))) makes manuals too long + (titlepage + (title ,name) + (subtitle "version " ,version ", updated " ,updated) + ,@(map (lambda (pair) + `(author ,(car pair) + " (" (email ,(cdr pair)) ")")) + authors) + (page) + (vskip (% (all "0pt plus 1filll"))) + (insertcopying)))) + +(define (package-stexi-generic-menu name entries) + "Create a menu from a generic alist of entries, the car of which +should be the node name, and the cdr the description. As an exception, +an entry of @code{#f} will produce a separator." + (define (make-entry node description) + `("* " ,node "::" + ,(make-string (max (- 21 (string-length node)) 2) #\space) + ,@description "\n")) + `((ifnottex + (node (% (name "Top"))) + (top (% (title ,name))) + (insertcopying) + (menu + ,@(apply + append + (map + (lambda (entry) + (if entry + (make-entry (car entry) (cdr entry)) + '("\n"))) + entries)))) + (iftex + (shortcontents)))) + + +(define (package-stexi-standard-menu name modules module-descriptions + extra-entries) + "Create a standard top node and menu, suitable for processing +by makeinfo." + (package-stexi-generic-menu + name + (let ((module-entries (map cons + (map module-name->node-name modules) + module-descriptions)) + (separate-sections (lambda (x) (if (null? x) x (cons #f x))))) + `(,@module-entries + ,@(separate-sections extra-entries))))) + +(define (package-stexi-extended-menu name module-pairs script-pairs + extra-entries) + "Create an \"extended\" menu, like the standard menu but with a +section for scripts." + (package-stexi-generic-menu + name + (let ((module-entries (map cons + (map module-name->node-name + (map car module-pairs)) + (map cdr module-pairs))) + (script-entries (map cons + (map basename (map car script-pairs)) + (map cdr script-pairs))) + (separate-sections (lambda (x) (if (null? x) x (cons #f x))))) + `(,@module-entries + ,@(separate-sections script-entries) + ,@(separate-sections extra-entries))))) + +(define (package-stexi-standard-prologue name filename category + description copying titlepage + menu) + "Create a standard prologue, suitable for later serialization +to texinfo and .info creation with makeinfo. + +Returns a list of stexinfo forms suitable for passing to +@code{package-stexi-documentation} as the prologue. @xref{texinfo +reflection package-stexi-documentation}, @ref{texinfo reflection +package-stexi-standard-titlepage,package-stexi-standard-titlepage}, +@ref{texinfo reflection +package-stexi-standard-copying,package-stexi-standard-copying}, +and @ref{texinfo reflection +package-stexi-standard-menu,package-stexi-standard-menu}." + `(,copying + (dircategory (% (category ,category))) + (direntry + "* " ,name ": (" ,filename "). " ,description ".") + ,@titlepage + ,@menu)) + +(define (stexi->chapter stexi) + (pre-post-order + stexi + `((texinfo . ,(lambda (tag attrs node . body) + `(,node + (chapter ,@(assq-ref (cdr attrs) 'title)) + ,@body))) + (*text* . ,(lambda (tag text) text)) + (*default* . ,(lambda args args))))) + +(define* (package-stexi-documentation modules name filename + prologue epilogue + #:key + (module-stexi-documentation-args + '()) + (scripts '())) + "Create stexi documentation for a @dfn{package}, where a +package is a set of modules that is released together. + +@var{modules} is expected to be a list of module names, where a +module name is a list of symbols. The stexi that is returned will +be titled @var{name} and a texinfo filename of @var{filename}. + +@var{prologue} and @var{epilogue} are lists of stexi forms that +will be spliced into the output document before and after the +generated modules documentation, respectively. +@xref{texinfo reflection package-stexi-standard-prologue}, to +create a conventional GNU texinfo prologue. + +@var{module-stexi-documentation-args} is an optional argument that, if +given, will be added to the argument list when +@code{module-texi-documentation} is called. For example, it might be +useful to define a @code{#:docs-resolver} argument." + (define (verify-modules-list l) + (define (all pred l) + (and (pred (car l)) + (or (null? (cdr l)) (all pred (cdr l))))) + (false-if-exception + (all (lambda (x) (all symbol? x)) modules))) + (if (not (verify-modules-list modules)) + (error "expected modules to be a list of a list of symbols" + modules)) + + `(texinfo + (% (title ,name) + (filename ,filename)) + ,@prologue + ,@(append-map (lambda (mod) + (stexi->chapter + (apply module-stexi-documentation + mod module-stexi-documentation-args))) + modules) + ,@(append-map (lambda (script) + (stexi->chapter + (script-stexi-documentation script))) + scripts) + ,@epilogue)) + +;;; arch-tag: bbe2bc03-e16d-4a9e-87b9-55225dc9836c diff --git a/module/texinfo/serialize.scm b/module/texinfo/serialize.scm new file mode 100644 index 000000000..6a32d2346 --- /dev/null +++ b/module/texinfo/serialize.scm @@ -0,0 +1,263 @@ +;;;; (texinfo serialize) -- rendering stexinfo as texinfo +;;;; +;;;; Copyright (C) 2009 Free Software Foundation, Inc. +;;;; Copyright (C) 2003,2004,2009 Andy Wingo <wingo at pobox dot com> +;;;; +;;;; This library is free software; you can redistribute it and/or +;;;; modify it under the terms of the GNU Lesser General Public +;;;; License as published by the Free Software Foundation; either +;;;; version 3 of the License, or (at your option) any later version. +;;;; +;;;; This library is distributed in the hope that it will be useful, +;;;; but WITHOUT ANY WARRANTY; without even the implied warranty of +;;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;;;; Lesser General Public License for more details. +;;;; +;;;; You should have received a copy of the GNU Lesser General Public +;;;; License along with this library; if not, write to the Free Software +;;;; Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;;;; + +;;; Commentary: +;; +;;Serialization of @code{stexi} to plain texinfo. +;; +;;; Code: + +(define-module (texinfo serialize) + #:use-module (texinfo) + #:use-module (texinfo string-utils) + #:use-module (sxml transform) + #:use-module (srfi srfi-1) + #:use-module (srfi srfi-13) + #:export (stexi->texi)) + +(define (list-intersperse src-l elem) + (if (null? src-l) src-l + (let loop ((l (cdr src-l)) (dest (cons (car src-l) '()))) + (if (null? l) (reverse dest) + (loop (cdr l) (cons (car l) (cons elem dest))))))) + +;; converts improper lists to proper lists. +(define (filter* pred l) + (let lp ((in l) (out '())) + (cond ((null? in) + (reverse! out)) + ((pair? in) + (lp (cdr in) (if (pred (car in)) (cons (car in) out) out))) + (else + (lp '() (if (pred in) (cons in out) out)))))) + +;; (list* 'a '(b c) 'd '(e f g)) => '(a b c d e f g) +(define (list* . args) + (let* ((args (reverse args)) + (tail (car args))) + (let lp ((in (cdr args)) (out tail)) + (cond ((null? in) out) + ((pair? (car in)) (lp (cdr in) (append (car in) out))) + ((null? (car in)) (lp (cdr in) out)) + (else (lp (cdr in) (cons (car in) out))))))) + +;; Why? Well, because syntax-case defines `include', and carps about its +;; wrong usage below... +(eval-when (eval load compile) + (define (include exp lp command type formals args accum) + (list* "\n" + (list-intersperse + args + " ") + " " command "@" accum))) + +(define (empty-command exp lp command type formals args accum) + (list* " " command "@" accum)) + +(define (inline-text exp lp command type formals args accum) + (if (not (string=? command "*braces*")) ;; fixme :( + (list* "}" + (append-map (lambda (x) (lp x '())) (reverse (cdr exp))) + "{" command "@" accum) + (list* "@}" + (append-map (lambda (x) (lp x '())) (reverse (cdr exp))) + "@{" accum))) + +(define (inline-args exp lp command type formals args accum) + (list* "}" + (if (not args) "" + (list-intersperse + (map + (lambda (x) + (cond ((not x) "") + ((pair? x) + (if (pair? (cdr x)) + (warn "Strange inline-args!" args)) + (car x)) + (else (error "Invalid inline-args" args)))) + (drop-while not + (map (lambda (x) (assq-ref args x)) + (reverse formals)))) + ",")) + "{" command "@" accum)) + +(define (serialize-text-args lp formals args) + (apply + append + (list-intersperse + (map (lambda (arg) (append-map (lambda (x) (lp x '())) arg)) + (map + reverse + (drop-while + not (map (lambda (x) (assq-ref args x)) + (reverse formals))))) + '(" ")))) + +(define (eol-text-args exp lp command type formals args accum) + (list* "\n" + (serialize-text-args lp formals args) + " " command "@" accum)) + +(define (eol-text exp lp command type formals args accum) + (list* "\n" + (append-map (lambda (x) (lp x '())) + (reverse (if args (cddr exp) (cdr exp)))) + " " command "@" accum)) + +(define (eol-args exp lp command type formals args accum) + (list* "\n" + (list-intersperse + (apply append + (drop-while not + (map (lambda (x) (assq-ref args x)) + (reverse formals)))) + ", ") + " " command "@" accum)) + +(define (environ exp lp command type formals args accum) + (case (car exp) + ((texinfo) + (list* "@bye\n" + (append-map (lambda (x) (lp x '())) (reverse (cddr exp))) + "\n@c %**end of header\n\n" + (reverse (assq-ref args 'title)) "@settitle " + (or (and=> (assq-ref args 'filename) + (lambda (filename) + (cons "\n" (reverse (cons "@setfilename " filename))))) + "") + "\\input texinfo @c -*-texinfo-*-\n@c %**start of header\n" + accum)) + (else + (list* "\n\n" command "@end " + (let ((body (append-map (lambda (x) (lp x '())) + (reverse (if args (cddr exp) (cdr exp)))))) + (if (or (null? body) + (eqv? (string-ref (car body) + (1- (string-length (car body)))) + #\newline)) + body + (cons "\n" body))) + "\n" + (serialize-text-args lp formals args) + " " command "@" accum)))) + +(define (table-environ exp lp command type formals args accum) + (list* "\n\n" command "@end " + (append-map (lambda (x) (lp x '())) + (reverse (if args (cddr exp) (cdr exp)))) + "\n" + (let* ((arg (if args (cadar args) ""))) ;; zero or one args + (if (pair? arg) + (list (symbol->string (car arg)) "@") + arg)) + " " command "@" accum)) + +(define (wrap strings) + (fill-string (string-concatenate strings) + #:line-width 72)) + +(define (paragraph exp lp command type formals args accum) + (list* "\n\n" + (wrap + (reverse + (append-map (lambda (x) (lp x '())) (reverse (cdr exp))))) + accum)) + +(define (item exp lp command type formals args accum) + (list* (append-map (lambda (x) (lp x '())) (reverse (cdr exp))) + "@item\n" + accum)) + +(define (entry exp lp command type formals args accum) + (list* (append-map (lambda (x) (lp x '())) (reverse (cddr exp))) + "\n" + (append-map (lambda (x) (lp x '())) (reverse (cdar args))) + "@item " + accum)) + +(define (fragment exp lp command type formals args accum) + (list* "\n@c %end of fragment\n" + (append-map (lambda (x) (lp x '())) (reverse (cdr exp))) + "\n@c %start of fragment\n\n" + accum)) + +(define serializers + `((EMPTY-COMMAND . ,empty-command) + (INLINE-TEXT . ,inline-text) + (INLINE-ARGS . ,inline-args) + (EOL-TEXT . ,eol-text) + (EOL-TEXT-ARGS . ,eol-text-args) + (INDEX . ,eol-text-args) + (EOL-ARGS . ,eol-args) + (ENVIRON . ,environ) + (TABLE-ENVIRON . ,table-environ) + (ENTRY . ,entry) + (ITEM . ,item) + (PARAGRAPH . ,paragraph) + (FRAGMENT . ,fragment) + (#f . ,include))) ; support writing include statements + +(define (serialize exp lp command type formals args accum) + ((or (assq-ref serializers type) + (error "Unknown command type" exp type)) + exp lp command type formals args accum)) + +(define escaped-chars '(#\} #\{ #\@)) +(define (escape str) + "Escapes any illegal texinfo characters (currently @{, @}, and @@)." + (let loop ((in (string->list str)) (out '())) + (if (null? in) + (apply string (reverse out)) + (if (memq (car in) escaped-chars) + (loop (cdr in) (cons* (car in) #\@ out)) + (loop (cdr in) (cons (car in) out)))))) + +(define (stexi->texi tree) + "Serialize the stexi @var{tree} into plain texinfo." + (string-concatenate-reverse + (let lp ((in tree) (out '())) + (cond + ((or (not in) (null? in)) out) + ((string? in) (cons (escape in) out)) + ((pair? in) + (let ((command-spec (assq (car in) texi-command-specs))) + (if (not command-spec) + (begin + (warn "Unknown stexi command, not rendering" in) + out) + (serialize in + lp + (symbol->string (car in)) + (cadr command-spec) + (filter* symbol? (cddr command-spec)) + (cond + ((and (pair? (cdr in)) (pair? (cadr in)) + (eq? (caadr in) '%)) + (cdadr in)) + ((not (cadr command-spec)) + ;; include + (cdr in)) + (else + #f)) + out)))) + (else + (error "Invalid stexi" in)))))) + +;;; arch-tag: d3fa16ea-0bf7-4ec5-ab9f-3f08490f77f5 diff --git a/module/texinfo/string-utils.scm b/module/texinfo/string-utils.scm new file mode 100644 index 000000000..eff914369 --- /dev/null +++ b/module/texinfo/string-utils.scm @@ -0,0 +1,400 @@ +;;;; (texinfo string-utils) -- text filling and wrapping +;;;; +;;;; Copyright (C) 2009 Free Software Foundation, Inc. +;;;; Copyright (C) 2003 Richard Todd +;;;; +;;;; This library is free software; you can redistribute it and/or +;;;; modify it under the terms of the GNU Lesser General Public +;;;; License as published by the Free Software Foundation; either +;;;; version 3 of the License, or (at your option) any later version. +;;;; +;;;; This library is distributed in the hope that it will be useful, +;;;; but WITHOUT ANY WARRANTY; without even the implied warranty of +;;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;;;; Lesser General Public License for more details. +;;;; +;;;; You should have received a copy of the GNU Lesser General Public +;;;; License along with this library; if not, write to the Free Software +;;;; Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;;;; + +;;; Commentary: +;; Module @samp{(texinfo string-utils)} provides various string-related +;; functions useful to Guile's texinfo support. +;;; Code: + +(define-module (texinfo string-utils) + #:use-module (srfi srfi-13) + #:use-module (srfi srfi-14) + #:use-module (oop goops) + #:export (escape-special-chars + transform-string + expand-tabs + center-string + left-justify-string + right-justify-string + collapse-repeated-chars + make-text-wrapper + fill-string + string->wrapped-lines)) + +(define* (transform-string str match? replace #:optional (start #f) (end #f)) +"Uses @var{match?} against each character in @var{str}, and performs a +replacement on each character for which matches are found. + +@var{match?} may either be a function, a character, a string, or +@code{#t}. If @var{match?} is a function, then it takes a single +character as input, and should return @samp{#t} for matches. +@var{match?} is a character, it is compared to each string character +using @code{char=?}. If @var{match?} is a string, then any character +in that string will be considered a match. @code{#t} will cause +every character to be a match. + +If @var{replace} is a function, it is called with the matched +character as an argument, and the returned value is sent to the output +string via @samp{display}. If @var{replace} is anything else, it is +sent through the output string via @samp{display}. + +Note that te replacement for the matched characters does not need to +be a single character. That is what differentiates this function from +@samp{string-map}, and what makes it useful for applications such as +converting @samp{#\\&} to @samp{\"&\"} in web page text. Some other +functions in this module are just wrappers around common uses of +@samp{transform-string}. Transformations not possible with this +function should probably be done with regular expressions. + +If @var{start} and @var{end} are given, they control which portion +of the string undergoes transformation. The entire input string +is still output, though. So, if @var{start} is @samp{5}, then the +first five characters of @var{str} will still appear in the returned +string. + +@lisp +; these two are equivalent... + (transform-string str #\\space #\\-) ; change all spaces to -'s + (transform-string str (lambda (c) (char=? #\\space c)) #\\-) +@end lisp" + ;; I had implemented this with string-fold, but it was + ;; slower... + (let* ((os (open-output-string)) + (matcher (cond ((char? match?) + (lambda (c) (char=? match? c))) + ((procedure? match?) + match?) + ((string? match?) + (lambda (c) (string-index match? c))) + ((boolean? match?) + (lambda (c) match?)) + (else (throw 'bad-type "expected #t, char, string, or procedure")))) + (replacer (if (procedure? replace) + (lambda (c) (display (replace c) os)) + (lambda (c) (display replace os))))) + + ;; put the first part in, un-transformed if they asked for it... + (if (and start (<= start (string-length str))) + (display (substring str 0 start) os)) + + ;; process the portion they want processed.... + (string-for-each + (lambda (c) + (if (matcher c) + ;; we have a match! replace the char as directed... + (replacer c) + + ;; not a match, just insert the character itself... + (write-char c os))) + str + (or start 0) + (or end (string-length str))) + + ;; if there was any at the end, tack it on... + (if (and end (< end (string-length str))) + (display (substring str end) os)) + + (get-output-string os))) + +(define* (expand-tabs str #:optional (tab-size 8)) +"Returns a copy of @var{str} with all tabs expanded to spaces. @var{tab-size} defaults to 8. + +Assuming tab size of 8, this is equivalent to: @lisp + (transform-string str #\\tab \" \") +@end lisp" + (transform-string str + #\tab + (make-string tab-size #\space))) + +(define (escape-special-chars str special-chars escape-char) +"Returns a copy of @var{str} with all given special characters preceded +by the given @var{escape-char}. + +@var{special-chars} can either be a single character, or a string consisting +of all the special characters. + +@lisp +;; make a string regexp-safe... + (escape-special-chars \"***(Example String)***\" + \"[]()/*.\" + #\\\\) +=> \"\\\\*\\\\*\\\\*\\\\(Example String\\\\)\\\\*\\\\*\\\\*\" + +;; also can escape a singe char... + (escape-special-chars \"richardt@@vzavenue.net\" + #\\@@ + #\\@@) +=> \"richardt@@@@vzavenue.net\" +@end lisp" + (transform-string str + (if (char? special-chars) + ;; if they gave us a char, use char=? + (lambda (c) (char=? c special-chars)) + + ;; if they gave us a string, see if our character is in it + (lambda (c) (string-index special-chars c))) + + ;; replace matches with the character preceded by the escape character + (lambda (c) (string escape-char c)))) + +(define* (center-string str #:optional (width 80) (chr #\space) (rchr #f)) +"Returns a copy of @var{str} centered in a field of @var{width} +characters. Any needed padding is done by character @var{chr}, which +defaults to @samp{#\\space}. If @var{rchr} is provided, then the +padding to the right will use it instead. See the examples below. +left and @var{rchr} on the right. The default @var{width} is 80. The +default @var{lchr} and @var{rchr} is @samp{#\\space}. The string is +never truncated. +@lisp + (center-string \"Richard Todd\" 24) +=> \" Richard Todd \" + + (center-string \" Richard Todd \" 24 #\\=) +=> \"===== Richard Todd =====\" + + (center-string \" Richard Todd \" 24 #\\< #\\>) +=> \"<<<<< Richard Todd >>>>>\" +@end lisp" + (let* ((len (string-length str)) + (lpad (make-string (max (quotient (- width len) 2) 0) chr)) + ;; right-char == char unless it has been provided by the user + (right-chr (or rchr chr)) + (rpad (if (char=? right-chr chr) + lpad + (make-string (max (quotient (- width len) 2) 0) right-chr)))) + (if (>= len width) + str + (string-append lpad str rpad (if (odd? (- width len)) (string right-chr) ""))))) + +(define* (left-justify-string str #:optional (width 80) (chr #\space)) +"@code{left-justify-string str [width chr]}. +Returns a copy of @var{str} padded with @var{chr} such that it is left +justified in a field of @var{width} characters. The default +@var{width} is 80. Unlike @samp{string-pad} from srfi-13, the string +is never truncated." + (let* ((len (string-length str)) + (pad (make-string (max (- width len) 0) chr))) + (if (>= len width) + str + (string-append str pad)))) + +(define* (right-justify-string str #:optional (width 80) (chr #\space)) +"Returns a copy of @var{str} padded with @var{chr} such that it is +right justified in a field of @var{width} characters. The default +@var{width} is 80. The default @var{chr} is @samp{#\\space}. Unlike +@samp{string-pad} from srfi-13, the string is never truncated." + (let* ((len (string-length str)) + (pad (make-string (max (- width len) 0) chr))) + (if (>= len width) + str + (string-append pad str)))) + + (define* (collapse-repeated-chars str #:optional (chr #\space) (num 1)) +"Returns a copy of @var{str} with all repeated instances of +@var{chr} collapsed down to at most @var{num} instances. +The default value for @var{chr} is @samp{#\\space}, and +the default value for @var{num} is 1. + +@lisp + (collapse-repeated-chars \"H e l l o\") +=> \"H e l l o\" + (collapse-repeated-chars \"H--e--l--l--o\" #\\-) +=> \"H-e-l-l-o\" + (collapse-repeated-chars \"H-e--l---l----o\" #\\- 2) +=> \"H-e--l--l--o\" +@end lisp" + ;; define repeat-locator as a stateful match? function which remembers + ;; the last character it had seen. + (let ((repeat-locator + ;; initialize prev-chr to something other than what we're seeking... + (let ((prev-chr (if (char=? chr #\space) #\A #\space)) + (match-count 0)) + (lambda (c) + (if (and (char=? c prev-chr) + (char=? prev-chr chr)) + ;; found enough duplicates if the match-count is high enough + (begin + (set! match-count (+ 1 match-count)) + (>= match-count num)) + + ;; did not find a duplicate + (begin (set! match-count 0) + (set! prev-chr c) + #f)))))) + + ;; transform the string with our stateful matcher... + ;; deleting matches... + (transform-string str repeat-locator ""))) + +;; split a text string into segments that have the form... +;; <ws non-ws> <ws non-ws> etc.. +(define (split-by-single-words str) + (let ((non-wschars (char-set-complement char-set:whitespace))) + (let loop ((ans '()) + (index 0)) + (let ((next-non-ws (string-index str non-wschars index))) + (if next-non-ws + ;; found non-ws...look for ws following... + (let ((next-ws (string-index str char-set:whitespace next-non-ws))) + (if next-ws + ;; found the ws following... + (loop (cons (substring str index next-ws) ans) + next-ws) + ;; did not find ws...must be the end... + (reverse (cons (substring str index) ans)))) + ;; did not find non-ws... only ws at end of the string... + (reverse ans)))))) + +(define* (make-text-wrapper #:key + (line-width 80) + (expand-tabs? #t) + (tab-width 8) + (collapse-whitespace? #t) + (subsequent-indent "") + (initial-indent "") + (break-long-words? #t)) + "Returns a procedure that will split a string into lines according to the +given parameters. + +@table @code +@item #:line-width +This is the target length used when deciding where to wrap lines. +Default is 80. + +@item #:expand-tabs? +Boolean describing whether tabs in the input should be expanded. Default +is #t. + +@item #:tab-width +If tabs are expanded, this will be the number of spaces to which they +expand. Default is 8. + +@item #:collapse-whitespace? +Boolean describing whether the whitespace inside the existing text +should be removed or not. Default is #t. + +If text is already well-formatted, and is just being wrapped to fit in a +different width, then set this to @samp{#f}. This way, many common text +conventions (such as two spaces between sentences) can be preserved if +in the original text. If the input text spacing cannot be trusted, then +leave this setting at the default, and all repeated whitespace will be +collapsed down to a single space. + +@item #:initial-indent +Defines a string that will be put in front of the first line of wrapped +text. Default is the empty string, ``''. + +@item #:subsequent-indent +Defines a string that will be put in front of all lines of wrapped +text, except the first one. Default is the empty string, ``''. + +@item #:break-long-words? +If a single word is too big to fit on a line, this setting tells the +wrapper what to do. Defaults to #t, which will break up long words. +When set to #f, the line will be allowed, even though it is longer +than the defined @code{#:line-width}. +@end table + +The return value is a procedure of one argument, the input string, which +returns a list of strings, where each element of the list is one line." + (lambda (str) + ;; replace newlines with spaces + (set! str (transform-string str (lambda (c) (char=? c #\nl)) #\space)) + + ;; expand tabs if they wanted us to... + (if expand-tabs? + (set! str (expand-tabs str tab-width))) + + ;; collapse whitespace if they wanted us to... + (if collapse-whitespace? + (set! str (collapse-repeated-chars str))) + + ;; drop any whitespace from the front... + (set! str (string-trim str)) + + ;; now start breaking the text into lines... + (let loop ((ans '()) + (words (split-by-single-words str)) + (line initial-indent) + (count 0)) + (if (null? words) + ;; out of words? ...done! + (reverse (if (> count 0) + (cons line ans) + ans)) + + ;; not out of words...keep going... + (let ((length-left (- line-width + (string-length line))) + (next-word (if (= count 0) + (string-trim (car words)) + (car words)))) + (cond + ;; does the next entry fit? + ((<= (string-length next-word) + length-left) + (loop ans + (cdr words) + (string-append line next-word) + (+ count 1))) + + ;; ok, it didn't fit...is there already at least one word on the line? + ((> count 0) + ;; try to use it for the next line, then... + (loop (cons line ans) + words + subsequent-indent + 0)) + + ;; ok, it didn't fit...and it's the first word. + ;; were we told to break up long words? + (break-long-words? + ;; break the like at the limit, since the user wants us to... + (loop (cons (string-append line (substring next-word 0 length-left)) + ans) + (cons (substring next-word length-left) + (cdr words)) + subsequent-indent + 0)) + + ;; well, then is it the first word and we *shouldn't* break long words, then... + (else + (loop (cons (string-append line next-word) + ans) + (cdr words) + subsequent-indent + 0)))))))) + +(define (string->wrapped-lines str . kwargs) + "@code{string->wrapped-lines str keywds ...}. Wraps the text given in +string @var{str} according to the parameters provided in @var{keywds}, +or the default setting if they are not given. Returns a list of strings +representing the formatted lines. Valid keyword arguments are discussed +in @code{make-text-wrapper}." + ((apply make-text-wrapper kwargs) str)) + +(define (fill-string str . kwargs) + "Wraps the text given in string @var{str} according to the parameters +provided in @var{keywds}, or the default setting if they are not +given. Returns a single string with the wrapped text. Valid keyword +arguments are discussed in @code{make-text-wrapper}." + (string-join (apply string->wrapped-lines str kwargs) + "\n" + 'infix)) |