summaryrefslogtreecommitdiff
path: root/gnu/xml/util/XMLWriter.java
diff options
context:
space:
mode:
Diffstat (limited to 'gnu/xml/util/XMLWriter.java')
-rw-r--r--gnu/xml/util/XMLWriter.java1927
1 files changed, 1927 insertions, 0 deletions
diff --git a/gnu/xml/util/XMLWriter.java b/gnu/xml/util/XMLWriter.java
new file mode 100644
index 000000000..feb9e4f0a
--- /dev/null
+++ b/gnu/xml/util/XMLWriter.java
@@ -0,0 +1,1927 @@
+/* XMLWriter.java --
+ Copyright (C) 1999,2000,2001 Free Software Foundation, Inc.
+
+This file is part of GNU Classpath.
+
+GNU Classpath is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2, or (at your option)
+any later version.
+
+GNU Classpath is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with GNU Classpath; see the file COPYING. If not, write to the
+Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+02111-1307 USA.
+
+Linking this library statically or dynamically with other modules is
+making a combined work based on this library. Thus, the terms and
+conditions of the GNU General Public License cover the whole
+combination.
+
+As a special exception, the copyright holders of this library give you
+permission to link this library with independent modules to produce an
+executable, regardless of the license terms of these independent
+modules, and to copy and distribute the resulting executable under
+terms of your choice, provided that you also meet, for each linked
+independent module, the terms and conditions of the license of that
+module. An independent module is a module which is not derived from
+or based on this library. If you modify this library, you may extend
+this exception to your version of the library, but you are not
+obligated to do so. If you do not wish to do so, delete this
+exception statement from your version. */
+
+package gnu.xml.util;
+
+import java.io.BufferedWriter;
+import java.io.CharConversionException;
+import java.io.IOException;
+import java.io.OutputStream;
+import java.io.OutputStreamWriter;
+import java.io.Writer;
+import java.util.Stack;
+
+import org.xml.sax.*;
+import org.xml.sax.ext.*;
+import org.xml.sax.helpers.*;
+
+
+/**
+ * This class is a SAX handler which writes all its input as a well formed
+ * XML or XHTML document. If driven using SAX2 events, this output may
+ * include a recreated document type declaration, subject to limitations
+ * of SAX (no internal subset exposed) or DOM (the important declarations,
+ * with their documentation, are discarded).
+ *
+ * <p> By default, text is generated "as-is", but some optional modes
+ * are supported. Pretty-printing is supported, to make life easier
+ * for people reading the output. XHTML (1.0) output has can be made
+ * particularly pretty; all the built-in character entities are known.
+ * Canonical XML can also be generated, assuming the input is properly
+ * formed.
+ *
+ * <hr>
+ *
+ * <p> Some of the methods on this class are intended for applications to
+ * use directly, rather than as pure SAX2 event callbacks. Some of those
+ * methods access the JavaBeans properties (used to tweak output formats,
+ * for example canonicalization and pretty printing). Subclasses
+ * are expected to add new behaviors, not to modify current behavior, so
+ * many such methods are final.</p>
+ *
+ * <p> The <em>write*()</em> methods may be slightly simpler for some
+ * applications to use than direct callbacks. For example, they support
+ * a simple policy for encoding data items as the content of a single element.
+ *
+ * <p> To reuse an XMLWriter you must provide it with a new Writer, since
+ * this handler closes the writer it was given as part of its endDocument()
+ * handling. (XML documents have an end of input, and the way to encode
+ * that on a stream is to close it.) </p>
+ *
+ * <hr>
+ *
+ * <p> Note that any relative URIs in the source document, as found in
+ * entity and notation declarations, ought to have been fully resolved by
+ * the parser providing events to this handler. This means that the
+ * output text should only have fully resolved URIs, which may not be
+ * the desired behavior in cases where later binding is desired. </p>
+ *
+ * <p> <em>Note that due to SAX2 defaults, you may need to manually
+ * ensure that the input events are XML-conformant with respect to namespace
+ * prefixes and declarations. {@link gnu.xml.pipeline.NSFilter} is
+ * one solution to this problem, in the context of processing pipelines.</em>
+ * Something as simple as connecting this handler to a parser might not
+ * generate the correct output. Another workaround is to ensure that the
+ * <em>namespace-prefixes</em> feature is always set to true, if you're
+ * hooking this directly up to some XMLReader implementation.
+ *
+ * @see gnu.xml.pipeline.TextConsumer
+ *
+ * @author David Brownell
+ */
+public class XMLWriter
+ implements ContentHandler, LexicalHandler, DTDHandler, DeclHandler
+{
+ // text prints/escapes differently depending on context
+ // CTX_ENTITY ... entity literal value
+ // CTX_ATTRIBUTE ... attribute literal value
+ // CTX_CONTENT ... content of an element
+ // CTX_UNPARSED ... CDATA, comment, PI, names, etc
+ // CTX_NAME ... name or nmtoken, no escapes possible
+ private static final int CTX_ENTITY = 1;
+ private static final int CTX_ATTRIBUTE = 2;
+ private static final int CTX_CONTENT = 3;
+ private static final int CTX_UNPARSED = 4;
+ private static final int CTX_NAME = 5;
+
+// FIXME: names (element, attribute, PI, notation, etc) are not
+// currently written out with range checks (escapeChars).
+// In non-XHTML, some names can't be directly written; panic!
+
+ private static String sysEOL;
+
+ static {
+ try {
+ sysEOL = System.getProperty ("line.separator", "\n");
+
+ // don't use the system's EOL if it's illegal XML.
+ if (!isLineEnd (sysEOL))
+ sysEOL = "\n";
+
+ } catch (SecurityException e) {
+ sysEOL = "\n";
+ }
+ }
+
+ private static boolean isLineEnd (String eol)
+ {
+ return "\n".equals (eol)
+ || "\r".equals (eol)
+ || "\r\n".equals (eol);
+ }
+
+ private Writer out;
+ private boolean inCDATA;
+ private int elementNestLevel;
+ private String eol = sysEOL;
+
+ private short dangerMask;
+ private StringBuffer stringBuf;
+ private Locator locator;
+ private ErrorHandler errHandler;
+
+ private boolean expandingEntities = false;
+ private int entityNestLevel;
+ private boolean xhtml;
+ private boolean startedDoctype;
+ private String encoding;
+
+ private boolean canonical;
+ private boolean inDoctype;
+ private boolean inEpilogue;
+
+ // pretty printing controls
+ private boolean prettyPrinting;
+ private int column;
+ private boolean noWrap;
+ private Stack space = new Stack ();
+
+ // this is not a hard'n'fast rule -- longer lines are OK,
+ // but are to be avoided. Here, prettyprinting is more to
+ // show structure "cleanly" than to be precise about it.
+ // better to have ragged layout than one line 24Kb long.
+ private static final int lineLength = 75;
+
+
+ /**
+ * Constructs this handler with System.out used to write SAX events
+ * using the UTF-8 encoding. Avoid using this except when you know
+ * it's safe to close System.out at the end of the document.
+ */
+ public XMLWriter () throws IOException
+ { this (System.out); }
+
+ /**
+ * Constructs a handler which writes all input to the output stream
+ * in the UTF-8 encoding, and closes it when endDocument is called.
+ * (Yes it's annoying that this throws an exception -- but there's
+ * really no way around it, since it's barely possible a JDK may
+ * exist somewhere that doesn't know how to emit UTF-8.)
+ */
+ public XMLWriter (OutputStream out) throws IOException
+ {
+ this (new OutputStreamWriter (out, "UTF8"));
+ }
+
+ /**
+ * Constructs a handler which writes all input to the writer, and then
+ * closes the writer when the document ends. If an XML declaration is
+ * written onto the output, and this class can determine the name of
+ * the character encoding for this writer, that encoding name will be
+ * included in the XML declaration.
+ *
+ * <P> See the description of the constructor which takes an encoding
+ * name for imporant information about selection of encodings.
+ *
+ * @param writer XML text is written to this writer.
+ */
+ public XMLWriter (Writer writer)
+ {
+ this (writer, null);
+ }
+
+ /**
+ * Constructs a handler which writes all input to the writer, and then
+ * closes the writer when the document ends. If an XML declaration is
+ * written onto the output, this class will use the specified encoding
+ * name in that declaration. If no encoding name is specified, no
+ * encoding name will be declared unless this class can otherwise
+ * determine the name of the character encoding for this writer.
+ *
+ * <P> At this time, only the UTF-8 ("UTF8") and UTF-16 ("Unicode")
+ * output encodings are fully lossless with respect to XML data. If you
+ * use any other encoding you risk having your data be silently mangled
+ * on output, as the standard Java character encoding subsystem silently
+ * maps non-encodable characters to a question mark ("?") and will not
+ * report such errors to applications.
+ *
+ * <p> For a few other encodings the risk can be reduced. If the writer is
+ * a java.io.OutputStreamWriter, and uses either the ISO-8859-1 ("8859_1",
+ * "ISO8859_1", etc) or US-ASCII ("ASCII") encodings, content which
+ * can't be encoded in those encodings will be written safely. Where
+ * relevant, the XHTML entity names will be used; otherwise, numeric
+ * character references will be emitted.
+ *
+ * <P> However, there remain a number of cases where substituting such
+ * entity or character references is not an option. Such references are
+ * not usable within a DTD, comment, PI, or CDATA section. Neither may
+ * they be used when element, attribute, entity, or notation names have
+ * the problematic characters.
+ *
+ * @param writer XML text is written to this writer.
+ * @param encoding if non-null, and an XML declaration is written,
+ * this is the name that will be used for the character encoding.
+ */
+ public XMLWriter (Writer writer, String encoding)
+ {
+ setWriter (writer, encoding);
+ }
+
+ private void setEncoding (String encoding)
+ {
+ if (encoding == null && out instanceof OutputStreamWriter)
+ encoding = ((OutputStreamWriter)out).getEncoding ();
+
+ if (encoding != null) {
+ encoding = encoding.toUpperCase ();
+
+ // Use official encoding names where we know them,
+ // avoiding the Java-only names. When using common
+ // encodings where we can easily tell if characters
+ // are out of range, we'll escape out-of-range
+ // characters using character refs for safety.
+
+ // I _think_ these are all the main synonyms for these!
+ if ("UTF8".equals (encoding)) {
+ encoding = "UTF-8";
+ } else if ("US-ASCII".equals (encoding)
+ || "ASCII".equals (encoding)) {
+ dangerMask = (short) 0xff80;
+ encoding = "US-ASCII";
+ } else if ("ISO-8859-1".equals (encoding)
+ || "8859_1".equals (encoding)
+ || "ISO8859_1".equals (encoding)) {
+ dangerMask = (short) 0xff00;
+ encoding = "ISO-8859-1";
+ } else if ("UNICODE".equals (encoding)
+ || "UNICODE-BIG".equals (encoding)
+ || "UNICODE-LITTLE".equals (encoding)) {
+ encoding = "UTF-16";
+
+ // TODO: UTF-16BE, UTF-16LE ... no BOM; what
+ // release of JDK supports those Unicode names?
+ }
+
+ if (dangerMask != 0)
+ stringBuf = new StringBuffer ();
+ }
+
+ this.encoding = encoding;
+ }
+
+
+ /**
+ * Resets the handler to write a new text document.
+ *
+ * @param writer XML text is written to this writer.
+ * @param encoding if non-null, and an XML declaration is written,
+ * this is the name that will be used for the character encoding.
+ *
+ * @exception IllegalStateException if the current
+ * document hasn't yet ended (with {@link #endDocument})
+ */
+ final public void setWriter (Writer writer, String encoding)
+ {
+ if (out != null)
+ throw new IllegalStateException (
+ "can't change stream in mid course");
+ out = writer;
+ if (out != null)
+ setEncoding (encoding);
+ if (!(out instanceof BufferedWriter))
+ out = new BufferedWriter (out);
+ space.push ("default");
+ }
+
+ /**
+ * Assigns the line ending style to be used on output.
+ * @param eolString null to use the system default; else
+ * "\n", "\r", or "\r\n".
+ */
+ final public void setEOL (String eolString)
+ {
+ if (eolString == null)
+ eol = sysEOL;
+ else if (!isLineEnd (eolString))
+ eol = eolString;
+ else
+ throw new IllegalArgumentException (eolString);
+ }
+
+ /**
+ * Assigns the error handler to be used to present most fatal
+ * errors.
+ */
+ public void setErrorHandler (ErrorHandler handler)
+ {
+ errHandler = handler;
+ }
+
+ /**
+ * Used internally and by subclasses, this encapsulates the logic
+ * involved in reporting fatal errors. It uses locator information
+ * for good diagnostics, if available, and gives the application's
+ * ErrorHandler the opportunity to handle the error before throwing
+ * an exception.
+ */
+ protected void fatal (String message, Exception e)
+ throws SAXException
+ {
+ SAXParseException x;
+
+ if (locator == null)
+ x = new SAXParseException (message, null, null, -1, -1, e);
+ else
+ x = new SAXParseException (message, locator, e);
+ if (errHandler != null)
+ errHandler.fatalError (x);
+ throw x;
+ }
+
+
+ // JavaBeans properties
+
+ /**
+ * Controls whether the output should attempt to follow the "transitional"
+ * XHTML rules so that it meets the "HTML Compatibility Guidelines"
+ * appendix in the XHTML specification. A "transitional" Document Type
+ * Declaration (DTD) is placed near the beginning of the output document,
+ * instead of whatever DTD would otherwise have been placed there, and
+ * XHTML empty elements are printed specially. When writing text in
+ * US-ASCII or ISO-8859-1 encodings, the predefined XHTML internal
+ * entity names are used (in preference to character references) when
+ * writing content characters which can't be expressed in those encodings.
+ *
+ * <p> When this option is enabled, it is the caller's responsibility
+ * to ensure that the input is otherwise valid as XHTML. Things to
+ * be careful of in all cases, as described in the appendix referenced
+ * above, include: <ul>
+ *
+ * <li> Element and attribute names must be in lower case, both
+ * in the document and in any CSS style sheet.
+ * <li> All XML constructs must be valid as defined by the XHTML
+ * "transitional" DTD (including all familiar constructs,
+ * even deprecated ones).
+ * <li> The root element must be "html".
+ * <li> Elements that must be empty (such as <em>&lt;br&gt;</em>
+ * must have no content.
+ * <li> Use both <em>lang</em> and <em>xml:lang</em> attributes
+ * when specifying language.
+ * <li> Similarly, use both <em>id</em> and <em>name</em> attributes
+ * when defining elements that may be referred to through
+ * URI fragment identifiers ... and make sure that the
+ * value is a legal NMTOKEN, since not all such HTML 4.0
+ * identifiers are valid in XML.
+ * <li> Be careful with character encodings; make sure you provide
+ * a <em>&lt;meta http-equiv="Content-type"
+ * content="text/xml;charset=..." /&gt;</em> element in
+ * the HTML "head" element, naming the same encoding
+ * used to create this handler. Also, if that encoding
+ * is anything other than US-ASCII, make sure that if
+ * the document is given a MIME content type, it has
+ * a <em>charset=...</em> attribute with that encoding.
+ * </ul>
+ *
+ * <p> Additionally, some of the oldest browsers have additional
+ * quirks, to address with guidelines such as: <ul>
+ *
+ * <li> Processing instructions may be rendered, so avoid them.
+ * (Similarly for an XML declaration.)
+ * <li> Embedded style sheets and scripts should not contain XML
+ * markup delimiters: &amp;, &lt;, and ]]&gt; are trouble.
+ * <li> Attribute values should not have line breaks or multiple
+ * consecutive white space characters.
+ * <li> Use no more than one of the deprecated (transitional)
+ * <em>&lt;isindex&gt;</em> elements.
+ * <li> Some boolean attributes (such as <em>compact, checked,
+ * disabled, readonly, selected,</em> and more) confuse
+ * some browsers, since they only understand minimized
+ * versions which are illegal in XML.
+ * </ul>
+ *
+ * <p> Also, some characteristics of the resulting output may be
+ * a function of whether the document is later given a MIME
+ * content type of <em>text/html</em> rather than one indicating
+ * XML (<em>application/xml</em> or <em>text/xml</em>). Worse,
+ * some browsers ignore MIME content types and prefer to rely URI
+ * name suffixes -- so an "index.xml" could always be XML, never
+ * XHTML, no matter its MIME type.
+ */
+ final public void setXhtml (boolean value)
+ {
+ if (locator != null)
+ throw new IllegalStateException ("started parsing");
+ xhtml = value;
+ if (xhtml)
+ canonical = false;
+ }
+
+ /**
+ * Returns true if the output attempts to echo the input following
+ * "transitional" XHTML rules and matching the "HTML Compatibility
+ * Guidelines" so that an HTML version 3 browser can read the output
+ * as HTML; returns false (the default) othewise.
+ */
+ final public boolean isXhtml ()
+ {
+ return xhtml;
+ }
+
+ /**
+ * Controls whether the output text contains references to
+ * entities (the default), or instead contains the expanded
+ * values of those entities.
+ */
+ final public void setExpandingEntities (boolean value)
+ {
+ if (locator != null)
+ throw new IllegalStateException ("started parsing");
+ expandingEntities = value;
+ if (!expandingEntities)
+ canonical = false;
+ }
+
+ /**
+ * Returns true if the output will have no entity references;
+ * returns false (the default) otherwise.
+ */
+ final public boolean isExpandingEntities ()
+ {
+ return expandingEntities;
+ }
+
+ /**
+ * Controls pretty-printing, which by default is not enabled
+ * (and currently is most useful for XHTML output).
+ * Pretty printing enables structural indentation, sorting of attributes
+ * by name, line wrapping, and potentially other mechanisms for making
+ * output more or less readable.
+ *
+ * <p> At this writing, structural indentation and line wrapping are
+ * enabled when pretty printing is enabled and the <em>xml:space</em>
+ * attribute has the value <em>default</em> (its other legal value is
+ * <em>preserve</em>, as defined in the XML specification). The three
+ * XHTML element types which use another value are recognized by their
+ * names (namespaces are ignored).
+ *
+ * <p> Also, for the record, the "pretty" aspect of printing here
+ * is more to provide basic structure on outputs that would otherwise
+ * risk being a single long line of text. For now, expect the
+ * structure to be ragged ... unless you'd like to submit a patch
+ * to make this be more strictly formatted!
+ *
+ * @exception IllegalStateException thrown if this method is invoked
+ * after output has begun.
+ */
+ final public void setPrettyPrinting (boolean value)
+ {
+ if (locator != null)
+ throw new IllegalStateException ("started parsing");
+ prettyPrinting = value;
+ if (prettyPrinting)
+ canonical = false;
+ }
+
+ /**
+ * Returns value of flag controlling pretty printing.
+ */
+ final public boolean isPrettyPrinting ()
+ {
+ return prettyPrinting;
+ }
+
+
+ /**
+ * Sets the output style to be canonicalized. Input events must
+ * meet requirements that are slightly more stringent than the
+ * basic well-formedness ones, and include: <ul>
+ *
+ * <li> Namespace prefixes must not have been changed from those
+ * in the original document. (This may only be ensured by setting
+ * the SAX2 XMLReader <em>namespace-prefixes</em> feature flag;
+ * by default, it is cleared.)
+ *
+ * <li> Redundant namespace declaration attributes have been
+ * removed. (If an ancestor element defines a namespace prefix
+ * and that declaration hasn't been overriden, an element must
+ * not redeclare it.)
+ *
+ * <li> If comments are not to be included in the canonical output,
+ * they must first be removed from the input event stream; this
+ * <em>Canonical XML with comments</em> by default.
+ *
+ * <li> If the input character encoding was not UCS-based, the
+ * character data must have been normalized using Unicode
+ * Normalization Form C. (UTF-8 and UTF-16 are UCS-based.)
+ *
+ * <li> Attribute values must have been normalized, as is done
+ * by any conformant XML processor which processes all external
+ * parameter entities.
+ *
+ * <li> Similarly, attribute value defaulting has been performed.
+ *
+ * </ul>
+ *
+ * <p> Note that fragments of XML documents, as specified by an XPath
+ * node set, may be canonicalized. In such cases, elements may need
+ * some fixup (for <em>xml:*</em> attributes and application-specific
+ * context).
+ *
+ * @exception IllegalArgumentException if the output encoding
+ * is anything other than UTF-8.
+ */
+ final public void setCanonical (boolean value)
+ {
+ if (value && !"UTF-8".equals (encoding))
+ throw new IllegalArgumentException ("encoding != UTF-8");
+ canonical = value;
+ if (canonical) {
+ prettyPrinting = xhtml = false;
+ expandingEntities = true;
+ eol = "\n";
+ }
+ }
+
+
+ /**
+ * Returns value of flag controlling canonical output.
+ */
+ final public boolean isCanonical ()
+ {
+ return canonical;
+ }
+
+
+ /**
+ * Flushes the output stream. When this handler is used in long lived
+ * pipelines, it can be important to flush buffered state, for example
+ * so that it can reach the disk as part of a state checkpoint.
+ */
+ final public void flush ()
+ throws IOException
+ {
+ if (out != null)
+ out.flush ();
+ }
+
+
+ // convenience routines
+
+// FIXME: probably want a subclass that holds a lot of these...
+// and maybe more!
+
+ /**
+ * Writes the string as if characters() had been called on the contents
+ * of the string. This is particularly useful when applications act as
+ * producers and write data directly to event consumers.
+ */
+ final public void write (String data)
+ throws SAXException
+ {
+ char buf [] = data.toCharArray ();
+ characters (buf, 0, buf.length);
+ }
+
+
+ /**
+ * Writes an element that has content consisting of a single string.
+ * @see #writeEmptyElement
+ * @see #startElement
+ */
+ public void writeElement (
+ String uri,
+ String localName,
+ String qName,
+ Attributes atts,
+ String content
+ ) throws SAXException
+ {
+ if (content == null || content.length () == 0) {
+ writeEmptyElement (uri, localName, qName, atts);
+ return;
+ }
+ startElement (uri, localName, qName, atts);
+ char chars [] = content.toCharArray ();
+ characters (chars, 0, chars.length);
+ endElement (uri, localName, qName);
+ }
+
+
+ /**
+ * Writes an element that has content consisting of a single integer,
+ * encoded as a decimal string.
+ * @see #writeEmptyElement
+ * @see #startElement
+ */
+ public void writeElement (
+ String uri,
+ String localName,
+ String qName,
+ Attributes atts,
+ int content
+ ) throws SAXException
+ {
+ writeElement (uri, localName, qName, atts, Integer.toString (content));
+ }
+
+
+ // SAX1 ContentHandler
+ /** <b>SAX1</b>: provides parser status information */
+ final public void setDocumentLocator (Locator l)
+ {
+ locator = l;
+ }
+
+
+ // URL for dtd that validates against all normal HTML constructs
+ private static final String xhtmlFullDTD =
+ "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd";
+
+
+ /**
+ * <b>SAX1</b>: indicates the beginning of a document parse.
+ * If you're writing (well formed) fragments of XML, neither
+ * this nor endDocument should be called.
+ */
+ // NOT final
+ public void startDocument ()
+ throws SAXException
+ {
+ try {
+ if (out == null)
+ throw new IllegalStateException (
+ "null Writer given to XMLWriter");
+
+ // Not all parsers provide the locator we want; this also
+ // flags whether events are being sent to this object yet.
+ // We could only have this one call if we only printed whole
+ // documents ... but we also print fragments, so most of the
+ // callbacks here replicate this test.
+
+ if (locator == null)
+ locator = new LocatorImpl ();
+
+ // Unless the data is in US-ASCII or we're canonicalizing, write
+ // the XML declaration if we know the encoding. US-ASCII won't
+ // normally get mangled by web server confusion about the
+ // character encodings used. Plus, it's an easy way to
+ // ensure we can write ASCII that's unlikely to confuse
+ // elderly HTML parsers.
+
+ if (!canonical
+ && dangerMask != (short) 0xff80
+ && encoding != null) {
+ rawWrite ("<?xml version='1.0'");
+ rawWrite (" encoding='" + encoding + "'");
+ rawWrite ("?>");
+ newline ();
+ }
+
+ if (xhtml) {
+
+ rawWrite ("<!DOCTYPE html PUBLIC");
+ newline ();
+ rawWrite (" '-//W3C//DTD XHTML 1.0 Transitional//EN'");
+ newline ();
+ rawWrite (" '");
+ // NOTE: URL (above) matches the REC
+ rawWrite (xhtmlFullDTD);
+ rawWrite ("'>");
+ newline ();
+ newline ();
+
+ // fake the rest of the handler into ignoring
+ // everything until the root element, so any
+ // XHTML DTD comments, PIs, etc are ignored
+ startedDoctype = true;
+ }
+
+ entityNestLevel = 0;
+
+ } catch (IOException e) {
+ fatal ("can't write", e);
+ }
+ }
+
+ /**
+ * <b>SAX1</b>: indicates the completion of a parse.
+ * Note that all complete SAX event streams make this call, even
+ * if an error is reported during a parse.
+ */
+ // NOT final
+ public void endDocument ()
+ throws SAXException
+ {
+ try {
+ if (!canonical) {
+ newline ();
+ newline ();
+ }
+ out.close ();
+ out = null;
+ locator = null;
+ } catch (IOException e) {
+ fatal ("can't write", e);
+ }
+ }
+
+ // XHTML elements declared as EMPTY print differently
+ final private static boolean isEmptyElementTag (String tag)
+ {
+ switch (tag.charAt (0)) {
+ case 'a': return "area".equals (tag);
+ case 'b': return "base".equals (tag)
+ || "basefont".equals (tag)
+ || "br".equals (tag);
+ case 'c': return "col".equals (tag);
+ case 'f': return "frame".equals (tag);
+ case 'h': return "hr".equals (tag);
+ case 'i': return "img".equals (tag)
+ || "input".equals (tag)
+ || "isindex".equals (tag);
+ case 'l': return "link".equals (tag);
+ case 'm': return "meta".equals (tag);
+ case 'p': return "param".equals (tag);
+ }
+ return false;
+ }
+
+ private static boolean indentBefore (String tag)
+ {
+ // basically indent before block content
+ // and within structure like tables, lists
+ switch (tag.charAt (0)) {
+ case 'a': return "applet".equals (tag);
+ case 'b': return "body".equals (tag)
+ || "blockquote".equals (tag);
+ case 'c': return "center".equals (tag);
+ case 'f': return "frame".equals (tag)
+ || "frameset".equals (tag);
+ case 'h': return "head".equals (tag);
+ case 'm': return "meta".equals (tag);
+ case 'o': return "object".equals (tag);
+ case 'p': return "param".equals (tag)
+ || "pre".equals (tag);
+ case 's': return "style".equals (tag);
+ case 't': return "title".equals (tag)
+ || "td".equals (tag)
+ || "th".equals (tag);
+ }
+ // ... but not inline elements like "em", "b", "font"
+ return false;
+ }
+
+ private static boolean spaceBefore (String tag)
+ {
+ // blank line AND INDENT before certain structural content
+ switch (tag.charAt (0)) {
+ case 'h': return "h1".equals (tag)
+ || "h2".equals (tag)
+ || "h3".equals (tag)
+ || "h4".equals (tag)
+ || "h5".equals (tag)
+ || "h6".equals (tag)
+ || "hr".equals (tag);
+ case 'l': return "li".equals (tag);
+ case 'o': return "ol".equals (tag);
+ case 'p': return "p".equals (tag);
+ case 't': return "table".equals (tag)
+ || "tr".equals (tag);
+ case 'u': return "ul".equals (tag);
+ }
+ return false;
+ }
+
+ // XHTML DTDs say these three have xml:space="preserve"
+ private static boolean spacePreserve (String tag)
+ {
+ return "pre".equals (tag)
+ || "style".equals (tag)
+ || "script".equals (tag);
+ }
+
+ /**
+ * <b>SAX2</b>: ignored.
+ */
+ final public void startPrefixMapping (String prefix, String uri)
+ {}
+
+ /**
+ * <b>SAX2</b>: ignored.
+ */
+ final public void endPrefixMapping (String prefix)
+ {}
+
+ private void writeStartTag (
+ String name,
+ Attributes atts,
+ boolean isEmpty
+ ) throws SAXException, IOException
+ {
+ rawWrite ('<');
+ rawWrite (name);
+
+ // write out attributes ... sorting is particularly useful
+ // with output that's been heavily defaulted.
+ if (atts != null && atts.getLength () != 0) {
+
+ // Set up to write, with optional sorting
+ int indices [] = new int [atts.getLength ()];
+
+ for (int i= 0; i < indices.length; i++)
+ indices [i] = i;
+
+ // optionally sort
+
+// FIXME: canon xml demands xmlns nodes go first,
+// and sorting by URI first (empty first) then localname
+// it should maybe use a different sort
+
+ if (canonical || prettyPrinting) {
+
+ // insertion sort by attribute name
+ for (int i = 1; i < indices.length; i++) {
+ int n = indices [i], j;
+ String s = atts.getQName (n);
+
+ for (j = i - 1; j >= 0; j--) {
+ if (s.compareTo (atts.getQName (indices [j]))
+ >= 0)
+ break;
+ indices [j + 1] = indices [j];
+ }
+ indices [j + 1] = n;
+ }
+ }
+
+ // write, sorted or no
+ for (int i= 0; i < indices.length; i++) {
+ String s = atts.getQName (indices [i]);
+
+ if (s == null || "".equals (s))
+ throw new IllegalArgumentException ("no XML name");
+ rawWrite (" ");
+ rawWrite (s);
+ rawWrite ("=");
+ writeQuotedValue (atts.getValue (indices [i]),
+ CTX_ATTRIBUTE);
+ }
+ }
+ if (isEmpty)
+ rawWrite (" /");
+ rawWrite ('>');
+ }
+
+ /**
+ * <b>SAX2</b>: indicates the start of an element.
+ * When XHTML is in use, avoid attribute values with
+ * line breaks or multiple whitespace characters, since
+ * not all user agents handle them correctly.
+ */
+ final public void startElement (
+ String uri,
+ String localName,
+ String qName,
+ Attributes atts
+ ) throws SAXException
+ {
+ startedDoctype = false;
+
+ if (locator == null)
+ locator = new LocatorImpl ();
+
+ if (qName == null || "".equals (qName))
+ throw new IllegalArgumentException ("no XML name");
+
+ try {
+ if (entityNestLevel != 0)
+ return;
+ if (prettyPrinting) {
+ String whitespace = null;
+
+ if (xhtml && spacePreserve (qName))
+ whitespace = "preserve";
+ else if (atts != null)
+ whitespace = atts.getValue ("xml:space");
+ if (whitespace == null)
+ whitespace = (String) space.peek ();
+ space.push (whitespace);
+
+ if ("default".equals (whitespace)) {
+ if (xhtml) {
+ if (spaceBefore (qName)) {
+ newline ();
+ doIndent ();
+ } else if (indentBefore (qName))
+ doIndent ();
+ // else it's inlined, modulo line length
+ // FIXME: incrementing element nest level
+ // for inlined elements causes ugliness
+ } else
+ doIndent ();
+ }
+ }
+ elementNestLevel++;
+ writeStartTag (qName, atts, xhtml && isEmptyElementTag (qName));
+
+ if (xhtml) {
+// FIXME: if this is an XHTML "pre" element, turn
+// off automatic wrapping.
+ }
+
+ } catch (IOException e) {
+ fatal ("can't write", e);
+ }
+ }
+
+ /**
+ * Writes an empty element.
+ * @see #startElement
+ */
+ public void writeEmptyElement (
+ String uri,
+ String localName,
+ String qName,
+ Attributes atts
+ ) throws SAXException
+ {
+ if (canonical) {
+ startElement (uri, localName, qName, atts);
+ endElement (uri, localName, qName);
+ } else {
+ try {
+ writeStartTag (qName, atts, true);
+ } catch (IOException e) {
+ fatal ("can't write", e);
+ }
+ }
+ }
+
+
+ /** <b>SAX2</b>: indicates the end of an element */
+ final public void endElement (String uri, String localName, String qName)
+ throws SAXException
+ {
+ if (qName == null || "".equals (qName))
+ throw new IllegalArgumentException ("no XML name");
+
+ try {
+ elementNestLevel--;
+ if (entityNestLevel != 0)
+ return;
+ if (xhtml && isEmptyElementTag (qName))
+ return;
+ rawWrite ("</");
+ rawWrite (qName);
+ rawWrite ('>');
+
+ if (prettyPrinting) {
+ if (!space.empty ())
+ space.pop ();
+ else
+ fatal ("stack discipline", null);
+ }
+ if (elementNestLevel == 0)
+ inEpilogue = true;
+
+ } catch (IOException e) {
+ fatal ("can't write", e);
+ }
+ }
+
+ /** <b>SAX1</b>: reports content characters */
+ final public void characters (char ch [], int start, int length)
+ throws SAXException
+ {
+ if (locator == null)
+ locator = new LocatorImpl ();
+
+ try {
+ if (entityNestLevel != 0)
+ return;
+ if (inCDATA) {
+ escapeChars (ch, start, length, CTX_UNPARSED);
+ } else {
+ escapeChars (ch, start, length, CTX_CONTENT);
+ }
+ } catch (IOException e) {
+ fatal ("can't write", e);
+ }
+ }
+
+ /** <b>SAX1</b>: reports ignorable whitespace */
+ final public void ignorableWhitespace (char ch [], int start, int length)
+ throws SAXException
+ {
+ if (locator == null)
+ locator = new LocatorImpl ();
+
+ try {
+ if (entityNestLevel != 0)
+ return;
+ // don't forget to map NL to CRLF, CR, etc
+ escapeChars (ch, start, length, CTX_CONTENT);
+ } catch (IOException e) {
+ fatal ("can't write", e);
+ }
+ }
+
+ /**
+ * <b>SAX1</b>: reports a PI.
+ * This doesn't check for illegal target names, such as "xml" or "XML",
+ * or namespace-incompatible ones like "big:dog"; the caller is
+ * responsible for ensuring those names are legal.
+ */
+ final public void processingInstruction (String target, String data)
+ throws SAXException
+ {
+ if (locator == null)
+ locator = new LocatorImpl ();
+
+ // don't print internal subset for XHTML
+ if (xhtml && startedDoctype)
+ return;
+
+ // ancient HTML browsers might render these ... their loss.
+ // to prevent: "if (xhtml) return;".
+
+ try {
+ if (entityNestLevel != 0)
+ return;
+ if (canonical && inEpilogue)
+ newline ();
+ rawWrite ("<?");
+ rawWrite (target);
+ rawWrite (' ');
+ escapeChars (data.toCharArray (), -1, -1, CTX_UNPARSED);
+ rawWrite ("?>");
+ if (elementNestLevel == 0 && !(canonical && inEpilogue))
+ newline ();
+ } catch (IOException e) {
+ fatal ("can't write", e);
+ }
+ }
+
+ /** <b>SAX1</b>: indicates a non-expanded entity reference */
+ public void skippedEntity (String name)
+ throws SAXException
+ {
+ try {
+ rawWrite ("&");
+ rawWrite (name);
+ rawWrite (";");
+ } catch (IOException e) {
+ fatal ("can't write", e);
+ }
+ }
+
+ // SAX2 LexicalHandler
+
+ /** <b>SAX2</b>: called before parsing CDATA characters */
+ final public void startCDATA ()
+ throws SAXException
+ {
+ if (locator == null)
+ locator = new LocatorImpl ();
+
+ if (canonical)
+ return;
+
+ try {
+ inCDATA = true;
+ if (entityNestLevel == 0)
+ rawWrite ("<![CDATA[");
+ } catch (IOException e) {
+ fatal ("can't write", e);
+ }
+ }
+
+ /** <b>SAX2</b>: called after parsing CDATA characters */
+ final public void endCDATA ()
+ throws SAXException
+ {
+ if (canonical)
+ return;
+
+ try {
+ inCDATA = false;
+ if (entityNestLevel == 0)
+ rawWrite ("]]>");
+ } catch (IOException e) {
+ fatal ("can't write", e);
+ }
+ }
+
+ /**
+ * <b>SAX2</b>: called when the doctype is partially parsed
+ * Note that this, like other doctype related calls, is ignored
+ * when XHTML is in use.
+ */
+ final public void startDTD (String name, String publicId, String systemId)
+ throws SAXException
+ {
+ if (locator == null)
+ locator = new LocatorImpl ();
+ if (xhtml)
+ return;
+ try {
+ inDoctype = startedDoctype = true;
+ if (canonical)
+ return;
+ rawWrite ("<!DOCTYPE ");
+ rawWrite (name);
+ rawWrite (' ');
+
+ if (!expandingEntities) {
+ if (publicId != null)
+ rawWrite ("PUBLIC '" + publicId + "' '" + systemId + "' ");
+ else if (systemId != null)
+ rawWrite ("SYSTEM '" + systemId + "' ");
+ }
+
+ rawWrite ('[');
+ newline ();
+ } catch (IOException e) {
+ fatal ("can't write", e);
+ }
+ }
+
+ /** <b>SAX2</b>: called after the doctype is parsed */
+ final public void endDTD ()
+ throws SAXException
+ {
+ inDoctype = false;
+ if (canonical || xhtml)
+ return;
+ try {
+ rawWrite ("]>");
+ newline ();
+ } catch (IOException e) {
+ fatal ("can't write", e);
+ }
+ }
+
+ /**
+ * <b>SAX2</b>: called before parsing a general entity in content
+ */
+ final public void startEntity (String name)
+ throws SAXException
+ {
+ try {
+ boolean writeEOL = true;
+
+ // Predefined XHTML entities (for characters) will get
+ // mapped back later.
+ if (xhtml || expandingEntities)
+ return;
+
+ entityNestLevel++;
+ if (name.equals ("[dtd]"))
+ return;
+ if (entityNestLevel != 1)
+ return;
+ if (!name.startsWith ("%")) {
+ writeEOL = false;
+ rawWrite ('&');
+ }
+ rawWrite (name);
+ rawWrite (';');
+ if (writeEOL)
+ newline ();
+ } catch (IOException e) {
+ fatal ("can't write", e);
+ }
+ }
+
+ /**
+ * <b>SAX2</b>: called after parsing a general entity in content
+ */
+ final public void endEntity (String name)
+ throws SAXException
+ {
+ if (xhtml || expandingEntities)
+ return;
+ entityNestLevel--;
+ }
+
+ /**
+ * <b>SAX2</b>: called when comments are parsed.
+ * When XHTML is used, the old HTML tradition of using comments
+ * to for inline CSS, or for JavaScript code is discouraged.
+ * This is because XML processors are encouraged to discard, on
+ * the grounds that comments are for users (and perhaps text
+ * editors) not programs. Instead, use external scripts
+ */
+ final public void comment (char ch [], int start, int length)
+ throws SAXException
+ {
+ if (locator == null)
+ locator = new LocatorImpl ();
+
+ // don't print internal subset for XHTML
+ if (xhtml && startedDoctype)
+ return;
+ // don't print comment in doctype for canon xml
+ if (canonical && inDoctype)
+ return;
+
+ try {
+ boolean indent;
+
+ if (prettyPrinting && space.empty ())
+ fatal ("stack discipline", null);
+ indent = prettyPrinting && "default".equals (space.peek ());
+ if (entityNestLevel != 0)
+ return;
+ if (indent)
+ doIndent ();
+ if (canonical && inEpilogue)
+ newline ();
+ rawWrite ("<!--");
+ escapeChars (ch, start, length, CTX_UNPARSED);
+ rawWrite ("-->");
+ if (indent)
+ doIndent ();
+ if (elementNestLevel == 0 && !(canonical && inEpilogue))
+ newline ();
+ } catch (IOException e) {
+ fatal ("can't write", e);
+ }
+ }
+
+ // SAX1 DTDHandler
+
+ /** <b>SAX1</b>: called on notation declarations */
+ final public void notationDecl (String name,
+ String publicId, String systemId)
+ throws SAXException
+ {
+ if (xhtml)
+ return;
+ try {
+ // At this time, only SAX2 callbacks start these.
+ if (!startedDoctype)
+ return;
+
+ if (entityNestLevel != 0)
+ return;
+ rawWrite ("<!NOTATION " + name + " ");
+ if (publicId != null)
+ rawWrite ("PUBLIC \"" + publicId + '"');
+ else
+ rawWrite ("SYSTEM ");
+ if (systemId != null)
+ rawWrite ('"' + systemId + '"');
+ rawWrite (">");
+ newline ();
+ } catch (IOException e) {
+ fatal ("can't write", e);
+ }
+ }
+
+ /** <b>SAX1</b>: called on unparsed entity declarations */
+ final public void unparsedEntityDecl (String name,
+ String publicId, String systemId,
+ String notationName)
+ throws SAXException
+ {
+ if (xhtml)
+ return;
+ try {
+ // At this time, only SAX2 callbacks start these.
+ if (!startedDoctype) {
+ // FIXME: write to temporary buffer, and make the start
+ // of the root element write these declarations.
+ return;
+ }
+
+ if (entityNestLevel != 0)
+ return;
+ rawWrite ("<!ENTITY " + name + " ");
+ if (publicId != null)
+ rawWrite ("PUBLIC \"" + publicId + '"');
+ else
+ rawWrite ("SYSTEM ");
+ rawWrite ('"' + systemId + '"');
+ rawWrite (" NDATA " + notationName + ">");
+ newline ();
+ } catch (IOException e) {
+ fatal ("can't write", e);
+ }
+ }
+
+ // SAX2 DeclHandler
+
+ /** <b>SAX2</b>: called on attribute declarations */
+ final public void attributeDecl (String eName, String aName,
+ String type, String mode, String value)
+ throws SAXException
+ {
+ if (xhtml)
+ return;
+ try {
+ // At this time, only SAX2 callbacks start these.
+ if (!startedDoctype)
+ return;
+ if (entityNestLevel != 0)
+ return;
+ rawWrite ("<!ATTLIST " + eName + ' ' + aName + ' ');
+ rawWrite (type);
+ rawWrite (' ');
+ if (mode != null)
+ rawWrite (mode + ' ');
+ if (value != null)
+ writeQuotedValue (value, CTX_ATTRIBUTE);
+ rawWrite ('>');
+ newline ();
+ } catch (IOException e) {
+ fatal ("can't write", e);
+ }
+ }
+
+ /** <b>SAX2</b>: called on element declarations */
+ final public void elementDecl (String name, String model)
+ throws SAXException
+ {
+ if (xhtml)
+ return;
+ try {
+ // At this time, only SAX2 callbacks start these.
+ if (!startedDoctype)
+ return;
+ if (entityNestLevel != 0)
+ return;
+ rawWrite ("<!ELEMENT " + name + ' ' + model + '>');
+ newline ();
+ } catch (IOException e) {
+ fatal ("can't write", e);
+ }
+ }
+
+ /** <b>SAX2</b>: called on external entity declarations */
+ final public void externalEntityDecl (
+ String name,
+ String publicId,
+ String systemId)
+ throws SAXException
+ {
+ if (xhtml)
+ return;
+ try {
+ // At this time, only SAX2 callbacks start these.
+ if (!startedDoctype)
+ return;
+ if (entityNestLevel != 0)
+ return;
+ rawWrite ("<!ENTITY ");
+ if (name.startsWith ("%")) {
+ rawWrite ("% ");
+ rawWrite (name.substring (1));
+ } else
+ rawWrite (name);
+ if (publicId != null)
+ rawWrite (" PUBLIC \"" + publicId + '"');
+ else
+ rawWrite (" SYSTEM ");
+ rawWrite ('"' + systemId + "\">");
+ newline ();
+ } catch (IOException e) {
+ fatal ("can't write", e);
+ }
+ }
+
+ /** <b>SAX2</b>: called on internal entity declarations */
+ final public void internalEntityDecl (String name, String value)
+ throws SAXException
+ {
+ if (xhtml)
+ return;
+ try {
+ // At this time, only SAX2 callbacks start these.
+ if (!startedDoctype)
+ return;
+ if (entityNestLevel != 0)
+ return;
+ rawWrite ("<!ENTITY ");
+ if (name.startsWith ("%")) {
+ rawWrite ("% ");
+ rawWrite (name.substring (1));
+ } else
+ rawWrite (name);
+ rawWrite (' ');
+ writeQuotedValue (value, CTX_ENTITY);
+ rawWrite ('>');
+ newline ();
+ } catch (IOException e) {
+ fatal ("can't write", e);
+ }
+ }
+
+ private void writeQuotedValue (String value, int code)
+ throws SAXException, IOException
+ {
+ char buf [] = value.toCharArray ();
+ int off = 0, len = buf.length;
+
+ // we can't add line breaks to attribute/entity/... values
+ noWrap = true;
+ rawWrite ('"');
+ escapeChars (buf, off, len, code);
+ rawWrite ('"');
+ noWrap = false;
+ }
+
+ // From "HTMLlat1x.ent" ... names of entities for ISO-8859-1
+ // (Latin/1) characters, all codes: 160-255 (0xA0-0xFF).
+ // Codes 128-159 have no assigned values.
+ private static final String HTMLlat1x [] = {
+ // 160
+ "nbsp", "iexcl", "cent", "pound", "curren",
+ "yen", "brvbar", "sect", "uml", "copy",
+
+ // 170
+ "ordf", "laquo", "not", "shy", "reg",
+ "macr", "deg", "plusmn", "sup2", "sup3",
+
+ // 180
+ "acute", "micro", "para", "middot", "cedil",
+ "sup1", "ordm", "raquo", "frac14", "frac12",
+
+ // 190
+ "frac34", "iquest", "Agrave", "Aacute", "Acirc",
+ "Atilde", "Auml", "Aring", "AElig", "Ccedil",
+
+ // 200
+ "Egrave", "Eacute", "Ecirc", "Euml", "Igrave",
+ "Iacute", "Icirc", "Iuml", "ETH", "Ntilde",
+
+ // 210
+ "Ograve", "Oacute", "Ocirc", "Otilde", "Ouml",
+ "times", "Oslash", "Ugrave", "Uacute", "Ucirc",
+
+ // 220
+ "Uuml", "Yacute", "THORN", "szlig", "agrave",
+ "aacute", "acirc", "atilde", "auml", "aring",
+
+ // 230
+ "aelig", "ccedil", "egrave", "eacute", "ecirc",
+ "euml", "igrave", "iacute", "icirc", "iuml",
+
+ // 240
+ "eth", "ntilde", "ograve", "oacute", "ocirc",
+ "otilde", "ouml", "divide", "oslash", "ugrave",
+
+ // 250
+ "uacute", "ucirc", "uuml", "yacute", "thorn",
+ "yuml"
+ };
+
+ // From "HTMLsymbolx.ent" ... some of the symbols that
+ // we can conveniently handle. Entities for the Greek.
+ // alphabet (upper and lower cases) are compact.
+ private static final String HTMLsymbolx_GR [] = {
+ // 913
+ "Alpha", "Beta", "Gamma", "Delta", "Epsilon",
+ "Zeta", "Eta", "Theta", "Iota", "Kappa",
+
+ // 923
+ "Lambda", "Mu", "Nu", "Xi", "Omicron",
+ "Pi", "Rho", null, "Sigma", "Tau",
+
+ // 933
+ "Upsilon", "Phi", "Chi", "Psi", "Omega"
+ };
+
+ private static final String HTMLsymbolx_gr [] = {
+ // 945
+ "alpha", "beta", "gamma", "delta", "epsilon",
+ "zeta", "eta", "theta", "iota", "kappa",
+
+ // 955
+ "lambda", "mu", "nu", "xi", "omicron",
+ "pi", "rho", "sigmaf", "sigma", "tau",
+
+ // 965
+ "upsilon", "phi", "chi", "psi", "omega"
+ };
+
+
+ // General routine to write text and substitute predefined
+ // entities (XML, and a special case for XHTML) as needed.
+ private void escapeChars (char buf [], int off, int len, int code)
+ throws SAXException, IOException
+ {
+ int first = 0;
+
+ if (off < 0) {
+ off = 0;
+ len = buf.length;
+ }
+ for (int i = 0; i < len; i++) {
+ String esc;
+ char c = buf [off + i];
+
+ switch (c) {
+ // Note that CTX_ATTRIBUTE isn't explicitly tested here;
+ // all syntax delimiters are escaped in CTX_ATTRIBUTE,
+ // otherwise it's similar to CTX_CONTENT
+
+ // ampersand flags entity references; entity replacement
+ // text has unexpanded references, other text doesn't.
+ case '&':
+ if (code == CTX_ENTITY || code == CTX_UNPARSED)
+ continue;
+ esc = "amp";
+ break;
+
+ // attributes and text may NOT have literal '<', but
+ // entities may have markup constructs
+ case '<':
+ if (code == CTX_ENTITY || code == CTX_UNPARSED)
+ continue;
+ esc = "lt";
+ break;
+
+ // as above re markup constructs; but otherwise
+ // except when canonicalizing, this is for consistency
+ case '>':
+ if (code == CTX_ENTITY || code == CTX_UNPARSED)
+ continue;
+ esc = "gt";
+ break;
+ case '\'':
+ if (code == CTX_CONTENT || code == CTX_UNPARSED)
+ continue;
+ if (canonical)
+ continue;
+ esc = "apos";
+ break;
+
+ // needed when printing quoted attribute/entity values
+ case '"':
+ if (code == CTX_CONTENT || code == CTX_UNPARSED)
+ continue;
+ esc = "quot";
+ break;
+
+ // make line ends work per host OS convention
+ case '\n':
+ esc = eol;
+ break;
+
+ //
+ // No other characters NEED special treatment ... except
+ // for encoding-specific issues, like whether the character
+ // can really be represented in that encoding.
+ //
+ default:
+ //
+ // There are characters we can never write safely; getting
+ // them is an error.
+ //
+ // (a) They're never legal in XML ... detected by range
+ // checks, and (eventually) by remerging surrogate
+ // pairs on output. (Easy error for apps to prevent.)
+ //
+ // (b) This encoding can't represent them, and we
+ // can't make reference substitution (e.g. inside
+ // CDATA sections, names, PI data, etc). (Hard for
+ // apps to prevent, except by using UTF-8 or UTF-16
+ // as their output encoding.)
+ //
+ // We know a very little bit about what characters
+ // the US-ASCII and ISO-8859-1 encodings support. For
+ // other encodings we can't detect the second type of
+ // error at all. (Never an issue for UTF-8 or UTF-16.)
+ //
+
+// FIXME: CR in CDATA is an error; in text, turn to a char ref
+
+// FIXME: CR/LF/TAB in attributes should become char refs
+
+ if ((c > 0xfffd)
+ || ((c < 0x0020) && !((c == 0x0009)
+ || (c == 0x000A) || (c == 0x000D)))
+ || (((c & dangerMask) != 0)
+ && (code == CTX_UNPARSED))) {
+
+ // if case (b) in CDATA, we might end the section,
+ // write a reference, then restart ... possible
+ // in one DOM L3 draft.
+
+ throw new CharConversionException (
+ "Illegal or non-writable character: U+"
+ + Integer.toHexString (c));
+ }
+
+ //
+ // If the output encoding represents the character
+ // directly, let it do so! Else we'll escape it.
+ //
+ if ((c & dangerMask) == 0)
+ continue;
+ esc = null;
+
+ // Avoid numeric refs where symbolic ones exist, as
+ // symbolic ones make more sense to humans reading!
+ if (xhtml) {
+ // all the HTMLlat1x.ent entities
+ // (all the "ISO-8859-1" characters)
+ if (c >= 160 && c <= 255)
+ esc = HTMLlat1x [c - 160];
+
+ // not quite half the HTMLsymbolx.ent entities
+ else if (c >= 913 && c <= 937)
+ esc = HTMLsymbolx_GR [c - 913];
+ else if (c >= 945 && c <= 969)
+ esc = HTMLsymbolx_gr [c - 945];
+
+ else switch (c) {
+ // all of the HTMLspecialx.ent entities
+ case 338: esc = "OElig"; break;
+ case 339: esc = "oelig"; break;
+ case 352: esc = "Scaron"; break;
+ case 353: esc = "scaron"; break;
+ case 376: esc = "Yuml"; break;
+ case 710: esc = "circ"; break;
+ case 732: esc = "tilde"; break;
+ case 8194: esc = "ensp"; break;
+ case 8195: esc = "emsp"; break;
+ case 8201: esc = "thinsp"; break;
+ case 8204: esc = "zwnj"; break;
+ case 8205: esc = "zwj"; break;
+ case 8206: esc = "lrm"; break;
+ case 8207: esc = "rlm"; break;
+ case 8211: esc = "ndash"; break;
+ case 8212: esc = "mdash"; break;
+ case 8216: esc = "lsquo"; break;
+ case 8217: esc = "rsquo"; break;
+ case 8218: esc = "sbquo"; break;
+ case 8220: esc = "ldquo"; break;
+ case 8221: esc = "rdquo"; break;
+ case 8222: esc = "bdquo"; break;
+ case 8224: esc = "dagger"; break;
+ case 8225: esc = "Dagger"; break;
+ case 8240: esc = "permil"; break;
+ case 8249: esc = "lsaquo"; break;
+ case 8250: esc = "rsaquo"; break;
+ case 8364: esc = "euro"; break;
+
+ // the other HTMLsymbox.ent entities
+ case 402: esc = "fnof"; break;
+ case 977: esc = "thetasym"; break;
+ case 978: esc = "upsih"; break;
+ case 982: esc = "piv"; break;
+ case 8226: esc = "bull"; break;
+ case 8230: esc = "hellip"; break;
+ case 8242: esc = "prime"; break;
+ case 8243: esc = "Prime"; break;
+ case 8254: esc = "oline"; break;
+ case 8260: esc = "frasl"; break;
+ case 8472: esc = "weierp"; break;
+ case 8465: esc = "image"; break;
+ case 8476: esc = "real"; break;
+ case 8482: esc = "trade"; break;
+ case 8501: esc = "alefsym"; break;
+ case 8592: esc = "larr"; break;
+ case 8593: esc = "uarr"; break;
+ case 8594: esc = "rarr"; break;
+ case 8595: esc = "darr"; break;
+ case 8596: esc = "harr"; break;
+ case 8629: esc = "crarr"; break;
+ case 8656: esc = "lArr"; break;
+ case 8657: esc = "uArr"; break;
+ case 8658: esc = "rArr"; break;
+ case 8659: esc = "dArr"; break;
+ case 8660: esc = "hArr"; break;
+ case 8704: esc = "forall"; break;
+ case 8706: esc = "part"; break;
+ case 8707: esc = "exist"; break;
+ case 8709: esc = "empty"; break;
+ case 8711: esc = "nabla"; break;
+ case 8712: esc = "isin"; break;
+ case 8713: esc = "notin"; break;
+ case 8715: esc = "ni"; break;
+ case 8719: esc = "prod"; break;
+ case 8721: esc = "sum"; break;
+ case 8722: esc = "minus"; break;
+ case 8727: esc = "lowast"; break;
+ case 8730: esc = "radic"; break;
+ case 8733: esc = "prop"; break;
+ case 8734: esc = "infin"; break;
+ case 8736: esc = "ang"; break;
+ case 8743: esc = "and"; break;
+ case 8744: esc = "or"; break;
+ case 8745: esc = "cap"; break;
+ case 8746: esc = "cup"; break;
+ case 8747: esc = "int"; break;
+ case 8756: esc = "there4"; break;
+ case 8764: esc = "sim"; break;
+ case 8773: esc = "cong"; break;
+ case 8776: esc = "asymp"; break;
+ case 8800: esc = "ne"; break;
+ case 8801: esc = "equiv"; break;
+ case 8804: esc = "le"; break;
+ case 8805: esc = "ge"; break;
+ case 8834: esc = "sub"; break;
+ case 8835: esc = "sup"; break;
+ case 8836: esc = "nsub"; break;
+ case 8838: esc = "sube"; break;
+ case 8839: esc = "supe"; break;
+ case 8853: esc = "oplus"; break;
+ case 8855: esc = "otimes"; break;
+ case 8869: esc = "perp"; break;
+ case 8901: esc = "sdot"; break;
+ case 8968: esc = "lceil"; break;
+ case 8969: esc = "rceil"; break;
+ case 8970: esc = "lfloor"; break;
+ case 8971: esc = "rfloor"; break;
+ case 9001: esc = "lang"; break;
+ case 9002: esc = "rang"; break;
+ case 9674: esc = "loz"; break;
+ case 9824: esc = "spades"; break;
+ case 9827: esc = "clubs"; break;
+ case 9829: esc = "hearts"; break;
+ case 9830: esc = "diams"; break;
+ }
+ }
+
+ // else escape with numeric char refs
+ if (esc == null) {
+ stringBuf.setLength (0);
+ stringBuf.append ("#x");
+ stringBuf.append (Integer.toHexString (c).toUpperCase ());
+ esc = stringBuf.toString ();
+
+ // FIXME: We don't write surrogate pairs correctly.
+ // They should work as one ref per character, since
+ // each pair is one character. For reading back into
+ // Unicode, it matters beginning in Unicode 3.1 ...
+ }
+ break;
+ }
+ if (i != first)
+ rawWrite (buf, off + first, i - first);
+ first = i + 1;
+ if (esc == eol)
+ newline ();
+ else {
+ rawWrite ('&');
+ rawWrite (esc);
+ rawWrite (';');
+ }
+ }
+ if (first < len)
+ rawWrite (buf, off + first, len - first);
+ }
+
+
+
+ private void newline ()
+ throws SAXException, IOException
+ {
+ out.write (eol);
+ column = 0;
+ }
+
+ private void doIndent ()
+ throws SAXException, IOException
+ {
+ int space = elementNestLevel * 2;
+
+ newline ();
+ column = space;
+ // track tabs only at line starts
+ while (space > 8) {
+ out.write ("\t");
+ space -= 8;
+ }
+ while (space > 0) {
+ out.write (" ");
+ space -= 2;
+ }
+ }
+
+ private void rawWrite (char c)
+ throws IOException
+ {
+ out.write (c);
+ column++;
+ }
+
+ private void rawWrite (String s)
+ throws SAXException, IOException
+ {
+ if (prettyPrinting && "default".equals (space.peek ())) {
+ char data [] = s.toCharArray ();
+ rawWrite (data, 0, data.length);
+ } else {
+ out.write (s);
+ column += s.length ();
+ }
+ }
+
+ // NOTE: if xhtml, the REC gives some rules about whitespace
+ // which we could follow ... notably, many places where conformant
+ // agents "must" consolidate/normalize whitespace. Line ends can
+ // be removed there, etc. This may not be the right place to do
+ // such mappings though.
+
+ // Line buffering may help clarify algorithms and improve results.
+
+ // It's likely xml:space needs more attention.
+
+ private void rawWrite (char buf [], int offset, int length)
+ throws SAXException, IOException
+ {
+ boolean wrap;
+
+ if (prettyPrinting && space.empty ())
+ fatal ("stack discipline", null);
+
+ wrap = prettyPrinting && "default".equals (space.peek ());
+ if (!wrap) {
+ out.write (buf, offset, length);
+ column += length;
+ return;
+ }
+
+ // we're pretty printing and want to fill lines out only
+ // to the desired line length.
+ while (length > 0) {
+ int target = lineLength - column;
+ boolean wrote = false;
+
+ // Do we even have a problem?
+ if (target > length || noWrap) {
+ out.write (buf, offset, length);
+ column += length;
+ return;
+ }
+
+ // break the line at a space character, trying to fill
+ // as much of the line as possible.
+ char c;
+
+ for (int i = target - 1; i >= 0; i--) {
+ if ((c = buf [offset + i]) == ' ' || c == '\t') {
+ i++;
+ out.write (buf, offset, i);
+ doIndent ();
+ offset += i;
+ length -= i;
+ wrote = true;
+ break;
+ }
+ }
+ if (wrote)
+ continue;
+
+ // no space character permitting break before target
+ // line length is filled. So, take the next one.
+ if (target < 0)
+ target = 0;
+ for (int i = target; i < length; i++)
+ if ((c = buf [offset + i]) == ' ' || c == '\t') {
+ i++;
+ out.write (buf, offset, i);
+ doIndent ();
+ offset += i;
+ length -= i;
+ wrote = true;
+ break;
+ }
+ if (wrote)
+ continue;
+
+ // no such luck.
+ out.write (buf, offset, length);
+ column += length;
+ break;
+ }
+ }
+}