From f6ed570eb2249385700b8496b7e944b78c3d2d34 Mon Sep 17 00:00:00 2001 From: nanbor Date: Mon, 3 Dec 2001 03:01:27 +0000 Subject: *** empty log message *** --- ACEXML/docs/TODO.txt | 7 +- ACEXML/examples/SAXPrint/Print_Handler.cpp | 32 ++- ACEXML/examples/SAXPrint/svc.conf.xml | 14 +- ACEXML/parser/parser/Parser.cpp | 332 ++++++++++++++++++++++++++--- ACEXML/parser/parser/Parser.h | 80 ++++++- 5 files changed, 413 insertions(+), 52 deletions(-) diff --git a/ACEXML/docs/TODO.txt b/ACEXML/docs/TODO.txt index 5fa955a8985..2f0227f3884 100644 --- a/ACEXML/docs/TODO.txt +++ b/ACEXML/docs/TODO.txt @@ -17,11 +17,10 @@
  • Add Schema paring ability. It is not clear to me, however, how to specify which schema a document is associated with. -
  • Add document locator implementation so we can report in which - line/column an error occurs. -
  • Add support for resolving external entities, such as a - schema/namespace definition located on the web. + schema/namespace definition located on the web. There should be + a factory object to create various kinds of @ref CharStream + based on the URN or PEReference being parsed.
  • A char stream should be able to differentiate the file encoding and perform the correct transcoding automatically. diff --git a/ACEXML/examples/SAXPrint/Print_Handler.cpp b/ACEXML/examples/SAXPrint/Print_Handler.cpp index d1248a7f8ab..82a2a59a415 100644 --- a/ACEXML/examples/SAXPrint/Print_Handler.cpp +++ b/ACEXML/examples/SAXPrint/Print_Handler.cpp @@ -147,24 +147,38 @@ ACEXML_Print_Handler::startPrefixMapping (const ACEXML_Char *, // *** Methods inherit from ACEXML_DTDHandler. void -ACEXML_Print_Handler::notationDecl (const ACEXML_Char *, - const ACEXML_Char *, - const ACEXML_Char *, +ACEXML_Print_Handler::notationDecl (const ACEXML_Char *name, + const ACEXML_Char *publicID, + const ACEXML_Char *systemID, ACEXML_Env &) // ACE_THROW_SPEC ((ACEXML_SAXException)) { - // No-op. + cout << "* Event notationDecl: (" << name << ") "; + + if (publicID == 0) + cout << "SYSTEM " << systemID << endl; + else if (systemID == 0) + cout << "PUBLIC " << publicID << endl; + else + cout << "PUBLIC " << publicID << " " << systemID << endl; } void -ACEXML_Print_Handler::unparsedEntityDecl (const ACEXML_Char *, - const ACEXML_Char *, - const ACEXML_Char *, - const ACEXML_Char *, +ACEXML_Print_Handler::unparsedEntityDecl (const ACEXML_Char *name, + const ACEXML_Char *publicID, + const ACEXML_Char *systemID, + const ACEXML_Char *notationName, ACEXML_Env &) // ACE_THROW_SPEC ((ACEXML_SAXException)) { - // No-op. + cout << "* Unparsed Entity: " << name; + + if (publicID == 0) + cout << " SYSTEM " << systemID; + else + cout << " PUBLIC " << publicID << " " << systemID; + + cout << " NDATA " << notationName << endl; } // Methods inherit from ACEXML_EnitityResolver. diff --git a/ACEXML/examples/SAXPrint/svc.conf.xml b/ACEXML/examples/SAXPrint/svc.conf.xml index 356dbb5e654..863ce22f35e 100644 --- a/ACEXML/examples/SAXPrint/svc.conf.xml +++ b/ACEXML/examples/SAXPrint/svc.conf.xml @@ -1,5 +1,15 @@ - + + + + + + + + ]> + -d @@ -32,7 +42,7 @@ - + diff --git a/ACEXML/parser/parser/Parser.cpp b/ACEXML/parser/parser/Parser.cpp index 25b14e5344d..346b113527c 100644 --- a/ACEXML/parser/parser/Parser.cpp +++ b/ACEXML/parser/parser/Parser.cpp @@ -30,7 +30,11 @@ ACEXML_Parser::ACEXML_Parser (void) : dtd_handler_ (0), entity_resolver_ (0), content_handler_ (0), - error_handler_ (0) + error_handler_ (0), + instream_ (0), + doctype_ (0), + dtd_system_ (0), + dtd_public_ (0) { } @@ -115,6 +119,7 @@ ACEXML_Parser::parse (ACEXML_InputSource *input, this->parse_xml_prolog (xmlenv); ACEXML_CHECK; + // @@ Should startDocument come before or after parsing the DTD definition? this->content_handler_->startDocument (xmlenv); ACEXML_CHECK; @@ -169,7 +174,8 @@ ACEXML_Parser::parse (ACEXML_InputSource *input, } } - this->parse_element (xmlenv); + // Now parse root element. + this->parse_element (1, xmlenv); ACEXML_CHECK; this->content_handler_->endDocument (xmlenv); @@ -215,6 +221,19 @@ ACEXML_Parser::skip_whitespace (ACEXML_Char **whitespace) return ch; } +int +ACEXML_Parser::skip_whitespace_count (ACEXML_Char *peeky) +{ + int wscount = 0; + ACEXML_Char dummy; + ACEXML_Char &forward = (peeky == 0 ? dummy : *peeky); + + for (;this->is_whitespace ((forward = this->peek ())); ++wscount) + this->get (); + + return wscount; +} + void ACEXML_Parser::parse_xml_prolog (ACEXML_Env &xmlenv) // ACE_THROW_SPEC ((ACEXML_SAXException)) @@ -454,7 +473,7 @@ ACEXML_Parser::parse_doctypedecl (ACEXML_Env &xmlenv) this->get () != 'P' || this->get () != 'E') { - xmlenv.exception (new ACEXML_SAXParseException ("Expecting 'DOCTYPE'")); + xmlenv.exception (new ACEXML_SAXParseException ("Expecting keyword 'DOCTYPE'")); return -1; } @@ -465,26 +484,36 @@ ACEXML_Parser::parse_doctypedecl (ACEXML_Env &xmlenv) return -1; } - // ACEXML_Char *doctype = - this->read_name (nextch); + this->doctype_ = this->read_name (nextch); - nextch = this->skip_whitespace (0); + this->skip_whitespace_count (&nextch); + + if (nextch == 'S' || nextch == 'P') // ExternalID defined + { + this->parse_external_id_and_ref (this->dtd_public_, + this->dtd_system_, + xmlenv); + if (xmlenv.exception () != 0) + return -1; + else if (this->dtd_public_ == 0) + cout << "ACEXML Parser got external DTD id: SYSTEM " << this->dtd_system_ + << endl; + else + cout << "==> ACEXML Parser got DTD external id: PUBLIC " + << this->dtd_public_ + << " " + << this->dtd_system_ + << endl; + } + nextch = this->skip_whitespace (0); switch (nextch) { case '[': // Internal DTD definitionl if (this->parse_internal_dtd (xmlenv) < 0) return -1; // Error in markupdecl break; - case 'S': // SYSTEM - case 'P': // PUBLIC - { - // Error: We don't handle either system or public ID yet. - xmlenv.exception (new ACEXML_SAXNotSupportedException ()); - return -1; - } - break; - case '>': // No DTD definition + case '>': // End of DTD definition // this is an XML document without a dectypedecl. return 0; case '0': @@ -494,13 +523,8 @@ ACEXML_Parser::parse_doctypedecl (ACEXML_Env &xmlenv) break; } - nextch = this->skip_whitespace (0); - - switch (nextch) + if (this->skip_whitespace (0) != '>') { - case '>': - return 0; // all is fine now. - default: xmlenv.exception (new ACEXML_SAXParseException ("Internal error")); return -1; } @@ -508,7 +532,7 @@ ACEXML_Parser::parse_doctypedecl (ACEXML_Env &xmlenv) } void -ACEXML_Parser::parse_element (ACEXML_Env &xmlenv) +ACEXML_Parser::parse_element (int is_root, ACEXML_Env &xmlenv) // ACE_THROW_SPEC ((ACEXML_SAXException)) { // Parse STag. @@ -521,6 +545,14 @@ ACEXML_Parser::parse_element (ACEXML_Env &xmlenv) return; } + if (is_root && + this->doctype_ != 0 && + ACE_OS_String::strcmp (startname, this->doctype_) != 0) + { + xmlenv.exception (new ACEXML_SAXParseException ("Root element missing.")); + return; + } + const ACEXML_Char *endname = 0; ACEXML_AttributesImpl attributes; ACEXML_Char ch; @@ -715,7 +747,7 @@ ACEXML_Parser::parse_element (ACEXML_Env &xmlenv) return; default: // a new nested element? - this->parse_element (xmlenv); + this->parse_element (0, xmlenv); ACEXML_CHECK; break; } @@ -1076,7 +1108,7 @@ ACEXML_Parser::get_quoted_string (ACEXML_Char *&str) int ACEXML_Parser::parse_internal_dtd (ACEXML_Env &xmlenv) { - ACEXML_Char nextch = this->get (); + ACEXML_Char nextch = this->skip_whitespace (0); do { switch (nextch) @@ -1106,7 +1138,7 @@ ACEXML_Parser::parse_internal_dtd (ACEXML_Env &xmlenv) break; default: - xmlenv.exception (new ACEXML_SAXParseException ("Invalid decl spec")); + xmlenv.exception (new ACEXML_SAXParseException ("Invalid keyword in decl spec")); return -1; } break; @@ -1152,13 +1184,15 @@ ACEXML_Parser::parse_internal_dtd (ACEXML_Env &xmlenv) } break; - case '%': // DeclSep. + case '%': // DeclSep. Define new PEreference... break; case ']': // End of internal definitions. - return 0; + return 0; // Not applicable when parsing external DTD spec. - case 0: + case 0: // This may not be an error if we decide + // to generalize this function to handle both + // internal and external DTD definitions. xmlenv.exception (new ACEXML_SAXParseException ("Unexpected EOF")); return -1; @@ -1189,8 +1223,134 @@ ACEXML_Parser::parse_element_decl (ACEXML_Env &xmlenv) int ACEXML_Parser::parse_entity_decl (ACEXML_Env &xmlenv) { - xmlenv.exception (new ACEXML_SAXNotSupportedException ()); - return -1; + ACEXML_Char nextch; + + if (this->get () != 'N' || + this->get () != 'T' || + this->get () != 'I' || + this->get () != 'T' || + this->get () != 'Y' || + this->skip_whitespace_count (&nextch) == 0) + { + xmlenv.exception (new ACEXML_SAXParseException ("Expecting keyword `ENTITY'")); + return -1; + } + + int is_GEDecl = 1; + if (nextch == '%') // This is a PEDecl. + { + is_GEDecl = 0; + this->get (); // consume the '%' + if (this->skip_whitespace_count (&nextch) == 0) + { + xmlenv.exception (new ACEXML_SAXParseException + ("Can't use a reference when defining entity name")); + return -1; + } + } + + ACEXML_Char *entity_name = this->read_name (); + if (entity_name == 0) + { + xmlenv.exception (new ACEXML_SAXParseException ("Error reading ENTITY name.")); + return -1; + } + + this->skip_whitespace_count (&nextch); + + if (nextch == '\'' || nextch == '"') + { + ACEXML_Char *entity_value = 0; + + if (this->get_quoted_string (entity_value) != 0) + { + xmlenv.exception (new ACEXML_SAXParseException + ("Error reading ENTITY value.")); + return -1; + } + + if (is_GEDecl) + { + if (this->entities_.add_entity (entity_name, entity_value) != 0) + { + xmlenv.exception (new ACEXML_SAXParseException + ("Error storing entity definition (duplicate definition?)")); + return -1; + } + } + else + { + // @@ need to implement PEdecl lookup mechanism + xmlenv.exception (new ACEXML_SAXNotSupportedException ()); + return -1; + } + } + else + { + ACEXML_Char *systemid, *publicid; + + this->parse_external_id_and_ref (publicid, systemid, xmlenv); + if (xmlenv.exception () != 0) + return -1; + + if (systemid == 0) + { + xmlenv.exception (new ACEXML_SAXParseException + ("Invalid ExternalID definition (system ID missing.)")); + return -1; + } + + this->skip_whitespace_count (&nextch); + if (nextch == 'N') // NDATA section followed + { + if (is_GEDecl == 0) + { + xmlenv.exception (new ACEXML_SAXParseException + ("Unexpecting keyword NDATA in PEDecl.")); + return -1; + } + + if (this->get () != 'N' || + this->get () != 'D' || + this->get () != 'A' || + this->get () != 'T' || + this->get () != 'A' || + this->skip_whitespace_count (&nextch) == 0) + { + xmlenv.exception (new ACEXML_SAXParseException + ("Expecting keyword NDATA.")); + return -1; + } + + ACEXML_Char *ndata = this->read_name (); + this->dtd_handler_->unparsedEntityDecl (entity_name, + publicid, + systemid, + ndata, + xmlenv); + if (xmlenv.exception () != 0) + return -1; + } + else + { + // @@ Need to support external CharStream sources + cout << "ENTITY: (" << entity_name << ") "; + + if (publicid == 0) + cout << "SYSTEM " << systemid << endl; + else + cout << "PUBLIC " << publicid << " " << systemid << endl; + } + } + + // End of ENTITY definition + if (this->skip_whitespace (0) != '>') + { + xmlenv.exception (new ACEXML_SAXParseException + ("Expecting end of ENTITY definition.")); + return -1; + } + return 0; } int @@ -1203,6 +1363,114 @@ ACEXML_Parser::parse_attlist_decl (ACEXML_Env &xmlenv) int ACEXML_Parser::parse_notation_decl (ACEXML_Env &xmlenv) { - xmlenv.exception (new ACEXML_SAXNotSupportedException ()); - return -1; + if (this->get () != 'N' || + this->get () != 'O' || + this->get () != 'T' || + this->get () != 'A' || + this->get () != 'T' || + this->get () != 'I' || + this->get () != 'O' || + this->get () != 'N' || + this->skip_whitespace_count () == 0) + { + xmlenv.exception (new ACEXML_SAXParseException ("Expecting keyword `NOTATION'")); + return -1; + } + + ACEXML_Char *notation = this->read_name (); + if (notation == 0) + { + xmlenv.exception (new ACEXML_SAXParseException ("Invalid notation name.")); + return -1; + } + + this->skip_whitespace_count (); + ACEXML_Char *systemid, *publicid; + + this->parse_external_id_and_ref (publicid, systemid, xmlenv); + if (xmlenv.exception () != 0) + return -1; + + if (this->get () != '>') + { + xmlenv.exception (new ACEXML_SAXParseException + ("Expecting NOTATION closing '>'.")); + return -1; + } + + this->dtd_handler_->notationDecl (notation, + publicid, + systemid, + xmlenv); + if (xmlenv.exception () != 0) + return -1; + + return 0; +} + +int +ACEXML_Parser::parse_external_id_and_ref (ACEXML_Char *&publicId, + ACEXML_Char *&systemId, + ACEXML_Env &xmlenv) +{ + publicId = systemId = 0; + ACEXML_Char nextch = this->get (); + + switch (nextch) + { + case 'S': // External SYSTEM id. + if (this->get () != 'Y' || + this->get () != 'S' || + this->get () != 'T' || + this->get () != 'E' || + this->get () != 'M' || + this->skip_whitespace_count () == 0) + { + xmlenv.exception (new ACEXML_SAXParseException + ("Expecting keyword 'SYSTEM'")); + return -1; + } + if (this->get_quoted_string (systemId) != 0) + { + xmlenv.exception (new ACEXML_SAXParseException + ("Error while parsing SYSTEM literal for SYSTEM id.")); + return -1; + } + break; + case 'P': // External PUBLIC id or previously defined PUBLIC id. + if (this->get () != 'U' || + this->get () != 'B' || + this->get () != 'L' || + this->get () != 'I' || + this->get () != 'C' || + this->skip_whitespace_count () == 0) + { + xmlenv.exception (new ACEXML_SAXParseException + ("Expecting keyword 'PUBLIC'")); + return -1; + } + if (this->get_quoted_string (publicId) != 0) + { + xmlenv.exception (new ACEXML_SAXParseException + ("Error while parsing public literal for PUBLIC id.")); + return -1; + } + + this->skip_whitespace_count (&nextch); + if (nextch == '\'' || nextch == '"') // not end of NOTATION yet. + { + if (this->get_quoted_string (systemId) != 0) + { + xmlenv.exception (new ACEXML_SAXParseException + ("Error while parsing system literal for PUBLIC id.")); + return -1; + } + } + break; + default: + xmlenv.exception (new ACEXML_SAXParseException + ("Expecting either keyword `SYSTEM' or `PUBLIC'.")); + return -1; + } + return 0; } diff --git a/ACEXML/parser/parser/Parser.h b/ACEXML/parser/parser/Parser.h index f3a0a85deda..14f17ace3ba 100644 --- a/ACEXML/parser/parser/Parser.h +++ b/ACEXML/parser/parser/Parser.h @@ -159,7 +159,9 @@ public: // *** Helper functions for parsing XML /** - * Skip any whitespaces encounter. + * Skip any whitespaces encounter until the first non-whitespace + * character is encountered and consumed from the current input + * CharStream. * * @param whitespace Return a pointer to the string of skipped * whitespace after proper conversion. Null if there's no @@ -168,9 +170,27 @@ public: * @retval The first none-white space characters (which will be * consumed from the CharStream.) If no whitespace is found, it * will return 0. + * + * @sa skip_whitespace_count */ ACEXML_Char skip_whitespace (ACEXML_Char **whitespace); + /** + * Skip any whitespaces encounter until the first non-whitespace + * character. The first non-whitespace character is not consumed. + * This method does peek into the input CharStream and therefore + * is more expensive than @ref skip_whitespace. + * + * @param peek If non-null, @a peek points to a ACEXML_Char where + * skip_whitespace_count store the first non-whitespace + * character it sees (character is not removed from the stream.) + * + * @retval The number of whitespace characters consumed. + * + * @sa skip_whitespace + */ + int skip_whitespace_count (ACEXML_Char *peek = 0); + /** * Check if a character @a c is a whitespace. * @@ -202,8 +222,8 @@ public: /** * Get a quoted string. Quoted strings are used to specify * attribute values and this routine will replace character and - * entity references on-the-fly. Parameter entitys is not allowed - * (or replaced) in this function. + * entity references on-the-fly. Parameter entities are not allowed + * (or replaced) in this function. (But regular entities are.) * * @param str returns the un-quoted string. * @@ -248,8 +268,17 @@ public: /** * Parse an XML element. The first character encountered should * be the first character of the element "Name". + * + * @param is_root If not 0, then we are expecting to see the "root" + * element now, and the next element's name need to match the name + * defined in DOCTYPE definition, i.e., @a this->doctype_. + * + * @todo Instead of simply checking for the root element based on the + * argument @a is_root, we should instead either pass in some sort + * of validator or allow the function to return the element name so it + * can be used in a validator. */ - void parse_element (ACEXML_Env &xmlenv) + void parse_element (int is_root, ACEXML_Env &xmlenv) // ACE_THROW_SPEC ((ACEXML_SAXException)) ; @@ -297,12 +326,16 @@ public: * Parse an "ELEMENT" decl. The first character this method * expects is always the 'L' (the second char) in the word * "ELEMENT". + * + * @retval 0 on success, -1 otherwise. */ int parse_element_decl (ACEXML_Env &xmlenv); /** * Parse an "ENTITY" decl. The first character this method expects * is always the 'N' (the second char) in the word "ENTITY". + * + * @retval 0 on success, -1 otherwise. */ int parse_entity_decl (ACEXML_Env &xmlenv); @@ -310,6 +343,8 @@ public: * Parse an "ATTLIST" decl. Thse first character this method * expects is always the 'A' (the first char) in the word * "ATTLIST". + * + * @retval 0 on success, -1 otherwise. */ int parse_attlist_decl (ACEXML_Env &xmlenv); @@ -317,9 +352,35 @@ public: *Parse a "NOTATION" decl. The first character this method * expects is always the 'N' (the first char) in the word * "NOTATION". + * + * @retval 0 on success, -1 otherwise. */ int parse_notation_decl (ACEXML_Env &xmlenv); + /** + * Parse an ExternalID or a reference to PUBLIC ExternalID. + * Possible cases are in the forms of: + * + * SYSTEM 'quoted string representing system resource' + * PUBLIC 'quoted name of public ID' 'quoted resource' + * PUBLIC 'quoted name we are referring to' + * + * + * The first character this function sees must be either 'S' or 'P'. + * When the function finishes parsing, the input stream points + * at the first non-whitespace character. + * + * @param publicID returns the unquoted publicID read. If none + * is available, it will be reset to 0. + * @param systemID returns the unquoted systemID read. If none + * is available, it will be reset to 0. + * + * @retval 0 on success, -1 otherwise. + */ + int parse_external_id_and_ref (ACEXML_Char *&publicId, + ACEXML_Char *&systemId, + ACEXML_Env &xmlenv); + protected: /// Get a character. ACEXML_Char get (void); @@ -353,9 +414,18 @@ private: ACEXML_ErrorHandler *error_handler_; /// @@ Feature and properties management structure here. - // Current input char stream. + /// Current input char stream. ACEXML_CharStream *instream_; + /// My doctype, if any. + ACEXML_Char *doctype_; + + /// External DTD System Literal, if any. + ACEXML_Char *dtd_system_; + + /// External DTD Public Literal, if any. + ACEXML_Char *dtd_public_; + ACE_Obstack_T obstack_; ACEXML_NamespaceSupport xml_namespace_; -- cgit v1.2.1