summaryrefslogtreecommitdiff
path: root/lib/rexml
diff options
context:
space:
mode:
authorkou <kou@b2dd03c8-39d4-4d8f-98ff-823fe69b080e>2014-02-23 09:01:32 +0000
committerkou <kou@b2dd03c8-39d4-4d8f-98ff-823fe69b080e>2014-02-23 09:01:32 +0000
commitba5ed845b30c81fbf92c052b83f54198cd272bbd (patch)
tree5a7fc9f29a6b89560d71fd0d8649e8ba76ae9a52 /lib/rexml
parent44a9509f2ff2be85b97ade9806857e0948c29a1b (diff)
downloadruby-ba5ed845b30c81fbf92c052b83f54198cd272bbd.tar.gz
* lib/rexml/xmltokens.rb: Add missing non ASCII valid characters
to element name characters. Now, REXML name tokens exactly match "[5] Name" in the XML spec and "[4] NCName" in the Namespaces in XML spec. See comment about the details. [Bug #9539] [ruby-core:60901] Reported by Mario Barcala. Thanks!!! * test/rexml/xpath/test_node.rb: Add tests for the above case. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@45153 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
Diffstat (limited to 'lib/rexml')
-rw-r--r--lib/rexml/xmltokens.rb76
1 files changed, 71 insertions, 5 deletions
diff --git a/lib/rexml/xmltokens.rb b/lib/rexml/xmltokens.rb
index 7dc4e8b2ba..4d4dd27f2d 100644
--- a/lib/rexml/xmltokens.rb
+++ b/lib/rexml/xmltokens.rb
@@ -2,12 +2,78 @@ module REXML
# Defines a number of tokens used for parsing XML. Not for general
# consumption.
module XMLTokens
- NCNAME_STR= '[\w:][\-\w.]*'
- NAME_STR= "(?:#{NCNAME_STR}:)?#{NCNAME_STR}"
+ # From http://www.w3.org/TR/REC-xml/#sec-common-syn
+ #
+ # [4] NameStartChar ::=
+ # ":" |
+ # [A-Z] |
+ # "_" |
+ # [a-z] |
+ # [#xC0-#xD6] |
+ # [#xD8-#xF6] |
+ # [#xF8-#x2FF] |
+ # [#x370-#x37D] |
+ # [#x37F-#x1FFF] |
+ # [#x200C-#x200D] |
+ # [#x2070-#x218F] |
+ # [#x2C00-#x2FEF] |
+ # [#x3001-#xD7FF] |
+ # [#xF900-#xFDCF] |
+ # [#xFDF0-#xFFFD] |
+ # [#x10000-#xEFFFF]
+ name_start_chars = [
+ ":",
+ "A-Z",
+ "_",
+ "a-z",
+ "\\u00C0-\\u00D6",
+ "\\u00D8-\\u00F6",
+ "\\u00F8-\\u02FF",
+ "\\u0370-\\u037D",
+ "\\u037F-\\u1FFF",
+ "\\u200C-\\u200D",
+ "\\u2070-\\u218F",
+ "\\u2C00-\\u2FEF",
+ "\\u3001-\\uD7FF",
+ "\\uF900-\\uFDCF",
+ "\\uFDF0-\\uFFFD",
+ "\\u{10000}-\\u{EFFFF}",
+ ]
+ # From http://www.w3.org/TR/REC-xml/#sec-common-syn
+ #
+ # [4a] NameChar ::=
+ # NameStartChar |
+ # "-" |
+ # "." |
+ # [0-9] |
+ # #xB7 |
+ # [#x0300-#x036F] |
+ # [#x203F-#x2040]
+ name_chars = name_start_chars + [
+ "\\-",
+ "\\.",
+ "0-9",
+ "\\u00B7",
+ "\\u0300-\\u036F",
+ "\\u203F-\\u2040",
+ ]
+ NAME_START_CHAR = "[#{name_start_chars.join('')}]"
+ NAME_CHAR = "[#{name_chars.join('')}]"
+ NAMECHAR = NAME_CHAR # deprecated. Use NAME_CHAR instead.
- NAMECHAR = '[\-\w\.:]'
- NAME = "([\\w:]#{NAMECHAR}*)"
- NMTOKEN = "(?:#{NAMECHAR})+"
+ # From http://www.w3.org/TR/xml-names11/#NT-NCName
+ #
+ # [6] NCNameStartChar ::= NameStartChar - ':'
+ ncname_start_chars = name_start_chars - [":"]
+ # From http://www.w3.org/TR/xml-names11/#NT-NCName
+ #
+ # [5] NCNameChar ::= NameChar - ':'
+ ncname_chars = name_chars - [":"]
+ NCNAME_STR = "[#{ncname_start_chars.join('')}][#{ncname_chars.join('')}]*"
+ NAME_STR = "(?:#{NCNAME_STR}:)?#{NCNAME_STR}"
+
+ NAME = "(#{NAME_START_CHAR}#{NAME_CHAR}*)"
+ NMTOKEN = "(?:#{NAME_CHAR})+"
NMTOKENS = "#{NMTOKEN}(\\s+#{NMTOKEN})*"
REFERENCE = "(?:&#{NAME};|&#\\d+;|&#x[0-9a-fA-F]+;)"