diff options
author | kou <kou@b2dd03c8-39d4-4d8f-98ff-823fe69b080e> | 2014-02-23 09:01:32 +0000 |
---|---|---|
committer | kou <kou@b2dd03c8-39d4-4d8f-98ff-823fe69b080e> | 2014-02-23 09:01:32 +0000 |
commit | ba5ed845b30c81fbf92c052b83f54198cd272bbd (patch) | |
tree | 5a7fc9f29a6b89560d71fd0d8649e8ba76ae9a52 /lib/rexml | |
parent | 44a9509f2ff2be85b97ade9806857e0948c29a1b (diff) | |
download | ruby-ba5ed845b30c81fbf92c052b83f54198cd272bbd.tar.gz |
* lib/rexml/xmltokens.rb: Add missing non ASCII valid characters
to element name characters. Now, REXML name tokens exactly
match "[5] Name" in the XML spec and "[4] NCName" in the
Namespaces in XML spec. See comment about the details.
[Bug #9539] [ruby-core:60901]
Reported by Mario Barcala. Thanks!!!
* test/rexml/xpath/test_node.rb: Add tests for the above case.
git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@45153 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
Diffstat (limited to 'lib/rexml')
-rw-r--r-- | lib/rexml/xmltokens.rb | 76 |
1 files changed, 71 insertions, 5 deletions
diff --git a/lib/rexml/xmltokens.rb b/lib/rexml/xmltokens.rb index 7dc4e8b2ba..4d4dd27f2d 100644 --- a/lib/rexml/xmltokens.rb +++ b/lib/rexml/xmltokens.rb @@ -2,12 +2,78 @@ module REXML # Defines a number of tokens used for parsing XML. Not for general # consumption. module XMLTokens - NCNAME_STR= '[\w:][\-\w.]*' - NAME_STR= "(?:#{NCNAME_STR}:)?#{NCNAME_STR}" + # From http://www.w3.org/TR/REC-xml/#sec-common-syn + # + # [4] NameStartChar ::= + # ":" | + # [A-Z] | + # "_" | + # [a-z] | + # [#xC0-#xD6] | + # [#xD8-#xF6] | + # [#xF8-#x2FF] | + # [#x370-#x37D] | + # [#x37F-#x1FFF] | + # [#x200C-#x200D] | + # [#x2070-#x218F] | + # [#x2C00-#x2FEF] | + # [#x3001-#xD7FF] | + # [#xF900-#xFDCF] | + # [#xFDF0-#xFFFD] | + # [#x10000-#xEFFFF] + name_start_chars = [ + ":", + "A-Z", + "_", + "a-z", + "\\u00C0-\\u00D6", + "\\u00D8-\\u00F6", + "\\u00F8-\\u02FF", + "\\u0370-\\u037D", + "\\u037F-\\u1FFF", + "\\u200C-\\u200D", + "\\u2070-\\u218F", + "\\u2C00-\\u2FEF", + "\\u3001-\\uD7FF", + "\\uF900-\\uFDCF", + "\\uFDF0-\\uFFFD", + "\\u{10000}-\\u{EFFFF}", + ] + # From http://www.w3.org/TR/REC-xml/#sec-common-syn + # + # [4a] NameChar ::= + # NameStartChar | + # "-" | + # "." | + # [0-9] | + # #xB7 | + # [#x0300-#x036F] | + # [#x203F-#x2040] + name_chars = name_start_chars + [ + "\\-", + "\\.", + "0-9", + "\\u00B7", + "\\u0300-\\u036F", + "\\u203F-\\u2040", + ] + NAME_START_CHAR = "[#{name_start_chars.join('')}]" + NAME_CHAR = "[#{name_chars.join('')}]" + NAMECHAR = NAME_CHAR # deprecated. Use NAME_CHAR instead. - NAMECHAR = '[\-\w\.:]' - NAME = "([\\w:]#{NAMECHAR}*)" - NMTOKEN = "(?:#{NAMECHAR})+" + # From http://www.w3.org/TR/xml-names11/#NT-NCName + # + # [6] NCNameStartChar ::= NameStartChar - ':' + ncname_start_chars = name_start_chars - [":"] + # From http://www.w3.org/TR/xml-names11/#NT-NCName + # + # [5] NCNameChar ::= NameChar - ':' + ncname_chars = name_chars - [":"] + NCNAME_STR = "[#{ncname_start_chars.join('')}][#{ncname_chars.join('')}]*" + NAME_STR = "(?:#{NCNAME_STR}:)?#{NCNAME_STR}" + + NAME = "(#{NAME_START_CHAR}#{NAME_CHAR}*)" + NMTOKEN = "(?:#{NAME_CHAR})+" NMTOKENS = "#{NMTOKEN}(\\s+#{NMTOKEN})*" REFERENCE = "(?:&#{NAME};|&#\\d+;|&#x[0-9a-fA-F]+;)" |