summaryrefslogtreecommitdiff
path: root/lisp
diff options
context:
space:
mode:
authorChong Yidong <cyd@gnu.org>2012-05-09 16:33:48 +0800
committerChong Yidong <cyd@gnu.org>2012-05-09 16:33:48 +0800
commitce7b18ec41c5102f4af27ec22cf873a75f510630 (patch)
tree7608fccb83f4887f7e95f925b2ee36cd4f78be7d /lisp
parent66b03a53a3218479b93d64857a08b6d4cb5a0f7f (diff)
downloademacs-ce7b18ec41c5102f4af27ec22cf873a75f510630.tar.gz
Improve RFC 3986 conformance of url package.
Fix 2012-04-10 change to url.el. * url-http.el (url-http-create-request): Ignore obsolete attributes slot of url-object. * url-parse.el: Improve RFC 3986 conformance. (url-generic-parse-url): Do not populate the ATTRIBUTES slot, since this is not reliable for general RFC 3986 URIs. Keep the whole path and query inside the FILENAME slot. Improve docstring. (url-recreate-url-attributes): Mark as obsolete. (url-recreate-url): Handle missing scheme and userinfo. * url-util.el (url-encode-url): New function for URL quoting. (url-encoding-table, url-host-allowed-chars) (url-path-allowed-chars): New constants. (url--allowed-chars): New helper function. (url-hexify-string): Use them. * url-vars.el (url-nonrelative-link): Make the regexp stricter. * url.el (url-retrieve-internal): Use url-encode-url. Fixes: debbugs:7017
Diffstat (limited to 'lisp')
-rw-r--r--lisp/url/ChangeLog22
-rw-r--r--lisp/url/url-http.el3
-rw-r--r--lisp/url/url-parse.el208
-rw-r--r--lisp/url/url-util.el135
-rw-r--r--lisp/url/url-vars.el6
-rw-r--r--lisp/url/url.el10
6 files changed, 259 insertions, 125 deletions
diff --git a/lisp/url/ChangeLog b/lisp/url/ChangeLog
index 7ce3489cfcc..3980b22d4c1 100644
--- a/lisp/url/ChangeLog
+++ b/lisp/url/ChangeLog
@@ -1,3 +1,25 @@
+2012-05-09 Chong Yidong <cyd@gnu.org>
+
+ * url-util.el (url-encode-url): New function for URL quoting.
+ (url-encoding-table, url-host-allowed-chars)
+ (url-path-allowed-chars): New constants.
+ (url--allowed-chars): New helper function.
+ (url-hexify-string): Use them.
+
+ * url-parse.el: Improve RFC 3986 conformance.
+ (url-generic-parse-url): Do not populate the ATTRIBUTES slot,
+ since this is not reliable for general RFC 3986 URIs. Keep the
+ whole path and query inside the FILENAME slot. Improve docstring.
+ (url-recreate-url-attributes): Mark as obsolete.
+ (url-recreate-url): Handle missing scheme and userinfo.
+
+ * url-http.el (url-http-create-request): Ignore obsolete
+ attributes slot of url-object.
+
+ * url-vars.el (url-nonrelative-link): Make the regexp stricter.
+
+ * url.el (url-retrieve-internal): Use url-encode-url (Bug#7017).
+
2012-04-26 Stefan Monnier <monnier@iro.umontreal.ca>
* url.el (url-retrieve-synchronously): Replace lexical-let by
diff --git a/lisp/url/url-http.el b/lisp/url/url-http.el
index a4726489814..ff026da2d21 100644
--- a/lisp/url/url-http.el
+++ b/lisp/url/url-http.el
@@ -223,8 +223,7 @@ request.")
(let ((url-basic-auth-storage
'url-http-proxy-basic-auth-storage))
(url-get-authentication url-http-target-url nil 'any nil))))
- (real-fname (concat (url-filename url-http-target-url)
- (url-recreate-url-attributes url-http-target-url)))
+ (real-fname (url-filename url-http-target-url))
(host (url-host url-http-target-url))
(auth (if (cdr-safe (assoc "Authorization" url-http-extra-headers))
nil
diff --git a/lisp/url/url-parse.el b/lisp/url/url-parse.el
index b91c85c0c3d..40183a4f533 100644
--- a/lisp/url/url-parse.el
+++ b/lisp/url/url-parse.el
@@ -48,21 +48,31 @@
;;;###autoload
(defun url-recreate-url (urlobj)
"Recreate a URL string from the parsed URLOBJ."
- (concat (url-type urlobj) ":" (if (url-host urlobj) "//" "")
- (if (url-user urlobj)
- (concat (url-user urlobj)
- (if (url-password urlobj)
- (concat ":" (url-password urlobj)))
- "@"))
- (url-host urlobj)
- (if (and (url-port urlobj)
- (not (equal (url-port urlobj)
- (url-scheme-get-property (url-type urlobj) 'default-port))))
- (format ":%d" (url-port urlobj)))
- (or (url-filename urlobj) "/")
- (url-recreate-url-attributes urlobj)
- (if (url-target urlobj)
- (concat "#" (url-target urlobj)))))
+ (let ((type (url-type urlobj))
+ (user (url-user urlobj))
+ (pass (url-password urlobj))
+ (host (url-host urlobj))
+ (port (url-portspec urlobj))
+ (file (url-filename urlobj))
+ (frag (url-target urlobj)))
+ (concat (if type (concat type ":"))
+ (if (url-fullness urlobj) "//")
+ (if (or user pass)
+ (concat user
+ (if pass (concat ":" pass))
+ "@"))
+ host
+ ;; RFC 3986: "omit the port component and its : delimiter
+ ;; if port is empty or if its value would be the same as
+ ;; that of the scheme's default."
+ (and port
+ (or (null type)
+ (not (equal port
+ (url-scheme-get-property type
+ 'default-port))))
+ (format ":%d" (url-port urlobj)))
+ (or file "/")
+ (if frag (concat "#" frag)))))
(defun url-recreate-url-attributes (urlobj)
"Recreate the attributes of an URL string from the parsed URLOBJ."
@@ -73,107 +83,129 @@
(concat (car x) "=" (cdr x))
(car x)))
(url-attributes urlobj) ";"))))
+(make-obsolete 'url-recreate-url-attributes nil "24.2")
;;;###autoload
(defun url-generic-parse-url (url)
"Return an URL-struct of the parts of URL.
The CL-style struct contains the following fields:
-TYPE USER PASSWORD HOST PORTSPEC FILENAME TARGET ATTRIBUTES FULLNESS."
- ;; See RFC 3986.
- (cond
- ((null url)
- (url-parse-make-urlobj))
- ((or (not (string-match url-nonrelative-link url))
- (= ?/ (string-to-char url)))
- ;; This isn't correct, as a relative URL can be a fragment link
- ;; (e.g. "#foo") and many other things (see section 4.2).
- ;; However, let's not fix something that isn't broken, especially
- ;; when close to a release.
- (url-parse-make-urlobj nil nil nil nil nil url))
- (t
+
+TYPE is the URI scheme (string or nil).
+USER is the user name (string or nil).
+PASSWORD is the password (string [deprecated] or nil).
+HOST is the host (a registered name, IP literal in square
+ brackets, or IPv4 address in dotted-decimal form).
+PORTSPEC is the specified port (a number), or nil.
+FILENAME is the path AND the query component of the URI.
+TARGET is the fragment identifier component (used to refer to a
+ subordinate resource, e.g. a part of a webpage).
+ATTRIBUTES is nil; this slot originally stored the attribute and
+ value alists for IMAP URIs, but this feature was removed
+ since it conflicts with RFC 3986.
+FULLNESS is non-nil iff the authority component of the URI is
+ present.
+
+The parser follows RFC 3986, except that it also tries to handle
+URIs that are not fully specified (e.g. lacking TYPE), and it
+does not check for or perform %-encoding.
+
+Here is an example. The URL
+
+ foo://bob:pass@example.com:42/a/b/c.dtb?type=animal&name=narwhal#nose
+
+parses to
+
+ TYPE = \"foo\"
+ USER = \"bob\"
+ PASSWORD = \"pass\"
+ HOST = \"example.com\"
+ PORTSPEC = 42
+ FILENAME = \"/a/b/c.dtb?type=animal&name=narwhal\"
+ TARGET = \"nose\"
+ ATTRIBUTES = nil
+ FULLNESS = t"
+ (if (null url)
+ (url-parse-make-urlobj)
(with-temp-buffer
;; Don't let those temp-buffer modifications accidentally
;; deactivate the mark of the current-buffer.
(let ((deactivate-mark nil))
(set-syntax-table url-parse-syntax-table)
- (let ((save-pos nil)
- (prot nil)
- (user nil)
- (pass nil)
- (host nil)
- (port nil)
- (file nil)
- (refs nil)
- (attr nil)
- (full nil)
+ (erase-buffer)
+ (insert url)
+ (goto-char (point-min))
+ (let ((save-pos (point))
+ scheme user pass host port file fragment full
(inhibit-read-only t))
- (erase-buffer)
- (insert url)
- (goto-char (point-min))
- (setq save-pos (point))
;; 3.1. Scheme
- (unless (looking-at "//")
- (skip-chars-forward "a-zA-Z+.\\-")
- (downcase-region save-pos (point))
- (setq prot (buffer-substring save-pos (point)))
- (skip-chars-forward ":")
- (setq save-pos (point)))
+ ;; This is nil for a URI that is not fully specified.
+ (when (looking-at "\\([a-zA-Z][-a-zA-Z0-9+.]*\\):")
+ (goto-char (match-end 0))
+ (setq save-pos (point))
+ (setq scheme (downcase (match-string 1))))
;; 3.2. Authority
(when (looking-at "//")
(setq full t)
(forward-char 2)
(setq save-pos (point))
- (skip-chars-forward "^/")
+ (skip-chars-forward "^/?#")
(setq host (buffer-substring save-pos (point)))
+ ;; 3.2.1 User Information
(if (string-match "^\\([^@]+\\)@" host)
(setq user (match-string 1 host)
- host (substring host (match-end 0) nil)))
- (if (and user (string-match "\\([^:]+\\):\\(.*\\)" user))
+ host (substring host (match-end 0))))
+ (if (and user (string-match "\\`\\([^:]*\\):\\(.*\\)" user))
(setq pass (match-string 2 user)
user (match-string 1 user)))
- ;; This gives wrong results for IPv6 literal addresses.
- (if (string-match ":\\([0-9+]+\\)" host)
- (setq port (string-to-number (match-string 1 host))
- host (substring host 0 (match-beginning 0))))
- (if (string-match ":$" host)
- (setq host (substring host 0 (match-beginning 0))))
- (setq host (downcase host)
- save-pos (point)))
-
- (if (not port)
- (setq port (url-scheme-get-property prot 'default-port)))
-
- ;; 3.3. Path
- ;; Gross hack to preserve ';' in data URLs
+ (cond
+ ;; IPv6 literal address.
+ ((string-match "^\\(\\[[^]]+\\]\\)\\(?::\\([0-9]*\\)\\)?$" host)
+ (setq port (match-string 2 host)
+ host (match-string 1 host)))
+ ;; Registered name or IPv4 address.
+ ((string-match ":\\([0-9]*\\)$" host)
+ (setq port (match-string 1 host)
+ host (substring host 0 (match-beginning 0)))))
+ (cond ((equal port "")
+ (setq port nil))
+ (port
+ (setq port (string-to-number port))))
+ (setq host (downcase host)))
+
+ (and (null port)
+ scheme
+ (setq port (url-scheme-get-property scheme 'default-port)))
+
+ ;; Now point is on the / ? or # which terminates the
+ ;; authority, or at the end of the URI, or (if there is no
+ ;; authority) at the beginning of the absolute path.
+
(setq save-pos (point))
+ (if (string= "data" scheme)
+ ;; For the "data" URI scheme, all the rest is the FILE.
+ (setq file (buffer-substring save-pos (point-max)))
+ ;; For hysterical raisins, our data structure returns the
+ ;; path and query components together in one slot.
+ ;; 3.3. Path
+ (skip-chars-forward "^?#")
+ ;; 3.4. Query
+ (when (looking-at "?")
+ (skip-chars-forward "^#"))
+ (setq file (buffer-substring save-pos (point)))
+ ;; 3.5 Fragment
+ (when (looking-at "#")
+ (let ((opoint (point)))
+ (forward-char 1)
+ (unless (eobp)
+ (setq fragment (buffer-substring (point) (point-max))))
+ (delete-region opoint (point-max)))))
- ;; 3.4. Query
- (if (string= "data" prot)
- (goto-char (point-max))
- ;; Now check for references
- (skip-chars-forward "^#")
- (if (eobp)
- nil
- (delete-region
- (point)
- (progn
- (skip-chars-forward "#")
- (setq refs (buffer-substring (point) (point-max)))
- (point-max))))
- (goto-char save-pos)
- (skip-chars-forward "^;")
- (unless (eobp)
- (setq attr (url-parse-args (buffer-substring (point) (point-max))
- t)
- attr (nreverse attr))))
-
- (setq file (buffer-substring save-pos (point)))
(if (and host (string-match "%[0-9][0-9]" host))
(setq host (url-unhex-string host)))
- (url-parse-make-urlobj
- prot user pass host port file refs attr full)))))))
+ (url-parse-make-urlobj scheme user pass host port file
+ fragment nil full))))))
(defmacro url-bit-for-url (method lookfor url)
`(let* ((urlobj (url-generic-parse-url url))
diff --git a/lisp/url/url-util.el b/lisp/url/url-util.el
index d12bd5447fa..c8016ef6cdb 100644
--- a/lisp/url/url-util.el
+++ b/lisp/url/url-util.el
@@ -333,40 +333,117 @@ forbidden in URL encoding."
(concat tmp str)))
(defconst url-unreserved-chars
- '(
- ?a ?b ?c ?d ?e ?f ?g ?h ?i ?j ?k ?l ?m ?n ?o ?p ?q ?r ?s ?t ?u ?v ?w ?x ?y ?z
+ '(?a ?b ?c ?d ?e ?f ?g ?h ?i ?j ?k ?l ?m ?n ?o ?p ?q ?r ?s ?t ?u ?v ?w ?x ?y ?z
?A ?B ?C ?D ?E ?F ?G ?H ?I ?J ?K ?L ?M ?N ?O ?P ?Q ?R ?S ?T ?U ?V ?W ?X ?Y ?Z
?0 ?1 ?2 ?3 ?4 ?5 ?6 ?7 ?8 ?9
- ?- ?_ ?. ?! ?~ ?* ?' ?\( ?\))
- "A list of characters that are _NOT_ reserved in the URL spec.
-This is taken from RFC 2396.")
+ ?- ?_ ?. ?~)
+ "List of characters that are unreserved in the URL spec.
+This is taken from RFC 3986 (section 2.3).")
+
+(defconst url-encoding-table
+ (let ((vec (make-vector 256 nil)))
+ (dotimes (byte 256)
+ (aset vec byte (format "%%%02x" byte)))
+ vec)
+ "Vector translating bytes to URI-encoded %-sequences.")
+
+(defun url--allowed-chars (char-list)
+ "Return an \"allowed character\" mask (a 256-slot vector).
+The Nth element is non-nil if character N is in CHAR-LIST. The
+result can be passed as the second arg to `url-hexify-string'."
+ (let ((vec (make-vector 256 nil)))
+ (dolist (byte char-list)
+ (ignore-errors (aset vec byte t)))
+ vec))
;;;###autoload
-(defun url-hexify-string (string)
- "Return a new string that is STRING URI-encoded.
-First, STRING is converted to utf-8, if necessary. Then, for each
-character in the utf-8 string, those found in `url-unreserved-chars'
-are left as-is, all others are represented as a three-character
-string: \"%\" followed by two lowercase hex digits."
- ;; To go faster and avoid a lot of consing, we could do:
- ;;
- ;; (defconst url-hexify-table
- ;; (let ((map (make-vector 256 nil)))
- ;; (dotimes (byte 256) (aset map byte
- ;; (if (memq byte url-unreserved-chars)
- ;; (char-to-string byte)
- ;; (format "%%%02x" byte))))
- ;; map))
- ;;
- ;; (mapconcat (curry 'aref url-hexify-table) ...)
+(defun url-hexify-string (string &optional allowed-chars)
+ "URI-encode STRING and return the result.
+If STRING is multibyte, it is first converted to a utf-8 byte
+string. Each byte corresponding to an allowed character is left
+as-is, while all other bytes are converted to a three-character
+string: \"%\" followed by two lowercase hex digits.
+
+The allowed characters are specified by ALLOWED-CHARS. If this
+argument is nil, the list `url-unreserved-chars' determines the
+allowed characters. Otherwise, ALLOWED-CHARS should be a vector
+whose Nth element is non-nil if character N is allowed."
+ (unless allowed-chars
+ (setq allowed-chars (url--allowed-chars url-unreserved-chars)))
(mapconcat (lambda (byte)
- (if (memq byte url-unreserved-chars)
- (char-to-string byte)
- (format "%%%02x" byte)))
- (if (multibyte-string-p string)
- (encode-coding-string string 'utf-8)
- string)
- ""))
+ (if (aref allowed-chars byte)
+ (char-to-string byte)
+ (aref url-encoding-table byte)))
+ (if (multibyte-string-p string)
+ (encode-coding-string string 'utf-8)
+ string)
+ ""))
+
+(defconst url-host-allowed-chars
+ ;; Allow % to avoid re-encoding %-encoded sequences.
+ (url--allowed-chars (append '(?% ?! ?$ ?& ?' ?\( ?\) ?* ?+ ?, ?\; ?=)
+ url-unreserved-chars))
+ "Allowed-character byte mask for the host segment of a URI.
+These characters are specified in RFC 3986, Appendix A.")
+
+(defconst url-path-allowed-chars
+ (let ((vec (copy-sequence url-host-allowed-chars)))
+ (aset vec ?/ t)
+ (aset vec ?: t)
+ (aset vec ?@ t)
+ vec)
+ "Allowed-character byte mask for the path segment of a URI.
+These characters are specified in RFC 3986, Appendix A.")
+
+(defconst url-query-allowed-chars
+ (let ((vec (copy-sequence url-path-allowed-chars)))
+ (aset vec ?? t)
+ vec)
+ "Allowed-character byte mask for the query segment of a URI.
+These characters are specified in RFC 3986, Appendix A.")
+
+;;;###autoload
+(defun url-encode-url (url)
+ "Return a properly URI-encoded version of URL.
+This function also performs URI normalization, e.g. converting
+the scheme to lowercase if it is uppercase. Apart from
+normalization, if URL is already URI-encoded, this function
+should return it unchanged."
+ (if (multibyte-string-p url)
+ (setq url (encode-coding-string url 'utf-8)))
+ (let* ((obj (url-generic-parse-url url))
+ (user (url-user obj))
+ (pass (url-password obj))
+ (host (url-host obj))
+ (file (url-filename obj))
+ (frag (url-target obj))
+ path query)
+ (if user
+ (setf (url-user obj) (url-hexify-string user)))
+ (if pass
+ (setf (url-password obj) (url-hexify-string pass)))
+ (when host
+ ;; No special encoding for IPv6 literals.
+ (unless (string-match "\\`\\[.*\\]\\'" host)
+ (setf (url-host obj)
+ (url-hexify-string host url-host-allowed-chars))))
+ ;; Split FILENAME slot into its PATH and QUERY components, and
+ ;; encode them separately. The PATH component can contain
+ ;; unreserved characters, %-encodings, and /:@!$&'()*+,;=
+ (when file
+ (if (string-match "\\?" file)
+ (setq path (substring file 0 (match-beginning 0))
+ query (substring file (match-end 0)))
+ (setq path file))
+ (setq path (url-hexify-string path url-path-allowed-chars))
+ (if query
+ (setq query (url-hexify-string query url-query-allowed-chars)))
+ (setf (url-filename obj)
+ (if query (concat path "?" query) path)))
+ (if frag
+ (setf (url-target obj)
+ (url-hexify-string frag url-query-allowed-chars)))
+ (url-recreate-url obj)))
;;;###autoload
(defun url-file-extension (fname &optional x)
diff --git a/lisp/url/url-vars.el b/lisp/url/url-vars.el
index ff18049e97b..0d71910849f 100644
--- a/lisp/url/url-vars.el
+++ b/lisp/url/url-vars.el
@@ -304,8 +304,12 @@ undefined."
:type '(choice (const :tag "None" :value nil) string)
:group 'url)
+;; From RFC3986: Scheme names consist of a sequence of characters
+;; beginning with a letter and followed by any combination of letters,
+;; digits, plus ("+"), period ("."), or hyphen ("-").
+
(defvar url-nonrelative-link
- "\\`\\([-a-zA-Z0-9+.]+:\\)"
+ "\\`\\([a-zA-Z][-a-zA-Z0-9+.]*:\\)"
"A regular expression that will match an absolute URL.")
(defcustom url-max-redirections 30
diff --git a/lisp/url/url.el b/lisp/url/url.el
index 7884882c6e7..6d276273c2d 100644
--- a/lisp/url/url.el
+++ b/lisp/url/url.el
@@ -125,7 +125,9 @@ variable in the original buffer as a forwarding pointer.")
;;;###autoload
(defun url-retrieve (url callback &optional cbargs silent inhibit-cookies)
"Retrieve URL asynchronously and call CALLBACK with CBARGS when finished.
-URL is either a string or a parsed URL.
+URL is either a string or a parsed URL. If it is a string
+containing characters that are not valid in a URI, those
+characters are percent-encoded; see `url-encode-url'.
CALLBACK is called when the object has been completely retrieved, with
the current buffer containing the object, and any MIME headers associated
@@ -179,10 +181,8 @@ URL-encoded before it's used."
(url-do-setup)
(url-gc-dead-buffers)
(if (stringp url)
- (set-text-properties 0 (length url) nil url))
- (when (multibyte-string-p url)
- (let ((url-unreserved-chars (append '(?: ?/) url-unreserved-chars)))
- (setq url (url-hexify-string url))))
+ (set-text-properties 0 (length url) nil url))
+ (setq url (url-encode-url url))
(if (not (vectorp url))
(setq url (url-generic-parse-url url)))
(if (not (functionp callback))