summaryrefslogtreecommitdiff
path: root/chromium/net/tools/dump_cache/url_to_filename_encoder.h
diff options
context:
space:
mode:
Diffstat (limited to 'chromium/net/tools/dump_cache/url_to_filename_encoder.h')
-rw-r--r--chromium/net/tools/dump_cache/url_to_filename_encoder.h211
1 files changed, 211 insertions, 0 deletions
diff --git a/chromium/net/tools/dump_cache/url_to_filename_encoder.h b/chromium/net/tools/dump_cache/url_to_filename_encoder.h
new file mode 100644
index 00000000000..7918418d5b2
--- /dev/null
+++ b/chromium/net/tools/dump_cache/url_to_filename_encoder.h
@@ -0,0 +1,211 @@
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+// URL filename encoder goals:
+//
+// 1. Allow URLs with arbitrary path-segment length, generating filenames
+// with a maximum of 128 characters.
+// 2. Provide a somewhat human readable filenames, for easy debugging flow.
+// 3. Provide reverse-mapping from filenames back to URLs.
+// 4. Be able to distinguish http://x from http://x/ from http://x/index.html.
+// Those can all be different URLs.
+// 5. Be able to represent http://a/b/c and http://a/b/c/d, a pattern seen
+// with Facebook Connect.
+//
+// We need an escape-character for representing characters that are legal
+// in URL paths, but not in filenames, such as '?'.
+//
+// We can pick any legal character as an escape, as long as we escape it too.
+// But as we have a goal of having filenames that humans can correlate with
+// URLs, we should pick one that doesn't show up frequently in URLs. Candidates
+// are ~`!@#$%^&()-=_+{}[],. but we would prefer to avoid characters that are
+// shell escapes or that various build tools use.
+//
+// .#&%-=_+ occur frequently in URLs.
+// <>:"/\|?* are illegal in Windows
+// See http://msdn.microsoft.com/en-us/library/aa365247(VS.85).aspx
+// ~`!$^&(){}[]'; are special to Unix shells
+// In addition, build tools do not like ^@#%
+//
+// Josh took a quick look at the frequency of some special characters in
+// Sadeesh's slurped directory from Fall 09 and found the following occurances:
+//
+// ^ 3 build tool doesn't like ^ in testdata filenames
+// @ 10 build tool doesn't like @ in testdata filenames
+// . 1676 too frequent in URLs
+// , 76 THE WINNER
+// # 0 build tool doesn't like it
+// & 487 Prefer to avoid shell escapes
+// % 374 g4 doesn't like it
+// = 579 very frequent in URLs -- leave unmodified
+// - 464 very frequent in URLs -- leave unmodified
+// _ 798 very frequent in URLs -- leave unmodified
+//
+//
+// The escaping algorithm is:
+// 1) Escape all unfriendly symbols as ,XX where XX is the hex code.
+// 2) Add a ',' at the end (We do not allow ',' at end of any directory name,
+// so this assures that e.g. /a and /a/b can coexist in the filesystem).
+// 3) Go through the path segment by segment (where a segment is one directory
+// or leaf in the path) and
+// 3a) If the segment is empty, escape the second slash. i.e. if it was
+// www.foo.com//a then we escape the second / like www.foo.com/,2Fa,
+// 3a) If it is "." or ".." prepend with ',' (so that we have a non-
+// empty and non-reserved filename).
+// 3b) If it is over 128 characters, break it up into smaller segments by
+// inserting ,-/ (Windows limits paths to 128 chars, other OSes also
+// have limits that would restrict us)
+//
+// For example:
+// URL File
+// / /,
+// /index.html /index.html,
+// /. /.,
+// /a/b /a/b,
+// /a/b/ /a/b/,
+// /a/b/c /a/b/c, Note: no prefix problem
+// /u?foo=bar /u,3Ffoo=bar,
+// // /,2F,
+// /./ /,./,
+// /../ /,../,
+// /, /,2C,
+// /,./ /,2C./,
+// /very...longname/ /very...long,-/name If very...long is about 126 long.
+
+// NOTE: we avoid using some classes here (like FilePath and GURL) because we
+// share this code with other projects externally.
+
+#ifndef NET_TOOLS_DUMP_CACHE_URL_TO_FILENAME_ENCODER_H_
+#define NET_TOOLS_DUMP_CACHE_URL_TO_FILENAME_ENCODER_H_
+
+#include <string>
+
+#include "base/strings/string_util.h"
+#include "net/tools/dump_cache/url_utilities.h"
+
+namespace net {
+
+// Helper class for converting a URL into a filename.
+class UrlToFilenameEncoder {
+ public:
+ // Given a |url| and a |base_path|, returns a filename which represents this
+ // |url|. |url| may include URL escaping such as %21 for !
+ // |legacy_escape| indicates that this function should use the old-style
+ // of encoding.
+ // TODO(mbelshe): delete the legacy_escape code.
+ static std::string Encode(const std::string& url, std::string base_path,
+ bool legacy_escape) {
+ std::string filename;
+ if (!legacy_escape) {
+ std::string url_no_scheme = UrlUtilities::GetUrlHostPath(url);
+ EncodeSegment(base_path, url_no_scheme, '/', &filename);
+#ifdef WIN32
+ ReplaceAll(&filename, "/", "\\");
+#endif
+ } else {
+ std::string clean_url(url);
+ if (clean_url.length() && clean_url[clean_url.length()-1] == '/')
+ clean_url.append("index.html");
+
+ std::string host = UrlUtilities::GetUrlHost(clean_url);
+ filename.append(base_path);
+ filename.append(host);
+#ifdef WIN32
+ filename.append("\\");
+#else
+ filename.append("/");
+#endif
+
+ std::string url_filename = UrlUtilities::GetUrlPath(clean_url);
+ // Strip the leading '/'.
+ if (url_filename[0] == '/')
+ url_filename = url_filename.substr(1);
+
+ // Replace '/' with '\'.
+ ConvertToSlashes(&url_filename);
+
+ // Strip double back-slashes ("\\\\").
+ StripDoubleSlashes(&url_filename);
+
+ // Save path as filesystem-safe characters.
+ url_filename = LegacyEscape(url_filename);
+ filename.append(url_filename);
+
+#ifndef WIN32
+ // Last step - convert to native slashes.
+ const std::string slash("/");
+ const std::string backslash("\\");
+ ReplaceAll(&filename, backslash, slash);
+#endif
+ }
+
+ return filename;
+ }
+
+ // Rewrite HTML in a form that the SPDY in-memory server
+ // can read.
+ // |filename_prefix| is prepended without escaping.
+ // |escaped_ending| is the URL to be encoded into a filename. It may have URL
+ // escaped characters (like %21 for !).
+ // |dir_separator| is "/" on Unix, "\" on Windows.
+ // |encoded_filename| is the resultant filename.
+ static void EncodeSegment(
+ const std::string& filename_prefix,
+ const std::string& escaped_ending,
+ char dir_separator,
+ std::string* encoded_filename);
+
+ // Decodes a filename that was encoded with EncodeSegment,
+ // yielding back the original URL.
+ static bool Decode(const std::string& encoded_filename,
+ char dir_separator,
+ std::string* decoded_url);
+
+ static const char kEscapeChar;
+ static const char kTruncationChar;
+ static const size_t kMaximumSubdirectoryLength;
+
+ friend class UrlToFilenameEncoderTest;
+
+ private:
+ // Appends a segment of the path, special-casing "." and "..", and
+ // ensuring that the segment does not exceed the path length. If it does,
+ // it chops the end off the segment, writes the segment with a separator of
+ // ",-/", and then rewrites segment to contain just the truncated piece so
+ // it can be used in the next iteration.
+ // |segment| is a read/write parameter containing segment to write
+ // Note: this should not be called with empty segment.
+ static void AppendSegment(std::string* segment, std::string* dest);
+
+ // Allow reading of old slurped files.
+ static std::string LegacyEscape(const std::string& path);
+
+ // Replace all instances of |from| within |str| as |to|.
+ static void ReplaceAll(std::string* str, const std::string& from,
+ const std::string& to) {
+ std::string::size_type pos(0);
+ while ((pos = str->find(from, pos)) != std::string::npos) {
+ str->replace(pos, from.size(), to);
+ pos += from.size();
+ }
+ }
+
+ // Replace all instances of "/" with "\" in |path|.
+ static void ConvertToSlashes(std::string* path) {
+ const std::string slash("/");
+ const std::string backslash("\\");
+ ReplaceAll(path, slash, backslash);
+ }
+
+ // Replace all instances of "\\" with "%5C%5C" in |path|.
+ static void StripDoubleSlashes(std::string* path) {
+ const std::string doubleslash("\\\\");
+ const std::string escaped_doubleslash("%5C%5C");
+ ReplaceAll(path, doubleslash, escaped_doubleslash);
+ }
+};
+
+} // namespace net
+
+#endif // NET_TOOLS_DUMP_CACHE_URL_TO_FILENAME_ENCODER_H_