summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorNick Vatamaniuc <vatamane@apache.org>2017-07-18 02:16:33 -0400
committerNick Vatamaniuc <nickva@users.noreply.github.com>2017-07-31 19:42:55 -0400
commitf8cb6f97c3a2d90449065c75dbc5405b34a61d18 (patch)
tree898d7ead83c736c16a58e8c85d233a22c238689f
parent1022c2507631cc642693a6efc615c43bc4c1617f (diff)
downloadcouchdb-f8cb6f97c3a2d90449065c75dbc5405b34a61d18.tar.gz
Make replication ID generation more robust.
Replications checkpoint to _local documents identified by replication ids. If replication ids change replication tasks will not be able to find their previous checkpoints and will rewind their change feeds back to 0. For a large database that could mean reprocessing millions of documents. Current version of replication id generation algorithm hashes the full url of the source, target, their headers, including authorization ones as well, and a few other things. This means when user changes their password and updates their replication document, replication ids will change and all the checkpoint will be invalidated. Also, it is fairly common to upgrade services from http:// to https://. Replication endpoint URIs then typically just change their schema part accordingly. However, schema is part of the replication ID calculation, so replication ids would then change as well. Introduce a more robust replication id generation algorithm which can handle some of those issues. The new algorithm: 1. Excludes source and target URI schema from the replication id calculation. As long as the host and other parts stay the same changing the schema will have no effect on the replication id. 2. Ignores inline (specified in the URL) basic authentication passwords. 3. Ignores basic authentication password even if provided in the basic authorization headers. 4. Is insensitive to switching between providing basic authentication credentials inline or in a headers section. However it includes the username used in the basic auth in the calculation. It is plausible scenario that http://user1:pass1@a.host.com is really a different database than http://user2:pass@@a.host.com Issue #688
-rw-r--r--src/couch_replicator/src/couch_replicator.hrl2
-rw-r--r--src/couch_replicator/src/couch_replicator_ids.erl175
2 files changed, 176 insertions, 1 deletions
diff --git a/src/couch_replicator/src/couch_replicator.hrl b/src/couch_replicator/src/couch_replicator.hrl
index ba9a6060f..d46c34720 100644
--- a/src/couch_replicator/src/couch_replicator.hrl
+++ b/src/couch_replicator/src/couch_replicator.hrl
@@ -10,7 +10,7 @@
% License for the specific language governing permissions and limitations under
% the License.
--define(REP_ID_VERSION, 3).
+-define(REP_ID_VERSION, 4).
-record(rep, {
id :: rep_id() | '_' | 'undefined',
diff --git a/src/couch_replicator/src/couch_replicator_ids.erl b/src/couch_replicator/src/couch_replicator_ids.erl
index cbfe82afb..62cfdf267 100644
--- a/src/couch_replicator/src/couch_replicator_ids.erl
+++ b/src/couch_replicator/src/couch_replicator_ids.erl
@@ -18,6 +18,8 @@
convert/1
]).
+-include_lib("ibrowse/include/ibrowse.hrl").
+
-include_lib("couch/include/couch_db.hrl").
-include("couch_replicator_api_wrap.hrl").
-include("couch_replicator.hrl").
@@ -37,6 +39,12 @@ replication_id(#rep{options = Options} = Rep) ->
% If a change is made to how replications are identified,
% please add a new clause and increase ?REP_ID_VERSION.
+replication_id(#rep{user_ctx = UserCtx} = Rep, 4) ->
+ UUID = couch_server:get_uuid(),
+ SrcInfo = get_v4_endpoint(UserCtx, Rep#rep.source),
+ TgtInfo = get_v4_endpoint(UserCtx, Rep#rep.target),
+ maybe_append_filters([UUID, SrcInfo, TgtInfo], Rep);
+
replication_id(#rep{user_ctx = UserCtx} = Rep, 3) ->
UUID = couch_server:get_uuid(),
Src = get_rep_endpoint(UserCtx, Rep#rep.source),
@@ -125,3 +133,170 @@ get_rep_endpoint(_UserCtx, #httpdb{url=Url, headers=Headers, oauth=OAuth}) ->
end;
get_rep_endpoint(UserCtx, <<DbName/binary>>) ->
{local, DbName, UserCtx}.
+
+
+get_v4_endpoint(UserCtx, #httpdb{} = HttpDb) ->
+ {Url, Headers, OAuth} = case get_rep_endpoint(UserCtx, HttpDb) of
+ {remote, U, Hds} ->
+ {U, Hds, undefined};
+ {remote, U, Hds, OA} ->
+ {U, Hds, OA}
+ end,
+ {UserFromHeaders, HeadersWithoutBasicAuth} = remove_basic_auth(Headers),
+ {UserFromUrl, Host, NonDefaultPort, Path} = get_v4_url_info(Url),
+ User = pick_defined_value([UserFromUrl, UserFromHeaders]),
+ {remote, User, Host, NonDefaultPort, Path, HeadersWithoutBasicAuth, OAuth};
+get_v4_endpoint(UserCtx, <<DbName/binary>>) ->
+ {local, DbName, UserCtx}.
+
+
+remove_basic_auth(Headers) ->
+ case lists:partition(fun is_basic_auth/1, Headers) of
+ {[], HeadersWithoutBasicAuth} ->
+ {undefined, HeadersWithoutBasicAuth};
+ {[{_, "Basic " ++ Base64} | _], HeadersWithoutBasicAuth} ->
+ User = get_basic_auth_user(Base64),
+ {User, HeadersWithoutBasicAuth}
+ end.
+
+
+is_basic_auth({"Authorization", "Basic " ++ _Base64}) ->
+ true;
+is_basic_auth(_) ->
+ false.
+
+
+get_basic_auth_user(Base64) ->
+ try re:split(base64:decode(Base64), ":", [{return, list}, {parts, 2}]) of
+ [User, _Pass] ->
+ User;
+ _ ->
+ undefined
+ catch
+ % Tolerate invalid B64 values here to avoid crashing replicator
+ error:function_clause ->
+ undefined
+ end.
+
+
+pick_defined_value(Values) ->
+ case [V || V <- Values, V /= undefined] of
+ [] ->
+ undefined;
+ DefinedValues ->
+ hd(DefinedValues)
+ end.
+
+
+get_v4_url_info(Url) when is_binary(Url) ->
+ get_v4_url_info(binary_to_list(Url));
+get_v4_url_info(Url) ->
+ case ibrowse_lib:parse_url(Url) of
+ {error, invalid_uri} ->
+ % Tolerate errors here to avoid a bad user document
+ % crashing the replicator
+ {undefined, Url, undefined, undefined};
+ #url{
+ protocol = Schema,
+ username = User,
+ host = Host,
+ port = Port,
+ path = Path
+ } ->
+ NonDefaultPort = get_non_default_port(Schema, Port),
+ {User, Host, NonDefaultPort, Path}
+ end.
+
+
+get_non_default_port(https, 443) ->
+ default;
+get_non_default_port(http, 80) ->
+ default;
+get_non_default_port(http, 5984) ->
+ default;
+get_non_default_port(_Schema, Port) ->
+ Port.
+
+
+-ifdef(TEST).
+
+-include_lib("eunit/include/eunit.hrl").
+
+http_v4_endpoint_test_() ->
+ [?_assertMatch({remote, User, Host, Port, Path, HeadersNoAuth, undefined},
+ get_v4_endpoint(nil, #httpdb{url = Url, headers = Headers})) ||
+ {{User, Host, Port, Path, HeadersNoAuth}, {Url, Headers}} <- [
+ {
+ {undefined, "host", default, "/", []},
+ {"http://host", []}
+ },
+ {
+ {undefined, "host", default, "/", []},
+ {"https://host", []}
+ },
+ {
+ {undefined, "host", default, "/", []},
+ {"http://host:5984", []}
+ },
+ {
+ {undefined, "host", 1, "/", []},
+ {"http://host:1", []}
+ },
+ {
+ {undefined, "host", 2, "/", []},
+ {"https://host:2", []}
+ },
+ {
+ {undefined, "host", default, "/", [{"h","v"}]},
+ {"http://host", [{"h","v"}]}
+ },
+ {
+ {undefined, "host", default, "/a/b", []},
+ {"http://host/a/b", []}
+ },
+ {
+ {"user", "host", default, "/", []},
+ {"http://user:pass@host", []}
+ },
+ {
+ {"user", "host", 3, "/", []},
+ {"http://user:pass@host:3", []}
+ },
+ {
+ {"user", "host", default, "/", []},
+ {"http://user:newpass@host", []}
+ },
+ {
+ {"user", "host", default, "/", []},
+ {"http://host", [basic_auth("user","pass")]}
+ },
+ {
+ {"user", "host", default, "/", []},
+ {"http://host", [basic_auth("user","newpass")]}
+ },
+ {
+ {"user1", "host", default, "/", []},
+ {"http://user1:pass1@host", [basic_auth("user2","pass2")]}
+ },
+ {
+ {"user", "host", default, "/", [{"h", "v"}]},
+ {"http://host", [{"h", "v"}, basic_auth("user","pass")]}
+ },
+ {
+ {undefined, "random_junk", undefined, undefined},
+ {"random_junk", []}
+ },
+ {
+ {undefined, "host", default, "/", []},
+ {"http://host", [{"Authorization", "Basic bad"}]}
+ }
+ ]
+ ].
+
+
+basic_auth(User, Pass) ->
+ B64Auth = base64:encode_to_string(User ++ ":" ++ Pass),
+ {"Authorization", "Basic " ++ B64Auth}.
+
+
+-endif.