diff options
author | Nick Vatamaniuc <vatamane@apache.org> | 2017-07-18 02:16:33 -0400 |
---|---|---|
committer | Nick Vatamaniuc <nickva@users.noreply.github.com> | 2017-07-31 19:42:55 -0400 |
commit | f8cb6f97c3a2d90449065c75dbc5405b34a61d18 (patch) | |
tree | 898d7ead83c736c16a58e8c85d233a22c238689f | |
parent | 1022c2507631cc642693a6efc615c43bc4c1617f (diff) | |
download | couchdb-f8cb6f97c3a2d90449065c75dbc5405b34a61d18.tar.gz |
Make replication ID generation more robust.
Replications checkpoint to _local documents identified by replication ids. If
replication ids change replication tasks will not be able to find their
previous checkpoints and will rewind their change feeds back to 0. For a large
database that could mean reprocessing millions of documents.
Current version of replication id generation algorithm hashes the full url of
the source, target, their headers, including authorization ones as well, and a
few other things. This means when user changes their password and updates their
replication document, replication ids will change and all the checkpoint will
be invalidated.
Also, it is fairly common to upgrade services from http:// to https://.
Replication endpoint URIs then typically just change their schema part
accordingly. However, schema is part of the replication ID calculation, so
replication ids would then change as well.
Introduce a more robust replication id generation algorithm which can handle
some of those issues. The new algorithm:
1. Excludes source and target URI schema from the replication id calculation.
As long as the host and other parts stay the same changing the schema will
have no effect on the replication id.
2. Ignores inline (specified in the URL) basic authentication passwords.
3. Ignores basic authentication password even if provided in the
basic authorization headers.
4. Is insensitive to switching between providing basic authentication
credentials inline or in a headers section. However it includes the username
used in the basic auth in the calculation. It is plausible scenario that
http://user1:pass1@a.host.com is really a different database than
http://user2:pass@@a.host.com
Issue #688
-rw-r--r-- | src/couch_replicator/src/couch_replicator.hrl | 2 | ||||
-rw-r--r-- | src/couch_replicator/src/couch_replicator_ids.erl | 175 |
2 files changed, 176 insertions, 1 deletions
diff --git a/src/couch_replicator/src/couch_replicator.hrl b/src/couch_replicator/src/couch_replicator.hrl index ba9a6060f..d46c34720 100644 --- a/src/couch_replicator/src/couch_replicator.hrl +++ b/src/couch_replicator/src/couch_replicator.hrl @@ -10,7 +10,7 @@ % License for the specific language governing permissions and limitations under % the License. --define(REP_ID_VERSION, 3). +-define(REP_ID_VERSION, 4). -record(rep, { id :: rep_id() | '_' | 'undefined', diff --git a/src/couch_replicator/src/couch_replicator_ids.erl b/src/couch_replicator/src/couch_replicator_ids.erl index cbfe82afb..62cfdf267 100644 --- a/src/couch_replicator/src/couch_replicator_ids.erl +++ b/src/couch_replicator/src/couch_replicator_ids.erl @@ -18,6 +18,8 @@ convert/1 ]). +-include_lib("ibrowse/include/ibrowse.hrl"). + -include_lib("couch/include/couch_db.hrl"). -include("couch_replicator_api_wrap.hrl"). -include("couch_replicator.hrl"). @@ -37,6 +39,12 @@ replication_id(#rep{options = Options} = Rep) -> % If a change is made to how replications are identified, % please add a new clause and increase ?REP_ID_VERSION. +replication_id(#rep{user_ctx = UserCtx} = Rep, 4) -> + UUID = couch_server:get_uuid(), + SrcInfo = get_v4_endpoint(UserCtx, Rep#rep.source), + TgtInfo = get_v4_endpoint(UserCtx, Rep#rep.target), + maybe_append_filters([UUID, SrcInfo, TgtInfo], Rep); + replication_id(#rep{user_ctx = UserCtx} = Rep, 3) -> UUID = couch_server:get_uuid(), Src = get_rep_endpoint(UserCtx, Rep#rep.source), @@ -125,3 +133,170 @@ get_rep_endpoint(_UserCtx, #httpdb{url=Url, headers=Headers, oauth=OAuth}) -> end; get_rep_endpoint(UserCtx, <<DbName/binary>>) -> {local, DbName, UserCtx}. + + +get_v4_endpoint(UserCtx, #httpdb{} = HttpDb) -> + {Url, Headers, OAuth} = case get_rep_endpoint(UserCtx, HttpDb) of + {remote, U, Hds} -> + {U, Hds, undefined}; + {remote, U, Hds, OA} -> + {U, Hds, OA} + end, + {UserFromHeaders, HeadersWithoutBasicAuth} = remove_basic_auth(Headers), + {UserFromUrl, Host, NonDefaultPort, Path} = get_v4_url_info(Url), + User = pick_defined_value([UserFromUrl, UserFromHeaders]), + {remote, User, Host, NonDefaultPort, Path, HeadersWithoutBasicAuth, OAuth}; +get_v4_endpoint(UserCtx, <<DbName/binary>>) -> + {local, DbName, UserCtx}. + + +remove_basic_auth(Headers) -> + case lists:partition(fun is_basic_auth/1, Headers) of + {[], HeadersWithoutBasicAuth} -> + {undefined, HeadersWithoutBasicAuth}; + {[{_, "Basic " ++ Base64} | _], HeadersWithoutBasicAuth} -> + User = get_basic_auth_user(Base64), + {User, HeadersWithoutBasicAuth} + end. + + +is_basic_auth({"Authorization", "Basic " ++ _Base64}) -> + true; +is_basic_auth(_) -> + false. + + +get_basic_auth_user(Base64) -> + try re:split(base64:decode(Base64), ":", [{return, list}, {parts, 2}]) of + [User, _Pass] -> + User; + _ -> + undefined + catch + % Tolerate invalid B64 values here to avoid crashing replicator + error:function_clause -> + undefined + end. + + +pick_defined_value(Values) -> + case [V || V <- Values, V /= undefined] of + [] -> + undefined; + DefinedValues -> + hd(DefinedValues) + end. + + +get_v4_url_info(Url) when is_binary(Url) -> + get_v4_url_info(binary_to_list(Url)); +get_v4_url_info(Url) -> + case ibrowse_lib:parse_url(Url) of + {error, invalid_uri} -> + % Tolerate errors here to avoid a bad user document + % crashing the replicator + {undefined, Url, undefined, undefined}; + #url{ + protocol = Schema, + username = User, + host = Host, + port = Port, + path = Path + } -> + NonDefaultPort = get_non_default_port(Schema, Port), + {User, Host, NonDefaultPort, Path} + end. + + +get_non_default_port(https, 443) -> + default; +get_non_default_port(http, 80) -> + default; +get_non_default_port(http, 5984) -> + default; +get_non_default_port(_Schema, Port) -> + Port. + + +-ifdef(TEST). + +-include_lib("eunit/include/eunit.hrl"). + +http_v4_endpoint_test_() -> + [?_assertMatch({remote, User, Host, Port, Path, HeadersNoAuth, undefined}, + get_v4_endpoint(nil, #httpdb{url = Url, headers = Headers})) || + {{User, Host, Port, Path, HeadersNoAuth}, {Url, Headers}} <- [ + { + {undefined, "host", default, "/", []}, + {"http://host", []} + }, + { + {undefined, "host", default, "/", []}, + {"https://host", []} + }, + { + {undefined, "host", default, "/", []}, + {"http://host:5984", []} + }, + { + {undefined, "host", 1, "/", []}, + {"http://host:1", []} + }, + { + {undefined, "host", 2, "/", []}, + {"https://host:2", []} + }, + { + {undefined, "host", default, "/", [{"h","v"}]}, + {"http://host", [{"h","v"}]} + }, + { + {undefined, "host", default, "/a/b", []}, + {"http://host/a/b", []} + }, + { + {"user", "host", default, "/", []}, + {"http://user:pass@host", []} + }, + { + {"user", "host", 3, "/", []}, + {"http://user:pass@host:3", []} + }, + { + {"user", "host", default, "/", []}, + {"http://user:newpass@host", []} + }, + { + {"user", "host", default, "/", []}, + {"http://host", [basic_auth("user","pass")]} + }, + { + {"user", "host", default, "/", []}, + {"http://host", [basic_auth("user","newpass")]} + }, + { + {"user1", "host", default, "/", []}, + {"http://user1:pass1@host", [basic_auth("user2","pass2")]} + }, + { + {"user", "host", default, "/", [{"h", "v"}]}, + {"http://host", [{"h", "v"}, basic_auth("user","pass")]} + }, + { + {undefined, "random_junk", undefined, undefined}, + {"random_junk", []} + }, + { + {undefined, "host", default, "/", []}, + {"http://host", [{"Authorization", "Basic bad"}]} + } + ] + ]. + + +basic_auth(User, Pass) -> + B64Auth = base64:encode_to_string(User ++ ":" ++ Pass), + {"Authorization", "Basic " ++ B64Auth}. + + +-endif. |