summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAdam Kocoloski <adam@cloudant.com>2015-07-17 19:20:36 -0400
committerAdam Kocoloski <adam@cloudant.com>2015-07-18 11:44:53 -0400
commit50ada24b120c274b7d959ffe8e780055816b662e (patch)
treee87abd977cb97918d1089083488a87fdb0925c86
parent5b1b3e155dd2909db75bed799f40f97c29410b19 (diff)
downloadcouchdb-2735-duplicate-docs.tar.gz
Ensure doc groups are sorted before merging them2735-duplicate-docs
We had been implicitly assuming that clients send us sorted groups, but unsurprisingly that's not always the case. The additional sorting here should be redundant, but the consequences of merging unsorted groups are severe -- we can end up with uniqueness violations on the primary key in the database -- and so we add an additional sort here. COUCHDB-2735
-rw-r--r--src/couchdb/couch_db_updater.erl14
1 files changed, 11 insertions, 3 deletions
diff --git a/src/couchdb/couch_db_updater.erl b/src/couchdb/couch_db_updater.erl
index 947669cb1..c92097f62 100644
--- a/src/couchdb/couch_db_updater.erl
+++ b/src/couchdb/couch_db_updater.erl
@@ -222,7 +222,7 @@ handle_cast(Msg, #db{name = Name} = Db) ->
handle_info({update_docs, Client, GroupedDocs, NonRepDocs, MergeConflicts,
FullCommit}, Db) ->
- GroupedDocs2 = [[{Client, D} || D <- DocGroup] || DocGroup <- GroupedDocs],
+ GroupedDocs2 = sort_and_tag_groups(Client, GroupedDocs),
if NonRepDocs == [] ->
{GroupedDocs3, Clients, FullCommit2} = collect_updates(GroupedDocs2,
[Client], MergeConflicts, FullCommit);
@@ -291,8 +291,7 @@ collect_updates(GroupedDocsAcc, ClientsAcc, MergeConflicts, FullCommit) ->
% updaters than deal with their possible conflicts, and local docs
% writes are relatively rare. Can be optmized later if really needed.
{update_docs, Client, GroupedDocs, [], MergeConflicts, FullCommit2} ->
- GroupedDocs2 = [[{Client, Doc} || Doc <- DocGroup]
- || DocGroup <- GroupedDocs],
+ GroupedDocs2 = sort_and_tag_groups(Client, GroupedDocs),
GroupedDocsAcc2 =
merge_updates(GroupedDocsAcc, GroupedDocs2, []),
collect_updates(GroupedDocsAcc2, [Client | ClientsAcc],
@@ -302,6 +301,15 @@ collect_updates(GroupedDocsAcc, ClientsAcc, MergeConflicts, FullCommit) ->
end.
+sort_and_tag_groups(Client, GroupedDocs) ->
+ % These groups should already be sorted but sometimes clients misbehave.
+ % The merge_updates function will fail and the database can end up with
+ % duplicate documents if the incoming groups are not sorted, so as a sanity
+ % check we sort them again here. See COUCHDB-2735.
+ Cmp = fun([{#doc{id=A}, _}|_], [{#doc{id=B}, _}|_]) -> A < B end,
+ SortedGroups = lists:sort(Cmp, GroupedDocs),
+ [[{Client, D} || D <- DocGroup] || DocGroup <- SortedGroups].
+
btree_by_seq_split(#doc_info{id=Id, high_seq=KeySeq, revs=Revs}) ->
{RevInfos, DeletedRevInfos} = lists:foldl(
fun(#rev_info{deleted = false, seq = Seq} = Ri, {Acc, AccDel}) ->