Merge branch 'master' into wip-optracker

Conflicts: src/osd/OpRequest.h src/osd/PG.cc src/osd/ReplicatedPG.cc Signed-off-by: Greg Farnum <greg@inktank.com>
author: Greg Farnum <greg@inktank.com> 2013-10-03 15:50:40 -0700
committer: Greg Farnum <greg@inktank.com> 2013-10-03 15:50:40 -0700
commit: b9d4e97378aa90687e23de73493ec92b280cae6a (patch)
tree: 477f6734908239453b6b0e26f146d60f9a0b0a8e
parent: d6a1799f410cad09dc27b9102911b5682ef4f346 (diff)
parent: 86e96578be669578fdf8213512506e60dc09856d (diff)
download: ceph-b9d4e97378aa90687e23de73493ec92b280cae6a.tar.gz
205 files changed, 8646 insertions, 3807 deletions
diff --git a/.gitignore b/.gitignore
index 211c09cbba7..7e637866366 100644
--- a/.gitignore
+++ b/.gitignore
@@ -69,4 +69,10 @@ web/*.html
 # dir from coverity tools
 cov-int/
 
-/test-driver
-\ No newline at end of file
+/test-driver
+
+# gtags(1) generated files
+GPATH
+GRTAGS
+GSYMS
+GTAGS
diff --git a/.mailmap b/.mailmap
new file mode 100644
index 00000000000..fc4a1eb9ce9
--- /dev/null
+++ b/.mailmap
@@ -0,0 +1,84 @@
+Sage Weil <sage@inktank.com> <sage@newdream.net>
+Sage Weil <sage@inktank.com> <sage.weil@dreamhost.com>
+Sage Weil <sage@inktank.com> <sageweil@29311d96-e01e-0410-9327-a35deaab8ce9>
+Sage Weil <sage@inktank.com> <sage@29311d96-e01e-0410-9327-a35deaab8ce9>
+Sage Weil <sage@inktank.com> <sage@ceph0.dreamhost.com>
+Sage Weil <sage@inktank.com> <sage@skinny.ops.newdream.net>
+Sage Weil <sage@inktank.com> <sage@foil.westwood.newdream.net>
+Sage Weil <sage@inktank.com> <sage@vapre.localdomain>
+Sage Weil <sage@inktank.com> <sage.weil@inktank.com>
+Yehuda Sadeh <yehuda@inktank.com> <yehuda@hq.newdream.net>
+Yehuda Sadeh <yehuda@inktank.com> <yehuda.sadeh@dreamhost.com>
+Yehuda Sadeh <yehuda@inktank.com> <yehuda@yehuda.infit.com>
+Yehuda Sadeh <yehuda@inktank.com> <yehuda@yehuda>
+Yehuda Sadeh <yehuda@inktank.com> <yehudasa@fatty.ops.newdream.net>
+Yehuda Sadeh <yehuda@inktank.com> <yehudasa@gmail.com>
+Yehuda Sadeh <yehuda@inktank.com> <yehudasa@ceph0.dreamhost.com>
+Colin P. McCabe <colinm@hq.newdream.net> <cmccabe@alumni.cmu.edu>
+Colin P. McCabe <colinm@hq.newdream.net> <cmccabe@fatty.ops.newdream.net>
+Greg Farnum <greg@inktank.com> <gregf@hq.newdream.net>
+Greg Farnum <greg@inktank.com> <gregory.farnum@dreamhost.com>
+Greg Farnum <greg@inktank.com> Gregory Farnum <greg@inktank.com> 
+Greg Farnum <greg@inktank.com> <greg@gregs42.com>
+Greg Farnum <greg@inktank.com> <gregf@skinny.ops.newdream.net>
+Greg Farnum <greg@inktank.com> <gfarnum@GF-Macbook.local>
+Samuel Just <sam.just@inktank.com> <samuel.just@dreamhost.com>
+Samuel Just <sam.just@inktank.com> <rexludorum@gmail.com>
+Samuel Just <sam.just@inktank.com> <samuelj@hq.newdream.net>
+Samuel Just <sam.just@inktank.com> <sam.just@dreamhost.com>
+Samuel Just <sam.just@inktank.com> <sam@Pondermatic.(none)>
+John Wilkins <john.wilkins@inktank.com> <john.wilkins@dreamhost.com>
+John Wilkins <john.wilkins@inktank.com> <john@admin-host.(none)>
+John Wilkins <john.wilkins@inktank.com> <johnw@johnw7664.(none)>
+Josh Durgin <josh.durgin@inktank.com> <josh.durgin@dreamhost.com>
+Josh Durgin <josh.durgin@inktank.com> <joshd@hq.newdream.net>
+Dan Mick <dan.mick@inktank.com> <dan.mick@dreamhost.com>
+Dan Mick <dan.mick@inktank.com> <dmick@danceorelse.org>
+Tommi Virtanen <tv@inktank.com> <tommi.virtanen@dreamhost.com>
+Tommi Virtanen <tv@inktank.com> <tv@hq.newdream.net>
+Tommi Virtanen <tv@inktank.com> <tv@eagain.net>
+João Eduardo Luís <joao.luis@inktank.com> <jecluis@gmail.com>
+João Eduardo Luís <joao.luis@inktank.com> Joao Eduardo Luis <joao.luis@inktank.com>
+Sam Lang <sam.lang@inktank.com> <samlang@gmail.com>
+Noah Watkins <noahwatkins@gmail.com> <jayhawk@cs.ucsc.edu>
+Gary Lowell <gary.lowell@inktank.com> <glowell@flab.ops.newdream.net>
+Gary Lowell <gary.lowell@inktank.com> <glowell@inktank.com>
+Patience Warnick <patience@cranium.pelton.net> <patiencew@29311d96-e01e-0410-9327-a35deaab8ce9>
+Wido den Hollander <wido@42on.com> <wido@widodh.nl>
+Michael Rodriguez <michael@newdream.net> <michael@squid.newdream.net>
+Michael Rodriguez <michael@newdream.net> <michael@newdream.net>
+Caleb Miles <caleb.miles@inktank.com> caleb miles <caselim@gmail.com>
+Caleb Miles <caleb.miles@inktank.com> caleb miles <caleb.miles@inktank.com>
+Caleb Miles <caleb.miles@inktank.com> Caleb Miles <caselim@gmail.com>
+Joe Buck <jbbuck@gmail.com> <buck@soe.ucsc.edu>
+Laszlo Boszormenyi <gcs@debian.hu> Laszlo Boszormenyi (GCS) <gcs@debian.hu>
+Roald J. van Loon <roaldvanloon@gmail.com> Roald van Loon <roaldvanloon@gmail.com>
+Alex Elder <elder@inktank.com> <elder@dreamhost.com>
+Alex Elder <elder@inktank.com> <elder@doink.(none)>
+Alex Elder <elder@inktank.com> <elder@speedy.(none)>
+Alexandre Marangone <alexandre.marangone@inktank.com> <a.marangone@gmail.com>
+Alexandre Oliva <oliva@gnu.org> <oliva@lsd.ic.unicamp.br>
+Alexandre Oliva <oliva@gnu.org> <lxoliva@fsfla.org>
+Ross Turk <ross.turk@inktank.com> <ross@inktank.com> 
+Ross Turk <ross.turk@inktank.com> <ross.turk@dreamhost.com>
+Patrick McGarry <patrick@inktank.com> <pmcgarry@gmail.com>
+Patrick McGarry <patrick@inktank.com> scuttlemonkey <patrick@inktank.com>
+Mark Nelson <mark.nelson@inktank.com> <mark.a.nelson@gmail.com>
+Tamil Muthamizhan <tamil.muthamizhan@inktank.com> <tamil@ubuntu.(none)>
+Tamil Muthamizhan <tamil.muthamizhan@inktank.com> tamil <tamil.muthamizhan@inktank.com>
+Tamil Muthamizhan <tamil.muthamizhan@inktank.com> <tamil@tamil-VirtualBox.(none)>
+Christian Brunner <christian@brunner-muc.de> <chb@muc.de> 
+Henry C Chang <henry_c_chang@tcloudcomputing.com> <henry.cy.chang@gmail.com>
+Alfredo Deza <alfredo.deza@inktank.com> <alfredo@deza.pe>
+Sylvain Munaut <s.munaut@whatever-company.com> <tnt@246tNt.com>
+Erwin, Brock A <Brock.Erwin@pnl.gov> <Brock.Erwin@pnl.govgit>
+Kacper Kowalik <xarthisius@gentoo.org> Kacper Kowalik (Xarthisius) <xarthisius@gentoo.org> 
+Neil Levine <neil.levine@inktank.com> <levine@yoyo.org>
+Guilhem Lettron <guilhem@lettron.fr> <guilhem+github@lettron.fr>
+Holger Macht <hmacht@suse.de> <holger@homac.de>
+Volker Assmann <volker@twisted-nerve.de> <volker@stan.local>
+Volker Assmann <volker@twisted-nerve.de> <volker@36-135.mops.RWTH-Aachen.DE>
+Sebastien Han <sebastien.han@enovance.com> <sebastien.han@enovance.com>
+Matthew Roy <matthew@royhousehold.net> <matthew@matthew-ubuntu.(none)>
+Matthew Roy <matthew@royhousehold.net> <mroy@sandbox-ed.com>
+Matthew Wodrich <matthew.wodrich@dreamhost.com> <mattheww@Mattsbox.(none)>
diff --git a/COPYING b/COPYING
index 920b049b7fa..a0034d58c3b 100644
--- a/COPYING
+++ b/COPYING
@@ -1,3 +1,8 @@
+Format-Specification: http://anonscm.debian.org/viewvc/dep/web/deps/dep5/copyright-format.xml?revision=279&view=markup
+Name: ceph
+Maintainer: Sage Weil <sage@newdream.net>
+Source: http://ceph.com/
+
 Files: *
 Copyright: (c) 2004-2010 by Sage Weil <sage@newdream.net>
 License: LGPL2.1 (see COPYING-LGPL2.1)
@@ -18,6 +23,10 @@ Files: src/include/ceph_hash.cc
 Copyright: None
 License: Public domain
 
+Files: src/common/bloom_filter.hpp
+Copyright: Copyright (C) 2000 Arash Partow <arash@partow.net>
+License: Boost Software License, Version 1.0
+
 Files: m4/acx_pthread.m4
 Copyright: Steven G. Johnson <stevenj@alum.mit.edu>
 License: GPLWithACException
@@ -94,33 +103,38 @@ Copyright: Copyright (C) 2013 Cloudwatt <libre.licensing@cloudwatt.com>
 License: LGPL2 or later
 
 Files: src/osd/ErasureCodePluginJerasure/*.{c,h}
-Copyright (c) 2011, James S. Plank <plank@cs.utk.edu>
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions
-are met:
-
- - Redistributions of source code must retain the above copyright
-   notice, this list of conditions and the following disclaimer.
-
- - Redistributions in binary form must reproduce the above copyright
-   notice, this list of conditions and the following disclaimer in
-   the documentation and/or other materials provided with the
-   distribution.
-
- - Neither the name of the University of Tennessee nor the names of its
-   contributors may be used to endorse or promote products derived
-   from this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
-OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
-AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
-WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-POSSIBILITY OF SUCH DAMAGE.
+Copyright: Copyright (c) 2011, James S. Plank <plank@cs.utk.edu>
+License:
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+  
+   - Redistributions of source code must retain the above copyright
+     notice, this list of conditions and the following disclaimer.
+  
+   - Redistributions in binary form must reproduce the above copyright
+     notice, this list of conditions and the following disclaimer in
+     the documentation and/or other materials provided with the
+     distribution.
+  
+   - Neither the name of the University of Tennessee nor the names of its
+     contributors may be used to endorse or promote products derived
+     from this software without specific prior written permission.
+  
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+  INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+  BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+  OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+  AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+  LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
+  WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+  POSSIBILITY OF SUCH DAMAGE.
+
+Packaging:
+    Copyright (C) 2004-2009 by Sage Weil <sage@newdream.net>
+    Copyright (C) 2010 Canonical, Ltd.
+    Licensed under LGPL-2.1
diff --git a/PendingReleaseNotes b/PendingReleaseNotes
index e7fcd7201bb..9a751ffdb49 100644
--- a/PendingReleaseNotes
+++ b/PendingReleaseNotes
@@ -1,2 +1,53 @@
-v0.69
+v0.70
 ~~~~~
+
+* librados::Rados::pool_create_async() and librados::Rados::pool_delete_async()
+  don't drop a reference to the completion object on error, caller needs to take
+  care of that. This has never really worked correctly and we were leaking an
+  object
+
+* 'ceph osd crush set <id> <weight> <loc..>' no longer adds the osd to the
+  specified location, as that's a job for 'ceph osd crush add'.  It will
+  however continue to work just the same as long as the osd already exists
+  in the crush map.
+
+* The OSD now enforces that class write methods cannot both mutate an
+  object and return data.  The rbd.assign_bid method, the lone
+  offender, has been removed.  This breaks compatibility with
+  pre-bobtail librbd clients by preventing them from creating new
+  images.
+
+* librados now returns on commit instead of ack for synchronous calls.
+  This is a bit safer in the case where both OSDs and the client crash, and
+  is probably how it should have been acting from the beginning. Users are
+  unlikely to notice but it could result in lower performance in some
+  circumstances. Those who care should switch to using the async interfaces,
+  which let you specify safety semantics precisely.
+
+* The C++ librados AioComplete::get_version() method was incorrectly
+  returning an int (usually 32-bits).  To avoid breaking library
+  compatibility, a get_version64() method is added that returns the
+  full-width value.  The old method is deprecated and will be removed
+  in a future release.  Users of the C++ librados API that make use of
+  the get_version() method should modify their code to avoid getting a
+  value that is truncated from 64 to to 32 bits.
+
+v0.71
+~~~~~
+
+* The MDS now disallows snapshots by default as they are not
+  considered stable.  The command 'ceph mds set allow_snaps' will
+  enable them.
+
+* For clusters that were created before v0.44 (pre-argonaut, Spring
+  2012) and store radosgw data, the auto-upgrade from TMAP to OMAP
+  objects has been disabled.  Before upgrading, make sure that any
+  buckets created on pre-argonaut releases have been modified (e.g.,
+  by PUTing and then DELETEing an object from each bucket).  Any
+  cluster created with argonaut (v0.48) or a later release or not
+  using radosgw never relied on the automatic conversion and is not
+  affected by this change.
+
+* Any direct users of the 'tmap' portion of the librados API should be
+  aware that the automatic tmap -> omap conversion functionality has
+  been removed.
diff --git a/ceph.spec.in b/ceph.spec.in
index 851ee7acfd5..a60d87ad814 100644
--- a/ceph.spec.in
+++ b/ceph.spec.in
@@ -37,6 +37,7 @@ BuildRequires:	perl
 BuildRequires:	gdbm
 BuildRequires:	pkgconfig
 BuildRequires:	python
+BuildRequires:	python-nose
 BuildRequires:  libaio-devel
 BuildRequires:  libcurl-devel
 BuildRequires:  libxml2-devel
diff --git a/configure.ac b/configure.ac
index 1478adfce79..eeecdbeffc8 100644
--- a/configure.ac
+++ b/configure.ac
@@ -333,14 +333,18 @@ if test "x$enable_cephfs_java" = "xyes"; then
 
 	# setup defaults for Debian default-jdk package (without --with-jdk-dir)
 	AS_IF([test -z "$with_jdk_dir"], [
-		   # This works with Debian's default-jdk package
-		   dir='/usr/lib/jvm/default-java/'
-		   javac_prog=`find $dir -name javac | head -n 1`
-		   AS_IF([test -x "$javac_prog"], [
-				  EXTRA_JDK_BIN_DIR=`dirname $javac_prog`])
-		   jnih=`find $dir -name jni.h | head -n 1`
-		   AS_IF([test -r "$jnih"], [
-				  EXTRA_JDK_INC_DIR=`dirname $jnih`])])
+		   # This works with Debian's and CentOS' default-jdk package
+       for dir in '/usr/lib/jvm/default-java/' '/usr/lib/jvm/java/' ; do
+          # only test if a suitable path has not yet been found
+          AS_IF([test "$EXTRA_JDK_BIN_DIR" == ""], [
+		          AS_IF([test -x "$javac_prog"], [
+				          EXTRA_JDK_BIN_DIR=`dirname $javac_prog`])
+		          jnih=`find $dir -name jni.h | head -n 1`
+		          AS_IF([test -r "$jnih"], [
+				          EXTRA_JDK_INC_DIR=`dirname $jnih`])
+              ])
+       done
+       ])
 
         # cephfs_java_test only makes sense if java is already turned on
 	# setup CLASSPATH for Debian default junit4.jar package 
@@ -368,10 +372,6 @@ if test "x$enable_cephfs_java" = "xyes"; then
         CLASSPATH=$CLASSPATH:$EXTRA_CLASSPATH_JAR
         export CLASSPATH
         AC_MSG_NOTICE([classpath - $CLASSPATH])
-        AS_IF([test "$have_junit4" = "1"], [
-		AC_CHECK_CLASS([org.junit.rules.ExternalResource], [], [
-		AC_MSG_NOTICE(Could not find org.junit.rules.ExternalResource)
-		have_junit4=0])])
 
         # Check for jni.h
 	CPPFLAGS_save=$CPPFLAGS
@@ -536,6 +536,9 @@ AC_CHECK_FUNC([fallocate],
 	[])
 
 
+AC_CHECK_HEADERS([sys/prctl.h])
+AC_CHECK_FUNCS([prctl])
+
 # Checks for typedefs, structures, and compiler characteristics.
 #AC_HEADER_STDBOOL
 #AC_C_CONST
diff --git a/debian/control b/debian/control
index 44ee725efd4..1aec592c9f8 100644
--- a/debian/control
+++ b/debian/control
@@ -34,6 +34,7 @@ Build-Depends: autoconf,
                libxml2-dev,
                pkg-config,
                python (>= 2.6.6-3~),
+               python-nose,
                uuid-dev,
                yasm
 Standards-Version: 3.9.3
diff --git a/debian/copyright b/debian/copyright
index d11a0f7f5da..d3906c44d35 100644
--- a/debian/copyright
+++ b/debian/copyright
@@ -1,11 +1,15 @@
-Format-Specification: http://svn.debian.org/wsvn/dep/web/deps/dep5.mdwn?op=file&rev=135
+Format-Specification: http://anonscm.debian.org/viewvc/dep/web/deps/dep5/copyright-format.xml?revision=279&view=markup
 Name: ceph
 Maintainer: Sage Weil <sage@newdream.net>
 Source: http://ceph.com/
 
 Files: *
 Copyright: (c) 2004-2010 by Sage Weil <sage@newdream.net>
-License: LGPL2.1 (see /usr/share/common-licenses/LGPL-2.1)
+License: LGPL2.1 (see COPYING-LGPL2.1)
+
+Files: doc/*
+Copyright: (c) 2010-2012 New Dream Network and contributors
+License: Creative Commons Attribution-ShareAlike (CC BY-SA)
 
 Files: src/mount/canonicalize.c
 Copyright: Copyright (C) 1993 Rick Sladkey <jrs@world.std.com>
@@ -19,6 +23,10 @@ Files: src/include/ceph_hash.cc
 Copyright: None
 License: Public domain
 
+Files: src/common/bloom_filter.hpp
+Copyright: Copyright (C) 2000 Arash Partow
+License: Boost Software License, Version 1.0
+
 Files: m4/acx_pthread.m4
 Copyright: Steven G. Johnson <stevenj@alum.mit.edu>
 License: GPLWithACException
@@ -28,25 +36,25 @@ Copyright:
     Copyright 2012-2013 Intel Corporation All Rights Reserved.
 License: BSD 3-clause
 
-Files: src/common/sctp_crc32.c:
+Files: src/common/sctp_crc32.c: 
 Copyright:
     Copyright (c) 2001-2007, by Cisco Systems, Inc. All rights reserved.
     Copyright (c) 2004-2006 Intel Corporation - All Rights Reserved
 License:
   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions are met:
-
+ 
   a) Redistributions of source code must retain the above copyright notice,
     this list of conditions and the following disclaimer.
-
+ 
   b) Redistributions in binary form must reproduce the above copyright
      notice, this list of conditions and the following disclaimer in
     the documentation and/or other materials provided with the distribution.
-
+ 
   c) Neither the name of Cisco Systems, Inc. nor the names of its
      contributors may be used to endorse or promote products derived
      from this software without specific prior written permission.
-
+ 
   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
   THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
@@ -88,6 +96,44 @@ License:
   FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
   OTHER DEALINGS IN THE SOFTWARE.
 
+
+
+Files: src/test/common/Throttle.cc src/test/filestore/chain_xattr.cc
+Copyright: Copyright (C) 2013 Cloudwatt <libre.licensing@cloudwatt.com>
+License: LGPL2 or later
+
+Files: src/osd/ErasureCodePluginJerasure/*.{c,h}
+Copyright: Copyright (c) 2011, James S. Plank <plank@cs.utk.edu>
+License:
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+  
+   - Redistributions of source code must retain the above copyright
+     notice, this list of conditions and the following disclaimer.
+  
+   - Redistributions in binary form must reproduce the above copyright
+     notice, this list of conditions and the following disclaimer in
+     the documentation and/or other materials provided with the
+     distribution.
+  
+   - Neither the name of the University of Tennessee nor the names of its
+     contributors may be used to endorse or promote products derived
+     from this software without specific prior written permission.
+  
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+  INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+  BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+  OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+  AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+  LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
+  WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+  POSSIBILITY OF SUCH DAMAGE.
+
 Packaging:
     Copyright (C) 2004-2009 by Sage Weil <sage@newdream.net>
     Copyright (C) 2010 Canonical, Ltd.
diff --git a/doc/architecture.rst b/doc/architecture.rst
index 9f57bbbd58a..988475f53b6 100644
--- a/doc/architecture.rst
+++ b/doc/architecture.rst
@@ -387,7 +387,7 @@ steps to compute PG IDs.
 #. CRUSH calculates the hash modulo the number of OSDs. (e.g., ``0x58``) to get 
    a PG ID.
 #. CRUSH gets the pool ID given the pool name (e.g., "liverpool" = ``4``)
-#. CRUSH prepends the pool ID to the pool ID to the PG ID (e.g., ``4.0x58``).
+#. CRUSH prepends the pool ID to the PG ID (e.g., ``4.0x58``).
 
 Computing object locations is much faster than performing object location query
 over a chatty session. The :abbr:`CRUSH (Controlled Replication Under Scalable
diff --git a/doc/dev/mon-bootstrap.rst b/doc/dev/mon-bootstrap.rst
index 9ce0070b791..0a4a9a2981e 100644
--- a/doc/dev/mon-bootstrap.rst
+++ b/doc/dev/mon-bootstrap.rst
@@ -42,7 +42,7 @@ with a command like::
 When creating a new monitor cluster, the keyring should also contain a ``client.admin`` key that can be used
 to administer the system::
 
-        ceph-authtool /path/to/keyring --gen-key -n client.admin
+        ceph-authtool /path/to/keyring --gen-key -n client.admin --set-uid=0 --cap mon 'allow *' --cap osd 'allow *' --cap mds 'allow'
 
 The resulting keyring is fed to ``ceph-mon --mkfs`` with the ``--keyring <keyring>`` command-line argument.
 
diff --git a/doc/dev/osd_internals/erasure_coding.rst b/doc/dev/osd_internals/erasure_coding.rst
index cc1efe4b4bf..0586c46c3bb 100644
--- a/doc/dev/osd_internals/erasure_coding.rst
+++ b/doc/dev/osd_internals/erasure_coding.rst
@@ -3,8 +3,8 @@ Erasure Coded Placement Groups
 ==============================
 
 The documentation of the erasure coding implementation in Ceph was
-created in July 2013. It is included in Ceph even before erasure
-coding is available because it drives a number of architectural
+created in July 2013. It is included in Ceph even before erasure coded
+pools are available because it drives a number of architectural
 changes. It is meant to be updated to reflect the `progress of these
 architectural changes <http://tracker.ceph.com/issues/4929>`_, up to
 the point where it becomes a reference of the erasure coding
@@ -14,8 +14,14 @@ Glossary
 --------
 
 *chunk* 
-   when the encoding function is called, it returns chunks of the
-   same size.
+   when the encoding function is called, it returns chunks of the same
+   size. Data chunks which can be concated to reconstruct the original
+   object and coding chunks which can be used to rebuild a lost chunk.
+
+*chunk rank*
+   the index of a chunk when returned by the encoding function. The
+   rank of the first chunk is 0, the rank of the second chunk is 1
+   etc.
 
 *stripe* 
    when an object is too large to be encoded with a single call,
@@ -23,9 +29,13 @@ Glossary
    called a stripe.
 
 *shard|strip*
-   the file that holds all chunks of a same rank for a given object.
+   an ordered sequence of chunks of the same rank from the same
+   object.  For a given placement group, each OSD contains shards of
+   the same rank. When dealing with objects that are encoded with a
+   single operation, *chunk* is sometime used instead of *shard*
+   because the shard is made of a single chunk.
 
-Example:
+The definitions are illustrated as follows:
 ::
  
                  OSD 40                       OSD 33
@@ -53,6 +63,6 @@ Table of content
 .. toctree::
    :maxdepth: 1
 
-   High level design document <erasure_coding/pgbackend>
    Developer notes <erasure_coding/developer_notes>
-   Draft PGBackend.h header <erasure_coding/PGBackend-h>
+   Jerasure plugin <erasure_coding/jerasure>
+   High level design document <erasure_coding/pgbackend>
diff --git a/doc/dev/osd_internals/erasure_coding/PGBackend-h.rst b/doc/dev/osd_internals/erasure_coding/PGBackend-h.rst
deleted file mode 100644
index b39cdb0e88e..00000000000
--- a/doc/dev/osd_internals/erasure_coding/PGBackend-h.rst
+++ /dev/null
@@ -1,156 +0,0 @@
-===========
-PGBackend.h
-===========
-
-Work in progress:
-::
- 
- /**
-  * PGBackend
-  *
-  * PGBackend defines an interface for logic handling IO and
-  * replication on RADOS objects.  The PGBackend implementation
-  * is responsible for:
-  *
-  * 1) Handling client operations
-  * 2) Handling object recovery
-  * 3) Handling object access
-  */
- class PGBackend {
- public:	
-   /// IO
- 
-   /// Perform write
-   int perform_write(
-     const vector<OSDOp> &ops,  ///< [in] ops to perform
-     Context *onreadable,       ///< [in] called when readable on all reaplicas
-     Context *onreadable,       ///< [in] called when durable on all replicas
-     ) = 0; ///< @return 0 or error
- 
-   /// Attempt to roll back a log entry
-   int try_rollback(
-     const pg_log_entry_t &entry, ///< [in] entry to roll back
-     ObjectStore::Transaction *t  ///< [out] transaction
-     ) = 0; ///< @return 0 on success, -EINVAL if it can't be rolled back
- 
-   /// Perform async read, oncomplete is called when ops out_bls are filled in
-   int perform_read(
-     vector<OSDOp> &ops,        ///< [in, out] ops
-     Context *oncomplete        ///< [out] called with r code
-     ) = 0; ///< @return 0 or error
- 
-   /// Peering
- 
-   /**
-    * have_enough_infos
-    *
-    * Allows PGBackend implementation to ensure that enough peers have
-    * been contacted to satisfy its requirements.
-    *
-    * TODO: this interface should yield diagnostic info about which infos
-    * are required
-    */
-   bool have_enough_infos(
-     const map<epoch_t, pg_interval_t> &past_intervals,      ///< [in] intervals
-     const map<chunk_id_t, map<int, pg_info_t> > &peer_infos ///< [in] infos
-     ) = 0; ///< @return true if we can continue peering
- 
-   /**
-    * choose_acting
-    *
-    * Allows PGBackend implementation to select the acting set based on the
-    * received infos
-    *
-    * @return False if the current acting set is inadequate, *req_acting will
-    *         be filled in with the requested new acting set.  True if the
-    *         current acting set is adequate, *auth_log will be filled in
-    *         with the correct location of the authoritative log.
-    */
-   bool choose_acting(
-     const map<int, pg_info_t> &peer_infos, ///< [in] received infos
-     int *auth_log,                         ///< [out] osd with auth log
-     vector<int> *req_acting                ///< [out] requested acting set
-     ) = 0;
- 
-   /// Scrub
- 
-   /// scan
-   int scan(
-     const hobject_t &start, ///< [in] scan objects >= start
-     const hobject_t &up_to, ///< [in] scan objects < up_to
-     vector<hobject_t> *out  ///< [out] objects returned
-     ) = 0; ///< @return 0 or error
- 
-   /// stat (TODO: ScrubMap::object needs to have PGBackend specific metadata)
-   int scrub(
-     const hobject_t &to_stat, ///< [in] object to stat
-     bool deep,                ///< [in] true if deep scrub
-     ScrubMap::object *o       ///< [out] result
-     ) = 0; ///< @return 0 or error
- 
-   /**
-    * compare_scrub_maps
-    *
-    * @param inconsistent [out] map of inconsistent pgs to pair<correct, incorrect>
-    * @param errstr [out] stream of text about inconsistencies for user
-    *                     perusal
-    *
-    * TODO: this interface doesn't actually make sense...
-    */
-   void compare_scrub_maps(
-     const map<int, ScrubMap> &maps, ///< [in] maps to compare
-     bool deep,                      ///< [in] true if scrub is deep
-     map<hobject_t, pair<set<int>, set<int> > > *inconsistent,
-     std:ostream *errstr
-     ) = 0;
- 
-   /// Recovery
- 
-   /**
-    * might_have_unrecoverable
-    *
-    * @param missing [in] missing,info gathered so far (must include acting)
-    * @param intervals [in] past intervals
-    * @param should_query [out] pair<int, cpg_t> shards to query
-    */
-   void might_have_unrecoverable(
-     const map<chunk_id_t, map<int, pair<pg_info_t, pg_missing_t> > &missing,
-     const map<epoch_t, pg_interval_t> &past_intervals,
-     set<pair<int, cpg_t> > *should_query
-     ) = 0;
- 
-   /**
-    * might_have_unfound
-    *
-    * @param missing [in] missing,info gathered so far (must include acting)
-    */
-   bool recoverable(
-     const map<chunk_id_t, map<int, pair<pg_info_t, pg_missing_t> > &missing,
-     const hobject_t &hoid ///< [in] object to check
-     ) = 0; ///< @return true if object can be recovered given missing
- 
-   /**
-    * recover_object
-    *
-    * Triggers a recovery operation on the specified hobject_t
-    * onreadable must be called before onwriteable
-    *
-    * @param missing [in] set of info, missing pairs for queried nodes
-    */
-   void recover_object(
-     const hobject_t &hoid, ///< [in] object to recover
-     const map<chunk_id_t, map<int, pair<pg_info_t, pg_missing_t> > &missing
-     Context *onreadable,   ///< [in] called when object can be read
-     Context *onwriteable   ///< [in] called when object can be written
-     ) = 0;
- 
-   /// Backfill
- 
-   /// choose_backfill
-   void choose_backfill(
-     const map<chunk_id_t, map<int, pg_info_t> > &peer_infos ///< [in] infos
-     const vector<int> &acting, ///< [in] acting set
-     const vector<int> &up,     ///< [in] up set
-     set<int> *to_backfill      ///< [out] osds to backfill
-     ) = 0;
- };
diff --git a/doc/dev/osd_internals/erasure_coding/developer_notes.rst b/doc/dev/osd_internals/erasure_coding/developer_notes.rst
index 2bc796c67e5..454f087fe53 100644
--- a/doc/dev/osd_internals/erasure_coding/developer_notes.rst
+++ b/doc/dev/osd_internals/erasure_coding/developer_notes.rst
@@ -10,7 +10,7 @@ of the erasure code within Ceph. It is mostly based on examples being
 explained to demonstrate how things work. It is written as if the
 implementation is complete although it may not be the case. For
 instance the plugin system and the jerasure plugin are implemented but
-the erasure code pool is not.
+the erasure coded pool is not.
 
 Reading and writing encoded chunks from and to OSDs
 ---------------------------------------------------
@@ -18,8 +18,8 @@ Reading and writing encoded chunks from and to OSDs
 An erasure coded pool stores each object as K+M chunks. It is divided
 into K data chunks and M coding chunks. The pool is configured to have
 a size of K+M so that each chunk is stored in an OSD in the acting
-set. The rank of the chunks is stored as `an attribute of the pool
-<http://tracker.ceph.com/issues/5862>`_ containing the object.
+set. The rank of the chunk is stored as `an attribute of the object
+<http://tracker.ceph.com/issues/5862>`_.
 
 For instance an erasure coded pool is created to use five OSDs ( K+M =
 5 ) and sustain the loss of two of them ( M = 2 ).
@@ -33,9 +33,9 @@ coding chunks : the fourth with *YXY* and the fifth with *GQC*. Each
 chunk is stored in an OSD in the acting set. The chunks are stored in
 objects that have the same name ( *NYAN* ) but reside on different
 OSDs. The order in which the chunks were created must be preserved and
-is stored as an attribute of the pool containing the object. Chunk
-*1* contains *ABC* and is stored on *OSD5* while chunk *4* contains
-*XYY* and is stored on *OSD3*.
+is stored as an attribute of the object ( shard_t ), in addition to its
+name. Chunk *1* contains *ABC* and is stored on *OSD5* while chunk *4*
+contains *XYY* and is stored on *OSD3*.
 
 ::
  
@@ -56,7 +56,7 @@ is stored as an attribute of the pool containing the object. Chunk
             +--v---+   +--v---+   +--v---+  +--v---+  +--v---+
       name  | NYAN |   | NYAN |   | NYAN |  | NYAN |  | NYAN |
             +------+   +------+   +------+  +------+  +------+
- pool shard |  1   |   |  2   |   |  3   |  |  4   |  |  5   |
+     shard  |  1   |   |  2   |   |  3   |  |  4   |  |  5   |
             +------+   +------+   +------+  +------+  +------+
    content  | ABC  |   | DEF  |   | GHI  |  | YXY  |  | QGC  |
             +--+---+   +--+---+   +--+---+  +--+---+  +--+---+
@@ -85,10 +85,12 @@ When the object *NYAN* is read from the erasure coded pool, the
 decoding function reads three chunks : chunk *1* containing *ABC*,
 chunk *3* containing *GHI* and chunk *4* containing *YXY* and rebuild
 the original content of the object *ABCDEFGHI*. The decoding function
-is informed that the chunks *2* and *5* are missing. The chunk *5*
-could not be read because the *OSD4* is *out*. The decoding function
-is called as soon as three chunks are read : *OSD2* was the slowest
-and its chunk was not taken into account.
+is informed that the chunks *2* and *5* are missing ( they are called
+*erasures* ). The chunk *5* could not be read because the *OSD4* is
+*out*. The decoding function can be called as soon as three chunks are
+read : *OSD2* was the slowest and its chunk was not taken into
+account.  
+
 ::
  
                              +-------------------+
@@ -110,17 +112,17 @@ and its chunk was not taken into account.
             +--+---+   +------+   +--+---+  +--+---+
       name  | NYAN |   | NYAN |   | NYAN |  | NYAN |
             +------+   +------+   +------+  +------+
- pool shard |  1   |   |  2   |   |  3   |  |  4   |
+     shard  |  1   |   |  2   |   |  3   |  |  4   |
             +------+   +------+   +------+  +------+
    content  | ABC  |   | DEF  |   | GHI  |  | YXY  |
             +--+---+   +--+---+   +--+---+  +--+---+
-               ^          ^          ^         ^
-               |          |          |         |
-               |          |       +--+---+     |
-               |          |       | OSD1 |     |
+               ^          .          ^         ^
+               |    TOO   .          |         |
+               |    SLOW  .       +--+---+     |
+               |          ^       | OSD1 |     |
                |          |       +------+     |
                |          |       +------+     |
-               |     SLOW +-------| OSD2 |     |
+               |          +-------| OSD2 |     |
                |                  +------+     |
                |                  +------+     |
                |                  | OSD3 |-----+
@@ -137,8 +139,9 @@ Interrupted full writes
 
 In an erasure coded pool the primary OSD in the up set receives all
 write operations. It is responsible for encoding the payload into K+M
-chunks and send them to the OSDs in the up set. It is also responsible
+chunks and sends them to the other OSDs. It is also responsible
 for maintaining an authoritative version of the placement group logs.
+
 ::
  
      primary
@@ -168,8 +171,8 @@ set of the placement group is made of *OSD 1*, *OSD 2* and *OSD 3*. An
 object has been encoded and stored in the OSDs : the chunk D1v1
 (i.e. Data chunk number 1 version 1) is on *OSD 1*, D2v1 on *OSD 2*
 and C1v1 (i.e. Coding chunk number 1 version 1) on *OSD 3*. The
-placement group logs on each OSD are in sync at epoch 1 version 1
-(i.e. 1,1).  
+placement group logs on each OSD are identical (i.e. 1,1).  
+
 ::
  
      primary
@@ -196,21 +199,23 @@ placement group logs on each OSD are in sync at epoch 1 version 1
                +-----------+
 
 *OSD 1* is the primary and receives a WRITE FULL from a client, which
-means the payload is to replace the object entirely instead of only
-overwriting a portion of it. Version two of the object is created
-to override version one. *OSD 1* encodes the payload into three
-chunks : D1v2 (i.e. Data chunk number 1 version 2) will be on *OSD 1*,
-D2v2 on *OSD 2* and C1v2 (i.e. Coding chunk number 1 version 2) on
-*OSD 3*. Each chunk is sent to the target OSD, including the primary
-OSD which is responsible for storing chunks in addition to handling
-write operations and maintaining an authoritative version of the
-placement group logs. When an OSD receives the message instructing it
-to write the chunk, it also creates a new entry in the placement group
-logs to reflect the change. For instance, as soon as *OSD 3* stores
-*C1v2*, it adds the entry 1,2 ( i.e. epoch 1, version 2 ) to its
-logs. Because the OSDs work asynchronously, some chunks may still be
-in flight ( such as *D2v2* ) while others are acknowledged and on disk
-( such as *C1v1* and *D1v1* ).  ::
+means the payload is to replace the object entirely instead of
+overwriting a portion of it. Version two of the object is created to
+override version one. *OSD 1* encodes the payload into three chunks :
+D1v2 (i.e. Data chunk number 1 version 2) will be on *OSD 1*, D2v2 on
+*OSD 2* and C1v2 (i.e. Coding chunk number 1 version 2) on *OSD
+3*. Each chunk is sent to the target OSD, including the primary OSD
+which is responsible for storing chunks in addition to handling write
+operations and maintaining an authoritative version of the placement
+group logs. When an OSD receives the message instructing it to write
+the chunk, it also creates a new entry in the placement group logs to
+reflect the change. For instance, as soon as *OSD 3* stores *C1v2*, it
+adds the entry 1,2 ( i.e. epoch 1, version 2 ) to its logs. Because
+the OSDs work asynchronously, some chunks may still be in flight (
+such as *D2v2* ) while others are acknowledged and on disk ( such as
+*C1v1* and *D1v1* ).
+
+::
  
      primary
    +---OSD 1---+
@@ -243,6 +248,7 @@ acting set and the logs' *last_complete* pointer can move from
 *1,1* to *1,2* and the files used to store the chunks of the previous
 version of the object can be removed : *D1v1* on *OSD 1*, *D2v1* on
 *OSD 2* and *C1v1* on *OSD 3*.
+
 ::
  
                +---OSD 1---+
@@ -271,13 +277,14 @@ version of the object can be removed : *D1v1* on *OSD 1*, *D2v1* on
 
 But accidents happen. If *OSD 1* goes down while *D2v2* is still in
 flight, the object's version 2 is partially written : *OSD 3* has
-one chunk but does not have enough to recover. It lost two chunks :
-*D1v2* and *D2v2* but the erasure coding parameters K = 2 + M = 1
-requires that at least two chunks are available to rebuild the
+one chunk but that is no not enough to recover. It lost two chunks :
+*D1v2* and *D2v2* and the erasure coding parameters K = 2 + M = 1
+require that at least two chunks are available to rebuild the
 third. *OSD 4* becomes the new primary and finds that the
 *last_complete* log entry ( i.e. all objects before this entry were
 known to be available on all OSDs in the previous acting set ) is
-*1,1* and will be the head of the new authoritative log.
+*1,1* and that will be the head of the new authoritative log.
+
 ::
  
                +---OSD 2---+
@@ -299,6 +306,7 @@ known to be available on all OSDs in the previous acting set ) is
 The log entry *1,2* found on *OSD 3* is divergent from the new
 authoritative log provided by *OSD 4* : it is discarded and the file
 containing the *C1v2* chunk is removed.
+
 ::
  
                +---OSD 2---+
@@ -323,14 +331,14 @@ coding library during scrubbing and stored on the new primary *OSD 4*.
 Interrupted append
 ------------------
 
-An object is coded in stripes, either because they are too big or
-because they are created with multiple operations instead of a single
-full write. A single stripe will exist/exists in the case of a full
-write, assuming the object size is not too large to encode in memory.
-When appending to an existing object, the stripe size is retrieved
-from the attributes of the object. It applies, for instance, when
-*rgw* writes an object with sequence of append instead of a single
-write.  ::
+An object is coded in stripes, either because it is too big or because
+it is created with multiple write operations instead of a single full
+write. When appending to an existing object, the stripe size is
+retrieved from the attributes of the object. It applies, for instance,
+when *rgw* writes an object with a sequence of appends instead of a
+single full write.
+
+::
  
      primary
    +---OSD 1---+
@@ -354,7 +362,7 @@ write.  ::
                +-----------+
 
 *OSD 1* is the primary and receives an APPEND from a client, meaning
-the payload is to be appended at the end of the object. *OSD 1*
+the payload is to be appended to the end of the object. *OSD 1*
 encodes the payload into three chunks : S2D1 (i.e. Stripe two data
 chunk number 1 ) will be in s1 ( shard 1 ) on *OSD 1*, S2D2 in s2 on
 *OSD 2* and S2C1 (i.e. Stripe two coding chunk number 1 ) in s3 on
@@ -368,8 +376,8 @@ logs to reflect the change. For instance, as soon as *OSD 3* stores
 logs. The log entry also carries the nature of the operation: in this
 case 1,2 is an APPEND where 1,1 was a CREATE. Because the OSDs work
 asynchronously, some chunks may still be in flight ( such as *S2D2* )
-while others are acknowledged and on disk ( such as *S2D1* and *S2C1*
-).
+while others are acknowledged and on disk (such as *S2D1* and *S2C1*).
+
 ::
  
                +---OSD 1---+
@@ -396,14 +404,16 @@ while others are acknowledged and on disk ( such as *S2D1* and *S2C1*
   +-----------+
 
 If *OSD 1* goes down while *S2D2* is still in flight, the payload is
-partially appended : s3 ( shard 3) in *OSD 3* has one chunk but does
-not have enough to recover because s1 and s2 don't have it. Two chunks
-were lost (*S2D1* and S2D2) but the erasure coding parameters K = 2 +
-M = 1 requires that at least two chunks are available to rebuild the
-third. *OSD 4* becomes the new primary and finds that the
-*last_complete* log entry ( i.e. all objects before this entry were
-known to be available on all OSDs in the previous acting set ) is
-*1,1* and will be the head of the new authoritative log.  ::
+partially appended : s3 (shard 3) in *OSD 3* has one chunk but does
+not have enough to recover. Two chunks were lost (*S2D1* and S2D2) but
+the erasure coding parameters K = 2 + M = 1 requires that at least two
+chunks are available to rebuild the third. *OSD 4* becomes the new
+primary and finds that the *last_complete* log entry ( i.e. all
+objects before this entry were known to be available on all OSDs in
+the previous acting set ) is *1,1* and will be the head of the new
+authoritative log.
+
+::
  
                +---OSD 2---+
                |+-s2-+ log |
@@ -429,8 +439,6 @@ the stripe size.
 Erasure code library
 --------------------
 
-See also `the corresponding tracker issue <http://tracker.ceph.com/issues/5877>`_
-
 Using `Reed-Solomon <https://en.wikipedia.org/wiki/Reed_Solomon>`_,
 with parameters K+M, object O is encoded by dividing it into chunks O1,
 O2, ...  OM and computing coding chunks P1, P2, ... PK. Any K chunks
@@ -443,8 +451,8 @@ Reading the original content of object O could be a simple
 concatenation of O1, O2, ... OM, because the plugins are using
 `systematic codes
 <http://en.wikipedia.org/wiki/Systematic_code>`_. Otherwise the chunks
-must be given to the erasure code library to retrieve the content of
-the object.
+must be given to the erasure code library *decode* method to retrieve
+the content of the object.
 
 Reed-Solomon is significantly more expensive to encode than fountain
 codes with the current `jerasure implementation
@@ -462,10 +470,11 @@ functions ( for Cauchy or Liberation for instance ): smaller packets
 means more calls and more overhead.
 
 Although Reed-Solomon is provided as a default, Ceph uses it via an
-`abstract API <http://tracker.ceph.com/issues/5878>`_ designed to
+`abstract API <https://github.com/ceph/ceph/blob/08a97ae45f4df58a6a8ea8a6400934d860cf5eb4/src/osd/ErasureCodeInterface.h>`_ designed to
 allow each pool to choose the plugin that implements it using
 `key=value pairs when creating the pool
-<http://tracker.ceph.com/issues/6113>`_.
+<https://github.com/ceph/ceph/blob/08a97ae45f4df58a6a8ea8a6400934d860cf5eb4/src/mon/MonCommands.h#L483>`_.
+
 ::
  
   ceph osd pool create <pool> \
@@ -473,18 +482,21 @@ allow each pool to choose the plugin that implements it using
      erasure-code-plugin=<plugin>
 
 The *<plugin>* is dynamically loaded from *<dir>* (defaults to
-*/usr/lib/ceph/erasure-code* ) and expected to implement the
-*int __erasure_code_init(char *plugin_name)* function 
-which is responsible for registering an object derived from
-*ErasureCodePlugin* in the registry :
+*/usr/lib/ceph/erasure-code* ) and expected to implement the *int
+__erasure_code_init(char *plugin_name)* function which is responsible
+for registering an object derived from *ErasureCodePlugin* in the
+registry. The `ErasureCodePluginExample <https://github.com/ceph/ceph/blob/08a97ae45f4df58a6a8ea8a6400934d860cf5eb4/src/test/osd/ErasureCodePluginExample.cc#L32>`_ plugin reads:
+
 ::
  
-  ErasureCodePluginRegistry::add(plugin_name, 
-                                 new ErasureCodePluginExample());
+  ErasureCodePluginRegistry &instance = 
+                             ErasureCodePluginRegistry::instance();
+  instance.add(plugin_name, new ErasureCodePluginExample());
 
 The *ErasureCodePlugin* derived object must provide a factory method
 from which the concrete implementation of the *ErasureCodeInterface*
-object can be generated:
+object can be generated. The `ErasureCodePluginExample plugin <https://github.com/ceph/ceph/blob/08a97ae45f4df58a6a8ea8a6400934d860cf5eb4/src/test/osd/ErasureCodePluginExample.cc#L22>`_ reads:
+
 ::
  
   virtual int factory(const map<std::string,std::string> &parameters,
@@ -493,39 +505,23 @@ object can be generated:
     return 0;
   } 
 
-The *parameters* is the list of *key=value* pairs that were set when the pool
-was created. Each *key* must be prefixed with erasure-code to avoid name collisions
+The *parameters* argument is the list of *key=value* pairs that were
+set when the pool was created. Each *key* must be prefixed with
+*erasure-code* to avoid name collisions:
+
 ::
  
-  ceph osd pool create <pool> \
+  ceph osd pool create poolname 123 \
      erasure-code-directory=<dir>         \ # mandatory
      erasure-code-plugin=jerasure         \ # mandatory
      erasure-code-m=10                    \ # optional and plugin dependant
      erasure-code-k=3                     \ # optional and plugin dependant
      erasure-code-technique=reed_sol_van  \ # optional and plugin dependant
 
-Erasure code jerasure plugin
-----------------------------
-
-The parameters interpreted by the jerasure plugin are:
-::
- 
-  ceph osd pool create <pool> \
-     erasure-code-directory=<dir>         \ # plugin directory absolute path
-     erasure-code-plugin=jerasure         \ # plugin name (only jerasure)
-     erasure-code-k=<k>                   \ # data chunks (default 2)
-     erasure-code-m=<m>                   \ # coding chunks (default 2)
-     erasure-code-technique=<technique>   \ # coding technique
-
-The coding techniques can be chosen among *reed_sol_van*,
-*reed_sol_r6_op*, *cauchy_orig*, *cauchy_good*, *liberation*,
-*blaum_roth* and *liber8tion*.
-
 Scrubbing
 ---------
 
 See also `Refactor scrub to use PGBackend methods <http://tracker.ceph.com/issues/5861>`_
-
 The simplest form of scrubbing is to check with each OSDs holding a
 chunk if it exists locally. If more thank M chunks are missing the
 object is marked as lost. If up to M chunks are missing they are
@@ -547,13 +543,6 @@ built-in on a per block basis.
 Notes
 -----
 
-This document is a description of how erasure coding could be
-implemented, it does not reflect the current state of the code
-base. Possible optimizations are mentionned where relevant but the
-first implementation should not include any of them: they are
-presented to show that there is a path toward optimization starting
-from simple minded implementation.
-
 If the objects are large, it may be impractical to encode and decode
 them in memory. However, when using *RBD* a 1TB device is divided in
 many individual 4MB objects and *RGW* does the same.
@@ -561,73 +550,3 @@ many individual 4MB objects and *RGW* does the same.
 Encoding and decoding is implemented in the OSD. Although it could be
 implemented client side for read write, the OSD must be able to encode
 and decode on its own when scrubbing.
-
-If a partial read is required, an optimization could be to only fetch
-the chunk that contains the data instead of always fetching all
-chunks. For instance if *H* is required in the example above, chunk 3
-is read if available. Reading 3 chunks is a fallback in case chunk 3 is
-not available.
-
-Partial reads and writes
-------------------------
-
-If an object is large, reading or writing all of it when changing only
-a few bytes is expensive. It is more efficient to only read or write a
-subset of the object. When a client writes on an existing object, it
-can provide the offset and the length of the write as well as the
-payload with the `CEPH_OSD_OP_WRITE
-<https://github.com/ceph/ceph/blob/962b64a83037ff79855c5261325de0cd1541f582/src/osd/ReplicatedPG.cc#L2542>`_
-operation. It is refered to as *partial write* and is different from
-the `CEPH_OSD_OP_WRITEFULL operation
-<https://github.com/ceph/ceph/blob/962b64a83037ff79855c5261325de0cd1541f582/src/osd/ReplicatedPG.cc#L2552>`_
-which writes the entire object at once.
-
-When using replicas for partial writes or reads, the primary OSD
-translates them into read(2) and write(2) POSIX system calls. When
-writing, it then forwards the CEPH_OSD_OP_WRITE message to the
-replicas and waits for them to acknowledge they are done.
-
-When reading erasure coded objects, at least M chunks must be read and
-decoded to extract the desired bytes. If a `systematic code
-<https://en.wikipedia.org/wiki/Systematic_code>`_ is used ( i.e. the
-data chunks are readable by simple concatenation ) read can be
-optimized to use the chunk containing the desired bytes and rely on
-the erasure decoding function only if a chunk is missing.
-
-When writing an erasure coded object, changing even one byte requires
-that it is encoded again in full.
-
-If Ceph is only used thru the *radosgw* or *librbd*, objects will mostly
-have the same size. The *radosgw* user may upload a 1GB object, which will
-be divided into smaller 4MB objects behind the scene ( or whatever is
-set with *rgw obj stripe size* ). If a KVM is attached a 10GB RBD block
-device, it will also be divided into smaller 4BM objects ( or whatever
-size is given to the --stripe-unit argument when creating the RBD
-block ). In both cases, writing one byte at the beginning will only
-require to encode the first object and not all of them.
-
-Objects can be further divided into stripes to reduce the overhead of
-partial writes. For instance:
-::
- 
-           +-----------------------+
-           |+---------------------+|
-           ||    stripe 0         ||
-           ||    [0,N)            ||
-           |+---------------------+|
-           |+---------------------+|
-           ||    stripe 1         ||
-           ||    [N,N*2)          ||
-           |+---------------------+|
-           |+---------------------+|
-           || stripe 3 [N*2,len)  ||
-           |+---------------------+|
-           +-----------------------+
-               object of size len
-
-Each stripe is encoded independantly and the same OSDs are used for
-all of them. For instance, if stripe 0 is encoded into 3 chunks on
-OSDs 5, 8 and 9, stripe 1 is also encoded into 3 chunks on the same
-OSDs. The size of a stripe is stored as an attribute of the object.
-When writing one byte at offset N, instead of re-encoding the whole
-object it is enough to re-encode the stripe that contains it.
diff --git a/doc/dev/osd_internals/erasure_coding/jerasure.rst b/doc/dev/osd_internals/erasure_coding/jerasure.rst
new file mode 100644
index 00000000000..312eac52e5d
--- /dev/null
+++ b/doc/dev/osd_internals/erasure_coding/jerasure.rst
@@ -0,0 +1,22 @@
+===============
+jerasure plugin
+===============
+
+Introduction
+------------
+
+The parameters interpreted by the jerasure plugin are:
+
+::
+ 
+  ceph osd pool create <pool> \
+     erasure-code-directory=<dir>         \ # plugin directory absolute path
+     erasure-code-plugin=jerasure         \ # plugin name (only jerasure)
+     erasure-code-k=<k>                   \ # data chunks (default 2)
+     erasure-code-m=<m>                   \ # coding chunks (default 2)
+     erasure-code-technique=<technique>   \ # coding technique
+
+The coding techniques can be chosen among *reed_sol_van*,
+*reed_sol_r6_op*, *cauchy_orig*, *cauchy_good*, *liberation*,
+*blaum_roth* and *liber8tion*.
+
diff --git a/doc/dev/osd_internals/erasure_coding/pgbackend.rst b/doc/dev/osd_internals/erasure_coding/pgbackend.rst
index c16354f5116..43415ba4f7e 100644
--- a/doc/dev/osd_internals/erasure_coding/pgbackend.rst
+++ b/doc/dev/osd_internals/erasure_coding/pgbackend.rst
@@ -2,14 +2,13 @@
 PG Backend Proposal
 ===================
 
-See also `PGBackend.h <../PGBackend-h>`_
-
 Motivation
 ----------
 
-The purpose of the PG Backend interface is to abstract over the
-differences between replication and erasure coding as failure recovery
-mechanisms.
+The purpose of the `PG Backend interface
+<https://github.com/ceph/ceph/blob/a287167cf8625165249b7636540591aefc0a693d/src/osd/PGBackend.h>`_
+is to abstract over the differences between replication and erasure
+coding as failure recovery mechanisms.
 
 Much of the existing PG logic, particularly that for dealing with
 peering, will be common to each.  With both schemes, a log of recent
@@ -34,12 +33,12 @@ and erasure coding which PGBackend must abstract over:
    positions are not interchangeable.  In particular, it might make
    sense for a single OSD to hold more than 1 PG copy for different
    acting set positions.
-5. Selection of a pgtemp for backfill may difer between replicated
+5. Selection of a pgtemp for backfill may differ between replicated
    and erasure coded backends.
 6. The set of necessary osds from a particular interval required to
-   to continue peering may difer between replicated and erasure
+   to continue peering may differ between replicated and erasure
    coded backends.
-7. The selection of the authoritative log may difer between replicated
+7. The selection of the authoritative log may differ between replicated
    and erasure coded backends.
 
 Client Writes
@@ -78,8 +77,9 @@ Core Changes:
 - Current code should be adapted to use and rollback as appropriate
   APPEND, DELETE, (SET|RM)ATTR log entries.
 - The filestore needs to be able to deal with multiply versioned
-  hobjects.  This probably means adapting the filestore internally to
-  use a ghobject which is basically a tuple<hobject_t, gen_t,
+  hobjects.  This means adapting the filestore internally to
+  use a `ghobject <https://github.com/ceph/ceph/blob/aba6efda13eb6ab4b96930e9cc2dbddebbe03f26/src/common/hobject.h#L193>`_ 
+  which is basically a tuple<hobject_t, gen_t,
   shard_t>.  The gen_t + shard_t need to be included in the on-disk
   filename.  gen_t is a unique object identifier to make sure there
   are no name collisions when object N is created +
@@ -114,7 +114,7 @@ divergent objects.  Thus, we must choose the *oldest* last_update from
 the last interval which went active in order to minimize the number of
 divergent objects.
 
-The dificulty is that the current code assumes that as long as it has
+The difficulty is that the current code assumes that as long as it has
 an info from at least 1 osd from the prior interval, it can complete
 peering.  In order to ensure that we do not end up with an
 unrecoverably divergent object, a K+M erasure coded PG must hear from at
@@ -161,7 +161,7 @@ Client Reads
 ------------
 
 Reads with the replicated strategy can always be satisfied
-syncronously out of the primary osd.  With an erasure coded strategy,
+synchronously out of the primary osd.  With an erasure coded strategy,
 the primary will need to request data from some number of replicas in
 order to satisfy a read.  The perform_read() interface for PGBackend
 therefore will be async.
@@ -192,7 +192,7 @@ include the chunk id in the object key.
 Core changes:
 
 - The filestore `ghobject_t needs to also include a chunk id
-  <http://tracker.ceph.com/issues/5862>`_ making it more like
+  <https://github.com/ceph/ceph/blob/aba6efda13eb6ab4b96930e9cc2dbddebbe03f26/src/common/hobject.h#L193>`_ making it more like
   tuple<hobject_t, gen_t, shard_t>.
 - coll_t needs to include a shard_t.
 - The `OSD pg_map and similar pg mappings need to work in terms of a
@@ -260,7 +260,7 @@ Core changes:
 Recovery
 --------
 
-See `Issue #5857`_. The logic for recovering an object depends on the backend.  With
+The logic for recovering an object depends on the backend.  With
 the current replicated strategy, we first pull the object replica
 to the primary and then concurrently push it out to the replicas.
 With the erasure coded strategy, we probably want to read the
@@ -270,7 +270,7 @@ and push out the replacement chunks concurrently.
 Another difference is that objects in erasure coded pg may be
 unrecoverable without being unfound.  The "unfound" concept
 should probably then be renamed to unrecoverable.  Also, the
-PGBackend impementation will have to be able to direct the search
+PGBackend implementation will have to be able to direct the search
 for pg replicas with unrecoverable object chunks and to be able
 to determine whether a particular object is recoverable.
 
@@ -281,9 +281,11 @@ Core changes:
 
 PGBackend interfaces:
 
-- might_have_unrecoverable()
-- recoverable()
-- recover_object()
+- `on_local_recover_start <https://github.com/ceph/ceph/blob/a287167cf8625165249b7636540591aefc0a693d/src/osd/PGBackend.h#L46>`_
+- `on_local_recover <https://github.com/ceph/ceph/blob/a287167cf8625165249b7636540591aefc0a693d/src/osd/PGBackend.h#L52>`_
+- `on_global_recover <https://github.com/ceph/ceph/blob/a287167cf8625165249b7636540591aefc0a693d/src/osd/PGBackend.h#L64>`_
+- `on_peer_recover <https://github.com/ceph/ceph/blob/a287167cf8625165249b7636540591aefc0a693d/src/osd/PGBackend.h#L69>`_
+- `begin_peer_recover <https://github.com/ceph/ceph/blob/a287167cf8625165249b7636540591aefc0a693d/src/osd/PGBackend.h#L76>`_
 
 Backfill
 --------
@@ -316,6 +318,4 @@ PGBackend interfaces:
 - choose_backfill(): allows the implementation to determine which osds
   should be backfilled in a particular interval.
 
-
-.. _Issue #5857: http://tracker.ceph.com/issues/5857
-.. _Issue #5856: http://tracker.ceph.com/issues/5856
-\ No newline at end of file
+.. _Issue #5856: http://tracker.ceph.com/issues/5856
diff --git a/doc/man/8/rbd.rst b/doc/man/8/rbd.rst
index f50d93eb04c..2d78748f5f2 100644
--- a/doc/man/8/rbd.rst
+++ b/doc/man/8/rbd.rst
@@ -113,6 +113,10 @@ Parameters
 
    Make json or xml formatted output more human-readable.
 
+.. option:: --read-only
+
+   Set device readonly when mapping image.
+
 
 Commands
 ========
diff --git a/doc/rados/configuration/journal-ref.rst b/doc/rados/configuration/journal-ref.rst
index b7344544b9a..97300f4a57f 100644
--- a/doc/rados/configuration/journal-ref.rst
+++ b/doc/rados/configuration/journal-ref.rst
@@ -27,6 +27,7 @@ Ceph OSDs use a journal for two reasons: speed and consistency.
 
 Ceph OSD Daemons support the following journal settings: 
 
+
 ``journal dio``
 
 :Description: Enables direct i/o to the journal. Requires ``journal block 
@@ -37,14 +38,17 @@ Ceph OSD Daemons support the following journal settings:
 :Default: ``true``
 
 
+
 ``journal aio``
 
+.. versionchanged:: 0.61 Cuttlefish
+
 :Description: Enables using ``libaio`` for asynchronous writes to the journal. 
               Requires ``journal dio`` set to ``true``.
 
 :Type: Boolean 
 :Required: No.
-:Default: ``false``
+:Default: Version 0.61 and later, ``true``. Version 0.60 and earlier, ``false``.
 
 
 ``journal block align``
diff --git a/doc/rados/operations/authentication.rst b/doc/rados/operations/authentication.rst
index 0b71d08b0c4..6bacf4c7dff 100644
--- a/doc/rados/operations/authentication.rst
+++ b/doc/rados/operations/authentication.rst
@@ -126,18 +126,15 @@ you may skip the steps related to generating keys.
     auth service required = cephx
     auth client required = cephx
 
-#. Or, enable ``cephx`` authentication for versions ``0.50`` and below by
+#. Or, enable ``cephx`` authentication for Ceph versions ``0.50`` and below by
    setting the following option in the ``[global]`` section of your `Ceph 
-   configuration`_ file::
+   configuration`_ file. **NOTE:** Deprecated as of version ``0.50``. ::
 
     auth supported = cephx
 
-.. deprecated:: 0.51
 
-#. Start or restart the Ceph cluster. :: 
+#. Start or restart the Ceph cluster. See `Operating a Cluster`_ for details. 
 
-	sudo service ceph -a start
-	sudo service ceph -a restart
 
 
 .. _disable-cephx:
@@ -164,10 +161,8 @@ during setup and/or troubleshooting to temporarily disable authentication.
 
     auth supported = none
 
-#. Start or restart the Ceph cluster. :: 
+#. Start or restart the Ceph cluster. See `Operating a Cluster`_ for details. 
 
-	sudo service ceph -a start
-	sudo service ceph -a restart
 
 
 Daemon Keyrings
@@ -422,3 +417,4 @@ of the enhanced authentication.
 
 .. _Ceph configuration: ../../configuration/ceph-conf
 .. _Cephx Configuration Reference: ../../configuration/auth-config-ref
+.. _Operating a Cluster: ../operating
+\ No newline at end of file
diff --git a/doc/start/quick-start-preflight.rst b/doc/start/quick-start-preflight.rst
index 58068f1df22..74dc403c211 100644
--- a/doc/start/quick-start-preflight.rst
+++ b/doc/start/quick-start-preflight.rst
@@ -10,17 +10,17 @@ demo cluster to explore some of the functionality. This **Preflight Checklist**
 will help you prepare an admin node and a server node for use with
 ``ceph-deploy``.
 
-.. ditaa:: 
+.. ditaa::
            /----------------\         /----------------\
            |   Admin Node   |<------->|   Server Node  |
            | cCCC           |         | cCCC           |
            \----------------/         \----------------/
- 
+
 
 Before you can deploy Ceph using ``ceph-deploy``, you need to ensure that you
 have a few things set up first on your admin node and on nodes running Ceph
 daemons.
- 
+
 
 Install an Operating System
 ===========================
@@ -42,7 +42,7 @@ SSH server. ::
 Create a User
 =============
 
-Create a user on nodes running Ceph daemons. 
+Create a user on nodes running Ceph daemons.
 
 .. tip:: We recommend a username that brute force attackers won't
    guess easily (e.g., something other than ``root``, ``ceph``, etc).
@@ -55,12 +55,12 @@ Create a user on nodes running Ceph daemons.
 
 
 ``ceph-deploy`` installs packages onto your nodes. This means that
-the user you create requires passwordless ``sudo`` privileges. 
+the user you create requires passwordless ``sudo`` privileges.
 
-.. note:: We **DO NOT** recommend enabling the ``root`` password 
-   for security reasons. 
+.. note:: We **DO NOT** recommend enabling the ``root`` password
+   for security reasons.
 
-To provide full privileges to the user, add the following to 
+To provide full privileges to the user, add the following to
 ``/etc/sudoers.d/ceph``. ::
 
 	echo "ceph ALL = (root) NOPASSWD:ALL" | sudo tee /etc/sudoers.d/ceph
@@ -81,22 +81,25 @@ running Ceph daemons (leave the passphrase empty). ::
 	Your identification has been saved in /ceph-client/.ssh/id_rsa.
 	Your public key has been saved in /ceph-client/.ssh/id_rsa.pub.
 
-Copy the key to each node running Ceph daemons:: 
+Copy the key to each node running Ceph daemons::
 
 	ssh-copy-id ceph@ceph-server
 
-Modify your ~/.ssh/config file of your admin node so that it defaults 
+Modify your ~/.ssh/config file of your admin node so that it defaults
 to logging in as the user you created when no username is specified. ::
 
 	Host ceph-server
 		Hostname ceph-server.fqdn-or-ip-address.com
 		User ceph
 
+.. note:: Do not call ceph-deploy with ``sudo`` or run as ``root`` if you are
+          login in as a different user (as in the ssh config above) because it
+          will not issue ``sudo`` commands needed on the remote host.
 
 Install ceph-deploy
 ===================
 
-To install ``ceph-deploy``, execute the following:: 
+To install ``ceph-deploy``, execute the following::
 
 	wget -q -O- 'https://ceph.com/git/?p=ceph.git;a=blob_plain;f=keys/release.asc' | sudo apt-key add -
 	echo deb http://ceph.com/debian-dumpling/ $(lsb_release -sc) main | sudo tee /etc/apt/sources.list.d/ceph.list
@@ -112,7 +115,7 @@ node (e.g., ensure ``iptables``, ``ufw`` or other tools that may prevent
 connections, traffic forwarding, etc. to allow what you need).
 
 .. tip:: The ``ceph-deploy`` tool is new and you may encounter some issues
-   without  effective error messages. 
+   without  effective error messages.
 
 Once you have completed this pre-flight checklist, you are ready to begin using
 ``ceph-deploy``.
@@ -147,7 +150,7 @@ Once you have passwordless ``ssh`` connectivity, passwordless ``sudo``,
 installed ``ceph-deploy``, and you have ensured appropriate connectivity,
 proceed to the `Storage Cluster Quick Start`_.
 
-.. tip:: The ``ceph-deploy`` utility can install Ceph packages on remote 
+.. tip:: The ``ceph-deploy`` utility can install Ceph packages on remote
    machines from the admin node!
 
 .. _Storage Cluster Quick Start: ../quick-ceph-deploy
diff --git a/man/rbd.8 b/man/rbd.8
index 27a74aaa19a..88048674614 100644
--- a/man/rbd.8
+++ b/man/rbd.8
@@ -148,6 +148,11 @@ Specifies output formatting (default: plain, json, xml)
 .B \-\-pretty\-format
 Make json or xml formatted output more human\-readable.
 .UNINDENT
+.INDENT 0.0
+.TP
+.B \-\-read\-only
+Set device readonly when mapping image.
+.UNINDENT
 .SH COMMANDS
 .INDENT 0.0
 .TP
diff --git a/qa/run_xfstests.sh b/qa/run_xfstests.sh
index f3dffca293f..f9c3e55a79d 100644
--- a/qa/run_xfstests.sh
+++ b/qa/run_xfstests.sh
@@ -276,6 +276,9 @@ function install_xfstests() {
 
 	cd xfstests
 
+	# FIXME: use an older version before the tests were rearranged!
+	git reset --hard e5f1a13792f20cfac097fef98007610b422f2cac
+
 	ncpu=$(getconf _NPROCESSORS_ONLN 2>&1)
 	[ -n "${ncpu}" -a "${ncpu}" -gt 1 ] && multiple="-j ${ncpu}"
 
diff --git a/qa/workunits/cephtool/test.sh b/qa/workunits/cephtool/test.sh
index d92c2709dfd..09e55b9a842 100755
--- a/qa/workunits/cephtool/test.sh
+++ b/qa/workunits/cephtool/test.sh
@@ -169,7 +169,16 @@ bl=192.168.0.1:0/1000
 ceph osd blacklist add $bl
 ceph osd blacklist ls | grep $bl
 ceph osd blacklist rm $bl
-expect_false "(ceph osd blacklist ls | grep $bl)"
+expect_false "ceph osd blacklist ls | grep $bl"
+
+bl=192.168.0.1
+# test without nonce, invalid nonce
+ceph osd blacklist add $bl
+ceph osd blacklist ls | grep $bl
+ceph osd blacklist rm $bl
+expect_false "ceph osd blacklist ls | grep $bl"
+expect_false "ceph osd blacklist $bl/-1"
+expect_false "ceph osd blacklist $bl/foo"
 
 ceph osd crush tunables legacy
 ceph osd crush tunables bobtail
@@ -334,4 +343,11 @@ ceph pg set_full_ratio 95 2>$TMPFILE; check_response $? 22 'not in range'
 # expect "not in range" for invalid overload percentage
 ceph osd reweight-by-utilization 80 2>$TMPFILE; check_response $? 22 'not in range'
 
+# expect 'heap' commands to be correctly parsed
+ceph heap stats
+ceph heap start_profiler
+ceph heap dump
+ceph heap stop_profiler
+ceph heap release
+
 echo OK
diff --git a/qa/workunits/mon/crush_ops.sh b/qa/workunits/mon/crush_ops.sh
index 09e49acfbf6..f1770e171eb 100755
--- a/qa/workunits/mon/crush_ops.sh
+++ b/qa/workunits/mon/crush_ops.sh
@@ -68,4 +68,13 @@ ceph osd crush add-bucket foo host
 ceph osd crush move foo root=default rack=localrack
 ceph osd crush rm foo
 
+# test reweight
+o3=`ceph osd create`
+ceph osd crush add $o3 123 root=default
+ceph osd tree | grep osd.$o3 | grep 123
+ceph osd crush reweight osd.$o3 113
+ceph osd tree | grep osd.$o3 | grep 113
+ceph osd crush rm osd.$o3
+ceph osd rm osd.$o3
+
 echo OK
diff --git a/qa/workunits/mon/pool_ops.sh b/qa/workunits/mon/pool_ops.sh
index e98e1e4121e..2436cc4837e 100755
--- a/qa/workunits/mon/pool_ops.sh
+++ b/qa/workunits/mon/pool_ops.sh
@@ -2,7 +2,8 @@
 
 set -e
 
-ceph osd pool create foo 123 123
+ceph osd pool create foo 123 123 key1=+++              && exit 1 || true
+ceph osd pool create foo 123 123 key1=value1 key2 key3=value3
 ceph osd pool create fooo 123
 
 ceph osd pool create foo 123 # idempotent
diff --git a/qa/workunits/rados/test_tmap_to_omap.sh b/qa/workunits/rados/test_tmap_to_omap.sh
new file mode 100755
index 00000000000..76656ad726b
--- /dev/null
+++ b/qa/workunits/rados/test_tmap_to_omap.sh
@@ -0,0 +1,28 @@
+#!/bin/sh -ex
+
+expect_false()
+{
+	set -x
+	if "$@"; then return 1; else return 0; fi
+}
+
+pool="pool-$$"
+rados mkpool $pool
+
+rados -p $pool tmap set foo key1 value1
+rados -p $pool tmap set foo key2 value2
+rados -p $pool tmap set foo key2 value2
+rados -p $pool tmap dump foo | grep key1
+rados -p $pool tmap dump foo | grep key2
+rados -p $pool tmap-to-omap foo
+expect_false rados -p $pool tmap dump foo
+expect_false rados -p $pool tmap dump foo
+
+rados -p $pool listomapkeys foo | grep key1
+rados -p $pool listomapkeys foo | grep key2
+rados -p $pool getomapval foo key1 | grep value1
+rados -p $pool getomapval foo key2 | grep value2
+
+rados rmpool $pool $pool --yes-i-really-really-mean-it
+
+echo OK
diff --git a/qa/workunits/snaps/snap-rm-diff.sh b/qa/workunits/snaps/snap-rm-diff.sh
index 8dff54f58b8..3d30dc7937a 100755
--- a/qa/workunits/snaps/snap-rm-diff.sh
+++ b/qa/workunits/snaps/snap-rm-diff.sh
@@ -1,5 +1,6 @@
 #!/bin/sh -ex
 
+ceph mds set allow_new_snaps --yes-i-really-mean-it
 wget -q http://ceph.com/qa/linux-2.6.33.tar.bz2
 mkdir foo
 cp linux* foo
diff --git a/qa/workunits/snaps/snaptest-0.sh b/qa/workunits/snaps/snaptest-0.sh
index 93e747af7dd..366249e7d25 100755
--- a/qa/workunits/snaps/snaptest-0.sh
+++ b/qa/workunits/snaps/snaptest-0.sh
@@ -1,7 +1,16 @@
 #!/bin/sh -x
 
+expect_failure() { 
+    if [ `"$@"` -e 0 ]; then
+	return 1
+    fi
+    return 0
+}
 set -e
 
+expect_failure mkdir .snap/foo
+ceph mds set allow_new_snaps --yes-i-really-mean-it
+
 echo asdf > foo
 mkdir .snap/foo
 grep asdf .snap/foo/foo
@@ -14,4 +23,7 @@ grep asdf .snap/bar/bar
 rmdir .snap/bar
 rm foo
 
+ceph mds unset allow_new_snaps --yes-i-really-mean-it
+expect_failure mkdir .snap/baz
+
 echo OK
 \ No newline at end of file
diff --git a/qa/workunits/snaps/snaptest-1.sh b/qa/workunits/snaps/snaptest-1.sh
index 59d41ef688f..7c528dd432a 100755
--- a/qa/workunits/snaps/snaptest-1.sh
+++ b/qa/workunits/snaps/snaptest-1.sh
@@ -2,6 +2,8 @@
 
 set -e
 
+ceph mds set allow_new_snaps --yes-i-really-mean-it
+
 echo 1 > file1
 echo 2 > file2
 echo 3 > file3
diff --git a/qa/workunits/snaps/snaptest-2.sh b/qa/workunits/snaps/snaptest-2.sh
index 4b67999921c..b73bf9cb97f 100755
--- a/qa/workunits/snaps/snaptest-2.sh
+++ b/qa/workunits/snaps/snaptest-2.sh
@@ -1,5 +1,7 @@
 #!/bin/bash
 
+ceph mds set allow_new_snaps --yes-i-really-mean-it
+
 echo "Create dir 100 to 199 ..."
 for i in $(seq 100 199); do
 	echo "    create dir $i"
diff --git a/qa/workunits/snaps/snaptest-authwb.sh b/qa/workunits/snaps/snaptest-authwb.sh
index 128efb70d19..acbb599bda9 100755
--- a/qa/workunits/snaps/snaptest-authwb.sh
+++ b/qa/workunits/snaps/snaptest-authwb.sh
@@ -2,6 +2,8 @@
 
 set -e
 
+ceph mds set allow_new_snaps --yes-i-really-mean-it
+
 touch foo
 chmod +x foo
 mkdir .snap/s
diff --git a/qa/workunits/snaps/snaptest-capwb.sh b/qa/workunits/snaps/snaptest-capwb.sh
index 8c5a1333b69..9d0568cb6db 100755
--- a/qa/workunits/snaps/snaptest-capwb.sh
+++ b/qa/workunits/snaps/snaptest-capwb.sh
@@ -4,6 +4,8 @@ set -e
 
 mkdir foo
 
+ceph mds set allow_new_snaps --yes-i-really-mean-it
+
 # make sure mds handles it when the client does not send flushsnap
 echo x > foo/x
 sync
diff --git a/qa/workunits/snaps/snaptest-dir-rename.sh b/qa/workunits/snaps/snaptest-dir-rename.sh
index e81edf9c47f..6995f537a47 100755
--- a/qa/workunits/snaps/snaptest-dir-rename.sh
+++ b/qa/workunits/snaps/snaptest-dir-rename.sh
@@ -2,6 +2,8 @@
 
 set -e
 
+ceph mds set allow_new_snaps --yes-i-really-mean-it
+
 #
 # make sure we keep an existing dn's seq
 #
diff --git a/qa/workunits/snaps/snaptest-double-null.sh b/qa/workunits/snaps/snaptest-double-null.sh
index cdf32e4f0ef..5a673ff9c0d 100755
--- a/qa/workunits/snaps/snaptest-double-null.sh
+++ b/qa/workunits/snaps/snaptest-double-null.sh
@@ -2,6 +2,8 @@
 
 set -e
 
+ceph mds set allow_new_snaps --yes-i-really-mean-it
+
 # multiple intervening snapshots with no modifications, and thus no
 # snapflush client_caps messages.  make sure the mds can handle this.
 
diff --git a/qa/workunits/snaps/snaptest-estale.sh b/qa/workunits/snaps/snaptest-estale.sh
index a4fb94368d4..31ba5a87659 100755
--- a/qa/workunits/snaps/snaptest-estale.sh
+++ b/qa/workunits/snaps/snaptest-estale.sh
@@ -1,5 +1,7 @@
 #!/bin/sh -x
 
+ceph mds set allow_new_snaps --yes-i-really-mean-it
+
 mkdir .snap/foo
 
 echo "We want ENOENT, not ESTALE, here."
diff --git a/qa/workunits/snaps/snaptest-git-ceph.sh b/qa/workunits/snaps/snaptest-git-ceph.sh
index 11532d8b14b..71a71e1d469 100755
--- a/qa/workunits/snaps/snaptest-git-ceph.sh
+++ b/qa/workunits/snaps/snaptest-git-ceph.sh
@@ -2,6 +2,8 @@
 
 set -e
 
+ceph mds set allow_new_snaps --yes-i-really-mean-it
+
 git clone git://ceph.com/git/ceph.git
 cd ceph
 
diff --git a/qa/workunits/snaps/snaptest-intodir.sh b/qa/workunits/snaps/snaptest-intodir.sh
index 3cbbe01718e..d022cfd479e 100755
--- a/qa/workunits/snaps/snaptest-intodir.sh
+++ b/qa/workunits/snaps/snaptest-intodir.sh
@@ -1,5 +1,7 @@
 #!/bin/sh -ex
 
+ceph mds set allow_new_snaps --yes-i-really-mean-it
+
 # this tests fix for #1399
 mkdir foo
 mkdir foo/.snap/one
diff --git a/qa/workunits/snaps/snaptest-multiple-capsnaps.sh b/qa/workunits/snaps/snaptest-multiple-capsnaps.sh
index 5ebc852cf6c..d88722bde09 100755
--- a/qa/workunits/snaps/snaptest-multiple-capsnaps.sh
+++ b/qa/workunits/snaps/snaptest-multiple-capsnaps.sh
@@ -2,6 +2,8 @@
 
 set -e
 
+ceph mds set allow_new_snaps --yes-i-really-mean-it
+
 echo asdf > a
 mkdir .snap/1
 chmod 777 a
diff --git a/qa/workunits/snaps/snaptest-parents.sh b/qa/workunits/snaps/snaptest-parents.sh
index 7e5241a27c0..8963f628dc8 100644
--- a/qa/workunits/snaps/snaptest-parents.sh
+++ b/qa/workunits/snaps/snaptest-parents.sh
@@ -2,6 +2,8 @@
 
 set -e
 
+ceph mds set allow_new_snaps --yes-i-really-mean-it
+
 echo "making directory tree and files"
 mkdir -p 1/a/b/c/
 echo "i'm file1" > 1/a/file1
diff --git a/qa/workunits/snaps/snaptest-snap-rm-cmp.sh b/qa/workunits/snaps/snaptest-snap-rm-cmp.sh
index aa094e70789..68ecf37b73e 100755
--- a/qa/workunits/snaps/snaptest-snap-rm-cmp.sh
+++ b/qa/workunits/snaps/snaptest-snap-rm-cmp.sh
@@ -2,6 +2,8 @@
 
 set -e
 
+ceph mds set allow_new_snaps --yes-i-really-mean-it
+
 file=linux-2.6.33.tar.bz2
 wget -q http://ceph.com/qa/$file
 
diff --git a/qa/workunits/snaps/snaptest-upchildrealms.sh b/qa/workunits/snaps/snaptest-upchildrealms.sh
index 63b7167b42d..b5b8830e9f0 100755
--- a/qa/workunits/snaps/snaptest-upchildrealms.sh
+++ b/qa/workunits/snaps/snaptest-upchildrealms.sh
@@ -2,6 +2,8 @@
 
 set -e
 
+ceph mds set allow_new_snaps --yes-i-really-mean-it
+
 #
 # verify that a snap update on a parent realm will induce
 # snap cap writeback for inodes child realms
diff --git a/qa/workunits/snaps/snaptest-xattrwb.sh b/qa/workunits/snaps/snaptest-xattrwb.sh
index b2dd7bc748a..c36e2575845 100755
--- a/qa/workunits/snaps/snaptest-xattrwb.sh
+++ b/qa/workunits/snaps/snaptest-xattrwb.sh
@@ -2,6 +2,8 @@
 
 set -e
 
+ceph mds set allow_new_snaps --yes-i-really-mean-it
+
 echo "testing simple xattr wb"
 touch x
 setfattr -n user.foo x
diff --git a/qa/workunits/snaps/untar_snap_rm.sh b/qa/workunits/snaps/untar_snap_rm.sh
index 5c71212df75..89e2db0cd10 100755
--- a/qa/workunits/snaps/untar_snap_rm.sh
+++ b/qa/workunits/snaps/untar_snap_rm.sh
@@ -2,6 +2,8 @@
 
 set -e
 
+ceph mds set allow_new_snaps --yes-i-really-mean-it
+
 do_tarball() {
     wget http://ceph.com/qa/$1
     tar xvf$2 $1
diff --git a/src/.gitignore b/src/.gitignore
index 4c98529bd87..6efe8dc6bc4 100644
--- a/src/.gitignore
+++ b/src/.gitignore
@@ -68,6 +68,7 @@ Makefile
 /test_*
 /cls_test_*
 /unittest_*
+/get_command_descriptions
 
 # old dir, may in use by older branches
 /leveldb
diff --git a/src/Makefile-env.am b/src/Makefile-env.am
index cc9ffc62f12..6a4e09512a2 100644
--- a/src/Makefile-env.am
+++ b/src/Makefile-env.am
@@ -8,6 +8,7 @@ CLEANFILES =
 
 noinst_HEADERS =
 bin_PROGRAMS =
+noinst_PROGRAMS =
 bin_SCRIPTS =
 sbin_PROGRAMS =
 sbin_SCRIPTS =
@@ -26,6 +27,12 @@ ceph_sbindir = $(exec_prefix)$(sbindir)
 # C/C++ tests to build will be appended to this
 check_PROGRAMS =
 
+# tests scripts will be appended to this
+check_SCRIPTS =
+
+# python unit tests need to know where the scripts are located
+export PYTHONPATH=$(top_srcdir)/src/pybind
+
 # when doing a debug build, make sure to make the targets
 if WITH_DEBUG
 bin_PROGRAMS += $(bin_DEBUGPROGRAMS)
@@ -35,13 +42,16 @@ endif
 ##################################
 ## automake environment
 
-AM_COMMON_FLAGS = \
+AM_COMMON_CPPFLAGS = \
 	-D__CEPH__ \
 	-D_FILE_OFFSET_BITS=64 \
 	-D_REENTRANT \
 	-D_THREAD_SAFE \
 	-D__STDC_FORMAT_MACROS \
 	-D_GNU_SOURCE \
+	-DCEPH_LIBDIR=\"${libdir}\"
+
+AM_COMMON_CFLAGS = \
 	-rdynamic \
 	-Wall \
 	${WARN_TYPE_LIMITS} \
@@ -52,14 +62,11 @@ AM_COMMON_FLAGS = \
 	-fno-strict-aliasing \
 	-fsigned-char
 
-AM_CFLAGS = $(AM_COMMON_FLAGS)
-AM_CPPFLAGS = \
-	$(AM_COMMON_FLAGS) \
-	-DCEPH_LIBDIR=\"${libdir}\"
+AM_CFLAGS = $(AM_COMMON_CFLAGS)
+AM_CPPFLAGS = $(AM_COMMON_CPPFLAGS)
 AM_CXXFLAGS = \
 	@AM_CXXFLAGS@ \
-	$(AM_COMMON_FLAGS) \
-	-DCEPH_LIBDIR=\"${libdir}\" \
+	$(AM_COMMON_CFLAGS) \
 	-Wnon-virtual-dtor \
 	-Wno-invalid-offsetof \
 	-Wstrict-null-sentinel
diff --git a/src/Makefile.am b/src/Makefile.am
index c0ed016006d..280b268479e 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -112,7 +112,7 @@ libcephfs_jni_la_SOURCES = \
 	java/native/JniConstants.cpp \
 	java/native/JniConstants.h
 libcephfs_jni_la_LIBADD = $(LIBCEPHFS) $(EXTRALIBS)
-libcephfs_jni_la_CPPFLAGS = $(JDK_CPPFLAGS)
+libcephfs_jni_la_CPPFLAGS = $(JDK_CPPFLAGS) $(AM_CPPFLAGS)
 libcephfs_jni_la_LDFLAGS = ${AM_LDFLAGS} -version-info 1:0:0
 lib_LTLIBRARIES += libcephfs_jni.la
 endif
@@ -251,10 +251,11 @@ shell_scripts += init-ceph mkcephfs
 # executables built, you need to replace this with manual assignments
 # target by target
 
-TESTS = $(check_PROGRAMS) unittest_bufferlist.sh
+TESTS = \
+	$(check_PROGRAMS) \
+	$(check_SCRIPTS)
 
 check-local:
-	$(srcdir)/test/encoding/check-generated.sh
 	$(srcdir)/test/encoding/readable.sh ../ceph-object-corpus
 
 
@@ -294,12 +295,12 @@ CLEANFILES += ceph_ver.h sample.fetch_config
 
 ceph: ceph.in ./ceph_ver.h Makefile
 	rm -f $@ $@.tmp
-	echo "#!/usr/bin/python" >$@.tmp
+	echo "#!/usr/bin/env python" >$@.tmp
 	grep "#define CEPH_GIT_NICE_VER" ./ceph_ver.h | \
 		sed -e 's/#define \(.*VER\) /\1=/' >>$@.tmp
 	grep "#define CEPH_GIT_VER" ./ceph_ver.h | \
 	  sed -e 's/#define \(.*VER\) /\1=/' -e 's/=\(.*\)$$/="\1"/' >>$@.tmp
-	cat $@.in >>$@.tmp
+	cat $(srcdir)/$@.in >>$@.tmp
 	chmod a+x $@.tmp
 	chmod a-w $@.tmp
 	mv $@.tmp $@
diff --git a/src/arch/intel.c b/src/arch/intel.c
index 0513da53c23..8b2d2ccab12 100644
--- a/src/arch/intel.c
+++ b/src/arch/intel.c
@@ -4,8 +4,7 @@
 int ceph_arch_intel_sse42 = 0;
 
 
-/* this probably isn't specific enough for x86_64?  fix me someday */
-#ifdef __LP64__
+#ifdef __x86_64__
 
 /* intel cpu? */
 static void do_cpuid(unsigned int *eax, unsigned int *ebx, unsigned int *ecx,
@@ -35,7 +34,7 @@ int ceph_arch_intel_probe(void)
 	return 0;
 }
 
-#else // __LP64__
+#else // __x86_64__
 
 int ceph_arch_intel_probe(void)
 {
@@ -43,4 +42,4 @@ int ceph_arch_intel_probe(void)
 	return 0;
 }
 
-#endif // __LP64__
+#endif // __x86_64__
diff --git a/src/ceph-create-keys b/src/ceph-create-keys
index 176b06e7a38..0359228d5f8 100755
--- a/src/ceph-create-keys
+++ b/src/ceph-create-keys
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/env python
 import argparse
 import errno
 import json
diff --git a/src/ceph-disk b/src/ceph-disk
index 3d09bdf7418..64d944d9db0 100755
--- a/src/ceph-disk
+++ b/src/ceph-disk
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/env python
 
 import argparse
 import errno
@@ -570,7 +570,7 @@ def get_fsid(cluster):
     fsid = get_conf(cluster=cluster, variable='fsid')
     if fsid is None:
         raise Error('getting cluster uuid from configuration failed')
-    return fsid
+    return fsid.lower()
 
 
 def get_or_create_dmcrypt_key(
@@ -888,15 +888,12 @@ def prepare_journal_dev(
 
 
 def prepare_journal_file(
-    journal,
-    journal_size):
+    journal):
 
     if not os.path.exists(journal):
-        LOG.debug('Creating journal file %s with size %dM', journal, journal_size)
+        LOG.debug('Creating journal file %s with size 0 (ceph-osd will resize and allocate)', journal)
         with file(journal, 'wb') as journal_file:
-            journal_file.truncate(journal_size * 1048576)
-
-    # FIXME: should we resize an existing journal file?
+            pass
 
     LOG.debug('Journal is file %s', journal)
     LOG.warning('OSD will not be hot-swappable if journal is not the same device as the osd data')
@@ -921,13 +918,13 @@ def prepare_journal(
     if not os.path.exists(journal):
         if force_dev:
             raise Error('Journal does not exist; not a block device', journal)
-        return prepare_journal_file(journal, journal_size)
+        return prepare_journal_file(journal)
 
     jmode = os.stat(journal).st_mode
     if stat.S_ISREG(jmode):
         if force_dev:
             raise Error('Journal is not a block device', journal)
-        return prepare_journal_file(journal, journal_size)
+        return prepare_journal_file(journal)
 
     if stat.S_ISBLK(jmode):
         if force_file:
@@ -1604,6 +1601,7 @@ def find_cluster_by_uuid(_uuid):
     Find a cluster name by searching /etc/ceph/*.conf for a conf file
     with the right uuid.
     """
+    _uuid = _uuid.lower()
     no_fsid = []
     if not os.path.exists('/etc/ceph'):
         return None
@@ -1611,11 +1609,15 @@ def find_cluster_by_uuid(_uuid):
         if not conf_file.endswith('.conf'):
             continue
         cluster = conf_file[:-5]
-        fsid = get_conf(cluster, 'fsid')
-        if fsid is None:
+        try:
+            fsid = get_fsid(cluster)
+        except Error as e: 
+            if e.message != 'getting cluster uuid from configuration failed':
+                raise e
             no_fsid.append(cluster)
-        elif fsid == _uuid:
-            return cluster
+        else:
+            if fsid == _uuid:
+                return cluster
     # be tolerant of /etc/ceph/ceph.conf without an fsid defined.
     if len(no_fsid) == 1 and no_fsid[0] == 'ceph':
         LOG.warning('No fsid defined in /etc/ceph/ceph.conf; using anyway')
diff --git a/src/ceph-rest-api b/src/ceph-rest-api
index ae5245b4f76..772b3d20fcd 100755
--- a/src/ceph-rest-api
+++ b/src/ceph-rest-api
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/env python
 # vim: ts=4 sw=4 smarttab expandtab
 
 import argparse
diff --git a/src/ceph.in b/src/ceph.in
index 320e4bd413f..075ec80c20b 100755
--- a/src/ceph.in
+++ b/src/ceph.in
@@ -476,6 +476,9 @@ def complete(sigdict, args, target):
 ###
 
 def main():
+    ceph_args = os.environ.get('CEPH_ARGS')
+    if ceph_args:
+        sys.argv.extend(ceph_args.split())
 
     parser, parsed_args, childargs = parse_cmdargs()
 
@@ -556,7 +559,6 @@ def main():
     cluster_handle = rados.Rados(name=name, clustername=clustername,
                                  conf_defaults=conf_defaults, conffile=conffile)
 
-    cluster_handle.conf_parse_env()
     retargs = cluster_handle.conf_parse_argv(childargs)
     #tmp = childargs
     childargs = retargs
@@ -642,7 +644,7 @@ def main():
     if parsed_args.output_file:
         try:
             outf = open(parsed_args.output_file, 'w')
-        except:
+        except Exception as e:
             print >> sys.stderr, \
                 'Can\'t open output file {0}: {1}'.\
                 format(parsed_args.output_file, e)
diff --git a/src/ceph_osd.cc b/src/ceph_osd.cc
index 3d517da1f89..2388762f1df 100644
--- a/src/ceph_osd.cc
+++ b/src/ceph_osd.cc
@@ -423,7 +423,7 @@ int main(int argc, const char **argv)
   global_init_daemonize(g_ceph_context, 0);
   common_init_finish(g_ceph_context);
 
-  if (g_conf->filestore_update_to >= (int)FileStore::on_disk_version) {
+  if (g_conf->filestore_update_to >= (int)FileStore::target_version) {
     int err = OSD::convertfs(g_conf->osd_data, g_conf->osd_journal);
     if (err < 0) {
       derr << TEXT_RED << " ** ERROR: error converting store " << g_conf->osd_data
diff --git a/src/cls/Makefile.am b/src/cls/Makefile.am
index 0c04c64faa7..2d3d43cb1e3 100644
--- a/src/cls/Makefile.am
+++ b/src/cls/Makefile.am
@@ -59,11 +59,13 @@ libcls_lock_client_la_SOURCES = \
 	cls/lock/cls_lock_types.cc \
 	cls/lock/cls_lock_ops.cc
 noinst_LTLIBRARIES += libcls_lock_client.la
+DENCODER_DEPS += libcls_lock_client.la
 
 libcls_refcount_client_la_SOURCES = \
 	cls/refcount/cls_refcount_client.cc \
 	cls/refcount/cls_refcount_ops.cc
 noinst_LTLIBRARIES += libcls_refcount_client.la
+DENCODER_DEPS += libcls_refcount_client.la
 
 libcls_version_client_a_SOURCES =  \
 	cls/version/cls_version_client.cc \
@@ -81,12 +83,14 @@ libcls_replica_log_client_a_SOURCES = \
 	cls/replica_log/cls_replica_log_ops.cc \
 	cls/replica_log/cls_replica_log_client.cc
 noinst_LIBRARIES += libcls_replica_log_client.a
+DENCODER_DEPS += libcls_replica_log_client.a
 
 libcls_rgw_client_la_SOURCES = \
 	cls/rgw/cls_rgw_client.cc \
 	cls/rgw/cls_rgw_types.cc \
 	cls/rgw/cls_rgw_ops.cc
 noinst_LTLIBRARIES += libcls_rgw_client.la
+DENCODER_DEPS += libcls_rgw_client.la
 
 libcls_rbd_client_la_SOURCES = cls/rbd/cls_rbd_client.cc
 noinst_LTLIBRARIES += libcls_rbd_client.la
diff --git a/src/common/Cond.h b/src/common/Cond.h
index e6a13ae48bb..46fdf159112 100644
--- a/src/common/Cond.h
+++ b/src/common/Cond.h
@@ -32,8 +32,8 @@ class Cond {
   Mutex *waiter_mutex;
 
   // don't allow copying.
-  void operator=(Cond &C) {}
-  Cond( const Cond &C ) {}
+  void operator=(Cond &C);
+  Cond(const Cond &C);
 
  public:
   Cond() : waiter_mutex(NULL) {
diff --git a/src/common/Makefile.am b/src/common/Makefile.am
index 4c027909b4d..9ec6c3e895b 100644
--- a/src/common/Makefile.am
+++ b/src/common/Makefile.am
@@ -4,6 +4,7 @@ libcommon_la_SOURCES = \
 	common/LogClient.cc \
 	common/LogEntry.cc \
 	common/PrebufferedStreambuf.cc \
+	common/SloppyCRCMap.cc \
 	common/BackTrace.cc \
 	common/perf_counters.cc \
 	common/Mutex.cc \
@@ -59,13 +60,17 @@ libcommon_la_SOURCES = \
 	common/pick_address.cc \
 	common/util.cc \
 	common/TextTable.cc \
-	common/secret.c \
 	common/ceph_fs.cc \
 	common/ceph_hash.cc \
 	common/ceph_strings.cc \
 	common/ceph_frag.cc \
 	common/addr_parsing.c \
-	common/hobject.cc
+	common/hobject.cc \
+	common/bloom_filter.cc
+
+if LINUX
+libcommon_la_SOURCES += common/secret.c
+endif
 
 # these should go out of libcommon
 libcommon_la_SOURCES += \
@@ -93,6 +98,7 @@ LIBCOMMON_DEPS += libcommon_crc.la
 noinst_LTLIBRARIES += libcommon_crc.la
 
 noinst_HEADERS += \
+	common/bloom_filter.hpp \
 	common/sctp_crc32.h \
 	common/crc32c_intel_baseline.h \
 	common/crc32c_intel_fast.h
@@ -117,6 +123,7 @@ noinst_HEADERS += \
 	common/LogClient.h \
 	common/LogEntry.h \
 	common/Preforker.h \
+	common/SloppyCRCMap.h \
 	common/WorkQueue.h \
 	common/PrioritizedQueue.h \
 	common/ceph_argparse.h \
diff --git a/src/common/Mutex.h b/src/common/Mutex.h
index 06e435d49df..e26a090703d 100644
--- a/src/common/Mutex.h
+++ b/src/common/Mutex.h
@@ -46,8 +46,8 @@ private:
   PerfCounters *logger;
 
   // don't allow copying.
-  void operator=(Mutex &M) {}
-  Mutex( const Mutex &M ) {}
+  void operator=(Mutex &M);
+  Mutex(const Mutex &M);
 
   void _register() {
     id = lockdep_register(name);
diff --git a/src/common/SloppyCRCMap.cc b/src/common/SloppyCRCMap.cc
new file mode 100644
index 00000000000..7924ae6e8a7
--- /dev/null
+++ b/src/common/SloppyCRCMap.cc
@@ -0,0 +1,180 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+
+#include "common/SloppyCRCMap.h"
+#include "common/Formatter.h"
+
+void SloppyCRCMap::write(uint64_t offset, uint64_t len, const bufferlist& bl,
+			 std::ostream *out)
+{
+  int64_t left = len;
+  uint64_t pos = offset;
+  unsigned o = offset % block_size;
+  if (o) {
+    crc_map.erase(offset - o);
+    if (out)
+      *out << "write invalidate " << (offset - o) << "\n";
+    pos += (block_size - o);
+    left -= (block_size - o);
+  }
+  while (left >= block_size) {
+    bufferlist t;
+    t.substr_of(bl, pos - offset, block_size);
+    crc_map[pos] = t.crc32c(crc_iv);
+    if (out)
+      *out << "write set " << pos << " " << crc_map[pos] << "\n";
+    pos += block_size;
+    left -= block_size;
+  }
+  if (left > 0) {
+    crc_map.erase(pos);
+    if (out)
+      *out << "write invalidate " << pos << "\n";
+  }
+}
+
+int SloppyCRCMap::read(uint64_t offset, uint64_t len, const bufferlist& bl,
+		       std::ostream *err)
+{
+  int errors = 0;
+  int64_t left = len;
+  uint64_t pos = offset;
+  unsigned o = offset % block_size;
+  if (o) {
+    pos += (block_size - o);
+    left -= (block_size - o);
+  }
+  while (left >= block_size) {
+    // FIXME: this could be more efficient if we avoid doing a find()
+    // on each iteration
+    std::map<uint64_t,uint32_t>::iterator p = crc_map.find(pos);
+    if (p != crc_map.end()) {
+      bufferlist t;
+      t.substr_of(bl, pos - offset, block_size);
+      uint32_t crc = t.crc32c(crc_iv);
+      if (p->second != crc) {
+	errors++;
+	if (err)
+	  *err << "offset " << pos << " len " << block_size
+	       << " has crc " << crc << " expected " << p->second << "\n";
+      }
+    }
+    pos += block_size;
+    left -= block_size;
+  }
+  return errors;  
+}
+
+void SloppyCRCMap::truncate(uint64_t offset)
+{
+  offset -= offset % block_size;
+  std::map<uint64_t,uint32_t>::iterator p = crc_map.lower_bound(offset);
+  while (p != crc_map.end())
+    crc_map.erase(p++);
+}
+
+void SloppyCRCMap::zero(uint64_t offset, uint64_t len)
+{
+  int64_t left = len;
+  uint64_t pos = offset;
+  unsigned o = offset % block_size;
+  if (o) {
+    crc_map.erase(offset - o);
+    pos += (block_size - o);
+    left -= (block_size - o);
+  }
+  while (left >= block_size) {
+    crc_map[pos] = zero_crc;
+    pos += block_size;
+    left -= block_size;
+  }
+  if (left > 0)
+    crc_map.erase(pos);
+}
+
+void SloppyCRCMap::clone_range(uint64_t offset, uint64_t len,
+			       uint64_t srcoff, const SloppyCRCMap& src,
+			       std::ostream *out)
+{
+  int64_t left = len;
+  uint64_t pos = offset;
+  uint64_t srcpos = srcoff;
+  unsigned o = offset % block_size;
+  if (o) {
+    crc_map.erase(offset - o);
+    pos += (block_size - o);
+    srcpos += (block_size - o);
+    left -= (block_size - o);
+    if (out)
+      *out << "clone_range invalidate " << (offset - o) << "\n";
+  }
+  while (left >= block_size) {
+    // FIXME: this could be more efficient.
+    if (block_size == src.block_size) {
+      map<uint64_t,uint32_t>::const_iterator p = src.crc_map.find(srcpos);
+      if (p != src.crc_map.end()) {
+	crc_map[pos] = p->second;
+	if (out)
+	  *out << "clone_range copy " << pos << " " << p->second << "\n";
+      } else {
+	crc_map.erase(pos);
+	if (out)
+	  *out << "clone_range invalidate " << pos << "\n";
+      }
+    } else {
+      crc_map.erase(pos);
+      if (out)
+	*out << "clone_range invalidate " << pos << "\n";
+    }
+    pos += block_size;
+    srcpos += block_size;
+    left -= block_size;
+  }
+  if (left > 0) {
+    crc_map.erase(pos);
+    if (out)
+      *out << "clone_range invalidate " << pos << "\n";
+  }
+}
+
+void SloppyCRCMap::encode(bufferlist& bl) const
+{
+  ENCODE_START(1, 1, bl);
+  ::encode(block_size, bl);
+  ::encode(crc_map, bl);
+  ENCODE_FINISH(bl);
+}
+
+void SloppyCRCMap::decode(bufferlist::iterator& bl)
+{
+  DECODE_START(1, bl);
+  uint32_t bs;
+  ::decode(bs, bl);
+  set_block_size(bs);
+  ::decode(crc_map, bl);
+  DECODE_FINISH(bl);
+}
+
+void SloppyCRCMap::dump(Formatter *f) const
+{
+  f->dump_unsigned("block_size", block_size);
+  f->open_array_section("crc_map");
+  for (map<uint64_t,uint32_t>::const_iterator p = crc_map.begin(); p != crc_map.end(); ++p) {
+    f->open_object_section("crc");
+    f->dump_unsigned("offset", p->first);
+    f->dump_unsigned("crc", p->second);
+    f->close_section();
+  }
+  f->close_section();
+}
+
+void SloppyCRCMap::generate_test_instances(list<SloppyCRCMap*>& ls)
+{
+  ls.push_back(new SloppyCRCMap);
+  ls.push_back(new SloppyCRCMap(2));
+  bufferlist bl;
+  bl.append("some data");
+  ls.back()->write(1, bl.length(), bl);
+  ls.back()->write(10, bl.length(), bl);
+  ls.back()->zero(4, 2);
+}
diff --git a/src/common/SloppyCRCMap.h b/src/common/SloppyCRCMap.h
new file mode 100644
index 00000000000..c07b4d9bb9d
--- /dev/null
+++ b/src/common/SloppyCRCMap.h
@@ -0,0 +1,78 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_COMMON_SLOPPYCRCMAP_H
+#define CEPH_COMMON_SLOPPYCRCMAP_H
+
+#include "include/types.h"
+#include "include/encoding.h"
+
+#include <map>
+#include <ostream>
+
+/**
+ * SloppyCRCMap
+ *
+ * Opportunistically track CRCs on any reads or writes that cover full
+ * blocks.  Verify read results when we have CRC data available for
+ * the given extent.
+ */
+class SloppyCRCMap {
+  static const int crc_iv = 0xffffffff;
+
+  std::map<uint64_t, uint32_t> crc_map;  // offset -> crc(-1)
+  uint32_t block_size;
+  uint32_t zero_crc;
+
+public:
+  SloppyCRCMap(uint32_t b=0) {
+    set_block_size(b);
+  }
+
+  void set_block_size(uint32_t b) {
+    block_size = b;
+    //zero_crc = ceph_crc32c(0xffffffff, NULL, block_size);
+    if (b) {
+      bufferlist bl;
+      bufferptr bp(block_size);
+      bp.zero();
+      bl.append(bp);
+      zero_crc = bl.crc32c(crc_iv);
+    } else {
+      zero_crc = crc_iv;
+    }
+  }
+
+  /// update based on a write
+  void write(uint64_t offset, uint64_t len, const bufferlist& bl,
+	     std::ostream *out = NULL);
+
+  /// update based on a truncate
+  void truncate(uint64_t offset);
+
+  /// update based on a zero/punch_hole
+  void zero(uint64_t offset, uint64_t len);
+
+  /// update based on a zero/punch_hole
+  void clone_range(uint64_t offset, uint64_t len, uint64_t srcoff, const SloppyCRCMap& src,
+		   std::ostream *out = NULL);
+
+  /**
+   * validate a read result
+   *
+   * @param offset offset
+   * @param length length
+   * @param bl data read
+   * @param err option ostream to describe errors in detail
+   * @returns error count, 0 for success
+   */
+  int read(uint64_t offset, uint64_t len, const bufferlist& bl, std::ostream *err);
+
+  void encode(bufferlist& bl) const;
+  void decode(bufferlist::iterator& bl);
+  void dump(Formatter *f) const;
+  static void generate_test_instances(list<SloppyCRCMap*>& ls);
+};
+WRITE_CLASS_ENCODER(SloppyCRCMap)
+
+#endif
diff --git a/src/common/TrackedOp.h b/src/common/TrackedOp.h
index 9e00c14b178..44e03905759 100644
--- a/src/common/TrackedOp.h
+++ b/src/common/TrackedOp.h
@@ -136,7 +136,7 @@ protected:
   string current; /// the current state the event is in
   uint64_t seq; /// a unique value set by the OpTracker
 
-  uint8_t warn_interval_multiplier; // limits output of a given op warning
+  uint32_t warn_interval_multiplier; // limits output of a given op warning
 
   TrackedOp(Message *req, OpTracker *_tracker) :
     xitem(this),
diff --git a/src/common/WorkQueue.h b/src/common/WorkQueue.h
index b2742accdce..794b577a71d 100644
--- a/src/common/WorkQueue.h
+++ b/src/common/WorkQueue.h
@@ -390,6 +390,43 @@ public:
   void drain(WorkQueue_* wq = 0);
 };
 
+class GenContextWQ :
+  public ThreadPool::WorkQueueVal<GenContext<ThreadPool::TPHandle&>*> {
+  list<GenContext<ThreadPool::TPHandle&>*> _queue;
+public:
+  GenContextWQ(const string &name, time_t ti, ThreadPool *tp)
+    : ThreadPool::WorkQueueVal<
+      GenContext<ThreadPool::TPHandle&>*>(name, ti, ti*10, tp) {}
+  
+  void _enqueue(GenContext<ThreadPool::TPHandle&> *c) {
+    _queue.push_back(c);
+  };
+  void _enqueue_front(GenContext<ThreadPool::TPHandle&> *c) {
+    _queue.push_front(c);
+  }
+  bool _empty() {
+    return _queue.empty();
+  }
+  GenContext<ThreadPool::TPHandle&> *_dequeue() {
+    assert(!_queue.empty());
+    GenContext<ThreadPool::TPHandle&> *c = _queue.front();
+    _queue.pop_front();
+    return c;
+  }
+  void _process(GenContext<ThreadPool::TPHandle&> *c, ThreadPool::TPHandle &tp) {
+    c->complete(tp);
+  }
+};
 
+class C_QueueInWQ : public Context {
+  GenContextWQ *wq;
+  GenContext<ThreadPool::TPHandle&> *c;
+public:
+  C_QueueInWQ(GenContextWQ *wq, GenContext<ThreadPool::TPHandle &> *c)
+    : wq(wq), c(c) {}
+  void finish(int) {
+    wq->queue(c);
+  }
+};
 
 #endif
diff --git a/src/common/bloom_filter.cc b/src/common/bloom_filter.cc
new file mode 100644
index 00000000000..f602b80149e
--- /dev/null
+++ b/src/common/bloom_filter.cc
@@ -0,0 +1,76 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "include/types.h"
+#include "common/bloom_filter.hpp"
+
+void bloom_filter::encode(bufferlist& bl) const
+{
+  ENCODE_START(1, 1, bl);
+  ::encode((uint64_t)salt_count_, bl);
+  ::encode((uint64_t)table_size_, bl);
+  ::encode((uint64_t)inserted_element_count_, bl);
+  ::encode((uint64_t)random_seed_, bl);
+  bufferptr bp((const char*)bit_table_, raw_table_size_);
+  ::encode(bp, bl);
+  ENCODE_FINISH(bl);
+}
+
+void bloom_filter::decode(bufferlist::iterator& p)
+{
+  DECODE_START(1, p);
+  uint64_t v;
+  ::decode(v, p);
+  salt_count_ = v;
+  ::decode(v, p);
+  table_size_ = v;
+  ::decode(v, p);
+  inserted_element_count_ = v;
+  ::decode(v, p);
+  random_seed_ = v;
+  bufferlist t;
+  ::decode(t, p);
+
+  salt_.clear();
+  generate_unique_salt();
+  raw_table_size_ = t.length();
+  assert(raw_table_size_ == table_size_ / bits_per_char);
+  delete bit_table_;
+  bit_table_ = new cell_type[raw_table_size_];
+  t.copy(0, raw_table_size_, (char *)bit_table_);
+
+  DECODE_FINISH(p);
+}
+
+void bloom_filter::dump(Formatter *f) const
+{
+  f->dump_unsigned("salt_count", salt_count_);
+  f->dump_unsigned("table_size", table_size_);
+  f->dump_unsigned("raw_table_size", raw_table_size_);
+  f->dump_unsigned("insert_count", inserted_element_count_);
+  f->dump_unsigned("random_seed", random_seed_);
+
+  f->open_array_section("salt_table");
+  for (std::vector<bloom_type>::const_iterator i = salt_.begin(); i != salt_.end(); ++i)
+    f->dump_unsigned("salt", *i);
+  f->close_section();
+
+  f->open_array_section("bit_table");
+  for (unsigned i = 0; i < raw_table_size_; ++i)
+    f->dump_unsigned("byte", (unsigned)bit_table_[i]);
+  f->close_section();
+}
+
+void bloom_filter::generate_test_instances(list<bloom_filter*>& ls)
+{
+  ls.push_back(new bloom_filter(10, .5, 1));
+  ls.push_back(new bloom_filter(10, .5, 1));
+  ls.back()->insert("foo");
+  ls.back()->insert("bar");
+  ls.push_back(new bloom_filter(50, .5, 1));
+  ls.back()->insert("foo");
+  ls.back()->insert("bar");
+  ls.back()->insert("baz");
+  ls.back()->insert("boof");
+  ls.back()->insert("boogggg");
+}
diff --git a/src/common/bloom_filter.hpp b/src/common/bloom_filter.hpp
new file mode 100644
index 00000000000..6216c7fb34d
--- /dev/null
+++ b/src/common/bloom_filter.hpp
@@ -0,0 +1,627 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+/*
+ *******************************************************************
+ *                                                                 *
+ *                        Open Bloom Filter                        *
+ *                                                                 *
+ * Author: Arash Partow - 2000                                     *
+ * URL: http://www.partow.net/programming/hashfunctions/index.html *
+ *                                                                 *
+ * Copyright notice:                                               *
+ * Free use of the Open Bloom Filter Library is permitted under    *
+ * the guidelines and in accordance with the most current version  *
+ * of the Boost Software License, Version 1.0                      *
+ * http://www.opensource.org/licenses/bsl1.0.html                  *
+ *                                                                 *
+ *******************************************************************
+*/
+
+
+#ifndef COMMON_BLOOM_FILTER_HPP
+#define COMMON_BLOOM_FILTER_HPP
+
+#include <cstddef>
+#include <algorithm>
+#include <cmath>
+#include <limits>
+#include <list>
+#include <string>
+#include <vector>
+
+#include "include/encoding.h"
+#include "common/Formatter.h"
+
+static const std::size_t bits_per_char = 0x08;    // 8 bits in 1 char(unsigned)
+static const unsigned char bit_mask[bits_per_char] = {
+  0x01,  //00000001
+  0x02,  //00000010
+  0x04,  //00000100
+  0x08,  //00001000
+  0x10,  //00010000
+  0x20,  //00100000
+  0x40,  //01000000
+  0x80   //10000000
+};
+
+
+class bloom_filter
+{
+protected:
+
+  typedef unsigned int bloom_type;
+  typedef unsigned char cell_type;
+
+public:
+
+  bloom_filter()
+    : bit_table_(0),
+      salt_count_(0),
+      table_size_(0),
+      raw_table_size_(0),
+      inserted_element_count_(0),
+      random_seed_(0)
+  {}
+
+  bloom_filter(const std::size_t& predicted_inserted_element_count,
+	       const double& false_positive_probability,
+	       const std::size_t& random_seed)
+    : bit_table_(0),
+      inserted_element_count_(0),
+      random_seed_((random_seed) ? random_seed : 0xA5A5A5A5)
+  {
+    find_optimal_parameters(predicted_inserted_element_count, false_positive_probability,
+			    &salt_count_, &table_size_);
+    init();
+  }
+
+  bloom_filter(const std::size_t& salt_count, std::size_t table_size,
+	       const std::size_t& random_seed)
+    : bit_table_(0),
+      salt_count_(salt_count),
+      table_size_(table_size),
+      inserted_element_count_(0),
+      random_seed_((random_seed) ? random_seed : 0xA5A5A5A5)
+  {
+    init();
+  }
+
+  void init() {
+    generate_unique_salt();
+    raw_table_size_ = table_size_ / bits_per_char;
+    bit_table_ = new cell_type[raw_table_size_];
+    std::fill_n(bit_table_,raw_table_size_,0x00);
+  }
+
+  bloom_filter(const bloom_filter& filter)
+  {
+    this->operator=(filter);
+  }
+
+  bloom_filter& operator = (const bloom_filter& filter)
+  {
+    if (this != &filter) {
+      salt_count_ = filter.salt_count_;
+      table_size_ = filter.table_size_;
+      raw_table_size_ = filter.raw_table_size_;
+      inserted_element_count_ = filter.inserted_element_count_;
+      random_seed_ = filter.random_seed_;
+      delete[] bit_table_;
+      bit_table_ = new cell_type[raw_table_size_];
+      std::copy(filter.bit_table_,filter.bit_table_ + raw_table_size_,bit_table_);
+      salt_ = filter.salt_;
+    }
+    return *this;
+  }
+
+  virtual ~bloom_filter()
+  {
+    delete[] bit_table_;
+  }
+
+  inline bool operator!() const
+  {
+    return (0 == table_size_);
+  }
+
+  inline void clear()
+  {
+    std::fill_n(bit_table_,raw_table_size_,0x00);
+    inserted_element_count_ = 0;
+  }
+
+  /**
+   * insert a u32 into the set
+   *
+   * NOTE: the internal hash is weak enough that consecutive inputs do
+   * not achieve the desired fpp.  Well-mixed values should be used
+   * here (e.g., put rjhash(x) into the filter instead of just x).
+   *
+   * @param val integer value to insert
+   */
+  inline void insert(uint32_t val) {
+    std::size_t bit_index = 0;
+    std::size_t bit = 0;
+    for (std::size_t i = 0; i < salt_.size(); ++i)
+    {
+      compute_indices(hash_ap(val,salt_[i]),bit_index,bit);
+      bit_table_[bit_index / bits_per_char] |= bit_mask[bit];
+    }
+    ++inserted_element_count_;
+  }
+
+  inline void insert(const unsigned char* key_begin, const std::size_t& length)
+  {
+    std::size_t bit_index = 0;
+    std::size_t bit = 0;
+    for (std::size_t i = 0; i < salt_.size(); ++i)
+    {
+      compute_indices(hash_ap(key_begin,length,salt_[i]),bit_index,bit);
+      bit_table_[bit_index / bits_per_char] |= bit_mask[bit];
+    }
+    ++inserted_element_count_;
+  }
+
+  template<typename T>
+  inline void insert(const T& t)
+  {
+    // Note: T must be a C++ POD type.
+    insert(reinterpret_cast<const unsigned char*>(&t),sizeof(T));
+  }
+
+  inline void insert(const std::string& key)
+  {
+    insert(reinterpret_cast<const unsigned char*>(key.c_str()),key.size());
+  }
+
+  inline void insert(const char* data, const std::size_t& length)
+  {
+    insert(reinterpret_cast<const unsigned char*>(data),length);
+  }
+
+  template<typename InputIterator>
+  inline void insert(const InputIterator begin, const InputIterator end)
+  {
+    InputIterator itr = begin;
+    while (end != itr)
+    {
+      insert(*(itr++));
+    }
+  }
+
+  /**
+   * check if a u32 is contained by set
+   *
+   * NOTE: the internal hash is weak enough that consecutive inputs do
+   * not achieve the desired fpp.  Well-mixed values should be used
+   * here (e.g., put rjhash(x) into the filter instead of just x).
+   *
+   * @param val integer value to query
+   * @returns true if value is (probably) in the set, false if it definitely is not
+   */
+  inline virtual bool contains(uint32_t val) const
+  {
+    std::size_t bit_index = 0;
+    std::size_t bit = 0;
+    for (std::size_t i = 0; i < salt_.size(); ++i)
+    {
+      compute_indices(hash_ap(val,salt_[i]),bit_index,bit);
+      if ((bit_table_[bit_index / bits_per_char] & bit_mask[bit]) != bit_mask[bit])
+      {
+        return false;
+      }
+    }
+    return true;
+  }
+
+  inline virtual bool contains(const unsigned char* key_begin, const std::size_t length) const
+  {
+    std::size_t bit_index = 0;
+    std::size_t bit = 0;
+    for (std::size_t i = 0; i < salt_.size(); ++i)
+    {
+      compute_indices(hash_ap(key_begin,length,salt_[i]),bit_index,bit);
+      if ((bit_table_[bit_index / bits_per_char] & bit_mask[bit]) != bit_mask[bit])
+      {
+        return false;
+      }
+    }
+    return true;
+  }
+
+  template<typename T>
+  inline bool contains(const T& t) const
+  {
+    return contains(reinterpret_cast<const unsigned char*>(&t),static_cast<std::size_t>(sizeof(T)));
+  }
+
+  inline bool contains(const std::string& key) const
+  {
+    return contains(reinterpret_cast<const unsigned char*>(key.c_str()),key.size());
+  }
+
+  inline bool contains(const char* data, const std::size_t& length) const
+  {
+    return contains(reinterpret_cast<const unsigned char*>(data),length);
+  }
+
+  template<typename InputIterator>
+  inline InputIterator contains_all(const InputIterator begin, const InputIterator end) const
+  {
+    InputIterator itr = begin;
+    while (end != itr)
+    {
+      if (!contains(*itr))
+      {
+        return itr;
+      }
+      ++itr;
+    }
+    return end;
+  }
+
+  template<typename InputIterator>
+  inline InputIterator contains_none(const InputIterator begin, const InputIterator end) const
+  {
+    InputIterator itr = begin;
+    while (end != itr)
+    {
+      if (contains(*itr))
+      {
+        return itr;
+      }
+      ++itr;
+    }
+    return end;
+  }
+
+  inline virtual std::size_t size() const
+  {
+    return table_size_;
+  }
+
+  inline std::size_t element_count() const
+  {
+    return inserted_element_count_;
+  }
+
+  inline double effective_fpp() const
+  {
+    /*
+      Note:
+      The effective false positive probability is calculated using the
+      designated table size and hash function count in conjunction with
+      the current number of inserted elements - not the user defined
+      predicated/expected number of inserted elements.
+    */
+    return std::pow(1.0 - std::exp(-1.0 * salt_.size() * inserted_element_count_ / size()), 1.0 * salt_.size());
+  }
+
+  inline bloom_filter& operator &= (const bloom_filter& filter)
+  {
+    /* intersection */
+    if (
+	(salt_count_  == filter.salt_count_) &&
+	(table_size_  == filter.table_size_) &&
+	(random_seed_ == filter.random_seed_)
+	) {
+      for (std::size_t i = 0; i < raw_table_size_; ++i) {
+	bit_table_[i] &= filter.bit_table_[i];
+      }
+    }
+    return *this;
+  }
+
+  inline bloom_filter& operator |= (const bloom_filter& filter)
+  {
+    /* union */
+    if (
+	(salt_count_  == filter.salt_count_) &&
+	(table_size_  == filter.table_size_) &&
+	(random_seed_ == filter.random_seed_)
+	) {
+      for (std::size_t i = 0; i < raw_table_size_; ++i) {
+        bit_table_[i] |= filter.bit_table_[i];
+      }
+    }
+    return *this;
+  }
+
+  inline bloom_filter& operator ^= (const bloom_filter& filter)
+  {
+    /* difference */
+    if (
+	(salt_count_  == filter.salt_count_) &&
+	(table_size_  == filter.table_size_) &&
+	(random_seed_ == filter.random_seed_)
+	) {
+      for (std::size_t i = 0; i < raw_table_size_; ++i) {
+	bit_table_[i] ^= filter.bit_table_[i];
+      }
+    }
+    return *this;
+  }
+
+  inline const cell_type* table() const
+  {
+    return bit_table_;
+  }
+
+protected:
+
+  inline virtual void compute_indices(const bloom_type& hash, std::size_t& bit_index, std::size_t& bit) const
+  {
+    bit_index = hash % table_size_;
+    bit = bit_index % bits_per_char;
+  }
+
+  void generate_unique_salt()
+  {
+    /*
+      Note:
+      A distinct hash function need not be implementation-wise
+      distinct. In the current implementation "seeding" a common
+      hash function with different values seems to be adequate.
+    */
+    const unsigned int predef_salt_count = 128;
+    static const bloom_type predef_salt[predef_salt_count] = {
+      0xAAAAAAAA, 0x55555555, 0x33333333, 0xCCCCCCCC,
+      0x66666666, 0x99999999, 0xB5B5B5B5, 0x4B4B4B4B,
+      0xAA55AA55, 0x55335533, 0x33CC33CC, 0xCC66CC66,
+      0x66996699, 0x99B599B5, 0xB54BB54B, 0x4BAA4BAA,
+      0xAA33AA33, 0x55CC55CC, 0x33663366, 0xCC99CC99,
+      0x66B566B5, 0x994B994B, 0xB5AAB5AA, 0xAAAAAA33,
+      0x555555CC, 0x33333366, 0xCCCCCC99, 0x666666B5,
+      0x9999994B, 0xB5B5B5AA, 0xFFFFFFFF, 0xFFFF0000,
+      0xB823D5EB, 0xC1191CDF, 0xF623AEB3, 0xDB58499F,
+      0xC8D42E70, 0xB173F616, 0xA91A5967, 0xDA427D63,
+      0xB1E8A2EA, 0xF6C0D155, 0x4909FEA3, 0xA68CC6A7,
+      0xC395E782, 0xA26057EB, 0x0CD5DA28, 0x467C5492,
+      0xF15E6982, 0x61C6FAD3, 0x9615E352, 0x6E9E355A,
+      0x689B563E, 0x0C9831A8, 0x6753C18B, 0xA622689B,
+      0x8CA63C47, 0x42CC2884, 0x8E89919B, 0x6EDBD7D3,
+      0x15B6796C, 0x1D6FDFE4, 0x63FF9092, 0xE7401432,
+      0xEFFE9412, 0xAEAEDF79, 0x9F245A31, 0x83C136FC,
+      0xC3DA4A8C, 0xA5112C8C, 0x5271F491, 0x9A948DAB,
+      0xCEE59A8D, 0xB5F525AB, 0x59D13217, 0x24E7C331,
+      0x697C2103, 0x84B0A460, 0x86156DA9, 0xAEF2AC68,
+      0x23243DA5, 0x3F649643, 0x5FA495A8, 0x67710DF8,
+      0x9A6C499E, 0xDCFB0227, 0x46A43433, 0x1832B07A,
+      0xC46AFF3C, 0xB9C8FFF0, 0xC9500467, 0x34431BDF,
+      0xB652432B, 0xE367F12B, 0x427F4C1B, 0x224C006E,
+      0x2E7E5A89, 0x96F99AA5, 0x0BEB452A, 0x2FD87C39,
+      0x74B2E1FB, 0x222EFD24, 0xF357F60C, 0x440FCB1E,
+      0x8BBE030F, 0x6704DC29, 0x1144D12F, 0x948B1355,
+      0x6D8FD7E9, 0x1C11A014, 0xADD1592F, 0xFB3C712E,
+      0xFC77642F, 0xF9C4CE8C, 0x31312FB9, 0x08B0DD79,
+      0x318FA6E7, 0xC040D23D, 0xC0589AA7, 0x0CA5C075,
+      0xF874B172, 0x0CF914D5, 0x784D3280, 0x4E8CFEBC,
+      0xC569F575, 0xCDB2A091, 0x2CC016B4, 0x5C5F4421
+    };
+
+    if (salt_count_ <= predef_salt_count)
+    {
+      std::copy(predef_salt,
+		predef_salt + salt_count_,
+		std::back_inserter(salt_));
+       for (unsigned int i = 0; i < salt_.size(); ++i)
+       {
+        /*
+          Note:
+          This is done to integrate the user defined random seed,
+          so as to allow for the generation of unique bloom filter
+          instances.
+        */
+        salt_[i] = salt_[i] * salt_[(i + 3) % salt_.size()] + random_seed_;
+       }
+    }
+    else
+    {
+      std::copy(predef_salt,predef_salt + predef_salt_count,std::back_inserter(salt_));
+      srand(static_cast<unsigned int>(random_seed_));
+      while (salt_.size() < salt_count_)
+      {
+        bloom_type current_salt = static_cast<bloom_type>(rand()) * static_cast<bloom_type>(rand());
+        if (0 == current_salt)
+	  continue;
+        if (salt_.end() == std::find(salt_.begin(), salt_.end(), current_salt))
+        {
+          salt_.push_back(current_salt);
+        }
+      }
+    }
+  }
+
+  static void find_optimal_parameters(std::size_t target_insert_count,
+				      double target_fpp,
+				      std::size_t *salt_count,
+				      std::size_t *table_size)
+  {
+    /*
+      Note:
+      The following will attempt to find the number of hash functions
+      and minimum amount of storage bits required to construct a bloom
+      filter consistent with the user defined false positive probability
+      and estimated element insertion count.
+    */
+
+    double min_m = std::numeric_limits<double>::infinity();
+    double min_k = 0.0;
+    double curr_m = 0.0;
+    double k = 1.0;
+    while (k < 1000.0)
+    {
+      double numerator  = (- k * target_insert_count);
+      double denominator = std::log(1.0 - std::pow(target_fpp, 1.0 / k));
+      curr_m = numerator / denominator;
+
+      if (curr_m < min_m)
+      {
+        min_m = curr_m;
+        min_k = k;
+      }
+      k += 1.0;
+    }
+
+    *salt_count = static_cast<std::size_t>(min_k);
+    size_t t = static_cast<std::size_t>(min_m);
+    t += (((t % bits_per_char) != 0) ? (bits_per_char - (t % bits_per_char)) : 0);
+    *table_size = t;
+  }
+
+  inline bloom_type hash_ap(uint32_t val, bloom_type hash) const
+  {
+    hash ^=    (hash <<  7) ^  ((val & 0xff000000) >> 24) * (hash >> 3);
+    hash ^= (~((hash << 11) + (((val & 0xff0000) >> 16) ^ (hash >> 5))));
+    hash ^=    (hash <<  7) ^  ((val & 0xff00) >> 8) * (hash >> 3);
+    hash ^= (~((hash << 11) + (((val & 0xff)) ^ (hash >> 5))));
+    return hash;
+  }
+
+  inline bloom_type hash_ap(const unsigned char* begin, std::size_t remaining_length, bloom_type hash) const
+  {
+    const unsigned char* itr = begin;
+
+    while (remaining_length >= 4)
+    {
+      hash ^=    (hash <<  7) ^  (*itr++) * (hash >> 3);
+      hash ^= (~((hash << 11) + ((*itr++) ^ (hash >> 5))));
+      hash ^=    (hash <<  7) ^  (*itr++) * (hash >> 3);
+      hash ^= (~((hash << 11) + ((*itr++) ^ (hash >> 5))));
+      remaining_length -= 4;
+    }
+
+    while (remaining_length >= 2)
+    {
+      hash ^=    (hash <<  7) ^  (*itr++) * (hash >> 3);
+      hash ^= (~((hash << 11) + ((*itr++) ^ (hash >> 5))));
+      remaining_length -= 2;
+    }
+
+    if (remaining_length)
+    {
+      hash ^= (hash <<  7) ^ (*itr) * (hash >> 3);
+    }
+
+    return hash;
+  }
+
+  std::vector<bloom_type> salt_;
+  unsigned char*       bit_table_;
+  std::size_t         salt_count_;
+  std::size_t         table_size_;
+  std::size_t         raw_table_size_;
+  std::size_t         inserted_element_count_;
+  std::size_t         random_seed_;
+
+public:
+  void encode(bufferlist& bl) const;
+  void decode(bufferlist::iterator& bl);
+  void dump(Formatter *f) const;
+  static void generate_test_instances(std::list<bloom_filter*>& ls);
+};
+WRITE_CLASS_ENCODER(bloom_filter)
+
+inline bloom_filter operator & (const bloom_filter& a, const bloom_filter& b)
+{
+  bloom_filter result = a;
+  result &= b;
+  return result;
+}
+
+inline bloom_filter operator | (const bloom_filter& a, const bloom_filter& b)
+{
+  bloom_filter result = a;
+  result |= b;
+  return result;
+}
+
+inline bloom_filter operator ^ (const bloom_filter& a, const bloom_filter& b)
+{
+  bloom_filter result = a;
+  result ^= b;
+  return result;
+}
+
+
+class compressible_bloom_filter : public bloom_filter
+{
+public:
+
+  compressible_bloom_filter(const std::size_t& predicted_element_count,
+			    const double& false_positive_probability,
+			    const std::size_t& random_seed)
+    : bloom_filter(predicted_element_count,false_positive_probability,random_seed)
+  {
+    size_list.push_back(table_size_);
+  }
+
+  inline virtual std::size_t size() const
+  {
+    return size_list.back();
+  }
+
+  inline bool compress(const double& percentage)
+  {
+    if ((0.0 >= percentage) || (percentage >= 100.0))
+    {
+      return false;
+    }
+
+    std::size_t original_table_size = size_list.back();
+    std::size_t new_table_size = static_cast<std::size_t>((size_list.back() * (1.0 - (percentage / 100.0))));
+    new_table_size -= (((new_table_size % bits_per_char) != 0) ? (new_table_size % bits_per_char) : 0);
+
+    if ((bits_per_char > new_table_size) || (new_table_size >= original_table_size))
+    {
+      return false;
+    }
+
+    cell_type* tmp = new cell_type[new_table_size / bits_per_char];
+    std::copy(bit_table_, bit_table_ + (new_table_size / bits_per_char), tmp);
+    cell_type* itr = bit_table_ + (new_table_size / bits_per_char);
+    cell_type* end = bit_table_ + (original_table_size / bits_per_char);
+    cell_type* itr_tmp = tmp;
+
+    while (end != itr)
+    {
+      *(itr_tmp++) |= (*itr++);
+    }
+
+    delete[] bit_table_;
+    bit_table_ = tmp;
+    size_list.push_back(new_table_size);
+
+    return true;
+  }
+
+private:
+
+  inline virtual void compute_indices(const bloom_type& hash, std::size_t& bit_index, std::size_t& bit) const
+  {
+    bit_index = hash;
+    for (std::size_t i = 0; i < size_list.size(); ++i)
+    {
+      bit_index %= size_list[i];
+    }
+    bit = bit_index % bits_per_char;
+  }
+
+  std::vector<std::size_t> size_list;
+};
+
+#endif
+
+
+/*
+  Note 1:
+  If it can be guaranteed that bits_per_char will be of the form 2^n then
+  the following optimization can be used:
+
+  hash_table[bit_index >> n] |= bit_mask[bit_index & (bits_per_char - 1)];
+
+  Note 2:
+  For performance reasons where possible when allocating memory it should
+  be aligned (aligned_alloc) according to the architecture being used.
+*/
diff --git a/src/common/buffer.cc b/src/common/buffer.cc
index 0424887139e..8da4c106d1b 100644
--- a/src/common/buffer.cc
+++ b/src/common/buffer.cc
@@ -990,11 +990,14 @@ void buffer::list::rebuild_page_aligned()
    */
   char *buffer::list::c_str()
   {
-    if (_buffers.size() == 0) 
+    if (_buffers.empty())
       return 0;                         // no buffers
-    if (_buffers.size() > 1) 
+
+    std::list<ptr>::const_iterator iter = _buffers.begin();
+    iter++;
+
+    if (iter != _buffers.end())
       rebuild();
-    assert(_buffers.size() == 1);
     return _buffers.front().c_str();  // good, we're already contiguous.
   }
 
@@ -1267,6 +1270,15 @@ int buffer::list::write_fd(int fd) const
   return 0;
 }
 
+__u32 buffer::list::crc32c(__u32 crc) const
+{
+  for (std::list<ptr>::const_iterator it = _buffers.begin();
+       it != _buffers.end();
+       ++it)
+    if (it->length())
+      crc = ceph_crc32c(crc, (unsigned char*)it->c_str(), it->length());
+  return crc;
+}
 
 void buffer::list::hexdump(std::ostream &out) const
 {
diff --git a/src/common/ceph_argparse.cc b/src/common/ceph_argparse.cc
index 2950a81f89d..6c8053897f3 100644
--- a/src/common/ceph_argparse.cc
+++ b/src/common/ceph_argparse.cc
@@ -464,18 +464,19 @@ CephInitParameters ceph_argparse_early_args
 static void generic_usage(bool is_server)
 {
   cout << "\
-  --conf/-c        Read configuration from the given configuration file\n\
-  --id/-i          set ID portion of my name\n\
-  --name/-n        set name (TYPE.ID)\n\
-  --version        show version and quit\n\
+  --conf/-c FILE    read configuration from the given configuration file\n\
+  --id/-i ID        set ID portion of my name\n\
+  --name/-n TYPE.ID set name\n\
+  --cluster NAME    set cluster name (default: ceph)\n\
+  --version         show version and quit\n\
 " << std::endl;
 
   if (is_server) {
     cout << "\
-  -d               Run in foreground, log to stderr.\n\
-  -f               Run in foreground, log to usual location.\n";
-    cout << "  --debug_ms N\n";
-    cout << "        set message debug level (e.g. 1)\n";
+  -d                run in foreground, log to stderr.\n\
+  -f                run in foreground, log to usual location.\n";
+    cout << "\
+  --debug_ms N      set message debug level (e.g. 1)\n";
   }
 }
 
diff --git a/src/common/ceph_json.cc b/src/common/ceph_json.cc
index 84355575c6c..a48e0636fcf 100644
--- a/src/common/ceph_json.cc
+++ b/src/common/ceph_json.cc
@@ -222,9 +222,7 @@ bool JSONParser::parse(const char *buf_, int len)
     return false;
   }
 
-  string json_string = buf_;
-  // make a substring to len
-  json_string = json_string.substr(0, len);
+  string json_string(buf_, len);
   success = read(json_string, data);
   if (success)
     handle_value(data);
diff --git a/src/common/ceph_strings.cc b/src/common/ceph_strings.cc
index cd08083967a..47648ce19b3 100644
--- a/src/common/ceph_strings.cc
+++ b/src/common/ceph_strings.cc
@@ -50,6 +50,8 @@ const char *ceph_osd_op_name(int op)
 
 	case CEPH_OSD_OP_COPY_GET: return "copy-get";
 	case CEPH_OSD_OP_COPY_FROM: return "copy-from";
+	case CEPH_OSD_OP_UNDIRTY: return "undirty";
+	case CEPH_OSD_OP_ISDIRTY: return "isdirty";
 
 	case CEPH_OSD_OP_CLONERANGE: return "clonerange";
 	case CEPH_OSD_OP_ASSERT_SRC_VERSION: return "assert-src-version";
diff --git a/src/common/code_environment.cc b/src/common/code_environment.cc
index 2cf19f48bc5..662fa36c9bd 100644
--- a/src/common/code_environment.cc
+++ b/src/common/code_environment.cc
@@ -11,6 +11,7 @@
  * Foundation.  See file COPYING.
  *
  */
+#include "acconfig.h"
 
 #include "common/code_environment.h"
 
@@ -19,7 +20,8 @@
 #include <stdlib.h>
 #include <string.h>
 #include <string>
-#if defined(__linux__)
+
+#ifdef HAVE_SYS_PRCTL_H
 #include <sys/prctl.h>
 #endif
 
@@ -45,6 +47,8 @@ std::ostream &operator<<(std::ostream &oss, enum code_environment_t e)
   return oss;
 }
 
+#if defined(HAVE_SYS_PRCTL_H) && defined(PR_GET_NAME) /* Since 2.6.11 */
+
 int get_process_name(char *buf, int len)
 {
   if (len <= 16) {
@@ -53,17 +57,19 @@ int get_process_name(char *buf, int len)
      * null-terminated. */
     return -ENAMETOOLONG;
   }
-#if defined(__FreeBSD__)
-#warning XXX
-    return -ENAMETOOLONG;
-#else
   memset(buf, 0, len);
-  int ret;
-  ret = prctl(PR_GET_NAME, buf);
-  return ret;
-#endif
+  return prctl(PR_GET_NAME, buf);
 }
 
+#else
+
+int get_process_name(char *buf, int len)
+{
+  return -ENOSYS;
+}
+
+#endif
+
 std::string get_process_name_cpp()
 {
   char buf[32];
diff --git a/src/common/config_opts.h b/src/common/config_opts.h
index f6283239660..fad831f5543 100644
--- a/src/common/config_opts.h
+++ b/src/common/config_opts.h
@@ -158,6 +158,8 @@ OPTION(mon_timecheck_interval, OPT_FLOAT, 300.0) // on leader, timecheck (clock
 OPTION(mon_accept_timeout, OPT_FLOAT, 10.0)    // on leader, if paxos update isn't accepted
 OPTION(mon_pg_create_interval, OPT_FLOAT, 30.0) // no more than every 30s
 OPTION(mon_pg_stuck_threshold, OPT_INT, 300) // number of seconds after which pgs can be considered inactive, unclean, or stale (see doc/control.rst under dump_stuck for more info)
+OPTION(mon_pg_warn_min_per_osd, OPT_INT, 20)  // min # pgs per (in) osd before we warn the admin
+OPTION(mon_pg_warn_max_object_skew, OPT_FLOAT, 10.0) // max skew few average in objects per pg
 OPTION(mon_osd_full_ratio, OPT_FLOAT, .95) // what % full makes an OSD "full"
 OPTION(mon_osd_nearfull_ratio, OPT_FLOAT, .85) // what % full makes an OSD near full
 OPTION(mon_globalid_prealloc, OPT_INT, 100)   // how many globalids to prealloc
@@ -360,12 +362,6 @@ OPTION(mds_standby_replay, OPT_BOOL, false)
 // If true, compact leveldb store on mount
 OPTION(osd_compact_leveldb_on_mount, OPT_BOOL, false)
 
-// If true, uses tmap as initial value for omap on old objects
-OPTION(osd_auto_upgrade_tmap, OPT_BOOL, true)
-
-// If true, TMAPPUT sets uses_tmap DEBUGGING ONLY
-OPTION(osd_tmapput_sets_uses_tmap, OPT_BOOL, false)
-
 // Maximum number of backfills to or from a single osd
 OPTION(osd_max_backfills, OPT_U64, 10)
 
@@ -520,7 +516,7 @@ OPTION(osd_recovery_op_warn_multiple, OPT_U32, 16)
 OPTION(osd_mon_shutdown_timeout, OPT_DOUBLE, 5)
 
 OPTION(osd_max_object_size, OPT_U64, 100*1024L*1024L*1024L) // OSD's maximum object size
-OPTION(osd_max_attr_size, OPT_U64, 65536)
+OPTION(osd_max_attr_size, OPT_U64, 0)
 
 OPTION(filestore, OPT_BOOL, false)
 
@@ -555,6 +551,9 @@ OPTION(filestore_max_inline_xattr_size, OPT_U32, 512)
 // for more than filestore_max_inline_xattrs attrs
 OPTION(filestore_max_inline_xattrs, OPT_U32, 2)
 
+OPTION(filestore_sloppy_crc, OPT_BOOL, false)         // track sloppy crcs
+OPTION(filestore_sloppy_crc_block_size, OPT_INT, 65536)
+
 OPTION(filestore_max_sync_interval, OPT_DOUBLE, 5)    // seconds
 OPTION(filestore_min_sync_interval, OPT_DOUBLE, .01)  // seconds
 OPTION(filestore_btrfs_snap, OPT_BOOL, true)
diff --git a/src/common/crc32c_intel_fast.c b/src/common/crc32c_intel_fast.c
index 0532dd261cf..42338a7bcd4 100644
--- a/src/common/crc32c_intel_fast.c
+++ b/src/common/crc32c_intel_fast.c
@@ -1,6 +1,5 @@
 #include "acconfig.h"
 #include "include/int_types.h"
-
 #include "common/crc32c_intel_baseline.h"
 
 extern unsigned int crc32_iscsi_00(unsigned char const *buffer, int len, unsigned int crc);
diff --git a/src/common/crc32c_intel_fast.h b/src/common/crc32c_intel_fast.h
index 7a394a0b82c..26a444f6061 100644
--- a/src/common/crc32c_intel_fast.h
+++ b/src/common/crc32c_intel_fast.h
@@ -8,7 +8,7 @@ extern "C" {
 /* is the fast version compiled in */
 extern int ceph_crc32c_intel_fast_exists(void);
 
-#ifdef __LP64__
+#ifdef __x86_64__
 
 extern uint32_t ceph_crc32c_intel_fast(uint32_t crc, unsigned char const *buffer, unsigned len);
 
diff --git a/src/common/hobject.cc b/src/common/hobject.cc
index d6273693c62..b68baedd524 100644
--- a/src/common/hobject.cc
+++ b/src/common/hobject.cc
@@ -191,3 +191,90 @@ ostream& operator<<(ostream& out, const hobject_t& o)
   out << "/" << o.nspace << "/" << o.pool;
   return out;
 }
+
+// This is compatible with decode for hobject_t prior to
+// version 5.
+void ghobject_t::encode(bufferlist& bl) const
+{
+  ENCODE_START(5, 3, bl);
+  ::encode(hobj.key, bl);
+  ::encode(hobj.oid, bl);
+  ::encode(hobj.snap, bl);
+  ::encode(hobj.hash, bl);
+  ::encode(hobj.max, bl);
+  ::encode(hobj.nspace, bl);
+  ::encode(hobj.pool, bl);
+  ::encode(generation, bl);
+  ::encode(shard_id, bl);
+  ENCODE_FINISH(bl);
+}
+
+void ghobject_t::decode(bufferlist::iterator& bl)
+{
+  DECODE_START_LEGACY_COMPAT_LEN(5, 3, 3, bl);
+  if (struct_v >= 1)
+    ::decode(hobj.key, bl);
+  ::decode(hobj.oid, bl);
+  ::decode(hobj.snap, bl);
+  ::decode(hobj.hash, bl);
+  if (struct_v >= 2)
+    ::decode(hobj.max, bl);
+  else
+    hobj.max = false;
+  if (struct_v >= 4) {
+    ::decode(hobj.nspace, bl);
+    ::decode(hobj.pool, bl);
+  }
+  if (struct_v >= 5) {
+    ::decode(generation, bl);
+    ::decode(shard_id, bl);
+  } else {
+    generation = ghobject_t::NO_GEN;
+    shard_id = ghobject_t::NO_SHARD;
+  }
+  DECODE_FINISH(bl);
+}
+
+void ghobject_t::dump(Formatter *f) const
+{
+  hobj.dump(f);
+  if (generation != NO_GEN) {
+    f->dump_int("generation", generation);
+    f->dump_int("shard_id", shard_id);
+  }
+}
+
+void ghobject_t::generate_test_instances(list<ghobject_t*>& o)
+{
+  o.push_back(new ghobject_t);
+  o.push_back(new ghobject_t);
+  o.back()->hobj.max = true;
+  o.push_back(new ghobject_t(hobject_t(object_t("oname"), string(), 1, 234, -1, "")));
+
+  o.push_back(new ghobject_t(hobject_t(object_t("oname2"), string("okey"), CEPH_NOSNAP,
+	67, 0, "n1"), 1, 0));
+  o.push_back(new ghobject_t(hobject_t(object_t("oname2"), string("okey"), CEPH_NOSNAP,
+	67, 0, "n1"), 1, 1));
+  o.push_back(new ghobject_t(hobject_t(object_t("oname2"), string("okey"), CEPH_NOSNAP,
+	67, 0, "n1"), 1, 2));
+  o.push_back(new ghobject_t(hobject_t(object_t("oname3"), string("oname3"),
+	CEPH_SNAPDIR, 910, 1, "n2"), 1, 0));
+  o.push_back(new ghobject_t(hobject_t(object_t("oname3"), string("oname3"),
+	CEPH_SNAPDIR, 910, 1, "n2"), 2, 0));
+  o.push_back(new ghobject_t(hobject_t(object_t("oname3"), string("oname3"),
+	CEPH_SNAPDIR, 910, 1, "n2"), 3, 0));
+  o.push_back(new ghobject_t(hobject_t(object_t("oname3"), string("oname3"),
+	CEPH_SNAPDIR, 910, 1, "n2"), 3, 1));
+  o.push_back(new ghobject_t(hobject_t(object_t("oname3"), string("oname3"),
+	CEPH_SNAPDIR, 910, 1, "n2"), 3, 2));
+}
+
+ostream& operator<<(ostream& out, const ghobject_t& o)
+{
+  out << o.hobj;
+  if (o.generation != ghobject_t::NO_GEN) {
+    assert(o.shard_id != ghobject_t::NO_SHARD);
+    out << "/" << o.generation << "/" << o.shard_id;
+  }
+  return out;
+}
diff --git a/src/common/hobject.h b/src/common/hobject.h
index 633e471dffc..82eecf3bfc7 100644
--- a/src/common/hobject.h
+++ b/src/common/hobject.h
@@ -79,6 +79,30 @@ public:
     return ret;
   }
 
+  /// @return head version of this hobject_t
+  hobject_t get_head() const {
+    hobject_t ret(*this);
+    ret.snap = CEPH_NOSNAP;
+    return ret;
+  }
+
+  /// @return snapdir version of this hobject_t
+  hobject_t get_snapdir() const {
+    hobject_t ret(*this);
+    ret.snap = CEPH_SNAPDIR;
+    return ret;
+  }
+
+  /// @return true if object is neither head nor snapdir
+  bool is_snap() const {
+    return (snap != CEPH_NOSNAP) && (snap != CEPH_SNAPDIR);
+  }
+
+  /// @return true iff the object should have a snapset in it's attrs
+  bool has_snapset() const {
+    return !is_snap();
+  }
+
   /* Do not use when a particular hash function is needed */
   explicit hobject_t(const sobject_t &o) :
     oid(o.oid), snap(o.snap), max(false), pool(-1) {
@@ -138,7 +162,7 @@ public:
     (*this) = temp;
   }
 
-  string get_namespace() const {
+  const string &get_namespace() const {
     return nspace;
   }
 
@@ -153,6 +177,7 @@ public:
   friend bool operator>=(const hobject_t&, const hobject_t&);
   friend bool operator==(const hobject_t&, const hobject_t&);
   friend bool operator!=(const hobject_t&, const hobject_t&);
+  friend class ghobject_t;
 };
 WRITE_CLASS_ENCODER(hobject_t)
 
@@ -179,4 +204,98 @@ WRITE_CMP_OPERATORS_7(hobject_t,
 		      oid,
 		      snap)
 
+typedef uint64_t gen_t;
+typedef uint8_t shard_t;
+
+#ifndef UINT8_MAX
+#define UINT8_MAX (255)
+#endif
+#ifndef UINT64_MAX
+#define UINT64_MAX (18446744073709551615ULL)
+#endif
+
+struct ghobject_t {
+  hobject_t hobj;
+  gen_t generation;
+  shard_t shard_id;
+
+public:
+  static const shard_t NO_SHARD = UINT8_MAX;
+  static const gen_t NO_GEN = UINT64_MAX;
+
+  ghobject_t() : generation(NO_GEN), shard_id(NO_SHARD) {}
+
+  ghobject_t(const hobject_t &obj) : hobj(obj), generation(NO_GEN), shard_id(NO_SHARD) {}
+
+  ghobject_t(const hobject_t &obj, gen_t gen, shard_t shard) : hobj(obj), generation(gen), shard_id(shard) {}
+
+  bool match(uint32_t bits, uint32_t match) const {
+    return hobj.match_hash(hobj.hash, bits, match);
+  }
+  /// @return min ghobject_t ret s.t. ret.hash == this->hash
+  ghobject_t get_boundary() const {
+    if (hobj.is_max())
+      return *this;
+    ghobject_t ret;
+    ret.hobj.hash = hobj.hash;
+    return ret;
+  }
+  filestore_hobject_key_t get_filestore_key_u32() const {
+    return hobj.get_filestore_key_u32();
+  }
+  filestore_hobject_key_t get_filestore_key() const {
+    return hobj.get_filestore_key();
+  }
+
+  // maximum sorted value.
+  static ghobject_t get_max() {
+    ghobject_t h(hobject_t::get_max());
+    return h;
+  }
+  bool is_max() const {
+    return hobj.is_max();
+  }
+
+  void swap(ghobject_t &o) {
+    ghobject_t temp(o);
+    o = (*this);
+    (*this) = temp;
+  }
+
+  void encode(bufferlist& bl) const;
+  void decode(bufferlist::iterator& bl);
+  void decode(json_spirit::Value& v);
+  void dump(Formatter *f) const;
+  static void generate_test_instances(list<ghobject_t*>& o);
+  friend bool operator<(const ghobject_t&, const ghobject_t&);
+  friend bool operator>(const ghobject_t&, const ghobject_t&);
+  friend bool operator<=(const ghobject_t&, const ghobject_t&);
+  friend bool operator>=(const ghobject_t&, const ghobject_t&);
+  friend bool operator==(const ghobject_t&, const ghobject_t&);
+  friend bool operator!=(const ghobject_t&, const ghobject_t&);
+};
+WRITE_CLASS_ENCODER(ghobject_t)
+
+namespace __gnu_cxx {
+  template<> struct hash<ghobject_t> {
+    size_t operator()(const ghobject_t &r) const {
+      static hash<object_t> H;
+      static rjhash<uint64_t> I;
+      return H(r.hobj.oid) ^ I(r.hobj.snap);
+    }
+  };
+}
+
+ostream& operator<<(ostream& out, const ghobject_t& o);
+
+WRITE_EQ_OPERATORS_3(ghobject_t, hobj, shard_id, generation)
+// sort ghobject_t's by <hobj, shard_id, generation> 
+// 
+// Two objects which differ by generation are more related than
+// two objects of the same generation which differ by shard.
+// 
+WRITE_CMP_OPERATORS_3(ghobject_t,
+		      hobj,
+		      shard_id,
+		      generation)
 #endif
diff --git a/src/common/lru_map.h b/src/common/lru_map.h
index fb637478884..6e7f7b3786f 100644
--- a/src/common/lru_map.h
+++ b/src/common/lru_map.h
@@ -13,8 +13,8 @@ class lru_map {
     typename std::list<K>::iterator lru_iter;
   };
 
-  std::map<K, entry> tokens;
-  std::list<K> tokens_lru;
+  std::map<K, entry> entries;
+  std::list<K> entries_lru;
 
   Mutex lock;
 
@@ -33,19 +33,19 @@ template <class K, class V>
 bool lru_map<K, V>::find(const K& key, V& value)
 {
   lock.Lock();
-  typename std::map<K, entry>::iterator iter = tokens.find(key);
-  if (iter == tokens.end()) {
+  typename std::map<K, entry>::iterator iter = entries.find(key);
+  if (iter == entries.end()) {
     lock.Unlock();
     return false;
   }
 
   entry& e = iter->second;
-  tokens_lru.erase(e.lru_iter);
+  entries_lru.erase(e.lru_iter);
 
   value = e.value;
 
-  tokens_lru.push_front(key);
-  e.lru_iter = tokens_lru.begin();
+  entries_lru.push_front(key);
+  e.lru_iter = entries_lru.begin();
 
   lock.Unlock();
 
@@ -56,23 +56,23 @@ template <class K, class V>
 void lru_map<K, V>::add(const K& key, V& value)
 {
   lock.Lock();
-  typename std::map<K, entry>::iterator iter = tokens.find(key);
-  if (iter != tokens.end()) {
+  typename std::map<K, entry>::iterator iter = entries.find(key);
+  if (iter != entries.end()) {
     entry& e = iter->second;
-    tokens_lru.erase(e.lru_iter);
+    entries_lru.erase(e.lru_iter);
   }
 
-  tokens_lru.push_front(key);
-  entry& e = tokens[key];
+  entries_lru.push_front(key);
+  entry& e = entries[key];
   e.value = value;
-  e.lru_iter = tokens_lru.begin();
-
-  while (tokens_lru.size() > max) {
-    typename std::list<K>::reverse_iterator riter = tokens_lru.rbegin();
-    iter = tokens.find(*riter);
-    // assert(iter != tokens.end());
-    tokens.erase(iter);
-    tokens_lru.pop_back();
+  e.lru_iter = entries_lru.begin();
+
+  while (entries.size() > max) {
+    typename std::list<K>::reverse_iterator riter = entries_lru.rbegin();
+    iter = entries.find(*riter);
+    // assert(iter != entries.end());
+    entries.erase(iter);
+    entries_lru.pop_back();
   }
   
   lock.Unlock();
@@ -82,13 +82,13 @@ template <class K, class V>
 void lru_map<K, V>::erase(const K& key)
 {
   Mutex::Locker l(lock);
-  typename std::map<K, entry>::iterator iter = tokens.find(key);
-  if (iter == tokens.end())
+  typename std::map<K, entry>::iterator iter = entries.find(key);
+  if (iter == entries.end())
     return;
 
   entry& e = iter->second;
-  tokens_lru.erase(e.lru_iter);
-  tokens.erase(iter);
+  entries_lru.erase(e.lru_iter);
+  entries.erase(iter);
 }
 
 #endif
diff --git a/src/common/safe_io.c b/src/common/safe_io.c
index ac99db04ad3..afee82edf07 100644
--- a/src/common/safe_io.c
+++ b/src/common/safe_io.c
@@ -14,8 +14,12 @@
 
 #define _XOPEN_SOURCE 500
 
+#include <stdio.h>
+#include <string.h>
 #include <unistd.h>
 #include <errno.h>
+#include <fcntl.h>
+#include <limits.h>
 
 #include "common/safe_io.h"
 
@@ -112,3 +116,79 @@ ssize_t safe_pwrite(int fd, const void *buf, size_t count, off_t offset)
 	}
 	return 0;
 }
+
+int safe_write_file(const char *base, const char *file,
+		    const char *val, size_t vallen)
+{
+  int ret;
+  char fn[PATH_MAX];
+  char tmp[PATH_MAX];
+  int fd;
+
+  // does the file already have correct content?
+  char oldval[80];
+  ret = safe_read_file(base, file, oldval, sizeof(oldval));
+  if (ret == (int)vallen && memcmp(oldval, val, vallen) == 0)
+    return 0;  // yes.
+
+  snprintf(fn, sizeof(fn), "%s/%s", base, file);
+  snprintf(tmp, sizeof(tmp), "%s/%s.tmp", base, file);
+  fd = open(tmp, O_WRONLY|O_CREAT|O_TRUNC, 0644);
+  if (fd < 0) {
+    ret = errno;
+    return -ret;
+  }
+  ret = safe_write(fd, val, vallen);
+  if (ret) {
+    TEMP_FAILURE_RETRY(close(fd));
+    return ret;
+  }
+
+  ret = fsync(fd);
+  if (ret < 0) ret = -errno;
+  TEMP_FAILURE_RETRY(close(fd));
+  if (ret < 0) {
+    unlink(tmp);
+    return ret;
+  }
+  ret = rename(tmp, fn);
+  if (ret < 0) {
+    ret = -errno;
+    unlink(tmp);
+    return ret;
+  }
+
+  fd = open(base, O_RDONLY);
+  if (fd < 0) {
+    ret = -errno;
+    return ret;
+  }
+  ret = fsync(fd);
+  if (ret < 0) ret = -errno;
+  TEMP_FAILURE_RETRY(close(fd));
+
+  return ret;
+}
+
+int safe_read_file(const char *base, const char *file,
+		   char *val, size_t vallen)
+{
+  char fn[PATH_MAX];
+  int fd, len;
+
+  snprintf(fn, sizeof(fn), "%s/%s", base, file);
+  fd = open(fn, O_RDONLY);
+  if (fd < 0) {
+    return -errno;
+  }
+  len = safe_read(fd, val, vallen - 1);
+  if (len < 0) {
+    TEMP_FAILURE_RETRY(close(fd));
+    return len;
+  }
+  // close sometimes returns errors, but only after write()
+  TEMP_FAILURE_RETRY(close(fd));
+
+  val[len] = 0;
+  return len;
+}
diff --git a/src/common/safe_io.h b/src/common/safe_io.h
index 4c2991fe6e8..a4c9bc7a72f 100644
--- a/src/common/safe_io.h
+++ b/src/common/safe_io.h
@@ -45,6 +45,15 @@ extern "C" {
   ssize_t safe_pread_exact(int fd, void *buf, size_t count, off_t offset)
       WARN_UNUSED_RESULT;
 
+
+  /*
+   * Safe functions to read and write an entire file.
+   */
+  int safe_write_file(const char *base, const char *file,
+			const char *val, size_t vallen);
+  int safe_read_file(const char *base, const char *file,
+		       char *val, size_t vallen);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/common/util.cc b/src/common/util.cc
index 6da37e88833..ab417befef6 100644
--- a/src/common/util.cc
+++ b/src/common/util.cc
@@ -58,6 +58,7 @@ int64_t unit_to_bytesize(string val, ostream *pss)
     switch (c) {
     case 'B':
       break;
+    case 'k':
     case 'K':
       modifier = 10;
       break;
diff --git a/src/crush/CrushWrapper.cc b/src/crush/CrushWrapper.cc
index bab9f9a817e..d17166bc4a9 100644
--- a/src/crush/CrushWrapper.cc
+++ b/src/crush/CrushWrapper.cc
@@ -861,7 +861,6 @@ void CrushWrapper::decode(bufferlist::iterator& blp)
     decode_32_or_64_string_map(type_map, blp);
     decode_32_or_64_string_map(name_map, blp);
     decode_32_or_64_string_map(rule_name_map, blp);
-    build_rmaps();
 
     // tunables
     if (!blp.end()) {
diff --git a/src/crush/CrushWrapper.h b/src/crush/CrushWrapper.h
index 80906e4fe18..b4bb67bb742 100644
--- a/src/crush/CrushWrapper.h
+++ b/src/crush/CrushWrapper.h
@@ -94,6 +94,7 @@ public:
       crush_destroy(crush);
     crush = crush_create();
     assert(crush);
+    have_rmaps = false;
   }
 
   // tunables
diff --git a/src/include/CompatSet.h b/src/include/CompatSet.h
index 26c438c05f2..b23883093ac 100644
--- a/src/include/CompatSet.h
+++ b/src/include/CompatSet.h
@@ -36,8 +36,8 @@ struct CompatSet {
     FeatureSet() : mask(1), names() {}
     void insert(Feature f) {
       assert(f.id > 0);
-      assert(f.id < 63);
-      mask |= (1<<f.id);
+      assert(f.id < 64);
+      mask |= ((uint64_t)1<<f.id);
       names[f.id] = f.name;
     }
 
@@ -50,7 +50,7 @@ struct CompatSet {
     void remove(uint64_t f) {
       if (names.count(f)) {
 	names.erase(f);
-	mask &= ~(1<<f);
+	mask &= ~((uint64_t)1<<f);
       }
     }
     void remove(Feature f) {
@@ -156,24 +156,48 @@ struct CompatSet {
       ((other.ro_compat.mask ^ ro_compat.mask) & other.ro_compat.mask);
     uint64_t other_incompat =
       ((other.incompat.mask ^ incompat.mask) & other.incompat.mask);
-    for (int i = 0; i < 64; ++i) {
-      int mask = 1 << i;
+    for (int id = 1; id < 64; ++id) {
+      uint64_t mask = (uint64_t)1 << id;
       if (mask & other_compat) {
-	diff.compat.insert( Feature(mask & other_compat,
-				    other.compat.names[mask&other_compat]));
+	diff.compat.insert( Feature(id, other.compat.names[id]));
       }
       if (mask & other_ro_compat) {
-	diff.ro_compat.insert(Feature(mask & other_ro_compat,
-				      other.compat.names[mask&other_ro_compat]));
+	diff.ro_compat.insert(Feature(id, other.ro_compat.names[id]));
       }
       if (mask & other_incompat) {
-	diff.incompat.insert( Feature(mask & other_incompat,
-				      other.incompat.names[mask&other_incompat]));
+	diff.incompat.insert( Feature(id, other.incompat.names[id]));
       }
     }
     return diff;
   }
   
+  /* Merge features supported by other CompatSet into this one.
+   * Return: true if some features were merged
+   */
+  bool merge(CompatSet& other) {
+    uint64_t other_compat =
+      ((other.compat.mask ^ compat.mask) & other.compat.mask);
+    uint64_t other_ro_compat =
+      ((other.ro_compat.mask ^ ro_compat.mask) & other.ro_compat.mask);
+    uint64_t other_incompat =
+      ((other.incompat.mask ^ incompat.mask) & other.incompat.mask);
+    if (!other_compat && !other_ro_compat && !other_incompat)
+      return false;
+    for (int id = 1; id < 64; ++id) {
+      uint64_t mask = (uint64_t)1 << id;
+      if (mask & other_compat) {
+	compat.insert( Feature(id, other.compat.names[id]));
+      }
+      if (mask & other_ro_compat) {
+	ro_compat.insert(Feature(id, other.ro_compat.names[id]));
+      }
+      if (mask & other_incompat) {
+	incompat.insert( Feature(id, other.incompat.names[id]));
+      }
+    }
+    return true;
+  }
+
   void encode(bufferlist& bl) const {
     compat.encode(bl);
     ro_compat.encode(bl);
diff --git a/src/include/Context.h b/src/include/Context.h
index 9ec4414a047..663313ceec1 100644
--- a/src/include/Context.h
+++ b/src/include/Context.h
@@ -28,6 +28,26 @@
 #define mydout(cct, v) lgeneric_subdout(cct, context, v)
 
 /*
+ * GenContext - abstract callback class
+ */
+template <typename T>
+class GenContext {
+  GenContext(const GenContext& other);
+  const GenContext& operator=(const GenContext& other);
+
+ protected:
+  virtual void finish(T t) = 0;
+
+ public:
+  GenContext() {}
+  virtual ~GenContext() {}       // we want a virtual destructor!!!
+  virtual void complete(T t) {
+    finish(t);
+    delete this;
+  }
+};
+
+/*
  * Context - abstract callback class
  */
 class Context {
diff --git a/src/include/Makefile.am b/src/include/Makefile.am
index d702ebd2795..2d98e777f00 100644
--- a/src/include/Makefile.am
+++ b/src/include/Makefile.am
@@ -18,7 +18,6 @@ rados_include_DATA = \
 	$(srcdir)/include/crc32c.h
 
 noinst_HEADERS += \
-	include/bloom_filter.hpp \
 	include/Context.h \
 	include/CompatSet.h \
 	include/Distribution.h \
diff --git a/src/include/bloom_filter.hpp b/src/include/bloom_filter.hpp
deleted file mode 100644
index 41aba4bad47..00000000000
--- a/src/include/bloom_filter.hpp
+++ /dev/null
@@ -1,544 +0,0 @@
-/*
- *******************************************************************
- *                                                                 *
- *                        Open Bloom Filter                        *
- *                                                                 *
- * Author: Arash Partow - 2000                                     *
- * URL: http://www.partow.net/programming/hashfunctions/index.html *
- *                                                                 *
- * Copyright notice:                                               *
- * Free use of the Open Bloom Filter Library is permitted under    *
- * the guidelines and in accordance with the most current version  *
- * of the Boost Software License, Version 1.0                      *
- * http://www.opensource.org/licenses/bsl1.0.html                  *
- *                                                                 *
- *******************************************************************
-*/
-
-
-#ifndef INCLUDE_BLOOM_FILTER_HPP
-#define INCLUDE_BLOOM_FILTER_HPP
-
-#include <cstddef>
-#include <algorithm>
-#include <cmath>
-#include <limits>
-#include <string>
-#include <vector>
-
-
-static const std::size_t bits_per_char = 0x08;    // 8 bits in 1 char(unsigned)
-static const unsigned char bit_mask[bits_per_char] = {
-                                                       0x01,  //00000001
-                                                       0x02,  //00000010
-                                                       0x04,  //00000100
-                                                       0x08,  //00001000
-                                                       0x10,  //00010000
-                                                       0x20,  //00100000
-                                                       0x40,  //01000000
-                                                       0x80   //10000000
-                                                     };
-
-
-class bloom_filter
-{
-protected:
-
-   typedef unsigned int bloom_type;
-   typedef unsigned char cell_type;
-
-public:
-
-   bloom_filter(const std::size_t& predicted_inserted_element_count,
-                const double& false_positive_probability,
-                const std::size_t& random_seed)
-   : bit_table_(0),
-     predicted_inserted_element_count_(predicted_inserted_element_count),
-     inserted_element_count_(0),
-     random_seed_((random_seed) ? random_seed : 0xA5A5A5A5),
-     desired_false_positive_probability_(false_positive_probability)
-   {
-      find_optimal_parameters();
-      generate_unique_salt();
-      raw_table_size_ = table_size_ / bits_per_char;
-      bit_table_ = new cell_type[raw_table_size_];
-      std::fill_n(bit_table_,raw_table_size_,0x00);
-   }
-
-   bloom_filter(const bloom_filter& filter)
-   {
-      this->operator=(filter);
-   }
-
-   bloom_filter& operator = (const bloom_filter& filter)
-   {
-      if (this != &filter) {
-        salt_count_ = filter.salt_count_;
-        table_size_ = filter.table_size_;
-        raw_table_size_ = filter.raw_table_size_;
-        predicted_inserted_element_count_ = filter.predicted_inserted_element_count_;
-        inserted_element_count_ = filter.inserted_element_count_;
-        random_seed_ = filter.random_seed_;
-        desired_false_positive_probability_ = filter.desired_false_positive_probability_;
-        delete[] bit_table_;
-        bit_table_ = new cell_type[raw_table_size_];
-        std::copy(filter.bit_table_,filter.bit_table_ + raw_table_size_,bit_table_);
-        salt_ = filter.salt_;
-      }
-      return *this;
-   }
-
-   virtual ~bloom_filter()
-   {
-      delete[] bit_table_;
-   }
-
-   inline bool operator!() const
-   {
-      return (0 == table_size_);
-   }
-
-   inline void clear()
-   {
-      std::fill_n(bit_table_,raw_table_size_,0x00);
-      inserted_element_count_ = 0;
-   }
-
-   inline void insert(const unsigned char* key_begin, const std::size_t& length)
-   {
-      std::size_t bit_index = 0;
-      std::size_t bit = 0;
-      for (std::size_t i = 0; i < salt_.size(); ++i)
-      {
-         compute_indices(hash_ap(key_begin,length,salt_[i]),bit_index,bit);
-         bit_table_[bit_index / bits_per_char] |= bit_mask[bit];
-      }
-      ++inserted_element_count_;
-   }
-
-   template<typename T>
-   inline void insert(const T& t)
-   {
-      // Note: T must be a C++ POD type.
-      insert(reinterpret_cast<const unsigned char*>(&t),sizeof(T));
-   }
-
-   inline void insert(const std::string& key)
-   {
-      insert(reinterpret_cast<const unsigned char*>(key.c_str()),key.size());
-   }
-
-   inline void insert(const char* data, const std::size_t& length)
-   {
-      insert(reinterpret_cast<const unsigned char*>(data),length);
-   }
-
-   template<typename InputIterator>
-   inline void insert(const InputIterator begin, const InputIterator end)
-   {
-      InputIterator itr = begin;
-      while (end != itr)
-      {
-         insert(*(itr++));
-      }
-   }
-
-   inline virtual bool contains(const unsigned char* key_begin, const std::size_t length) const
-   {
-      std::size_t bit_index = 0;
-      std::size_t bit = 0;
-      for (std::size_t i = 0; i < salt_.size(); ++i)
-      {
-         compute_indices(hash_ap(key_begin,length,salt_[i]),bit_index,bit);
-         if ((bit_table_[bit_index / bits_per_char] & bit_mask[bit]) != bit_mask[bit])
-         {
-            return false;
-         }
-      }
-      return true;
-   }
-
-   template<typename T>
-   inline bool contains(const T& t) const
-   {
-      return contains(reinterpret_cast<const unsigned char*>(&t),static_cast<std::size_t>(sizeof(T)));
-   }
-
-   inline bool contains(const std::string& key) const
-   {
-      return contains(reinterpret_cast<const unsigned char*>(key.c_str()),key.size());
-   }
-
-   inline bool contains(const char* data, const std::size_t& length) const
-   {
-      return contains(reinterpret_cast<const unsigned char*>(data),length);
-   }
-
-   template<typename InputIterator>
-   inline InputIterator contains_all(const InputIterator begin, const InputIterator end) const
-   {
-      InputIterator itr = begin;
-      while (end != itr)
-      {
-         if (!contains(*itr))
-         {
-            return itr;
-         }
-         ++itr;
-      }
-      return end;
-   }
-
-   template<typename InputIterator>
-   inline InputIterator contains_none(const InputIterator begin, const InputIterator end) const
-   {
-      InputIterator itr = begin;
-      while (end != itr)
-      {
-         if (contains(*itr))
-         {
-            return itr;
-         }
-         ++itr;
-      }
-      return end;
-   }
-
-   inline virtual std::size_t size() const
-   {
-      return table_size_;
-   }
-
-   inline std::size_t element_count() const
-   {
-      return inserted_element_count_;
-   }
-
-   inline double effective_fpp() const
-   {
-      /*
-        Note:
-        The effective false positive probability is calculated using the
-        designated table size and hash function count in conjunction with
-        the current number of inserted elements - not the user defined
-        predicated/expected number of inserted elements.
-      */
-      return std::pow(1.0 - std::exp(-1.0 * salt_.size() * inserted_element_count_ / size()), 1.0 * salt_.size());
-   }
-
-   inline bloom_filter& operator &= (const bloom_filter& filter)
-   {
-      /* intersection */
-      if (
-          (salt_count_  == filter.salt_count_) &&
-          (table_size_  == filter.table_size_) &&
-          (random_seed_ == filter.random_seed_)
-         )
-      {
-         for (std::size_t i = 0; i < raw_table_size_; ++i)
-         {
-            bit_table_[i] &= filter.bit_table_[i];
-         }
-      }
-      return *this;
-   }
-
-   inline bloom_filter& operator |= (const bloom_filter& filter)
-   {
-      /* union */
-      if (
-          (salt_count_  == filter.salt_count_) &&
-          (table_size_  == filter.table_size_) &&
-          (random_seed_ == filter.random_seed_)
-         )
-      {
-         for (std::size_t i = 0; i < raw_table_size_; ++i)
-         {
-            bit_table_[i] |= filter.bit_table_[i];
-         }
-      }
-      return *this;
-   }
-
-   inline bloom_filter& operator ^= (const bloom_filter& filter)
-   {
-      /* difference */
-      if (
-          (salt_count_  == filter.salt_count_) &&
-          (table_size_  == filter.table_size_) &&
-          (random_seed_ == filter.random_seed_)
-         )
-      {
-         for (std::size_t i = 0; i < raw_table_size_; ++i)
-         {
-            bit_table_[i] ^= filter.bit_table_[i];
-         }
-      }
-      return *this;
-   }
-
-   inline const cell_type* table() const
-   {
-      return bit_table_;
-   }
-
-protected:
-
-   inline virtual void compute_indices(const bloom_type& hash, std::size_t& bit_index, std::size_t& bit) const
-   {
-      bit_index = hash % table_size_;
-      bit = bit_index % bits_per_char;
-   }
-
-   void generate_unique_salt()
-   {
-      /*
-        Note:
-        A distinct hash function need not be implementation-wise
-        distinct. In the current implementation "seeding" a common
-        hash function with different values seems to be adequate.
-      */
-      const unsigned int predef_salt_count = 128;
-      static const bloom_type predef_salt[predef_salt_count] =
-                                 {
-                                    0xAAAAAAAA, 0x55555555, 0x33333333, 0xCCCCCCCC,
-                                    0x66666666, 0x99999999, 0xB5B5B5B5, 0x4B4B4B4B,
-                                    0xAA55AA55, 0x55335533, 0x33CC33CC, 0xCC66CC66,
-                                    0x66996699, 0x99B599B5, 0xB54BB54B, 0x4BAA4BAA,
-                                    0xAA33AA33, 0x55CC55CC, 0x33663366, 0xCC99CC99,
-                                    0x66B566B5, 0x994B994B, 0xB5AAB5AA, 0xAAAAAA33,
-                                    0x555555CC, 0x33333366, 0xCCCCCC99, 0x666666B5,
-                                    0x9999994B, 0xB5B5B5AA, 0xFFFFFFFF, 0xFFFF0000,
-                                    0xB823D5EB, 0xC1191CDF, 0xF623AEB3, 0xDB58499F,
-                                    0xC8D42E70, 0xB173F616, 0xA91A5967, 0xDA427D63,
-                                    0xB1E8A2EA, 0xF6C0D155, 0x4909FEA3, 0xA68CC6A7,
-                                    0xC395E782, 0xA26057EB, 0x0CD5DA28, 0x467C5492,
-                                    0xF15E6982, 0x61C6FAD3, 0x9615E352, 0x6E9E355A,
-                                    0x689B563E, 0x0C9831A8, 0x6753C18B, 0xA622689B,
-                                    0x8CA63C47, 0x42CC2884, 0x8E89919B, 0x6EDBD7D3,
-                                    0x15B6796C, 0x1D6FDFE4, 0x63FF9092, 0xE7401432,
-                                    0xEFFE9412, 0xAEAEDF79, 0x9F245A31, 0x83C136FC,
-                                    0xC3DA4A8C, 0xA5112C8C, 0x5271F491, 0x9A948DAB,
-                                    0xCEE59A8D, 0xB5F525AB, 0x59D13217, 0x24E7C331,
-                                    0x697C2103, 0x84B0A460, 0x86156DA9, 0xAEF2AC68,
-                                    0x23243DA5, 0x3F649643, 0x5FA495A8, 0x67710DF8,
-                                    0x9A6C499E, 0xDCFB0227, 0x46A43433, 0x1832B07A,
-                                    0xC46AFF3C, 0xB9C8FFF0, 0xC9500467, 0x34431BDF,
-                                    0xB652432B, 0xE367F12B, 0x427F4C1B, 0x224C006E,
-                                    0x2E7E5A89, 0x96F99AA5, 0x0BEB452A, 0x2FD87C39,
-                                    0x74B2E1FB, 0x222EFD24, 0xF357F60C, 0x440FCB1E,
-                                    0x8BBE030F, 0x6704DC29, 0x1144D12F, 0x948B1355,
-                                    0x6D8FD7E9, 0x1C11A014, 0xADD1592F, 0xFB3C712E,
-                                    0xFC77642F, 0xF9C4CE8C, 0x31312FB9, 0x08B0DD79,
-                                    0x318FA6E7, 0xC040D23D, 0xC0589AA7, 0x0CA5C075,
-                                    0xF874B172, 0x0CF914D5, 0x784D3280, 0x4E8CFEBC,
-                                    0xC569F575, 0xCDB2A091, 0x2CC016B4, 0x5C5F4421
-                                 };
-
-      if (salt_count_ <= predef_salt_count)
-      {
-         std::copy(predef_salt,
-                   predef_salt + salt_count_,
-                   std::back_inserter(salt_));
-          for (unsigned int i = 0; i < salt_.size(); ++i)
-          {
-            /*
-              Note:
-              This is done to integrate the user defined random seed,
-              so as to allow for the generation of unique bloom filter
-              instances.
-            */
-            salt_[i] = salt_[i] * salt_[(i + 3) % salt_.size()] + random_seed_;
-          }
-      }
-      else
-      {
-         std::copy(predef_salt,predef_salt + predef_salt_count,std::back_inserter(salt_));
-         srand(static_cast<unsigned int>(random_seed_));
-         while (salt_.size() < salt_count_)
-         {
-            bloom_type current_salt = static_cast<bloom_type>(rand()) * static_cast<bloom_type>(rand());
-            if (0 == current_salt) continue;
-            if (salt_.end() == std::find(salt_.begin(), salt_.end(), current_salt))
-            {
-               salt_.push_back(current_salt);
-            }
-         }
-      }
-   }
-
-   void find_optimal_parameters()
-   {
-      /*
-        Note:
-        The following will attempt to find the number of hash functions
-        and minimum amount of storage bits required to construct a bloom
-        filter consistent with the user defined false positive probability
-        and estimated element insertion count.
-      */
-
-      double min_m = std::numeric_limits<double>::infinity();
-      double min_k = 0.0;
-      double curr_m = 0.0;
-      double k = 1.0;
-      while (k < 1000.0)
-      {
-         double numerator   = (- k * predicted_inserted_element_count_);
-         double denominator = std::log(1.0 - std::pow(desired_false_positive_probability_, 1.0 / k));
-         curr_m = numerator / denominator;
-
-         if (curr_m < min_m)
-         {
-            min_m = curr_m;
-            min_k = k;
-         }
-         k += 1.0;
-      }
-
-      salt_count_ = static_cast<std::size_t>(min_k);
-      table_size_ = static_cast<std::size_t>(min_m);
-      table_size_ += (((table_size_ % bits_per_char) != 0) ? (bits_per_char - (table_size_ % bits_per_char)) : 0);
-   }
-
-   inline bloom_type hash_ap(const unsigned char* begin, std::size_t remaining_length, bloom_type hash) const
-   {
-      const unsigned char* itr = begin;
-
-      while (remaining_length >= 4)
-      {
-         hash ^=    (hash <<  7) ^  (*itr++) * (hash >> 3);
-         hash ^= (~((hash << 11) + ((*itr++) ^ (hash >> 5))));
-         hash ^=    (hash <<  7) ^  (*itr++) * (hash >> 3);
-         hash ^= (~((hash << 11) + ((*itr++) ^ (hash >> 5))));
-         remaining_length -= 4;
-      }
-
-      while (remaining_length >= 2)
-      {
-         hash ^=    (hash <<  7) ^  (*itr++) * (hash >> 3);
-         hash ^= (~((hash << 11) + ((*itr++) ^ (hash >> 5))));
-         remaining_length -= 2;
-      }
-
-      if (remaining_length)
-      {
-         hash ^= (hash <<  7) ^ (*itr) * (hash >> 3);
-      }
-
-      return hash;
-   }
-
-   std::vector<bloom_type> salt_;
-   unsigned char*          bit_table_;
-   std::size_t             salt_count_;
-   std::size_t             table_size_;
-   std::size_t             raw_table_size_;
-   std::size_t             predicted_inserted_element_count_;
-   std::size_t             inserted_element_count_;
-   std::size_t             random_seed_;
-   double                  desired_false_positive_probability_;
-};
-
-inline bloom_filter operator & (const bloom_filter& a, const bloom_filter& b)
-{
-   bloom_filter result = a;
-   result &= b;
-   return result;
-}
-
-inline bloom_filter operator | (const bloom_filter& a, const bloom_filter& b)
-{
-   bloom_filter result = a;
-   result |= b;
-   return result;
-}
-
-inline bloom_filter operator ^ (const bloom_filter& a, const bloom_filter& b)
-{
-   bloom_filter result = a;
-   result ^= b;
-   return result;
-}
-
-
-class compressible_bloom_filter : public bloom_filter
-{
-public:
-
-   compressible_bloom_filter(const std::size_t& predicted_element_count,
-                             const double& false_positive_probability,
-                             const std::size_t& random_seed)
-   : bloom_filter(predicted_element_count,false_positive_probability,random_seed)
-   {
-      size_list.push_back(table_size_);
-   }
-
-   inline virtual std::size_t size() const
-   {
-      return size_list.back();
-   }
-
-   inline bool compress(const double& percentage)
-   {
-      if ((0.0 >= percentage) || (percentage >= 100.0))
-      {
-         return false;
-      }
-
-      std::size_t original_table_size = size_list.back();
-      std::size_t new_table_size = static_cast<std::size_t>((size_list.back() * (1.0 - (percentage / 100.0))));
-      new_table_size -= (((new_table_size % bits_per_char) != 0) ? (new_table_size % bits_per_char) : 0);
-
-      if ((bits_per_char > new_table_size) || (new_table_size >= original_table_size))
-      {
-         return false;
-      }
-
-      desired_false_positive_probability_ = effective_fpp();
-      cell_type* tmp = new cell_type[new_table_size / bits_per_char];
-      std::copy(bit_table_, bit_table_ + (new_table_size / bits_per_char), tmp);
-      cell_type* itr = bit_table_ + (new_table_size / bits_per_char);
-      cell_type* end = bit_table_ + (original_table_size / bits_per_char);
-      cell_type* itr_tmp = tmp;
-
-      while (end != itr)
-      {
-         *(itr_tmp++) |= (*itr++);
-      }
-
-      delete[] bit_table_;
-      bit_table_ = tmp;
-      size_list.push_back(new_table_size);
-
-      return true;
-   }
-
-private:
-
-   inline virtual void compute_indices(const bloom_type& hash, std::size_t& bit_index, std::size_t& bit) const
-   {
-      bit_index = hash;
-      for (std::size_t i = 0; i < size_list.size(); ++i)
-      {
-         bit_index %= size_list[i];
-      }
-      bit = bit_index % bits_per_char;
-   }
-
-   std::vector<std::size_t> size_list;
-};
-
-#endif
-
-
-/*
-  Note 1:
-  If it can be guaranteed that bits_per_char will be of the form 2^n then
-  the following optimization can be used:
-
-  hash_table[bit_index >> n] |= bit_mask[bit_index & (bits_per_char - 1)];
-
-  Note 2:
-  For performance reasons where possible when allocating memory it should
-  be aligned (aligned_alloc) according to the architecture being used.
-*/
diff --git a/src/include/buffer.h b/src/include/buffer.h
index 077cf0d9b0b..ffa3d6e1b97 100644
--- a/src/include/buffer.h
+++ b/src/include/buffer.h
@@ -14,8 +14,6 @@
 #ifndef CEPH_BUFFER_H
 #define CEPH_BUFFER_H
 
-#include "include/int_types.h"
-
 #if defined(__linux__)
 #include <stdlib.h>
 #include <linux/types.h>
@@ -46,6 +44,7 @@ void	*valloc(size_t);
 #include <malloc.h>
 #endif
 
+#include <inttypes.h>
 #include <stdint.h>
 #include <string.h>
 
@@ -420,15 +419,7 @@ public:
     ssize_t read_fd(int fd, size_t len);
     int write_file(const char *fn, int mode=0644);
     int write_fd(int fd) const;
-    __u32 crc32c(__u32 crc) {
-      for (std::list<ptr>::const_iterator it = _buffers.begin(); 
-	   it != _buffers.end(); 
-	   ++it)
-	if (it->length())
-	  crc = ceph_crc32c(crc, (unsigned char*)it->c_str(), it->length());
-      return crc;
-    }
-
+    uint32_t crc32c(uint32_t crc) const;
   };
 
   /*
@@ -436,7 +427,7 @@ public:
    */
 
   class hash {
-    __u32 crc;
+    uint32_t crc;
 
   public:
     hash() : crc(0) { }
@@ -445,7 +436,7 @@ public:
       crc = bl.crc32c(crc);
     }
 
-    __u32 digest() {
+    uint32_t digest() {
       return crc;
     }
   };
diff --git a/src/include/ceph_fs.h b/src/include/ceph_fs.h
index 6c41d14f5da..ba0b5eb0f19 100644
--- a/src/include/ceph_fs.h
+++ b/src/include/ceph_fs.h
@@ -224,6 +224,7 @@ struct ceph_mon_subscribe_ack {
  * mdsmap flags
  */
 #define CEPH_MDSMAP_DOWN    (1<<0)  /* cluster deliberately down */
+#define CEPH_MDSMAP_ALLOW_SNAPS   (1<<1)  /* cluster allowed to create snapshots */
 
 /*
  * mds states
diff --git a/src/include/crc32c.h b/src/include/crc32c.h
index 8e22c624636..49d68474d68 100644
--- a/src/include/crc32c.h
+++ b/src/include/crc32c.h
@@ -1,8 +1,7 @@
 #ifndef CEPH_CRC32C_H
 #define CEPH_CRC32C_H
 
-#include "include/int_types.h"
-
+#include <inttypes.h>
 #include <string.h>
 
 typedef uint32_t (*ceph_crc32c_func_t)(uint32_t crc, unsigned char const *data, unsigned length);
diff --git a/src/include/rados.h b/src/include/rados.h
index 178c171c445..e7a32b5afef 100644
--- a/src/include/rados.h
+++ b/src/include/rados.h
@@ -219,6 +219,8 @@ enum {
 
 	CEPH_OSD_OP_COPY_FROM = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 26,
 	CEPH_OSD_OP_COPY_GET = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 27,
+	CEPH_OSD_OP_UNDIRTY   = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 28,
+	CEPH_OSD_OP_ISDIRTY   = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 29,
 
 	/** multi **/
 	CEPH_OSD_OP_CLONERANGE = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_MULTI | 1,
diff --git a/src/include/rados/librados.h b/src/include/rados/librados.h
index a85ef3057bc..515663c2335 100644
--- a/src/include/rados/librados.h
+++ b/src/include/rados/librados.h
@@ -1,8 +1,6 @@
 #ifndef CEPH_LIBRADOS_H
 #define CEPH_LIBRADOS_H
 
-#include "include/int_types.h"
-
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -566,7 +564,7 @@ int rados_pool_create_with_auid(rados_t cluster, const char *pool_name, uint64_t
  * @returns 0 on success, negative error code on failure
  */
 int rados_pool_create_with_crush_rule(rados_t cluster, const char *pool_name,
-				      __u8 crush_rule_num);
+				      uint8_t crush_rule_num);
 
 /**
  * Create a pool with a specific CRUSH rule and auid
@@ -581,7 +579,7 @@ int rados_pool_create_with_crush_rule(rados_t cluster, const char *pool_name,
  * @returns 0 on success, negative error code on failure
  */
 int rados_pool_create_with_all(rados_t cluster, const char *pool_name, uint64_t auid,
-			       __u8 crush_rule_num);
+			       uint8_t crush_rule_num);
 
 /**
  * Delete a pool and all data inside it
diff --git a/src/include/rados/librados.hpp b/src/include/rados/librados.hpp
index 94d3d23a824..3f6d025ff41 100644
--- a/src/include/rados/librados.hpp
+++ b/src/include/rados/librados.hpp
@@ -133,11 +133,16 @@ namespace librados
    * BALANCE_READS and LOCALIZE_READS should only be used
    * when reading from data you're certain won't change,
    * like a snapshot, or where eventual consistency is ok.
+   *
+   * ORDER_READS_WRITES will order reads the same way writes are
+   * ordered (e.g., waiting for degraded objects).  In particular, it
+   * will make a write followed by a read sequence be preserved.
    */
   enum ObjectOperationGlobalFlags {
     OPERATION_NOFLAG         = 0,
     OPERATION_BALANCE_READS  = 1,
     OPERATION_LOCALIZE_READS = 2,
+    OPERATION_ORDER_READS_WRITES = 4,
   };
 
   /*
@@ -278,6 +283,13 @@ namespace librados
      */
     void copy_from(const std::string& src, const IoCtx& src_ioctx, uint64_t src_version);
 
+    /**
+     * undirty an object
+     *
+     * Clear an objects dirty flag
+     */
+    void undirty();
+
     friend class IoCtx;
   };
 
@@ -396,6 +408,14 @@ namespace librados
      */
     void list_snaps(snap_set_t *out_snaps, int *prval);
 
+    /**
+     * query dirty state of an object
+     *
+     * @param out_dirty [out] pointer to resulting bool
+     * @param prval [out] place error code in prval upon completion
+     */
+    void is_dirty(bool *isdirty, int *prval);
+
   };
 
   /* IoCtx : This is a context in which we can perform I/O.
diff --git a/src/include/types.h b/src/include/types.h
index 1f9756b22c7..5a9e6f6d4c9 100644
--- a/src/include/types.h
+++ b/src/include/types.h
@@ -379,7 +379,7 @@ inline ostream& operator<<(ostream& out, const prettybyte_t& b)
   if (b.v > bump_after << 20)
     return out << (b.v >> 20) << " MB";    
   if (b.v > bump_after << 10)
-    return out << (b.v >> 10) << " KB";
+    return out << (b.v >> 10) << " kB";
   return out << b.v << " bytes";
 }
 
@@ -402,7 +402,7 @@ inline ostream& operator<<(ostream& out, const si_t& b)
   if (b.v > bump_after << 20)
     return out << (b.v >> 20) << "M";
   if (b.v > bump_after << 10)
-    return out << (b.v >> 10) << "K";
+    return out << (b.v >> 10) << "k";
   return out << b.v;
 }
 
@@ -425,7 +425,7 @@ inline ostream& operator<<(ostream& out, const pretty_si_t& b)
   if (b.v > bump_after << 20)
     return out << (b.v >> 20) << " M";
   if (b.v > bump_after << 10)
-    return out << (b.v >> 10) << " K";
+    return out << (b.v >> 10) << " k";
   return out << b.v << " ";
 }
 
@@ -445,7 +445,7 @@ inline ostream& operator<<(ostream& out, const kb_t& kb)
     return out << (kb.v >> 20) << " GB";    
   if (kb.v > bump_after << 10)
     return out << (kb.v >> 10) << " MB";
-  return out << kb.v << " KB";
+  return out << kb.v << " kB";
 }
 
 inline ostream& operator<<(ostream& out, const ceph_mon_subscribe_item& i)
diff --git a/src/init-ceph.in b/src/init-ceph.in
index 3a404a46c6f..46877d75558 100644
--- a/src/init-ceph.in
+++ b/src/init-ceph.in
@@ -80,7 +80,7 @@ stop_daemon() {
     action=$5
     [ -z "$action" ] && action="Stopping"
     echo -n "$action Ceph $name on $host..."
-    do_cmd "while [ 1 ]; do 
+    do_cmd "while [ 1 ]; do
 	[ -e $pidfile ] || break
 	pid=\`cat $pidfile\`
 	while [ -e /proc/\$pid ] && grep -q $daemon /proc/\$pid/cmdline ; do
@@ -172,6 +172,14 @@ command=$1
 get_local_name_list
 get_name_list "$@"
 
+# Reverse the order if we are stopping
+if [ "$command" = "stop" ]; then
+    for f in $what; do
+       new_order="$f $new_order"
+    done
+    what="$new_order"
+fi
+
 for name in $what; do
     type=`echo $name | cut -c 1-3`   # e.g. 'mon', if $item is 'mon1'
     id=`echo $name | cut -c 4- | sed 's/^\\.//'`
@@ -251,18 +259,18 @@ for name in $what; do
 	    wrap=""
 	    runmode=""
 	    runarg=""
-	    
+
 	    [ -z "$docrun" ] && get_conf_bool docrun "0" "restart on core dump"
 	    [ "$docrun" -eq 1 ] && wrap="$BINDIR/ceph-run"
-	    
+
 	    [ -z "$dovalgrind" ] && get_conf_bool valgrind "" "valgrind"
 	    [ -n "$valgrind" ] && wrap="$wrap valgrind $valgrind"
-	    
+
 	    [ -n "$wrap" ] && runmode="-f &" && runarg="-f"
 	    [ -n "$max_open_files" ] && files="ulimit -n $max_open_files;"
 
 	    cmd="$files $wrap $cmd $runmode"
-	    
+
 	    if [ $dofsmount -eq 1 ] && [ -n "$fs_devs" ]; then
 		get_conf pre_mount "true" "pre mount command"
 		get_conf fs_type "" "osd mkfs type"
@@ -361,7 +369,7 @@ for name in $what; do
 	    [ -n "$post_start" ] && do_cmd "$post_start"
 	    [ -n "$lockfile" ] && [ "$?" -eq 0 ] && touch $lockfile
 	    ;;
-	
+
 	stop)
 	    get_conf pre_stop "" "pre stop command"
 	    get_conf post_stop "" "post stop command"
@@ -402,13 +410,13 @@ for name in $what; do
 	    [ -n "$post_forcestop" ] && do_cmd "$post_forcestop"
 	    [ -n "$lockfile" ] && [ "$?" -eq 0 ] && rm -f $lockfile
 	    ;;
-	    
+
 	killall)
 	    echo "killall ceph-$type on $host"
 	    do_cmd "pkill ^ceph-$type || true"
 	    [ -n "$lockfile" ] && [ "$?" -eq 0 ] && rm -f $lockfile
 	    ;;
-	
+
 	force-reload | reload)
 	    signal_daemon $name ceph-$type $pid_file -1 "Reloading"
 	    ;;
diff --git a/src/java/Makefile.am b/src/java/Makefile.am
index ac7e86cf9fa..8b28f839e46 100644
--- a/src/java/Makefile.am
+++ b/src/java/Makefile.am
@@ -64,7 +64,6 @@ BUILT_SOURCES = $(JAVA_H)
 if HAVE_JUNIT4
 
 JAVA_TEST_CLASSES = $(JAVA_TEST_SRC:test/%.java=%.class)
-ESCAPED_JAVA_TEST_CLASSES = com/ceph/fs/CephAllTests\$$1.class
 
 CEPH_TEST_PROXY=test/com/ceph/fs/CephMountTest.class
 
@@ -73,7 +72,7 @@ $(CEPH_TEST_PROXY): $(JAVA_TEST_SRC) $(CEPH_PROXY)
 	$(JAVAC) -source 1.5 -target 1.5 -Xlint:-options test/com/ceph/fs/*.java
 
 libcephfs-test.jar: $(CEPH_TEST_PROXY)
-	$(JAR) cf $@ $(JAVA_TEST_CLASSES:%=-C test %) $(ESCAPED_JAVA_TEST_CLASSES:%=-C test %)
+	$(JAR) cf $@ $(JAVA_TEST_CLASSES:%=-C test %) 
 
 java_DATA += libcephfs-test.jar
 
diff --git a/src/java/test/com/ceph/fs/CephAllTests.java b/src/java/test/com/ceph/fs/CephAllTests.java
index 71c2ddfee96..039ad6da3b7 100644
--- a/src/java/test/com/ceph/fs/CephAllTests.java
+++ b/src/java/test/com/ceph/fs/CephAllTests.java
@@ -23,7 +23,6 @@ import java.io.FileNotFoundException;
 import java.io.IOException;
 import java.util.UUID;
 import org.junit.*;
-import org.junit.rules.ExternalResource;
 import org.junit.runners.Suite;
 import org.junit.runner.RunWith;
 import static org.junit.Assert.*;
@@ -42,16 +41,4 @@ import static org.junit.Assert.*;
  */
 public class CephAllTests{
 
-  @Rule
-  public static ExternalResource testRule = new ExternalResource(){
-    @Override
-    protected void before() throws Throwable{
-      // Add debugging messages or setup code here
-    };
-
-    @Override
-    protected void after(){
-      // Add debugging messages or cleanup code here
-    };
-  };
 }
diff --git a/src/librados/RadosClient.cc b/src/librados/RadosClient.cc
index 8a5f499ec15..1be3ebd10f9 100644
--- a/src/librados/RadosClient.cc
+++ b/src/librados/RadosClient.cc
@@ -459,7 +459,6 @@ int librados::RadosClient::pool_create_async(string& name, PoolAsyncCompletionIm
   Context *onfinish = new C_PoolAsync_Safe(c);
   int r = objecter->create_pool(name, onfinish, auid, crush_rule);
   if (r < 0) {
-    delete c;
     delete onfinish;
   }
   return r;
@@ -505,7 +504,6 @@ int librados::RadosClient::pool_delete_async(const char *name, PoolAsyncCompleti
   Context *onfinish = new C_PoolAsync_Safe(c);
   int r = objecter->delete_pool(tmp_pool_id, onfinish);
   if (r < 0) {
-    delete c;
     delete onfinish;
   }
   return r;
diff --git a/src/librados/librados.cc b/src/librados/librados.cc
index 852228ed383..217a0a7bfb2 100644
--- a/src/librados/librados.cc
+++ b/src/librados/librados.cc
@@ -269,6 +269,14 @@ void librados::ObjectReadOperation::list_snaps(
   o->list_snaps(out_snaps, prval);
 }
 
+void librados::ObjectReadOperation::is_dirty(bool *is_dirty, int *prval)
+{
+  ::ObjectOperation *o = (::ObjectOperation *)impl;
+  o->is_dirty(is_dirty, prval);
+}
+
+
+
 int librados::IoCtx::omap_get_vals(const std::string& oid,
                                    const std::string& start_after,
                                    const std::string& filter_prefix,
@@ -390,6 +398,12 @@ void librados::ObjectWriteOperation::copy_from(const std::string& src,
   o->copy_from(object_t(src), src_ioctx.io_ctx_impl->snap_seq, src_ioctx.io_ctx_impl->oloc, src_version);
 }
 
+void librados::ObjectWriteOperation::undirty()
+{
+  ::ObjectOperation *o = (::ObjectOperation *)impl;
+  o->undirty();
+}
+
 void librados::ObjectWriteOperation::tmap_put(const bufferlist &bl)
 {
   ::ObjectOperation *o = (::ObjectOperation *)impl;
@@ -958,6 +972,8 @@ int librados::IoCtx::aio_operate(const std::string& oid, AioCompletion *c,
     op_flags |= CEPH_OSD_FLAG_BALANCE_READS;
   if (flags & OPERATION_LOCALIZE_READS)
     op_flags |= CEPH_OSD_FLAG_LOCALIZE_READS;
+  if (flags & OPERATION_ORDER_READS_WRITES)
+    op_flags |= CEPH_OSD_FLAG_RWORDERED;
 
   return io_ctx_impl->aio_operate_read(obj, (::ObjectOperation*)o->impl, c->pc,
 				       op_flags, pbl);
diff --git a/src/mds/CDentry.cc b/src/mds/CDentry.cc
index 5ff6e61fbe0..05766587930 100644
--- a/src/mds/CDentry.cc
+++ b/src/mds/CDentry.cc
@@ -567,4 +567,14 @@ void CDentry::remove_client_lease(ClientLease *l, Locker *locker)
     locker->eval_gather(&lock);
 }
 
-
+void CDentry::_put()
+{
+  if (get_num_ref() <= (int)is_dirty() + 1) {
+    CDentry::linkage_t *dnl = get_projected_linkage();
+    if (dnl->is_primary()) {
+      CInode *in = dnl->get_inode();
+      if (get_num_ref() == (int)is_dirty() + !!in->get_num_ref())
+	in->mdcache->maybe_eval_stray(in, true);
+    }
+  }
+}
diff --git a/src/mds/CDentry.h b/src/mds/CDentry.h
index 0d2445a525f..e40854adfaa 100644
--- a/src/mds/CDentry.h
+++ b/src/mds/CDentry.h
@@ -76,6 +76,8 @@ public:
   static const int STATE_FRAGMENTING =  (1<<1);
   static const int STATE_PURGING =      (1<<2);
   static const int STATE_BADREMOTEINO = (1<<3);
+  // stray dentry needs notification of releasing reference
+  static const int STATE_STRAY =	STATE_NOTIFYREF;
 
   // -- pins --
   static const int PIN_INODEPIN =     1;  // linked inode is pinned
@@ -146,6 +148,7 @@ protected:
 
 public:
   elist<CDentry*>::item item_dirty;
+  elist<CDentry*>::item item_stray;
 
 protected:
   int auth_pins, nested_auth_pins;
@@ -254,6 +257,7 @@ public:
   void last_put() {
     lru_unpin();
   }
+  void _put();
 
   // auth pins
   bool can_auth_pin();
diff --git a/src/mds/CDir.cc b/src/mds/CDir.cc
index 2b991d78fde..4a5e636d9a6 100644
--- a/src/mds/CDir.cc
+++ b/src/mds/CDir.cc
@@ -27,7 +27,7 @@
 #include "MDLog.h"
 #include "LogSegment.h"
 
-#include "include/bloom_filter.hpp"
+#include "common/bloom_filter.hpp"
 #include "include/Context.h"
 #include "common/Clock.h"
 
@@ -655,6 +655,14 @@ void CDir::remove_null_dentries() {
   assert(get_num_any() == items.size());
 }
 
+void CDir::touch_dentries_bottom() {
+  dout(12) << "touch_dentries_bottom " << *this << dendl;
+
+  for (CDir::map_t::iterator p = items.begin();
+       p != items.end();
+       ++p)
+    inode->mdcache->touch_dentry_bottom(p->second);
+}
 
 bool CDir::try_trim_snap_dentry(CDentry *dn, const set<snapid_t>& snaps)
 {
@@ -1461,6 +1469,7 @@ void CDir::_fetched(bufferlist &bl, const string& want_dn)
   }
   bool purged_any = false;
 
+  bool stray = inode->is_stray();
 
   //int num_new_inodes_loaded = 0;
   loff_t baseoff = p.get_off();
@@ -1605,6 +1614,12 @@ void CDir::_fetched(bufferlist &bl, const string& want_dn)
 	  if (in->inode.is_dirty_rstat())
 	    in->mark_dirty_rstat();
 
+	  if (stray) {
+	    dn->state_set(CDentry::STATE_STRAY);
+	    if (in->inode.nlink == 0)
+	      in->state_set(CInode::STATE_ORPHAN);
+	  }
+
 	  //in->hack_accessed = false;
 	  //in->hack_load_stamp = ceph_clock_now(g_ceph_context);
 	  //num_new_inodes_loaded++;
diff --git a/src/mds/CDir.h b/src/mds/CDir.h
index 7cf2b6a43d7..86da4e5dfd3 100644
--- a/src/mds/CDir.h
+++ b/src/mds/CDir.h
@@ -357,6 +357,7 @@ private:
   void remove_null_dentries();
   void purge_stale_snap_data(const set<snapid_t>& snaps);
 public:
+  void touch_dentries_bottom();
   bool try_trim_snap_dentry(CDentry *dn, const set<snapid_t>& snaps);
 
 
diff --git a/src/mds/CInode.cc b/src/mds/CInode.cc
index 46f8d33cfd8..7accc5a4dba 100644
--- a/src/mds/CInode.cc
+++ b/src/mds/CInode.cc
@@ -682,6 +682,12 @@ void CInode::last_put()
     parent->put(CDentry::PIN_INODEPIN);
 }
 
+void CInode::_put()
+{
+  if (get_num_ref() == (int)is_dirty() + (int)is_dirty_parent())
+    mdcache->maybe_eval_stray(this, true);
+}
+
 void CInode::add_remote_parent(CDentry *p) 
 {
   if (remote_parents.empty())
@@ -1073,7 +1079,6 @@ void CInode::_stored_backtrace(version_t v, Context *fin)
     clear_dirty_parent();
   if (fin)
     fin->complete(0);
-  mdcache->maybe_eval_stray(this);
 }
 
 void CInode::_mark_dirty_parent(LogSegment *ls, bool dirty_pool)
diff --git a/src/mds/CInode.h b/src/mds/CInode.h
index 8e760220c14..1c2a9339c1c 100644
--- a/src/mds/CInode.h
+++ b/src/mds/CInode.h
@@ -156,6 +156,8 @@ public:
   static const int STATE_STRAYPINNED = (1<<16);
   static const int STATE_FROZENAUTHPIN = (1<<17);
   static const int STATE_DIRTYPOOL =   (1<<18);
+  // orphan inode needs notification of releasing reference
+  static const int STATE_ORPHAN =	STATE_NOTIFYREF;
 
   static const int MASK_STATE_EXPORTED =
     (STATE_DIRTY|STATE_NEEDSRECOVER|STATE_DIRTYPARENT|STATE_DIRTYPOOL);
@@ -812,6 +814,7 @@ public:
   }
   void first_get();
   void last_put();
+  void _put();
 
 
   // -- hierarchy stuff --
diff --git a/src/mds/Locker.cc b/src/mds/Locker.cc
index 99bd761e0f7..19c9176f414 100644
--- a/src/mds/Locker.cc
+++ b/src/mds/Locker.cc
@@ -1640,9 +1640,6 @@ void Locker::file_update_finish(CInode *in, Mutation *mut, bool share, client_t
       share_inode_max_size(in);
   }
   issue_caps_set(need_issue);
-
-  // unlinked stray?  may need to purge (e.g., after all caps are released)
-  mdcache->maybe_eval_stray(in);
 }
 
 Capability* Locker::issue_new_caps(CInode *in,
@@ -3011,8 +3008,6 @@ void Locker::remove_client_cap(CInode *in, client_t client)
   }
   
   try_eval(in, CEPH_CAP_LOCKS);
-
-  mds->mdcache->maybe_eval_stray(in);
 }
 
 
diff --git a/src/mds/LogEvent.cc b/src/mds/LogEvent.cc
index b775b6d9501..16e7f803196 100644
--- a/src/mds/LogEvent.cc
+++ b/src/mds/LogEvent.cc
@@ -46,10 +46,16 @@ LogEvent *LogEvent::decode(bufferlist& bl)
   ::decode(type, p);
 
   if (EVENT_NEW_ENCODING == type) {
-    DECODE_START(1, p);
-    ::decode(type, p);
-    event = decode_event(bl, p, type);
-    DECODE_FINISH(p);
+    try {
+      DECODE_START(1, p);
+      ::decode(type, p);
+      event = decode_event(bl, p, type);
+      DECODE_FINISH(p);
+    }
+    catch (const buffer::error &e) {
+      generic_dout(0) << "failed to decode LogEvent (type maybe " << type << ")" << dendl;
+      return NULL;
+    }
   } else { // we are using classic encoding
     event = decode_event(bl, p, type);
   }
diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc
index 86b380f2827..9dc1229fbb9 100644
--- a/src/mds/MDCache.cc
+++ b/src/mds/MDCache.cc
@@ -127,7 +127,8 @@ long g_num_caps = 0;
 set<int> SimpleLock::empty_gather_set;
 
 
-MDCache::MDCache(MDS *m)
+MDCache::MDCache(MDS *m) :
+  delayed_eval_stray(member_offset(CDentry, item_stray))
 {
   mds = m;
   migrator = new Migrator(mds, this);
@@ -676,6 +677,7 @@ CDentry *MDCache::get_or_create_stray_dentry(CInode *in)
   } else 
     assert(straydn->get_projected_linkage()->is_null());
 
+  straydn->state_set(CDentry::STATE_STRAY);
   return straydn;
 }
 
@@ -5934,8 +5936,9 @@ void MDCache::truncate_inode_finish(CInode *in, LogSegment *ls)
 
   EUpdate *le = new EUpdate(mds->mdlog, "truncate finish");
   mds->mdlog->start_entry(le);
-  le->metablob.add_dir_context(in->get_parent_dir());
-  le->metablob.add_primary_dentry(in->get_projected_parent_dn(), in, true);
+  CDentry *dn = in->get_projected_parent_dn();
+  le->metablob.add_dir_context(dn->get_dir());
+  le->metablob.add_primary_dentry(dn, in, true);
   le->metablob.add_truncate_finish(in->ino(), ls->offset);
 
   journal_dirty_inode(mut, &le->metablob, in);
@@ -6017,8 +6020,15 @@ bool MDCache::trim(int max)
   }
   dout(7) << "trim max=" << max << "  cur=" << lru.lru_get_size() << dendl;
 
-  map<int, MCacheExpire*> expiremap;
+  // process delayed eval_stray()
+  for (elist<CDentry*>::iterator p = delayed_eval_stray.begin(); !p.end(); ) {
+    CDentry *dn = *p;
+    ++p;
+    dn->item_stray.remove_myself();
+    eval_stray(dn);
+  }
 
+  map<int, MCacheExpire*> expiremap;
   bool is_standby_replay = mds->is_standby_replay();
   int unexpirable = 0;
   list<CDentry*> unexpirables;
@@ -6026,13 +6036,12 @@ bool MDCache::trim(int max)
   while (lru.lru_get_size() + unexpirable > (unsigned)max) {
     CDentry *dn = static_cast<CDentry*>(lru.lru_expire());
     if (!dn) break;
-    if (is_standby_replay && dn->get_linkage() &&
-        dn->get_linkage()->inode->item_open_file.is_on_list()) {
+    if ((is_standby_replay && dn->get_linkage() &&
+        dn->get_linkage()->inode->item_open_file.is_on_list()) ||
+	trim_dentry(dn, expiremap)) {
       unexpirables.push_back(dn);
       ++unexpirable;
-      continue;
     }
-    trim_dentry(dn, expiremap);
   }
   for(list<CDentry*>::iterator i = unexpirables.begin();
       i != unexpirables.end();
@@ -6087,7 +6096,7 @@ void MDCache::send_expire_messages(map<int, MCacheExpire*>& expiremap)
 }
 
 
-void MDCache::trim_dentry(CDentry *dn, map<int, MCacheExpire*>& expiremap)
+bool MDCache::trim_dentry(CDentry *dn, map<int, MCacheExpire*>& expiremap)
 {
   dout(12) << "trim_dentry " << *dn << dendl;
   
@@ -6142,6 +6151,9 @@ void MDCache::trim_dentry(CDentry *dn, map<int, MCacheExpire*>& expiremap)
     CInode *in = dnl->get_inode();
     assert(in);
     trim_inode(dn, in, con, expiremap);
+    // purging stray instead of trimming ?
+    if (dn->get_num_ref() > 0)
+      return true;
   } 
   else {
     assert(dnl->is_null());
@@ -6160,6 +6172,7 @@ void MDCache::trim_dentry(CDentry *dn, map<int, MCacheExpire*>& expiremap)
     migrator->export_empty_import(dir);
   
   if (mds->logger) mds->logger->inc(l_mds_iex);
+  return false;
 }
 
 
@@ -6222,7 +6235,14 @@ void MDCache::trim_inode(CDentry *dn, CInode *in, CDir *con, map<int, MCacheExpi
     trim_dirfrag(*p, con ? con:*p, expiremap);  // if no container (e.g. root dirfrag), use *p
   
   // INODE
-  if (!in->is_auth()) {
+  if (in->is_auth()) {
+    // eval stray after closing dirfrags
+    if (dn) {
+      maybe_eval_stray(in);
+      if (dn->get_num_ref() > 0)
+	return;
+    }
+  } else {
     pair<int,int> auth = in->authority();
     
     dirfrag_t df;
@@ -6305,6 +6325,12 @@ void MDCache::trim_non_auth()
       // add back into lru (at the top)
       lru.lru_insert_top(dn);
 
+      if (dn->get_dir()->get_inode()->is_stray()) {
+	dn->state_set(CDentry::STATE_STRAY);
+	if (dnl->is_primary() && dnl->get_inode()->inode.nlink == 0)
+	  dnl->get_inode()->state_set(CInode::STATE_ORPHAN);
+      }
+
       if (!first_auth) {
 	first_auth = dn;
       } else {
@@ -6725,9 +6751,6 @@ void MDCache::inode_remove_replica(CInode *in, int from, set<SimpleLock *>& gath
   if (in->nestlock.remove_replica(from)) gather_locks.insert(&in->nestlock);
   if (in->flocklock.remove_replica(from)) gather_locks.insert(&in->flocklock);
   if (in->policylock.remove_replica(from)) gather_locks.insert(&in->policylock);
-
-  // trim?
-  maybe_eval_stray(in);
 }
 
 void MDCache::dentry_remove_replica(CDentry *dn, int from, set<SimpleLock *>& gather_locks)
@@ -6737,10 +6760,6 @@ void MDCache::dentry_remove_replica(CDentry *dn, int from, set<SimpleLock *>& ga
   // fix lock
   if (dn->lock.remove_replica(from))
     gather_locks.insert(&dn->lock);
-
-  CDentry::linkage_t *dnl = dn->get_projected_linkage();
-  if (dnl->is_primary())
-    maybe_eval_stray(dnl->get_inode());
 }
 
 void MDCache::trim_client_leases()
@@ -9147,7 +9166,7 @@ struct C_MDC_EvalStray : public Context {
   }
 };
 
-void MDCache::eval_stray(CDentry *dn)
+void MDCache::eval_stray(CDentry *dn, bool delay)
 {
   dout(10) << "eval_stray " << *dn << dendl;
   CDentry::linkage_t *dnl = dn->get_projected_linkage();
@@ -9211,9 +9230,13 @@ void MDCache::eval_stray(CDentry *dn)
       dout(20) << " too many dn refs" << dendl;
       return;
     }
-    purge_stray(dn);
+    if (delay) {
+      if (!dn->item_stray.is_on_list())
+	delayed_eval_stray.push_back(&dn->item_stray);
+    } else
+      purge_stray(dn);
   }
-  else if (in->inode.nlink == 1) {
+  else if (in->inode.nlink >= 1) {
     // trivial reintegrate?
     if (!in->remote_parents.empty()) {
       CDentry *rlink = *in->remote_parents.begin();
@@ -9257,14 +9280,6 @@ void MDCache::fetch_backtrace(inodeno_t ino, int64_t pool, bufferlist& bl, Conte
   mds->objecter->getxattr(oid, object_locator_t(pool), "parent", CEPH_NOSNAP, &bl, 0, fin);
 }
 
-void MDCache::remove_backtrace(inodeno_t ino, int64_t pool, Context *fin)
-{
-  SnapContext snapc;
-  object_t oid = CInode::get_object_name(ino, frag_t(), "");
-  mds->objecter->removexattr(oid, object_locator_t(pool), "parent", snapc,
-			     ceph_clock_now(g_ceph_context), 0, NULL, fin);
-}
-
 class C_MDC_PurgeStrayPurged : public Context {
   MDCache *cache;
   CDentry *dn;
@@ -9276,94 +9291,6 @@ public:
   }
 };
 
-class C_MDC_PurgeForwardingPointers : public Context {
-  MDCache *cache;
-  CDentry *dn;
-public:
-  bufferlist bl;
-  C_MDC_PurgeForwardingPointers(MDCache *c, CDentry *d) :
-    cache(c), dn(d) {}
-  void finish(int r) {
-    cache->_purge_forwarding_pointers(bl, dn, r);
-  }
-};
-
-class C_MDC_PurgeStray : public Context {
-  MDCache *cache;
-  CDentry *dn;
-public:
-  C_MDC_PurgeStray(MDCache *c, CDentry *d) :
-    cache(c), dn(d) {}
-  void finish(int r) {
-    cache->_purge_stray(dn, r);
-  }
-};
-
-void MDCache::_purge_forwarding_pointers(bufferlist& bl, CDentry *dn, int r)
-{
-  assert(r == 0 || r == -ENOENT || r == -ENODATA);
-  inode_backtrace_t backtrace;
-  if (r == 0)
-    ::decode(backtrace, bl);
-
-  // setup gathering context
-  C_GatherBuilder gather_bld(g_ceph_context);
-
-  // remove all the objects with forwarding pointer backtraces (aka sentinels)
-  for (set<int64_t>::const_iterator i = backtrace.old_pools.begin();
-       i != backtrace.old_pools.end();
-       ++i) {
-    SnapContext snapc;
-    object_t oid = CInode::get_object_name(backtrace.ino, frag_t(), "");
-    object_locator_t oloc(*i);
-
-    mds->objecter->remove(oid, oloc, snapc, ceph_clock_now(g_ceph_context), 0,
-                         NULL, gather_bld.new_sub());
-  }
-
-  if (gather_bld.has_subs()) {
-    gather_bld.set_finisher(new C_MDC_PurgeStray(this, dn));
-    gather_bld.activate();
-  } else {
-    _purge_stray(dn, r);
-  }
-}
-
-void MDCache::_purge_stray(CDentry *dn, int r)
-{
-  // purge the strays
-  CDentry::linkage_t *dnl = dn->get_projected_linkage();
-  CInode *in = dnl->get_inode();
-  dout(10) << "_purge_stray " << *dn << " " << *in << dendl;
-
-  SnapRealm *realm = in->find_snaprealm();
-  SnapContext nullsnap;
-  const SnapContext *snapc;
-  if (realm) {
-    dout(10) << " realm " << *realm << dendl;
-    snapc = &realm->get_snap_context();
-  } else {
-    dout(10) << " NO realm, using null context" << dendl;
-    snapc = &nullsnap;
-    assert(in->last == CEPH_NOSNAP);
-  }
-
-  uint64_t period = (uint64_t)in->inode.layout.fl_object_size * (uint64_t)in->inode.layout.fl_stripe_count;
-  uint64_t cur_max_size = in->inode.get_max_size();
-  uint64_t to = MAX(in->inode.size, cur_max_size);
-  if (to && period) {
-    uint64_t num = (to + period - 1) / period;
-    dout(10) << "purge_stray 0~" << to << " objects 0~" << num << " snapc " << snapc << " on " << *in << dendl;
-    mds->filer->purge_range(in->inode.ino, &in->inode.layout, *snapc,
-                           0, num, ceph_clock_now(g_ceph_context), 0,
-			   new C_MDC_PurgeStrayPurged(this, dn));
-
-  } else {
-    dout(10) << "purge_stray 0 objects snapc " << snapc << " on " << *in << dendl;
-    _purge_stray_purged(dn);
-  }
-}
-
 void MDCache::purge_stray(CDentry *dn)
 {
   CDentry::linkage_t *dnl = dn->get_projected_linkage();
@@ -9381,24 +9308,86 @@ void MDCache::purge_stray(CDentry *dn)
   dn->get(CDentry::PIN_PURGING);
   in->state_set(CInode::STATE_PURGING);
 
-  
+  if (dn->item_stray.is_on_list())
+    dn->item_stray.remove_myself();
+
+  if (in->is_dirty_parent())
+    in->clear_dirty_parent();
+
   // CHEAT.  there's no real need to journal our intent to purge, since
   // that is implicit in the dentry's presence and non-use in the stray
   // dir.  on recovery, we'll need to re-eval all strays anyway.
   
+  SnapContext nullsnapc;
+  C_GatherBuilder gather(g_ceph_context, new C_MDC_PurgeStrayPurged(this, dn));
+
   if (in->is_dir()) {
-    dout(10) << "purge_stray dir ... implement me!" << dendl;  // FIXME XXX
-    // remove the backtrace
-    remove_backtrace(in->ino(), mds->mdsmap->get_metadata_pool(),
-		     new C_MDC_PurgeStrayPurged(this, dn));
-  } else if (in->is_file()) {
-    // get the backtrace before blowing away the object
-    C_MDC_PurgeForwardingPointers *fin = new C_MDC_PurgeForwardingPointers(this, dn);
-    fetch_backtrace(in->ino(), in->get_inode().layout.fl_pg_pool, fin->bl, fin);
+    object_locator_t oloc(mds->mdsmap->get_metadata_pool());
+    list<frag_t> ls;
+    if (!in->dirfragtree.is_leaf(frag_t()))
+      in->dirfragtree.get_leaves(ls);
+    ls.push_back(frag_t());
+    for (list<frag_t>::iterator p = ls.begin();
+         p != ls.end();
+         ++p) {
+      object_t oid = CInode::get_object_name(in->inode.ino, *p, "");
+      dout(10) << "purge_stray remove dirfrag " << oid << dendl;
+      mds->objecter->remove(oid, oloc, nullsnapc, ceph_clock_now(g_ceph_context),
+                            0, NULL, gather.new_sub());
+    }
+    assert(gather.has_subs());
+    gather.activate();
+    return;
+  }
+
+  const SnapContext *snapc;
+  SnapRealm *realm = in->find_snaprealm();
+  if (realm) {
+    dout(10) << " realm " << *realm << dendl;
+    snapc = &realm->get_snap_context();
   } else {
-    // not a dir or file; purged!
-    _purge_stray_purged(dn);
+    dout(10) << " NO realm, using null context" << dendl;
+    snapc = &nullsnapc;
+    assert(in->last == CEPH_NOSNAP);
+  }
+
+  if (in->is_file()) {
+    uint64_t period = (uint64_t)in->inode.layout.fl_object_size *
+		      (uint64_t)in->inode.layout.fl_stripe_count;
+    uint64_t cur_max_size = in->inode.get_max_size();
+    uint64_t to = MAX(in->inode.size, cur_max_size);
+    if (to && period) {
+      uint64_t num = (to + period - 1) / period;
+      dout(10) << "purge_stray 0~" << to << " objects 0~" << num
+	       << " snapc " << snapc << " on " << *in << dendl;
+      mds->filer->purge_range(in->inode.ino, &in->inode.layout, *snapc,
+			      0, num, ceph_clock_now(g_ceph_context), 0,
+			      gather.new_sub());
+    }
   }
+
+  inode_t *pi = in->get_projected_inode();
+  object_t oid = CInode::get_object_name(pi->ino, frag_t(), "");
+  // remove the backtrace object if it was not purged
+  if (!gather.has_subs()) {
+    object_locator_t oloc(pi->layout.fl_pg_pool);
+    dout(10) << "purge_stray remove backtrace object " << oid
+	     << " pool " << oloc.pool << " snapc " << snapc << dendl;
+    mds->objecter->remove(oid, oloc, *snapc, ceph_clock_now(g_ceph_context), 0,
+			  NULL, gather.new_sub());
+  }
+  // remove old backtrace objects
+  for (vector<int64_t>::iterator p = pi->old_pools.begin();
+       p != pi->old_pools.end();
+       ++p) {
+    object_locator_t oloc(*p);
+    dout(10) << "purge_stray remove backtrace object " << oid
+	     << " old pool " << *p << " snapc " << snapc << dendl;
+    mds->objecter->remove(oid, oloc, *snapc, ceph_clock_now(g_ceph_context), 0,
+			  NULL, gather.new_sub());
+  }
+  assert(gather.has_subs());
+  gather.activate();
 }
 
 class C_MDC_PurgeStrayLogged : public Context {
@@ -9480,9 +9469,6 @@ void MDCache::_purge_stray_logged(CDentry *dn, version_t pdv, LogSegment *ls)
   CInode *in = dn->get_linkage()->get_inode();
   dout(10) << "_purge_stray_logged " << *dn << " " << *in << dendl;
 
-  dn->state_clear(CDentry::STATE_PURGING);
-  dn->put(CDentry::PIN_PURGING);
-
   assert(!in->state_test(CInode::STATE_RECOVERING));
 
   // unlink
@@ -9493,11 +9479,13 @@ void MDCache::_purge_stray_logged(CDentry *dn, version_t pdv, LogSegment *ls)
 
   dn->dir->pop_and_dirty_projected_fnode(ls);
 
+  in->state_clear(CInode::STATE_ORPHAN);
+  dn->state_clear(CDentry::STATE_PURGING);
+  dn->put(CDentry::PIN_PURGING);
+
   // drop inode
   if (in->is_dirty())
     in->mark_clean();
-  if (in->is_dirty_parent())
-    in->clear_dirty_parent();
 
   remove_inode(in);
 
@@ -10639,7 +10627,7 @@ void MDCache::handle_dentry_unlink(MDentryUnlink *m)
 	    !in->state_test(CInode::STATE_EXPORTINGCAPS))
 	  migrator->export_caps(in);
 	
-	lru.lru_bottouch(straydn);  // move stray to end of lru
+	touch_dentry_bottom(straydn); // move stray to end of lru
 	straydn = NULL;
       } else {
 	assert(!straydn);
@@ -10649,7 +10637,7 @@ void MDCache::handle_dentry_unlink(MDentryUnlink *m)
       assert(dnl->is_null());
       
       // move to bottom of lru
-      lru.lru_bottouch(dn);
+      touch_dentry_bottom(dn);
     }
   }
 
diff --git a/src/mds/MDCache.h b/src/mds/MDCache.h
index b4b57da84b2..d8f2a9486fb 100644
--- a/src/mds/MDCache.h
+++ b/src/mds/MDCache.h
@@ -19,6 +19,7 @@
 
 #include "include/types.h"
 #include "include/filepath.h"
+#include "include/elist.h"
 
 #include "CInode.h"
 #include "CDentry.h"
@@ -564,7 +565,7 @@ public:
 
   // trimming
   bool trim(int max = -1);   // trim cache
-  void trim_dentry(CDentry *dn, map<int, MCacheExpire*>& expiremap);
+  bool trim_dentry(CDentry *dn, map<int, MCacheExpire*>& expiremap);
   void trim_dirfrag(CDir *dir, CDir *con,
 		    map<int, MCacheExpire*>& expiremap);
   void trim_inode(CDentry *dn, CInode *in, CDir *con,
@@ -646,6 +647,15 @@ public:
   }
   void touch_dentry_bottom(CDentry *dn) {
     lru.lru_bottouch(dn);
+    if (dn->get_projected_linkage()->is_primary()) {
+      CInode *in = dn->get_projected_linkage()->get_inode();
+      if (in->has_dirfrags()) {
+	list<CDir*> ls;
+	in->get_dirfrags(ls);
+	for (list<CDir*>::iterator p = ls.begin(); p != ls.end(); ++p)
+	  (*p)->touch_dentries_bottom();
+      }
+    }
   }
 protected:
 
@@ -858,31 +868,28 @@ public:
 
   // -- stray --
 public:
+  elist<CDentry*> delayed_eval_stray;
+
   void scan_stray_dir();
-  void eval_stray(CDentry *dn);
+  void eval_stray(CDentry *dn, bool delay=false);
   void eval_remote(CDentry *dn);
 
-  void maybe_eval_stray(CInode *in) {
+  void maybe_eval_stray(CInode *in, bool delay=false) {
     if (in->inode.nlink > 0 || in->is_base())
       return;
     CDentry *dn = in->get_projected_parent_dn();
-    if (dn->get_projected_linkage()->is_primary() &&
-	dn->get_dir()->get_inode()->is_stray() &&
-	!dn->is_replicated())
-      eval_stray(dn);
+    if (!dn->state_test(CDentry::STATE_PURGING) &&
+	dn->get_projected_linkage()->is_primary() &&
+	dn->get_dir()->get_inode()->is_stray())
+      eval_stray(dn, delay);
   }
 protected:
   void fetch_backtrace(inodeno_t ino, int64_t pool, bufferlist& bl, Context *fin);
-  void remove_backtrace(inodeno_t ino, int64_t pool, Context *fin);
-  void _purge_forwarding_pointers(bufferlist& bl, CDentry *dn, int r);
-  void _purge_stray(CDentry *dn, int r);
   void purge_stray(CDentry *dn);
   void _purge_stray_purged(CDentry *dn, int r=0);
   void _purge_stray_logged(CDentry *dn, version_t pdv, LogSegment *ls);
   void _purge_stray_logged_truncate(CDentry *dn, LogSegment *ls);
   friend class C_MDC_FetchedBacktrace;
-  friend class C_MDC_PurgeForwardingPointers;
-  friend class C_MDC_PurgeStray;
   friend class C_MDC_PurgeStrayLogged;
   friend class C_MDC_PurgeStrayLoggedTruncate;
   friend class C_MDC_PurgeStrayPurged;
diff --git a/src/mds/MDLog.cc b/src/mds/MDLog.cc
index 1ace72e0ac3..cacbebfd3f6 100644
--- a/src/mds/MDLog.cc
+++ b/src/mds/MDLog.cc
@@ -499,7 +499,11 @@ void MDLog::_replay_thread()
     if (journaler->get_error()) {
       r = journaler->get_error();
       dout(0) << "_replay journaler got error " << r << ", aborting" << dendl;
-      if (r == -EINVAL) {
+      if (r == -ENOENT) {
+	// journal has been trimmed by somebody else?
+	assert(journaler->is_readonly());
+	r = -EAGAIN;
+      } else if (r == -EINVAL) {
         if (journaler->get_read_pos() < journaler->get_expire_pos()) {
           // this should only happen if you're following somebody else
           assert(journaler->is_readonly());
@@ -605,7 +609,7 @@ void MDLog::_replay_thread()
   }
 
   dout(10) << "_replay_thread kicking waiters" << dendl;
-  finish_contexts(g_ceph_context, waitfor_replay, 0);  
+  finish_contexts(g_ceph_context, waitfor_replay, r);  
 
   dout(10) << "_replay_thread finish" << dendl;
   mds->mds_lock.Unlock();
diff --git a/src/mds/MDS.cc b/src/mds/MDS.cc
index fc05ca0ecb7..c2e0bbbe369 100644
--- a/src/mds/MDS.cc
+++ b/src/mds/MDS.cc
@@ -800,7 +800,9 @@ void MDS::handle_command(MMonCommand *m)
      clog.info() << "tcmalloc not enabled, can't use heap profiler commands\n";
    else {
      ostringstream ss;
-     ceph_heap_profiler_handle_command(m->cmd, ss);
+     vector<std::string> cmdargs;
+     cmdargs.insert(cmdargs.begin(), m->cmd.begin()+1, m->cmd.end());
+     ceph_heap_profiler_handle_command(cmdargs, ss);
      clog.info() << ss.str();
    }
  } else dout(0) << "unrecognized command! " << m->cmd << dendl;
diff --git a/src/mds/MDSMap.cc b/src/mds/MDSMap.cc
index 1646a134ad5..f1ab9b112d8 100644
--- a/src/mds/MDSMap.cc
+++ b/src/mds/MDSMap.cc
@@ -470,7 +470,7 @@ void MDSMap::encode(bufferlist& bl, uint64_t features) const
     ::encode(cas_pool, bl);
 
     // kclient ignores everything from here
-    __u16 ev = 5;
+    __u16 ev = 6;
     ::encode(ev, bl);
     ::encode(compat, bl);
     ::encode(metadata_pool, bl);
@@ -483,6 +483,8 @@ void MDSMap::encode(bufferlist& bl, uint64_t features) const
     ::encode(failed, bl);
     ::encode(stopped, bl);
     ::encode(last_failure_osd_epoch, bl);
+    ::encode(ever_allowed_snaps, bl);
+    ::encode(explicitly_allowed_snaps, bl);
     ENCODE_FINISH(bl);
   }
 }
@@ -540,5 +542,12 @@ void MDSMap::decode(bufferlist::iterator& p)
   ::decode(stopped, p);
   if (ev >= 4)
     ::decode(last_failure_osd_epoch, p);
+  if (ev >= 6) {
+    ::decode(ever_allowed_snaps, p);
+    ::decode(explicitly_allowed_snaps, p);
+  } else {
+    ever_allowed_snaps = true;
+    explicitly_allowed_snaps = false;
+  }
   DECODE_FINISH(p);
 }
diff --git a/src/mds/MDSMap.h b/src/mds/MDSMap.h
index 5bfc7cc20d5..5eadf156a95 100644
--- a/src/mds/MDSMap.h
+++ b/src/mds/MDSMap.h
@@ -175,6 +175,9 @@ protected:
   map<int32_t,uint64_t> up;        // who is in those roles
   map<uint64_t,mds_info_t> mds_info;
 
+  bool ever_allowed_snaps; //< the cluster has ever allowed snap creation
+  bool explicitly_allowed_snaps; //< the user has explicitly enabled snap creation
+
 public:
   CompatSet compat;
 
@@ -188,7 +191,9 @@ public:
       max_file_size(0),
       cas_pool(-1),
       metadata_pool(0),
-      max_mds(0)
+      max_mds(0),
+      ever_allowed_snaps(false),
+      explicitly_allowed_snaps(false)
   { }
 
   utime_t get_session_timeout() {
@@ -201,6 +206,14 @@ public:
   void set_flag(int f) { flags |= f; }
   void clear_flag(int f) { flags &= ~f; }
 
+  void set_snaps_allowed() {
+    set_flag(CEPH_MDSMAP_ALLOW_SNAPS);
+    ever_allowed_snaps = true;
+    explicitly_allowed_snaps = true;
+  }
+  bool allows_snaps() { return test_flag(CEPH_MDSMAP_ALLOW_SNAPS); }
+  void clear_snaps_allowed() { clear_flag(CEPH_MDSMAP_ALLOW_SNAPS); }
+
   epoch_t get_epoch() const { return epoch; }
   void inc_epoch() { epoch++; }
 
diff --git a/src/mds/Server.cc b/src/mds/Server.cc
index 466d4818456..869f3773441 100644
--- a/src/mds/Server.cc
+++ b/src/mds/Server.cc
@@ -1167,10 +1167,11 @@ void Server::dispatch_client_request(MDRequest *mdr)
 
     // inodes ops.
   case CEPH_MDS_OP_LOOKUP:
-  case CEPH_MDS_OP_LOOKUPSNAP:
     handle_client_getattr(mdr, true);
     break;
 
+  case CEPH_MDS_OP_LOOKUPSNAP:
+    // lookupsnap does not reference a CDentry; treat it as a getattr
   case CEPH_MDS_OP_GETATTR:
     handle_client_getattr(mdr, false);
     break;
@@ -4909,8 +4910,10 @@ void Server::_unlink_local(MDRequest *mdr, CDentry *dn, CDentry *straydn)
   inode_t *pi = in->project_inode();
   mdr->add_projected_inode(in); // do this _after_ my dn->pre_dirty().. we apply that one manually.
   pi->version = in->pre_dirty();
-  pi->nlink--;
   pi->ctime = mdr->now;
+  pi->nlink--;
+  if (pi->nlink == 0)
+    in->state_set(CInode::STATE_ORPHAN);
 
   if (dnl->is_primary()) {
     // primary link.  add stray dentry.
@@ -6054,8 +6057,10 @@ void Server::_rename_prepare(MDRequest *mdr,
 	pi->nlink--;
     }
     if (tpi) {
-      tpi->nlink--;
       tpi->ctime = mdr->now;
+      tpi->nlink--;
+      if (tpi->nlink == 0)
+	oldin->state_set(CInode::STATE_ORPHAN);
     }
   }
 
@@ -7157,6 +7162,12 @@ struct C_MDS_mksnap_finish : public Context {
 /* This function takes responsibility for the passed mdr*/
 void Server::handle_client_mksnap(MDRequest *mdr)
 {
+  if (!mds->mdsmap->allows_snaps()) {
+    // you can't make snapshots until you set an option right now
+    reply_request(mdr, -EPERM);
+    return;
+  }
+
   MClientRequest *req = mdr->client_request;
   CInode *diri = mdcache->get_inode(req->get_filepath().get_ino());
   if (!diri || diri->state_test(CInode::STATE_PURGING)) {
diff --git a/src/mds/mdstypes.h b/src/mds/mdstypes.h
index 902e3104aa8..2a3874818b7 100644
--- a/src/mds/mdstypes.h
+++ b/src/mds/mdstypes.h
@@ -1134,8 +1134,9 @@ class MDSCacheObject {
   // -- state --
   const static int STATE_AUTH      = (1<<30);
   const static int STATE_DIRTY     = (1<<29);
-  const static int STATE_REJOINING = (1<<28);  // replica has not joined w/ primary copy
-  const static int STATE_REJOINUNDEF = (1<<27);  // contents undefined.
+  const static int STATE_NOTIFYREF = (1<<28); // notify dropping ref drop through _put()
+  const static int STATE_REJOINING = (1<<27);  // replica has not joined w/ primary copy
+  const static int STATE_REJOINUNDEF = (1<<26);  // contents undefined.
 
 
   // -- wait --
@@ -1221,6 +1222,7 @@ protected:
 #endif
     assert(ref > 0);
   }
+  virtual void _put() {}
   void put(int by) {
 #ifdef MDS_REF_SET
     if (ref == 0 || ref_map[by] == 0) {
@@ -1236,6 +1238,8 @@ protected:
 #endif
       if (ref == 0)
 	last_put();
+      if (state_test(STATE_NOTIFYREF))
+	_put();
     }
   }
 
diff --git a/src/mon/MDSMonitor.cc b/src/mon/MDSMonitor.cc
index b2273274521..48c1c99d584 100644
--- a/src/mon/MDSMonitor.cc
+++ b/src/mon/MDSMonitor.cc
@@ -920,6 +920,36 @@ bool MDSMonitor::prepare_command(MMonCommand *m)
       r = 0;
     }
 
+  } else if (prefix == "mds set") {
+    string key;
+    cmd_getval(g_ceph_context, cmdmap, "key", key);
+    string sure;
+    cmd_getval(g_ceph_context, cmdmap, "sure", sure);
+    if (key == "allow_new_snaps") {
+      if (sure != "--yes-i-really-mean-it") {
+	ss << "Snapshots are unstable and will probably break your FS! Add --yes-i-really-mean-it if you are sure";
+	r = -EPERM;
+      } else {
+	pending_mdsmap.set_snaps_allowed();
+	ss << "turned on snaps";
+	r = 0;
+      }
+    }
+  } else if (prefix == "mds unset") {
+    string key;
+    cmd_getval(g_ceph_context, cmdmap, "key", key);
+    string sure;
+    cmd_getval(g_ceph_context, cmdmap, "sure", sure);
+    if (key == "allow_new_snaps") {
+      if (sure != "--yes-i-really-mean-it") {
+	ss << "this won't get rid of snapshots or restore the cluster if it's broken. Add --yes-i-really-mean-it if you are sure";
+	r = -EPERM;
+      } else {
+	pending_mdsmap.clear_snaps_allowed();
+	ss << "disabled new snapshots";
+	r = 0;
+      }
+    }
   } else if (prefix == "mds add_data_pool") {
     int64_t poolid;
     cmd_getval(g_ceph_context, cmdmap, "poolid", poolid);
diff --git a/src/mon/MonCommands.h b/src/mon/MonCommands.h
index 365fd28b64e..33e00a98d30 100644
--- a/src/mon/MonCommands.h
+++ b/src/mon/MonCommands.h
@@ -59,7 +59,7 @@
  * CephString: optional badchars
  * CephSocketpath: validation involves "is it S_ISSOCK"
  * CephIPAddr: v4 or v6 addr with optional port, syntax validated
- * CephEntityAddr: CephIPAddr + '/nonce'
+ * CephEntityAddr: CephIPAddr + optional '/nonce'
  * CephPoolname: Plainold string
  * CephObjectname: Another plainold string
  * CephPgid: n.xxx where n is an int > 0, xxx is a hex number > 0
@@ -210,8 +210,8 @@ COMMAND("quorum_status", "report status of monitor quorum", \
 	"mon", "r", "cli,rest")
 COMMAND("mon_status", "report status of monitors", "mon", "r", "cli,rest")
 COMMAND("sync force " \
-	"name=validate1,type=CephChoices,strings=--yes-i-really-mean-it " \
-	"name=validate2,type=CephChoices,strings=--i-know-what-i-am-doing", \
+	"name=validate1,type=CephChoices,strings=--yes-i-really-mean-it,req=false " \
+	"name=validate2,type=CephChoices,strings=--i-know-what-i-am-doing,req=false", \
 	"force sync of and clear monitor store", "mon", "rw", "cli,rest")
 COMMAND("heap " \
 	"name=heapcmd,type=CephChoices,strings=dump|start_profiler|stop_profiler|release|stats", \
@@ -274,6 +274,15 @@ COMMAND("mds compat rm_compat " \
 COMMAND("mds compat rm_incompat " \
 	"name=feature,type=CephInt,range=0", \
 	"remove incompatible feature", "mds", "rw", "cli,rest")
+COMMAND("mds set " \
+        "name=key,type=CephChoices,strings=allow_new_snaps " \
+        "name=sure,type=CephString,req=false", \
+        "set <key>", \
+        "mds", "w", "cli,rest")
+COMMAND("mds unset " \
+        "name=key,type=CephChoices,strings=allow_new_snaps " \
+        "name=sure,type=CephString,req=false", \
+        "unset <key>", "mds", "w", "cli,rest")
 COMMAND("mds add_data_pool " \
 	"name=poolid,type=CephInt,range=0", \
 	"add data pool <poolid>", "mds", "rw", "cli,rest")
@@ -283,14 +292,14 @@ COMMAND("mds remove_data_pool " \
 COMMAND("mds newfs " \
 	"name=metadata,type=CephInt,range=0 " \
 	"name=data,type=CephInt,range=0 " \
-	"name=sure,type=CephChoices,strings=--yes-i-really-mean-it", \
+	"name=sure,type=CephChoices,strings=--yes-i-really-mean-it,req=false", \
 	"make new filesystom using pools <metadata> and <data>", \
 	"mds", "rw", "cli,rest")
 /*
  * Monmap commands
  */
 COMMAND("mon dump " \
-	"name=epoch,type=CephInt,req=false", \
+	"name=epoch,type=CephInt,range=0,req=false", \
 	"dump formatted monmap (optionally from epoch)", \
 	"mon", "r", "cli,rest")
 COMMAND("mon stat", "summarize monitor status", "mon", "r", "cli,rest")
@@ -456,7 +465,7 @@ COMMAND("osd reweight " \
 	"reweight osd to 0.0 < <weight> < 1.0", "osd", "rw", "cli,rest")
 COMMAND("osd lost " \
 	"name=id,type=CephInt,range=0 " \
-	"name=sure,type=CephChoices,strings=--yes-i-really-mean-it", \
+	"name=sure,type=CephChoices,strings=--yes-i-really-mean-it,req=false", \
 	"mark osd as permanently lost. THIS DESTROYS DATA IF NO MORE REPLICAS EXIST, BE CAREFUL", \
 	"osd", "rw", "cli,rest")
 COMMAND("osd create " \
@@ -484,9 +493,9 @@ COMMAND("osd pool create " \
 	"create pool", "osd", "rw", "cli,rest")
 COMMAND("osd pool delete " \
 	"name=pool,type=CephPoolname " \
-	"name=pool2,type=CephPoolname " \
-	"name=sure,type=CephChoices,strings=--yes-i-really-really-mean-it", \
-	"delete pool (say pool twice, add --yes-i-really-really-mean-it)", \
+	"name=pool2,type=CephPoolname,req=false " \
+	"name=sure,type=CephChoices,strings=--yes-i-really-really-mean-it,req=false", \
+	"delete pool", \
 	"osd", "rw", "cli,rest")
 COMMAND("osd pool rename " \
 	"name=srcpool,type=CephPoolname " \
diff --git a/src/mon/Monitor.cc b/src/mon/Monitor.cc
index 10f5bfb149c..2c64a8f2ef2 100644
--- a/src/mon/Monitor.cc
+++ b/src/mon/Monitor.cc
@@ -1854,13 +1854,7 @@ void Monitor::get_status(stringstream &ss, Formatter *f)
 }
 
 #undef COMMAND
-struct MonCommand {
-  string cmdstring;
-  string helpstring;
-  string module;
-  string req_perms;
-  string availability;
-} mon_commands[] = {
+MonCommand mon_commands[] = {
 #define COMMAND(parsesig, helptext, modulename, req_perms, avail) \
   {parsesig, helptext, modulename, req_perms, avail},
 #include <mon/MonCommands.h>
@@ -1909,6 +1903,26 @@ bool Monitor::_allowed_command(MonSession *s, string &module, string &prefix,
   return capable;
 }
 
+void get_command_descriptions(const MonCommand *commands,
+			      unsigned commands_size,
+			      Formatter *f,
+			      bufferlist *rdata) {
+  int cmdnum = 0;
+  f->open_object_section("command_descriptions");
+  for (const MonCommand *cp = commands;
+       cp < &commands[commands_size]; cp++) {
+
+    ostringstream secname;
+    secname << "cmd" << setfill('0') << std::setw(3) << cmdnum;
+    dump_cmddesc_to_json(f, secname.str(),
+			 cp->cmdstring, cp->helpstring, cp->module,
+			 cp->req_perms, cp->availability);
+    cmdnum++;
+  }
+  f->close_section();	// command_descriptions
+
+  f->flush(*rdata);
+}
 
 void Monitor::handle_command(MMonCommand *m)
 {
@@ -1953,23 +1967,9 @@ void Monitor::handle_command(MMonCommand *m)
 
   cmd_getval(g_ceph_context, cmdmap, "prefix", prefix);
   if (prefix == "get_command_descriptions") {
-    int cmdnum = 0;
-    Formatter *f = new_formatter("json");
-    f->open_object_section("command_descriptions");
-    for (MonCommand *cp = mon_commands;
-	 cp < &mon_commands[ARRAY_SIZE(mon_commands)]; cp++) {
-
-      ostringstream secname;
-      secname << "cmd" << setfill('0') << std::setw(3) << cmdnum;
-      dump_cmddesc_to_json(f, secname.str(),
-			   cp->cmdstring, cp->helpstring, cp->module,
-			   cp->req_perms, cp->availability);
-      cmdnum++;
-    }
-    f->close_section();	// command_descriptions
-
     bufferlist rdata;
-    f->flush(rdata);
+    Formatter *f = new_formatter("json");
+    get_command_descriptions(mon_commands, ARRAY_SIZE(mon_commands), f, &rdata);
     delete f;
     reply_command(m, 0, "", rdata, 0);
     return;
diff --git a/src/mon/Monitor.h b/src/mon/Monitor.h
index df4a751361a..9b304428732 100644
--- a/src/mon/Monitor.h
+++ b/src/mon/Monitor.h
@@ -844,5 +844,17 @@ public:
 
 long parse_pos_long(const char *s, ostream *pss = NULL);
 
+struct MonCommand {
+  string cmdstring;
+  string helpstring;
+  string module;
+  string req_perms;
+  string availability;
+};
+
+void get_command_descriptions(const MonCommand *commands,
+			      unsigned commands_size,
+			      Formatter *f,
+			      bufferlist *rdata);
 
 #endif
diff --git a/src/mon/OSDMonitor.cc b/src/mon/OSDMonitor.cc
index 36fe6d345f2..9144736d801 100644
--- a/src/mon/OSDMonitor.cc
+++ b/src/mon/OSDMonitor.cc
@@ -120,7 +120,12 @@ void OSDMonitor::update_from_paxos(bool *need_bootstrap)
    * We will possibly have a stashed latest that *we* wrote, and we will
    * always be sure to have the oldest full map in the first..last range
    * due to encode_trim_extra(), which includes the oldest full map in the trim
-   * transaction.  Start with whichever is newer.
+   * transaction.
+   *
+   * encode_trim_extra() does not however write the full map's
+   * version to 'full_latest'.  This is only done when we are building the
+   * full maps from the incremental versions.  But don't panic!  We make sure
+   * that the following conditions find whichever full map version is newer.
    */
   version_t latest_full = get_version_latest_full();
   if (latest_full == 0 && get_first_committed() > 1)
@@ -179,32 +184,49 @@ void OSDMonitor::update_from_paxos(bool *need_bootstrap)
   }
 
   // walk through incrementals
-  MonitorDBStore::Transaction t;
+  MonitorDBStore::Transaction *t = NULL;
+  size_t tx_size = 0;
   while (version > osdmap.epoch) {
     bufferlist inc_bl;
     int err = get_version(osdmap.epoch+1, inc_bl);
     assert(err == 0);
     assert(inc_bl.length());
-    
+
     dout(7) << "update_from_paxos  applying incremental " << osdmap.epoch+1 << dendl;
     OSDMap::Incremental inc(inc_bl);
     err = osdmap.apply_incremental(inc);
     assert(err == 0);
 
+    if (t == NULL)
+      t = new MonitorDBStore::Transaction;
+
     // write out the full map for all past epochs
     bufferlist full_bl;
     osdmap.encode(full_bl);
-    put_version_full(&t, osdmap.epoch, full_bl);
+    tx_size += full_bl.length();
+
+    put_version_full(t, osdmap.epoch, full_bl);
+    put_version_latest_full(t, osdmap.epoch);
 
     // share
     dout(1) << osdmap << dendl;
 
     if (osdmap.epoch == 1) {
-      t.erase("mkfs", "osdmap");
+      t->erase("mkfs", "osdmap");
+    }
+
+    if (tx_size > g_conf->mon_sync_max_payload_size*2) {
+      mon->store->apply_transaction(*t);
+      delete t;
+      t = NULL;
+      tx_size = 0;
     }
   }
-  if (!t.empty())
-    mon->store->apply_transaction(t);
+
+  if (t != NULL) {
+    mon->store->apply_transaction(*t);
+    delete t;
+  }
 
   for (int o = 0; o < osdmap.get_max_osd(); o++) {
     if (osdmap.is_down(o)) {
@@ -620,7 +642,6 @@ void OSDMonitor::encode_trim_extra(MonitorDBStore::Transaction *tx, version_t fi
   bufferlist bl;
   get_version_full(first, bl);
   put_version_full(tx, first, bl);
-  put_version_latest_full(tx, first);
 }
 
 // -------------
@@ -2154,7 +2175,7 @@ bool OSDMonitor::preprocess_command(MMonCommand *m)
 				osdmap.get_inst(i));
 	}
       r = 0;
-      ss << " instructed to " << whostr;
+      ss << " instructed to " << pvec.back();
     } else {
       long osd = parse_osd_id(whostr.c_str(), &ss);
       if (osd < 0) {
@@ -2526,7 +2547,7 @@ int OSDMonitor::prepare_new_pool(string& name, uint64_t auid, int crush_rule,
        i != properties.end();
        i++) {
     size_t equal = i->find('=');
-    if (equal != string::npos)
+    if (equal == string::npos)
       pi->properties[*i] = string();
     else {
       const string key = i->substr(0, equal);
@@ -3001,7 +3022,7 @@ bool OSDMonitor::prepare_command(MMonCommand *m)
       cmd_getval(g_ceph_context, cmdmap, "weight", w);
 
       err = newcrush.adjust_item_weightf(g_ceph_context, id, w);
-      if (err == 0) {
+      if (err >= 0) {
 	pending_inc.crush.clear();
 	newcrush.encode(pending_inc.crush);
 	ss << "reweighted item id " << id << " name '" << name << "' to " << w
@@ -3599,7 +3620,7 @@ done:
 	  ss << "specified pg_num " << n << " <= current " << p->get_pg_num();
 	  err = -EINVAL;
 	} else if (!mon->pgmon()->pg_map.creating_pgs.empty()) {
-	  ss << "currently creating pgs, wait";
+	  ss << "busy creating pgs; try again later";
 	  err = -EAGAIN;
 	} else {
 	  pending_inc.get_new_pool(pool, p)->set_pg_num(n);
@@ -3609,7 +3630,7 @@ done:
 	if (n > p->get_pg_num()) {
 	  ss << "specified pgp_num " << n << " > pg_num " << p->get_pg_num();
 	} else if (!mon->pgmon()->pg_map.creating_pgs.empty()) {
-	  ss << "still creating pgs, wait";
+	  ss << "busy creating pgs; try again later";
 	  err = -EAGAIN;
 	} else {
 	  pending_inc.get_new_pool(pool, p)->set_pgp_num(n);
diff --git a/src/mon/PGMonitor.cc b/src/mon/PGMonitor.cc
index 2a677be61d9..0f495052747 100644
--- a/src/mon/PGMonitor.cc
+++ b/src/mon/PGMonitor.cc
@@ -1847,6 +1847,54 @@ void PGMonitor::get_health(list<pair<health_status_t,string> >& summary,
       detail->push_back(make_pair(HEALTH_ERR, ss.str()));
     }
   }
+
+  // pg skew
+  int num_in = mon->osdmon()->osdmap.get_num_in_osds();
+  if (num_in && g_conf->mon_pg_warn_min_per_osd > 0) {
+    int per = pg_map.pg_stat.size() / num_in;
+    if (per < g_conf->mon_pg_warn_min_per_osd) {
+      ostringstream ss;
+      ss << "too few pgs per osd (" << per << " < min " << g_conf->mon_pg_warn_min_per_osd << ")";
+      summary.push_back(make_pair(HEALTH_WARN, ss.str()));
+      if (detail)
+	detail->push_back(make_pair(HEALTH_WARN, ss.str()));
+    }
+  }
+  if (!pg_map.pg_stat.empty()) {
+    for (hash_map<int,pool_stat_t>::const_iterator p = pg_map.pg_pool_sum.begin();
+	 p != pg_map.pg_pool_sum.end();
+	 ++p) {
+      const pg_pool_t *pi = mon->osdmon()->osdmap.get_pg_pool(p->first);
+      if (!pi)
+	continue;   // in case osdmap changes haven't propagated to PGMap yet
+      if (pi->get_pg_num() > pi->get_pgp_num()) {
+	ostringstream ss;
+	ss << "pool " << mon->osdmon()->osdmap.get_pool_name(p->first) << " pg_num "
+	   << pi->get_pg_num() << " > pgp_num " << pi->get_pgp_num();
+	summary.push_back(make_pair(HEALTH_WARN, ss.str()));
+	if (detail)
+	  detail->push_back(make_pair(HEALTH_WARN, ss.str()));
+      }
+      int average_objects_per_pg = pg_map.pg_sum.stats.sum.num_objects / pg_map.pg_stat.size();
+      if (average_objects_per_pg > 0) {
+	int objects_per_pg = p->second.stats.sum.num_objects / pi->get_pg_num();
+	float ratio = (float)objects_per_pg / (float)average_objects_per_pg;
+	if (g_conf->mon_pg_warn_max_object_skew > 0 &&
+	    ratio > g_conf->mon_pg_warn_max_object_skew) {
+	  ostringstream ss;
+	  ss << "pool " << mon->osdmon()->osdmap.get_pool_name(p->first) << " has too few pgs";
+	  summary.push_back(make_pair(HEALTH_WARN, ss.str()));
+	  if (detail) {
+	    ostringstream ss;
+	    ss << "pool " << mon->osdmon()->osdmap.get_pool_name(p->first) << " objects per pg ("
+	       << objects_per_pg << ") is more than " << ratio << " times cluster average ("
+	       << average_objects_per_pg << ")";
+	    detail->push_back(make_pair(HEALTH_WARN, ss.str()));
+	  }
+	}
+      }
+    }
+  }
 }
 
 void PGMonitor::check_full_osd_health(list<pair<health_status_t,string> >& summary,
diff --git a/src/msg/Pipe.cc b/src/msg/Pipe.cc
index 50656fee53b..66b64d0097a 100644
--- a/src/msg/Pipe.cc
+++ b/src/msg/Pipe.cc
@@ -1136,6 +1136,19 @@ void Pipe::unregister_pipe()
   }
 }
 
+void Pipe::join()
+{
+  ldout(msgr->cct, 20) << "join" << dendl;
+  if (writer_thread.is_started())
+    writer_thread.join();
+  if (reader_thread.is_started())
+    reader_thread.join();
+  if (delay_thread) {
+    ldout(msgr->cct, 20) << "joining delay_thread" << dendl;
+    delay_thread->stop();
+    delay_thread->join();
+  }
+}
 
 void Pipe::requeue_sent()
 {
diff --git a/src/msg/Pipe.h b/src/msg/Pipe.h
index 5f94305350c..6c91395a352 100644
--- a/src/msg/Pipe.h
+++ b/src/msg/Pipe.h
@@ -234,16 +234,7 @@ class DispatchQueue;
 
     void register_pipe();
     void unregister_pipe();
-    void join() {
-      if (writer_thread.is_started())
-        writer_thread.join();
-      if (reader_thread.is_started())
-        reader_thread.join();
-      if (delay_thread) {
-	delay_thread->stop();
-	delay_thread->join();
-      }
-    }
+    void join();
     void stop();
 
     void _send(Message *m) {
diff --git a/src/msg/msg_types.cc b/src/msg/msg_types.cc
index 38416abd4f2..b02db768bfb 100644
--- a/src/msg/msg_types.cc
+++ b/src/msg/msg_types.cc
@@ -135,7 +135,7 @@ bool entity_addr_t::parse(const char *s, const char **end)
 ostream& operator<<(ostream& out, const sockaddr_storage &ss)
 {
   char buf[NI_MAXHOST] = { 0 };
-  char serv[20] = { 0 };
+  char serv[NI_MAXSERV] = { 0 };
   size_t hostlen;
 
   if (ss.ss_family == AF_INET)
diff --git a/src/objsync/boto_del.py b/src/objsync/boto_del.py
index 14e790544ec..ba512e1ca33 100755
--- a/src/objsync/boto_del.py
+++ b/src/objsync/boto_del.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/env python
 
 #
 # Ceph - scalable distributed file system
diff --git a/src/os/CollectionIndex.h b/src/os/CollectionIndex.h
index 9b1ceae8c46..89b7b862632 100644
--- a/src/os/CollectionIndex.h
+++ b/src/os/CollectionIndex.h
@@ -23,7 +23,7 @@
 #include "include/object.h"
 
 /**
- * CollectionIndex provides an interface for manipulating indexed colelctions
+ * CollectionIndex provides an interface for manipulating indexed collections
  */
 class CollectionIndex {
 protected:
@@ -127,26 +127,26 @@ protected:
    * @return Error Code, 0 for success
    */
   virtual int created(
-    const hobject_t &hoid, ///< [in] Created object.
+    const ghobject_t &oid, ///< [in] Created object.
     const char *path       ///< [in] Path to created object.
     ) = 0;
 
   /**
-   * Removes hoid from the collection
+   * Removes oid from the collection
    *
    * @return Error Code, 0 for success
    */
   virtual int unlink(
-    const hobject_t &hoid ///< [in] Object to remove
+    const ghobject_t &oid ///< [in] Object to remove
     ) = 0;
 
   /**
-   * Gets the IndexedPath for hoid.
+   * Gets the IndexedPath for oid.
    *
    * @return Error Code, 0 for success
    */
   virtual int lookup(
-    const hobject_t &hoid, ///< [in] Object to lookup
+    const ghobject_t &oid, ///< [in] Object to lookup
     IndexedPath *path,	   ///< [out] Path to object
     int *exist	           ///< [out] True if the object exists, else false
     ) = 0;
@@ -167,17 +167,17 @@ protected:
 
   /// List contents of collection by hash
   virtual int collection_list_partial(
-    const hobject_t &start, ///< [in] object at which to start
+    const ghobject_t &start, ///< [in] object at which to start
     int min_count,          ///< [in] get at least min_count objects
     int max_count,          ///< [in] return at most max_count objects
     snapid_t seq,           ///< [in] list only objects with snap >= seq
-    vector<hobject_t> *ls,  ///< [out] Listed objects
-    hobject_t *next         ///< [out] Next object to list
+    vector<ghobject_t> *ls,  ///< [out] Listed objects
+    ghobject_t *next         ///< [out] Next object to list
     ) = 0;
 
   /// List contents of collection.
   virtual int collection_list(
-    vector<hobject_t> *ls ///< [out] Listed Objects
+    vector<ghobject_t> *ls ///< [out] Listed Objects
     ) = 0;
 
   /// Call prior to removing directory
diff --git a/src/os/DBObjectMap.cc b/src/os/DBObjectMap.cc
index 90c840bbe9c..635870b0db5 100644
--- a/src/os/DBObjectMap.cc
+++ b/src/os/DBObjectMap.cc
@@ -130,61 +130,68 @@ bool DBObjectMap::check(std::ostream &out)
   return retval;
 }
 
-string DBObjectMap::hobject_key(const hobject_t &hoid)
+string DBObjectMap::ghobject_key(const ghobject_t &oid)
 {
   string out;
-  append_escaped(hoid.oid.name, &out);
+  append_escaped(oid.hobj.oid.name, &out);
   out.push_back('.');
-  append_escaped(hoid.get_key(), &out);
+  append_escaped(oid.hobj.get_key(), &out);
   out.push_back('.');
-  append_escaped(hoid.nspace, &out);
+  append_escaped(oid.hobj.nspace, &out);
   out.push_back('.');
 
   char snap_with_hash[1000];
   char *t = snap_with_hash;
   char *end = t + sizeof(snap_with_hash);
-  if (hoid.snap == CEPH_NOSNAP)
+  if (oid.hobj.snap == CEPH_NOSNAP)
     t += snprintf(t, end - t, "head");
-  else if (hoid.snap == CEPH_SNAPDIR)
+  else if (oid.hobj.snap == CEPH_SNAPDIR)
     t += snprintf(t, end - t, "snapdir");
   else
-    t += snprintf(t, end - t, "%llx", (long long unsigned)hoid.snap);
+    t += snprintf(t, end - t, "%llx", (long long unsigned)oid.hobj.snap);
 
-  if (hoid.pool == -1)
+  if (oid.hobj.pool == -1)
     t += snprintf(t, end - t, ".none");
   else
-    t += snprintf(t, end - t, ".%llx", (long long unsigned)hoid.pool);
-  snprintf(t, end - t, ".%.*X", (int)(sizeof(hoid.hash)*2), hoid.hash);
+    t += snprintf(t, end - t, ".%llx", (long long unsigned)oid.hobj.pool);
+  snprintf(t, end - t, ".%.*X", (int)(sizeof(oid.hobj.hash)*2), oid.hobj.hash);
+
+  if (oid.generation != ghobject_t::NO_GEN) {
+    assert(oid.shard_id != ghobject_t::NO_SHARD);
+
+    t += snprintf(t, end - t, ".%llx", (long long unsigned)oid.generation);
+    t += snprintf(t, end - t, ".%x", (int)oid.shard_id);
+  }
   out += string(snap_with_hash);
   return out;
 }
 
-string DBObjectMap::hobject_key_v0(coll_t c, const hobject_t &hoid)
+string DBObjectMap::ghobject_key_v0(coll_t c, const ghobject_t &oid)
 {
   string out;
   append_escaped(c.to_str(), &out);
   out.push_back('.');
-  append_escaped(hoid.oid.name, &out);
+  append_escaped(oid.hobj.oid.name, &out);
   out.push_back('.');
-  append_escaped(hoid.get_key(), &out);
+  append_escaped(oid.hobj.get_key(), &out);
   out.push_back('.');
 
   char snap_with_hash[1000];
   char *t = snap_with_hash;
   char *end = t + sizeof(snap_with_hash);
-  if (hoid.snap == CEPH_NOSNAP)
+  if (oid.hobj.snap == CEPH_NOSNAP)
     t += snprintf(t, end - t, ".head");
-  else if (hoid.snap == CEPH_SNAPDIR)
+  else if (oid.hobj.snap == CEPH_SNAPDIR)
     t += snprintf(t, end - t, ".snapdir");
   else
-    t += snprintf(t, end - t, ".%llx", (long long unsigned)hoid.snap);
-  snprintf(t, end - t, ".%.*X", (int)(sizeof(hoid.hash)*2), hoid.hash);
+    t += snprintf(t, end - t, ".%llx", (long long unsigned)oid.hobj.snap);
+  snprintf(t, end - t, ".%.*X", (int)(sizeof(oid.hobj.hash)*2), oid.hobj.hash);
   out += string(snap_with_hash);
   return out;
 }
 
-bool DBObjectMap::parse_hobject_key_v0(const string &in, coll_t *c,
-				       hobject_t *hoid)
+bool DBObjectMap::parse_ghobject_key_v0(const string &in, coll_t *c,
+				       ghobject_t *oid)
 {
   string coll;
   string name;
@@ -244,13 +251,13 @@ bool DBObjectMap::parse_hobject_key_v0(const string &in, coll_t *c,
   pg_t pg;
   if (c->is_pg_prefix(pg))
     pool = (int64_t)pg.pool();
-  (*hoid) = hobject_t(name, key, snap, hash, pool, "");
+  (*oid) = ghobject_t(hobject_t(name, key, snap, hash, pool, ""));
   return true;
 }
 
-string DBObjectMap::map_header_key(const hobject_t &hoid)
+string DBObjectMap::map_header_key(const ghobject_t &oid)
 {
-  return hobject_key(hoid);
+  return ghobject_key(oid);
 }
 
 string DBObjectMap::header_key(uint64_t seq)
@@ -311,9 +318,9 @@ int DBObjectMap::DBObjectMapIteratorImpl::init()
 }
 
 ObjectMap::ObjectMapIterator DBObjectMap::get_iterator(
-  const hobject_t &hoid)
+  const ghobject_t &oid)
 {
-  Header header = lookup_map_header(hoid);
+  Header header = lookup_map_header(oid);
   if (!header)
     return ObjectMapIterator(new EmptyIteratorImpl());
   return _get_iterator(header);
@@ -496,15 +503,15 @@ int DBObjectMap::DBObjectMapIteratorImpl::status()
   return r;
 }
 
-int DBObjectMap::set_keys(const hobject_t &hoid,
+int DBObjectMap::set_keys(const ghobject_t &oid,
 			  const map<string, bufferlist> &set,
 			  const SequencerPosition *spos)
 {
   KeyValueDB::Transaction t = db->get_transaction();
-  Header header = lookup_create_map_header(hoid, t);
+  Header header = lookup_create_map_header(oid, t);
   if (!header)
     return -EINVAL;
-  if (check_spos(hoid, header, spos))
+  if (check_spos(oid, header, spos))
     return 0;
 
   t->set(user_prefix(header), set);
@@ -512,15 +519,15 @@ int DBObjectMap::set_keys(const hobject_t &hoid,
   return db->submit_transaction(t);
 }
 
-int DBObjectMap::set_header(const hobject_t &hoid,
+int DBObjectMap::set_header(const ghobject_t &oid,
 			    const bufferlist &bl,
 			    const SequencerPosition *spos)
 {
   KeyValueDB::Transaction t = db->get_transaction();
-  Header header = lookup_create_map_header(hoid, t);
+  Header header = lookup_create_map_header(oid, t);
   if (!header)
     return -EINVAL;
-  if (check_spos(hoid, header, spos))
+  if (check_spos(oid, header, spos))
     return 0;
   _set_header(header, bl, t);
   return db->submit_transaction(t);
@@ -534,10 +541,10 @@ void DBObjectMap::_set_header(Header header, const bufferlist &bl,
   t->set(sys_prefix(header), to_set);
 }
 
-int DBObjectMap::get_header(const hobject_t &hoid,
+int DBObjectMap::get_header(const ghobject_t &oid,
 			    bufferlist *bl)
 {
-  Header header = lookup_map_header(hoid);
+  Header header = lookup_map_header(oid);
   if (!header) {
     return 0;
   }
@@ -568,16 +575,16 @@ int DBObjectMap::_get_header(Header header,
   return 0;
 }
 
-int DBObjectMap::clear(const hobject_t &hoid,
+int DBObjectMap::clear(const ghobject_t &oid,
 		       const SequencerPosition *spos)
 {
   KeyValueDB::Transaction t = db->get_transaction();
-  Header header = lookup_map_header(hoid);
+  Header header = lookup_map_header(oid);
   if (!header)
     return -ENOENT;
-  if (check_spos(hoid, header, spos))
+  if (check_spos(oid, header, spos))
     return 0;
-  remove_map_header(hoid, header, t);
+  remove_map_header(oid, header, t);
   assert(header->num_children > 0);
   header->num_children--;
   int r = _clear(header, t);
@@ -688,15 +695,15 @@ int DBObjectMap::need_parent(DBObjectMapIterator iter)
   return 1;
 }
 
-int DBObjectMap::rm_keys(const hobject_t &hoid,
+int DBObjectMap::rm_keys(const ghobject_t &oid,
 			 const set<string> &to_clear,
 			 const SequencerPosition *spos)
 {
-  Header header = lookup_map_header(hoid);
+  Header header = lookup_map_header(oid);
   if (!header)
     return -ENOENT;
   KeyValueDB::Transaction t = db->get_transaction();
-  if (check_spos(hoid, header, spos))
+  if (check_spos(oid, header, spos))
     return 0;
   t->rmkeys(user_prefix(header), to_clear);
   if (!header->parent) {
@@ -756,17 +763,17 @@ int DBObjectMap::rm_keys(const hobject_t &hoid,
     parent->num_children--;
     _clear(parent, t);
     header->parent = 0;
-    set_map_header(hoid, *header, t);
+    set_map_header(oid, *header, t);
     t->rmkeys_by_prefix(complete_prefix(header));
   }
   return db->submit_transaction(t);
 }
 
-int DBObjectMap::get(const hobject_t &hoid,
+int DBObjectMap::get(const ghobject_t &oid,
 		     bufferlist *_header,
 		     map<string, bufferlist> *out)
 {
-  Header header = lookup_map_header(hoid);
+  Header header = lookup_map_header(oid);
   if (!header)
     return -ENOENT;
   _get_header(header, _header);
@@ -779,13 +786,13 @@ int DBObjectMap::get(const hobject_t &hoid,
   return 0;
 }
 
-int DBObjectMap::get_keys(const hobject_t &hoid,
+int DBObjectMap::get_keys(const ghobject_t &oid,
 			  set<string> *keys)
 {
-  Header header = lookup_map_header(hoid);
+  Header header = lookup_map_header(oid);
   if (!header)
     return -ENOENT;
-  ObjectMapIterator iter = get_iterator(hoid);
+  ObjectMapIterator iter = get_iterator(oid);
   for (; iter->valid(); iter->next()) {
     if (iter->status())
       return iter->status();
@@ -816,40 +823,40 @@ int DBObjectMap::scan(Header header,
   return 0;
 }
 
-int DBObjectMap::get_values(const hobject_t &hoid,
+int DBObjectMap::get_values(const ghobject_t &oid,
 			    const set<string> &keys,
 			    map<string, bufferlist> *out)
 {
-  Header header = lookup_map_header(hoid);
+  Header header = lookup_map_header(oid);
   if (!header)
     return -ENOENT;
   return scan(header, keys, 0, out);
 }
 
-int DBObjectMap::check_keys(const hobject_t &hoid,
+int DBObjectMap::check_keys(const ghobject_t &oid,
 			    const set<string> &keys,
 			    set<string> *out)
 {
-  Header header = lookup_map_header(hoid);
+  Header header = lookup_map_header(oid);
   if (!header)
     return -ENOENT;
   return scan(header, keys, out, 0);
 }
 
-int DBObjectMap::get_xattrs(const hobject_t &hoid,
+int DBObjectMap::get_xattrs(const ghobject_t &oid,
 			    const set<string> &to_get,
 			    map<string, bufferlist> *out)
 {
-  Header header = lookup_map_header(hoid);
+  Header header = lookup_map_header(oid);
   if (!header)
     return -ENOENT;
   return db->get(xattr_prefix(header), to_get, out);
 }
 
-int DBObjectMap::get_all_xattrs(const hobject_t &hoid,
+int DBObjectMap::get_all_xattrs(const ghobject_t &oid,
 				set<string> *out)
 {
-  Header header = lookup_map_header(hoid);
+  Header header = lookup_map_header(oid);
   if (!header)
     return -ENOENT;
   KeyValueDB::Iterator iter = db->get_iterator(xattr_prefix(header));
@@ -860,39 +867,39 @@ int DBObjectMap::get_all_xattrs(const hobject_t &hoid,
   return iter->status();
 }
 
-int DBObjectMap::set_xattrs(const hobject_t &hoid,
+int DBObjectMap::set_xattrs(const ghobject_t &oid,
 			    const map<string, bufferlist> &to_set,
 			    const SequencerPosition *spos)
 {
   KeyValueDB::Transaction t = db->get_transaction();
-  Header header = lookup_create_map_header(hoid, t);
+  Header header = lookup_create_map_header(oid, t);
   if (!header)
     return -EINVAL;
-  if (check_spos(hoid, header, spos))
+  if (check_spos(oid, header, spos))
     return 0;
   t->set(xattr_prefix(header), to_set);
   return db->submit_transaction(t);
 }
 
-int DBObjectMap::remove_xattrs(const hobject_t &hoid,
+int DBObjectMap::remove_xattrs(const ghobject_t &oid,
 			       const set<string> &to_remove,
 			       const SequencerPosition *spos)
 {
   KeyValueDB::Transaction t = db->get_transaction();
-  Header header = lookup_map_header(hoid);
+  Header header = lookup_map_header(oid);
   if (!header)
     return -ENOENT;
-  if (check_spos(hoid, header, spos))
+  if (check_spos(oid, header, spos))
     return 0;
   t->rmkeys(xattr_prefix(header), to_remove);
   return db->submit_transaction(t);
 }
 
-int DBObjectMap::clone(const hobject_t &hoid,
-		       const hobject_t &target,
+int DBObjectMap::clone(const ghobject_t &oid,
+		       const ghobject_t &target,
 		       const SequencerPosition *spos)
 {
-  if (hoid == target)
+  if (oid == target)
     return 0;
 
   KeyValueDB::Transaction t = db->get_transaction();
@@ -907,18 +914,18 @@ int DBObjectMap::clone(const hobject_t &hoid,
     }
   }
 
-  Header parent = lookup_map_header(hoid);
+  Header parent = lookup_map_header(oid);
   if (!parent)
     return db->submit_transaction(t);
 
-  Header source = generate_new_header(hoid, parent);
+  Header source = generate_new_header(oid, parent);
   Header destination = generate_new_header(target, parent);
   if (spos)
     destination->spos = *spos;
 
   parent->num_children = 2;
   set_header(parent, t);
-  set_map_header(hoid, *source, t);
+  set_map_header(oid, *source, t);
   set_map_header(target, *destination, t);
 
   map<string, bufferlist> to_set;
@@ -973,9 +980,9 @@ int DBObjectMap::upgrade()
 		 to_get);
 
       coll_t coll;
-      hobject_t hoid;
-      assert(parse_hobject_key_v0(iter->key(), &coll, &hoid));
-      new_map_headers[hobject_key(hoid)] = got.begin()->second;
+      ghobject_t oid;
+      assert(parse_ghobject_key_v0(iter->key(), &coll, &oid));
+      new_map_headers[ghobject_key(oid)] = got.begin()->second;
     }
 
     t->rmkeys(LEAF_PREFIX, legacy_to_remove);
@@ -1038,18 +1045,18 @@ int DBObjectMap::init(bool do_upgrade)
   return 0;
 }
 
-int DBObjectMap::sync(const hobject_t *hoid,
+int DBObjectMap::sync(const ghobject_t *oid,
 		      const SequencerPosition *spos) {
   KeyValueDB::Transaction t = db->get_transaction();
   write_state(t);
-  if (hoid) {
+  if (oid) {
     assert(spos);
-    Header header = lookup_map_header(*hoid);
+    Header header = lookup_map_header(*oid);
     if (header) {
-      dout(10) << "hoid: " << *hoid << " setting spos to "
+      dout(10) << "oid: " << *oid << " setting spos to "
 	       << *spos << dendl;
       header->spos = *spos;
-      set_map_header(*hoid, *header, t);
+      set_map_header(*oid, *header, t);
     }
   }
   return db->submit_transaction_sync(t);
@@ -1067,27 +1074,27 @@ int DBObjectMap::write_state(KeyValueDB::Transaction _t) {
 }
 
 
-DBObjectMap::Header DBObjectMap::_lookup_map_header(const hobject_t &hoid)
+DBObjectMap::Header DBObjectMap::_lookup_map_header(const ghobject_t &oid)
 {
-  while (map_header_in_use.count(hoid))
+  while (map_header_in_use.count(oid))
     header_cond.Wait(header_lock);
 
   map<string, bufferlist> out;
   set<string> to_get;
-  to_get.insert(map_header_key(hoid));
+  to_get.insert(map_header_key(oid));
   int r = db->get(HOBJECT_TO_SEQ, to_get, &out);
   if (r < 0)
     return Header();
   if (out.empty())
     return Header();
   
-  Header ret(new _Header(), RemoveMapHeaderOnDelete(this, hoid));
+  Header ret(new _Header(), RemoveMapHeaderOnDelete(this, oid));
   bufferlist::iterator iter = out.begin()->second.begin();
   ret->decode(iter);
   return ret;
 }
 
-DBObjectMap::Header DBObjectMap::_generate_new_header(const hobject_t &hoid,
+DBObjectMap::Header DBObjectMap::_generate_new_header(const ghobject_t &oid,
 						      Header parent)
 {
   Header header = Header(new _Header(), RemoveOnDelete(this));
@@ -1097,7 +1104,7 @@ DBObjectMap::Header DBObjectMap::_generate_new_header(const hobject_t &hoid,
     header->spos = parent->spos;
   }
   header->num_children = 1;
-  header->hoid = hoid;
+  header->oid = oid;
   assert(!in_use.count(header->seq));
   in_use.insert(header->seq);
 
@@ -1137,14 +1144,14 @@ DBObjectMap::Header DBObjectMap::lookup_parent(Header input)
 }
 
 DBObjectMap::Header DBObjectMap::lookup_create_map_header(
-  const hobject_t &hoid,
+  const ghobject_t &oid,
   KeyValueDB::Transaction t)
 {
   Mutex::Locker l(header_lock);
-  Header header = _lookup_map_header(hoid);
+  Header header = _lookup_map_header(oid);
   if (!header) {
-    header = _generate_new_header(hoid, Header());
-    set_map_header(hoid, *header, t);
+    header = _generate_new_header(oid, Header());
+    set_map_header(oid, *header, t);
   }
   return header;
 }
@@ -1169,50 +1176,50 @@ void DBObjectMap::set_header(Header header, KeyValueDB::Transaction t)
   t->set(sys_prefix(header), to_write);
 }
 
-void DBObjectMap::remove_map_header(const hobject_t &hoid,
+void DBObjectMap::remove_map_header(const ghobject_t &oid,
 				    Header header,
 				    KeyValueDB::Transaction t)
 {
   dout(20) << "remove_map_header: removing " << header->seq
-	   << " hoid " << hoid << dendl;
+	   << " oid " << oid << dendl;
   set<string> to_remove;
-  to_remove.insert(map_header_key(hoid));
+  to_remove.insert(map_header_key(oid));
   t->rmkeys(HOBJECT_TO_SEQ, to_remove);
 }
 
-void DBObjectMap::set_map_header(const hobject_t &hoid, _Header header,
+void DBObjectMap::set_map_header(const ghobject_t &oid, _Header header,
 				 KeyValueDB::Transaction t)
 {
   dout(20) << "set_map_header: setting " << header.seq
-	   << " hoid " << hoid << " parent seq "
+	   << " oid " << oid << " parent seq "
 	   << header.parent << dendl;
   map<string, bufferlist> to_set;
-  header.encode(to_set[map_header_key(hoid)]);
+  header.encode(to_set[map_header_key(oid)]);
   t->set(HOBJECT_TO_SEQ, to_set);
 }
 
-bool DBObjectMap::check_spos(const hobject_t &hoid,
+bool DBObjectMap::check_spos(const ghobject_t &oid,
 			     Header header,
 			     const SequencerPosition *spos)
 {
   if (!spos || *spos > header->spos) {
     stringstream out;
     if (spos)
-      dout(10) << "hoid: " << hoid << " not skipping op, *spos "
+      dout(10) << "oid: " << oid << " not skipping op, *spos "
 	       << *spos << dendl;
     else
-      dout(10) << "hoid: " << hoid << " not skipping op, *spos "
+      dout(10) << "oid: " << oid << " not skipping op, *spos "
 	       << "empty" << dendl;
     dout(10) << " > header.spos " << header->spos << dendl;
     return false;
   } else {
-    dout(10) << "hoid: " << hoid << " skipping op, *spos " << *spos
+    dout(10) << "oid: " << oid << " skipping op, *spos " << *spos
 	     << " <= header.spos " << header->spos << dendl;
     return true;
   }
 }
 
-int DBObjectMap::list_objects(vector<hobject_t> *out)
+int DBObjectMap::list_objects(vector<ghobject_t> *out)
 {
   KeyValueDB::Iterator iter = db->get_iterator(HOBJECT_TO_SEQ);
   for (iter->seek_to_first(); iter->valid(); iter->next()) {
@@ -1220,7 +1227,7 @@ int DBObjectMap::list_objects(vector<hobject_t> *out)
     bufferlist::iterator bliter = bl.begin();
     _Header header;
     header.decode(bliter);
-    out->push_back(header.hoid);
+    out->push_back(header.oid);
   }
   return 0;
 }
diff --git a/src/os/DBObjectMap.h b/src/os/DBObjectMap.h
index ba05dff6c6f..459447f9c97 100644
--- a/src/os/DBObjectMap.h
+++ b/src/os/DBObjectMap.h
@@ -26,7 +26,7 @@
  * @see user_prefix
  * @see sys_prefix
  *
- * - HOBJECT_TO_SEQ: Contains leaf mapping from hobject_t->seq and
+ * - GHOBJECT_TO_SEQ: Contains leaf mapping from ghobject_t->hobj.seq and
  *                   corresponding omap header
  * - SYS_PREFIX: GLOBAL_STATE_KEY - contains next seq number
  *                                  @see State
@@ -66,89 +66,89 @@ public:
    * Set of headers currently in use
    */
   set<uint64_t> in_use;
-  set<hobject_t> map_header_in_use;
+  set<ghobject_t> map_header_in_use;
 
   DBObjectMap(KeyValueDB *db) : db(db),
 				header_lock("DBOBjectMap")
     {}
 
   int set_keys(
-    const hobject_t &hoid,
+    const ghobject_t &oid,
     const map<string, bufferlist> &set,
     const SequencerPosition *spos=0
     );
 
   int set_header(
-    const hobject_t &hoid,
+    const ghobject_t &oid,
     const bufferlist &bl,
     const SequencerPosition *spos=0
     );
 
   int get_header(
-    const hobject_t &hoid,
+    const ghobject_t &oid,
     bufferlist *bl
     );
 
   int clear(
-    const hobject_t &hoid,
+    const ghobject_t &oid,
     const SequencerPosition *spos=0
     );
 
   int rm_keys(
-    const hobject_t &hoid,
+    const ghobject_t &oid,
     const set<string> &to_clear,
     const SequencerPosition *spos=0
     );
 
   int get(
-    const hobject_t &hoid,
+    const ghobject_t &oid,
     bufferlist *header,
     map<string, bufferlist> *out
     );
 
   int get_keys(
-    const hobject_t &hoid,
+    const ghobject_t &oid,
     set<string> *keys
     );
 
   int get_values(
-    const hobject_t &hoid,
+    const ghobject_t &oid,
     const set<string> &keys,
     map<string, bufferlist> *out
     );
 
   int check_keys(
-    const hobject_t &hoid,
+    const ghobject_t &oid,
     const set<string> &keys,
     set<string> *out
     );
 
   int get_xattrs(
-    const hobject_t &hoid,
+    const ghobject_t &oid,
     const set<string> &to_get,
     map<string, bufferlist> *out
     );
 
   int get_all_xattrs(
-    const hobject_t &hoid,
+    const ghobject_t &oid,
     set<string> *out
     );
 
   int set_xattrs(
-    const hobject_t &hoid,
+    const ghobject_t &oid,
     const map<string, bufferlist> &to_set,
     const SequencerPosition *spos=0
     );
 
   int remove_xattrs(
-    const hobject_t &hoid,
+    const ghobject_t &oid,
     const set<string> &to_remove,
     const SequencerPosition *spos=0
     );
 
   int clone(
-    const hobject_t &hoid,
-    const hobject_t &target,
+    const ghobject_t &oid,
+    const ghobject_t &target,
     const SequencerPosition *spos=0
     );
 
@@ -162,13 +162,13 @@ public:
   bool check(std::ostream &out);
 
   /// Ensure that all previous operations are durable
-  int sync(const hobject_t *hoid=0, const SequencerPosition *spos=0);
+  int sync(const ghobject_t *oid=0, const SequencerPosition *spos=0);
 
   /// Util, list all objects, there must be no other concurrent access
-  int list_objects(vector<hobject_t> *objs ///< [out] objects
+  int list_objects(vector<ghobject_t> *objs ///< [out] objects
     );
 
-  ObjectMapIterator get_iterator(const hobject_t &hoid);
+  ObjectMapIterator get_iterator(const ghobject_t &oid);
 
   static const string USER_PREFIX;
   static const string XATTR_PREFIX;
@@ -223,7 +223,7 @@ public:
     uint64_t num_children;
 
     coll_t c;
-    hobject_t hoid;
+    ghobject_t oid;
 
     SequencerPosition spos;
 
@@ -233,7 +233,7 @@ public:
       ::encode(parent, bl);
       ::encode(num_children, bl);
       ::encode(c, bl);
-      ::encode(hoid, bl);
+      ::encode(oid, bl);
       ::encode(spos, bl);
       ENCODE_FINISH(bl);
     }
@@ -244,7 +244,7 @@ public:
       ::decode(parent, bl);
       ::decode(num_children, bl);
       ::decode(c, bl);
-      ::decode(hoid, bl);
+      ::decode(oid, bl);
       if (struct_v >= 2)
 	::decode(spos, bl);
       DECODE_FINISH(bl);
@@ -255,7 +255,7 @@ public:
       f->dump_unsigned("parent", parent);
       f->dump_unsigned("num_children", num_children);
       f->dump_stream("coll") << c;
-      f->dump_stream("oid") << hoid;
+      f->dump_stream("oid") << oid;
     }
 
     static void generate_test_instances(list<_Header*> &o) {
@@ -269,15 +269,15 @@ public:
   };
 
   /// String munging (public for testing)
-  static string hobject_key(const hobject_t &hoid);
-  static string hobject_key_v0(coll_t c, const hobject_t &hoid);
-  static bool parse_hobject_key_v0(const string &in,
-				   coll_t *c, hobject_t *hoid);
+  static string ghobject_key(const ghobject_t &oid);
+  static string ghobject_key_v0(coll_t c, const ghobject_t &oid);
+  static bool parse_ghobject_key_v0(const string &in,
+				   coll_t *c, ghobject_t *oid);
 private:
   /// Implicit lock on Header->seq
   typedef std::tr1::shared_ptr<_Header> Header;
 
-  string map_header_key(const hobject_t &hoid);
+  string map_header_key(const ghobject_t &oid);
   string header_key(uint64_t seq);
   string complete_prefix(Header header);
   string user_prefix(Header header);
@@ -368,40 +368,40 @@ private:
   /// Set node containing input to new contents
   void set_header(Header input, KeyValueDB::Transaction t);
 
-  /// Remove leaf node corresponding to hoid in c
-  void remove_map_header(const hobject_t &hoid,
+  /// Remove leaf node corresponding to oid in c
+  void remove_map_header(const ghobject_t &oid,
 			 Header header,
 			 KeyValueDB::Transaction t);
 
-  /// Set leaf node for c and hoid to the value of header
-  void set_map_header(const hobject_t &hoid, _Header header,
+  /// Set leaf node for c and oid to the value of header
+  void set_map_header(const ghobject_t &oid, _Header header,
 		      KeyValueDB::Transaction t);
 
-  /// Set leaf node for c and hoid to the value of header
-  bool check_spos(const hobject_t &hoid,
+  /// Set leaf node for c and oid to the value of header
+  bool check_spos(const ghobject_t &oid,
 		  Header header,
 		  const SequencerPosition *spos);
 
-  /// Lookup or create header for c hoid
-  Header lookup_create_map_header(const hobject_t &hoid,
+  /// Lookup or create header for c oid
+  Header lookup_create_map_header(const ghobject_t &oid,
 				  KeyValueDB::Transaction t);
 
   /**
-   * Generate new header for c hoid with new seq number
+   * Generate new header for c oid with new seq number
    *
    * Has the side effect of syncronously saving the new DBObjectMap state
    */
-  Header _generate_new_header(const hobject_t &hoid, Header parent);
-  Header generate_new_header(const hobject_t &hoid, Header parent) {
+  Header _generate_new_header(const ghobject_t &oid, Header parent);
+  Header generate_new_header(const ghobject_t &oid, Header parent) {
     Mutex::Locker l(header_lock);
-    return _generate_new_header(hoid, parent);
+    return _generate_new_header(oid, parent);
   }
 
-  /// Lookup leaf header for c hoid
-  Header _lookup_map_header(const hobject_t &hoid);
-  Header lookup_map_header(const hobject_t &hoid) {
+  /// Lookup leaf header for c oid
+  Header _lookup_map_header(const ghobject_t &oid);
+  Header lookup_map_header(const ghobject_t &oid) {
     Mutex::Locker l(header_lock);
-    return _lookup_map_header(hoid);
+    return _lookup_map_header(oid);
   }
 
   /// Lookup header node for input
@@ -448,12 +448,12 @@ private:
   class RemoveMapHeaderOnDelete {
   public:
     DBObjectMap *db;
-    hobject_t obj;
-    RemoveMapHeaderOnDelete(DBObjectMap *db, const hobject_t &obj) :
-      db(db), obj(obj) {}
+    ghobject_t oid;
+    RemoveMapHeaderOnDelete(DBObjectMap *db, const ghobject_t &oid) :
+      db(db), oid(oid) {}
     void operator() (_Header *header) {
       Mutex::Locker l(db->header_lock);
-      db->map_header_in_use.erase(obj);
+      db->map_header_in_use.erase(oid);
       db->map_header_cond.Signal();
       delete header;
     }
diff --git a/src/os/FDCache.h b/src/os/FDCache.h
index 00e632f3e0f..93557d43c47 100644
--- a/src/os/FDCache.h
+++ b/src/os/FDCache.h
@@ -49,7 +49,7 @@ public:
   };
 
 private:
-  SharedLRU<hobject_t, FD> registry;
+  SharedLRU<ghobject_t, FD> registry;
   CephContext *cct;
 
 public:
@@ -63,16 +63,16 @@ public:
   }
   typedef std::tr1::shared_ptr<FD> FDRef;
 
-  FDRef lookup(const hobject_t &hoid) {
+  FDRef lookup(const ghobject_t &hoid) {
     return registry.lookup(hoid);
   }
 
-  FDRef add(const hobject_t &hoid, int fd) {
+  FDRef add(const ghobject_t &hoid, int fd) {
     return registry.add(hoid, new FD(fd));
   }
 
   /// clear cached fd for hoid, subsequent lookups will get an empty FD
-  void clear(const hobject_t &hoid) {
+  void clear(const ghobject_t &hoid) {
     registry.clear(hoid);
     assert(!registry.lookup(hoid));
   }
diff --git a/src/os/FileStore.cc b/src/os/FileStore.cc
index 5d9e9d1482d..cd8a8e50658 100644
--- a/src/os/FileStore.cc
+++ b/src/os/FileStore.cc
@@ -86,6 +86,23 @@ using ceph::crypto::SHA1;
 #define REPLAY_GUARD_XATTR "user.cephos.seq"
 #define GLOBAL_REPLAY_GUARD_XATTR "user.cephos.gseq"
 
+//Initial features in new superblock.
+static CompatSet get_fs_initial_compat_set() {
+  CompatSet::FeatureSet ceph_osd_feature_compat;
+  CompatSet::FeatureSet ceph_osd_feature_ro_compat;
+  CompatSet::FeatureSet ceph_osd_feature_incompat;
+  return CompatSet(ceph_osd_feature_compat, ceph_osd_feature_ro_compat,
+		   ceph_osd_feature_incompat);
+}
+
+//Features are added here that this FileStore supports.
+static CompatSet get_fs_supported_compat_set() {
+  CompatSet compat =  get_fs_initial_compat_set();
+  //Any features here can be set in code, but not in initial superblock
+  compat.incompat.insert(CEPH_FS_FEATURE_INCOMPAT_SHARDS);
+  return compat;
+}
+
 
 void FileStore::FSPerfTracker::update_from_perfcounters(
   PerfCounters &logger)
@@ -124,12 +141,12 @@ int FileStore::init_index(coll_t cid)
 {
   char path[PATH_MAX];
   get_cdir(cid, path, sizeof(path));
-  int r = index_manager.init_index(cid, path, on_disk_version);
+  int r = index_manager.init_index(cid, path, target_version);
   assert(!m_filestore_fail_eio || r != -EIO);
   return r;
 }
 
-int FileStore::lfn_find(coll_t cid, const hobject_t& oid, IndexedPath *path)
+int FileStore::lfn_find(coll_t cid, const ghobject_t& oid, IndexedPath *path)
 {
   Index index; 
   int r, exist;
@@ -147,20 +164,25 @@ int FileStore::lfn_find(coll_t cid, const hobject_t& oid, IndexedPath *path)
   return 0;
 }
 
-int FileStore::lfn_truncate(coll_t cid, const hobject_t& oid, off_t length)
+int FileStore::lfn_truncate(coll_t cid, const ghobject_t& oid, off_t length)
 {
   IndexedPath path;
-  int r = lfn_find(cid, oid, &path);
+  FDRef fd;
+  int r = lfn_open(cid, oid, false, &fd, &path);
   if (r < 0)
     return r;
-  r = ::truncate(path->path(), length);
+  r = ::ftruncate(**fd, length);
   if (r < 0)
     r = -errno;
+  if (r >= 0 && m_filestore_sloppy_crc) {
+    int rc = backend->_crc_update_truncate(**fd, length);
+    assert(rc >= 0);
+  }
   assert(!m_filestore_fail_eio || r != -EIO);
   return r;
 }
 
-int FileStore::lfn_stat(coll_t cid, const hobject_t& oid, struct stat *buf)
+int FileStore::lfn_stat(coll_t cid, const ghobject_t& oid, struct stat *buf)
 {
   IndexedPath path;
   int r = lfn_find(cid, oid, &path);
@@ -173,12 +195,13 @@ int FileStore::lfn_stat(coll_t cid, const hobject_t& oid, struct stat *buf)
 }
 
 int FileStore::lfn_open(coll_t cid,
-			const hobject_t& oid,
+			const ghobject_t& oid,
 			bool create,
 			FDRef *outfd,
 			IndexedPath *path,
 			Index *index) 
 {
+  assert(get_allow_sharded_objects() || oid.shard_id == ghobject_t::NO_SHARD);
   assert(outfd);
   int flags = O_RDWR;
   if (create)
@@ -246,7 +269,7 @@ void FileStore::lfn_close(FDRef fd)
 {
 }
 
-int FileStore::lfn_link(coll_t c, coll_t newcid, const hobject_t& o, const hobject_t& newoid)
+int FileStore::lfn_link(coll_t c, coll_t newcid, const ghobject_t& o, const ghobject_t& newoid)
 {
   Index index_new, index_old;
   IndexedPath path_new, path_old;
@@ -298,7 +321,7 @@ int FileStore::lfn_link(coll_t c, coll_t newcid, const hobject_t& o, const hobje
   return 0;
 }
 
-int FileStore::lfn_unlink(coll_t cid, const hobject_t& o,
+int FileStore::lfn_unlink(coll_t cid, const ghobject_t& o,
 			  const SequencerPosition &spos,
 			  bool force_clear_omap)
 {
@@ -324,7 +347,8 @@ int FileStore::lfn_unlink(coll_t cid, const hobject_t& o,
 	assert(!m_filestore_fail_eio || r != -EIO);
 	return r;
       }
-      force_clear_omap = true;
+      if (st.st_nlink == 1)
+	force_clear_omap = true;
     }
     if (force_clear_omap) {
       dout(20) << __func__ << ": clearing omap on " << o
@@ -396,7 +420,9 @@ FileStore::FileStore(const std::string &base, const std::string &jdev, const cha
   m_filestore_queue_committing_max_ops(g_conf->filestore_queue_committing_max_ops),
   m_filestore_queue_committing_max_bytes(g_conf->filestore_queue_committing_max_bytes),
   m_filestore_do_dump(false),
-  m_filestore_dump_fmt(true)
+  m_filestore_dump_fmt(true),
+  m_filestore_sloppy_crc(g_conf->filestore_sloppy_crc),
+  m_filestore_sloppy_crc_block_size(g_conf->filestore_sloppy_crc_block_size)
 {
   m_filestore_kill_at.set(g_conf->filestore_kill_at);
 
@@ -446,6 +472,8 @@ FileStore::FileStore(const std::string &base, const std::string &jdev, const cha
 
   generic_backend = new GenericFileStoreBackend(this);
   backend = generic_backend;
+
+  superblock.compat_features = get_fs_initial_compat_set();
 }
 
 FileStore::~FileStore()
@@ -591,6 +619,13 @@ int FileStore::mkfs()
     goto close_fsid_fd;
   }
 
+  ret = write_superblock();
+  if (ret < 0) {
+    derr << "mkfs: write_superblock() failed: "
+	 << cpp_strerror(ret) << dendl;
+    goto close_fsid_fd;
+  }
+
   struct statfs basefs;
   ret = ::fstatfs(basedir_fd, &basefs);
   if (ret < 0) {
@@ -916,6 +951,49 @@ int FileStore::_sanity_check_fs()
   return 0;
 }
 
+int FileStore::write_superblock()
+{
+  bufferlist bl;
+  ::encode(superblock, bl);
+  return safe_write_file(basedir.c_str(), "superblock",
+      bl.c_str(), bl.length());
+}
+
+int FileStore::read_superblock()
+{
+  bufferptr bp(PATH_MAX);
+  int ret = safe_read_file(basedir.c_str(), "superblock",
+      bp.c_str(), bp.length());
+  if (ret < 0) {
+    if (ret == -ENOENT) {
+      // If the file doesn't exist write initial CompatSet
+      return write_superblock();
+    }
+    return ret;
+  }
+
+  bufferlist bl;
+  bl.push_back(bp);
+  bufferlist::iterator i = bl.begin();
+  ::decode(superblock, i);
+  return 0;
+}
+
+void FileStore::set_allow_sharded_objects()
+{
+  if (!get_allow_sharded_objects()) {
+    superblock.compat_features.incompat.insert(CEPH_FS_FEATURE_INCOMPAT_SHARDS);
+    int ret = write_superblock();
+    assert(ret == 0);	//Should we return error and make caller handle it?
+  }
+  return;
+}
+
+bool FileStore::get_allow_sharded_objects()
+{
+  return superblock.compat_features.incompat.contains(CEPH_FS_FEATURE_INCOMPAT_SHARDS);
+}
+
 int FileStore::update_version_stamp()
 {
   return write_version_stamp();
@@ -923,25 +1001,19 @@ int FileStore::update_version_stamp()
 
 int FileStore::version_stamp_is_valid(uint32_t *version)
 {
-  char fn[PATH_MAX];
-  snprintf(fn, sizeof(fn), "%s/store_version", basedir.c_str());
-  int fd = ::open(fn, O_RDONLY, 0644);
-  if (fd < 0) {
-    if (errno == ENOENT)
+  bufferptr bp(PATH_MAX);
+  int ret = safe_read_file(basedir.c_str(), "store_version",
+      bp.c_str(), bp.length());
+  if (ret < 0) {
+    if (ret == -ENOENT)
       return 0;
-    else 
-      return -errno;
+    return ret;
   }
-  bufferptr bp(PATH_MAX);
-  int ret = safe_read(fd, bp.c_str(), bp.length());
-  TEMP_FAILURE_RETRY(::close(fd));
-  if (ret < 0)
-    return -errno;
   bufferlist bl;
   bl.push_back(bp);
   bufferlist::iterator i = bl.begin();
   ::decode(*version, i);
-  if (*version == on_disk_version)
+  if (*version == target_version)
     return 1;
   else
     return 0;
@@ -949,19 +1021,11 @@ int FileStore::version_stamp_is_valid(uint32_t *version)
 
 int FileStore::write_version_stamp()
 {
-  char fn[PATH_MAX];
-  snprintf(fn, sizeof(fn), "%s/store_version", basedir.c_str());
-  int fd = ::open(fn, O_WRONLY|O_CREAT|O_TRUNC, 0644);
-  if (fd < 0)
-    return -errno;
   bufferlist bl;
-  ::encode(on_disk_version, bl);
-  
-  int ret = safe_write(fd, bl.c_str(), bl.length());
-  TEMP_FAILURE_RETRY(::close(fd));
-  if (ret < 0)
-    return -errno;
-  return 0;
+  ::encode(target_version, bl);
+
+  return safe_write_file(basedir.c_str(), "store_version",
+      bl.c_str(), bl.length());
 }
 
 int FileStore::read_op_seq(uint64_t *seq)
@@ -1003,6 +1067,7 @@ int FileStore::mount()
   char buf[PATH_MAX];
   uint64_t initial_op_seq;
   set<string> cluster_snaps;
+  CompatSet supported_compat_set = get_fs_supported_compat_set();
 
   dout(5) << "basedir " << basedir << " journal " << journalpath << dendl;
   
@@ -1057,12 +1122,26 @@ int FileStore::mount()
       ret = -EINVAL;
       derr << "FileStore::mount : stale version stamp " << version_stamp
 	   << ". Please run the FileStore update script before starting the "
-	   << "OSD, or set filestore_update_to to " << on_disk_version
+	   << "OSD, or set filestore_update_to to " << target_version
 	   << dendl;
       goto close_fsid_fd;
     }
   }
 
+  ret = read_superblock();
+  if (ret < 0) {
+    ret = -EINVAL;
+    goto close_fsid_fd;
+  }
+
+  // Check if this FileStore supports all the necessary features to mount
+  if (supported_compat_set.compare(superblock.compat_features) == -1) {
+    derr << "FileStore::mount : Incompatible features set "
+	   << superblock.compat_features << dendl;
+    ret = -EINVAL;
+    goto close_fsid_fd;
+  }
+
   // open some dir handles
   basedir_fd = ::open(basedir.c_str(), O_RDONLY);
   if (basedir_fd < 0) {
@@ -1710,7 +1789,7 @@ int FileStore::_do_transactions(
   for (list<Transaction*>::iterator p = tls.begin();
        p != tls.end();
        ++p, trans_num++) {
-    r = _do_transaction(**p, op_seq, trans_num);
+    r = _do_transaction(**p, op_seq, trans_num, handle);
     if (r < 0)
       break;
     if (handle)
@@ -1812,7 +1891,7 @@ void FileStore::_set_replay_guard(coll_t cid,
 
 void FileStore::_set_replay_guard(int fd,
 				  const SequencerPosition& spos,
-				  const hobject_t *hoid,
+				  const ghobject_t *hoid,
 				  bool in_progress)
 {
   if (backend->can_checkpoint())
@@ -1893,7 +1972,7 @@ void FileStore::_close_replay_guard(int fd, const SequencerPosition& spos)
   dout(10) << "_close_replay_guard " << spos << " done" << dendl;
 }
 
-int FileStore::_check_replay_guard(coll_t cid, hobject_t oid, const SequencerPosition& spos)
+int FileStore::_check_replay_guard(coll_t cid, ghobject_t oid, const SequencerPosition& spos)
 {
   if (!replaying || backend->can_checkpoint())
     return 1;
@@ -1972,7 +2051,9 @@ int FileStore::_check_replay_guard(int fd, const SequencerPosition& spos)
   }
 }
 
-unsigned FileStore::_do_transaction(Transaction& t, uint64_t op_seq, int trans_num)
+unsigned FileStore::_do_transaction(
+  Transaction& t, uint64_t op_seq, int trans_num,
+  ThreadPool::TPHandle *handle)
 {
   dout(10) << "_do_transaction on " << &t << dendl;
 
@@ -1980,6 +2061,9 @@ unsigned FileStore::_do_transaction(Transaction& t, uint64_t op_seq, int trans_n
   
   SequencerPosition spos(op_seq, trans_num, 0);
   while (i.have_op()) {
+    if (handle)
+      handle->reset_tp_timeout();
+
     int op = i.get_op();
     int r = 0;
 
@@ -1991,7 +2075,7 @@ unsigned FileStore::_do_transaction(Transaction& t, uint64_t op_seq, int trans_n
     case Transaction::OP_TOUCH:
       {
 	coll_t cid = i.get_cid();
-	hobject_t oid = i.get_oid();
+	ghobject_t oid = i.get_oid();
 	if (_check_replay_guard(cid, oid, spos) > 0)
 	  r = _touch(cid, oid);
       }
@@ -2000,7 +2084,7 @@ unsigned FileStore::_do_transaction(Transaction& t, uint64_t op_seq, int trans_n
     case Transaction::OP_WRITE:
       {
 	coll_t cid = i.get_cid();
-	hobject_t oid = i.get_oid();
+	ghobject_t oid = i.get_oid();
 	uint64_t off = i.get_length();
 	uint64_t len = i.get_length();
 	bool replica = i.get_replica();
@@ -2014,7 +2098,7 @@ unsigned FileStore::_do_transaction(Transaction& t, uint64_t op_seq, int trans_n
     case Transaction::OP_ZERO:
       {
 	coll_t cid = i.get_cid();
-	hobject_t oid = i.get_oid();
+	ghobject_t oid = i.get_oid();
 	uint64_t off = i.get_length();
 	uint64_t len = i.get_length();
 	if (_check_replay_guard(cid, oid, spos) > 0)
@@ -2035,7 +2119,7 @@ unsigned FileStore::_do_transaction(Transaction& t, uint64_t op_seq, int trans_n
     case Transaction::OP_TRUNCATE:
       {
 	coll_t cid = i.get_cid();
-	hobject_t oid = i.get_oid();
+	ghobject_t oid = i.get_oid();
 	uint64_t off = i.get_length();
 	if (_check_replay_guard(cid, oid, spos) > 0)
 	  r = _truncate(cid, oid, off);
@@ -2045,7 +2129,7 @@ unsigned FileStore::_do_transaction(Transaction& t, uint64_t op_seq, int trans_n
     case Transaction::OP_REMOVE:
       {
 	coll_t cid = i.get_cid();
-	hobject_t oid = i.get_oid();
+	ghobject_t oid = i.get_oid();
 	if (_check_replay_guard(cid, oid, spos) > 0)
 	  r = _remove(cid, oid, spos);
       }
@@ -2054,7 +2138,7 @@ unsigned FileStore::_do_transaction(Transaction& t, uint64_t op_seq, int trans_n
     case Transaction::OP_SETATTR:
       {
 	coll_t cid = i.get_cid();
-	hobject_t oid = i.get_oid();
+	ghobject_t oid = i.get_oid();
 	string name = i.get_attrname();
 	bufferlist bl;
 	i.get_bl(bl);
@@ -2072,7 +2156,7 @@ unsigned FileStore::_do_transaction(Transaction& t, uint64_t op_seq, int trans_n
     case Transaction::OP_SETATTRS:
       {
 	coll_t cid = i.get_cid();
-	hobject_t oid = i.get_oid();
+	ghobject_t oid = i.get_oid();
 	map<string, bufferptr> aset;
 	i.get_attrset(aset);
 	if (_check_replay_guard(cid, oid, spos) > 0)
@@ -2085,7 +2169,7 @@ unsigned FileStore::_do_transaction(Transaction& t, uint64_t op_seq, int trans_n
     case Transaction::OP_RMATTR:
       {
 	coll_t cid = i.get_cid();
-	hobject_t oid = i.get_oid();
+	ghobject_t oid = i.get_oid();
 	string name = i.get_attrname();
 	if (_check_replay_guard(cid, oid, spos) > 0)
 	  r = _rmattr(cid, oid, name.c_str(), spos);
@@ -2095,7 +2179,7 @@ unsigned FileStore::_do_transaction(Transaction& t, uint64_t op_seq, int trans_n
     case Transaction::OP_RMATTRS:
       {
 	coll_t cid = i.get_cid();
-	hobject_t oid = i.get_oid();
+	ghobject_t oid = i.get_oid();
 	if (_check_replay_guard(cid, oid, spos) > 0)
 	  r = _rmattrs(cid, oid, spos);
       }
@@ -2104,8 +2188,8 @@ unsigned FileStore::_do_transaction(Transaction& t, uint64_t op_seq, int trans_n
     case Transaction::OP_CLONE:
       {
 	coll_t cid = i.get_cid();
-	hobject_t oid = i.get_oid();
-	hobject_t noid = i.get_oid();
+	ghobject_t oid = i.get_oid();
+	ghobject_t noid = i.get_oid();
 	r = _clone(cid, oid, noid, spos);
       }
       break;
@@ -2113,8 +2197,8 @@ unsigned FileStore::_do_transaction(Transaction& t, uint64_t op_seq, int trans_n
     case Transaction::OP_CLONERANGE:
       {
 	coll_t cid = i.get_cid();
-	hobject_t oid = i.get_oid();
-	hobject_t noid = i.get_oid();
+	ghobject_t oid = i.get_oid();
+	ghobject_t noid = i.get_oid();
  	uint64_t off = i.get_length();
 	uint64_t len = i.get_length();
 	r = _clone_range(cid, oid, noid, off, len, off, spos);
@@ -2124,8 +2208,8 @@ unsigned FileStore::_do_transaction(Transaction& t, uint64_t op_seq, int trans_n
     case Transaction::OP_CLONERANGE2:
       {
 	coll_t cid = i.get_cid();
-	hobject_t oid = i.get_oid();
-	hobject_t noid = i.get_oid();
+	ghobject_t oid = i.get_oid();
+	ghobject_t noid = i.get_oid();
  	uint64_t srcoff = i.get_length();
 	uint64_t len = i.get_length();
  	uint64_t dstoff = i.get_length();
@@ -2153,7 +2237,7 @@ unsigned FileStore::_do_transaction(Transaction& t, uint64_t op_seq, int trans_n
       {
 	coll_t ncid = i.get_cid();
 	coll_t ocid = i.get_cid();
-	hobject_t oid = i.get_oid();
+	ghobject_t oid = i.get_oid();
 	r = _collection_add(ncid, ocid, oid, spos);
       }
       break;
@@ -2161,7 +2245,7 @@ unsigned FileStore::_do_transaction(Transaction& t, uint64_t op_seq, int trans_n
     case Transaction::OP_COLL_REMOVE:
        {
 	coll_t cid = i.get_cid();
-	hobject_t oid = i.get_oid();
+	ghobject_t oid = i.get_oid();
 	if (_check_replay_guard(cid, oid, spos) > 0)
 	  r = _remove(cid, oid, spos);
        }
@@ -2172,7 +2256,7 @@ unsigned FileStore::_do_transaction(Transaction& t, uint64_t op_seq, int trans_n
 	// WARNING: this is deprecated and buggy; only here to replay old journals.
 	coll_t ocid = i.get_cid();
 	coll_t ncid = i.get_cid();
-	hobject_t oid = i.get_oid();
+	ghobject_t oid = i.get_oid();
 	r = _collection_add(ocid, ncid, oid, spos);
 	if (r == 0 &&
 	    (_check_replay_guard(ocid, oid, spos) > 0))
@@ -2183,9 +2267,9 @@ unsigned FileStore::_do_transaction(Transaction& t, uint64_t op_seq, int trans_n
     case Transaction::OP_COLL_MOVE_RENAME:
       {
 	coll_t oldcid = i.get_cid();
-	hobject_t oldoid = i.get_oid();
+	ghobject_t oldoid = i.get_oid();
 	coll_t newcid = i.get_cid();
-	hobject_t newoid = i.get_oid();
+	ghobject_t newoid = i.get_oid();
 	r = _collection_move_rename(oldcid, oldoid, newcid, newoid, spos);
       }
       break;
@@ -2225,14 +2309,14 @@ unsigned FileStore::_do_transaction(Transaction& t, uint64_t op_seq, int trans_n
     case Transaction::OP_OMAP_CLEAR:
       {
 	coll_t cid(i.get_cid());
-	hobject_t oid = i.get_oid();
+	ghobject_t oid = i.get_oid();
 	r = _omap_clear(cid, oid, spos);
       }
       break;
     case Transaction::OP_OMAP_SETKEYS:
       {
 	coll_t cid(i.get_cid());
-	hobject_t oid = i.get_oid();
+	ghobject_t oid = i.get_oid();
 	map<string, bufferlist> aset;
 	i.get_attrset(aset);
 	r = _omap_setkeys(cid, oid, aset, spos);
@@ -2241,7 +2325,7 @@ unsigned FileStore::_do_transaction(Transaction& t, uint64_t op_seq, int trans_n
     case Transaction::OP_OMAP_RMKEYS:
       {
 	coll_t cid(i.get_cid());
-	hobject_t oid = i.get_oid();
+	ghobject_t oid = i.get_oid();
 	set<string> keys;
 	i.get_keyset(keys);
 	r = _omap_rmkeys(cid, oid, keys, spos);
@@ -2250,7 +2334,7 @@ unsigned FileStore::_do_transaction(Transaction& t, uint64_t op_seq, int trans_n
     case Transaction::OP_OMAP_RMKEYRANGE:
       {
 	coll_t cid(i.get_cid());
-	hobject_t oid = i.get_oid();
+	ghobject_t oid = i.get_oid();
 	string first, last;
 	first = i.get_key();
 	last = i.get_key();
@@ -2260,7 +2344,7 @@ unsigned FileStore::_do_transaction(Transaction& t, uint64_t op_seq, int trans_n
     case Transaction::OP_OMAP_SETHEADER:
       {
 	coll_t cid(i.get_cid());
-	hobject_t oid = i.get_oid();
+	ghobject_t oid = i.get_oid();
 	bufferlist bl;
 	i.get_bl(bl);
 	r = _omap_setheader(cid, oid, bl, spos);
@@ -2380,7 +2464,7 @@ unsigned FileStore::_do_transaction(Transaction& t, uint64_t op_seq, int trans_n
 // --------------------
 // objects
 
-bool FileStore::exists(coll_t cid, const hobject_t& oid)
+bool FileStore::exists(coll_t cid, const ghobject_t& oid)
 {
   struct stat st;
   if (stat(cid, oid, &st) == 0)
@@ -2390,7 +2474,7 @@ bool FileStore::exists(coll_t cid, const hobject_t& oid)
 }
   
 int FileStore::stat(
-  coll_t cid, const hobject_t& oid, struct stat *st, bool allow_eio)
+  coll_t cid, const ghobject_t& oid, struct stat *st, bool allow_eio)
 {
   int r = lfn_stat(cid, oid, st);
   assert(allow_eio || !m_filestore_fail_eio || r != -EIO);
@@ -2412,7 +2496,7 @@ int FileStore::stat(
 
 int FileStore::read(
   coll_t cid,
-  const hobject_t& oid, 
+  const ghobject_t& oid,
   uint64_t offset,
   size_t len,
   bufferlist& bl,
@@ -2448,6 +2532,17 @@ int FileStore::read(
   }
   bptr.set_length(got);   // properly size the buffer
   bl.push_back(bptr);   // put it in the target bufferlist
+
+  if (m_filestore_sloppy_crc && (!replaying || backend->can_checkpoint())) {
+    ostringstream ss;
+    int errors = backend->_crc_verify_read(**fd, offset, got, bl, &ss);
+    if (errors > 0) {
+      dout(0) << "FileStore::read " << cid << "/" << oid << " " << offset << "~"
+	      << got << " ... BAD CRC:\n" << ss.str() << dendl;
+      assert(0 == "bad crc on read");
+    }
+  }
+
   lfn_close(fd);
 
   dout(10) << "FileStore::read " << cid << "/" << oid << " " << offset << "~"
@@ -2460,7 +2555,7 @@ int FileStore::read(
   }
 }
 
-int FileStore::fiemap(coll_t cid, const hobject_t& oid,
+int FileStore::fiemap(coll_t cid, const ghobject_t& oid,
                     uint64_t offset, size_t len,
                     bufferlist& bl)
 {
@@ -2538,7 +2633,7 @@ done:
 }
 
 
-int FileStore::_remove(coll_t cid, const hobject_t& oid,
+int FileStore::_remove(coll_t cid, const ghobject_t& oid,
 		       const SequencerPosition &spos) 
 {
   dout(15) << "remove " << cid << "/" << oid << dendl;
@@ -2547,7 +2642,7 @@ int FileStore::_remove(coll_t cid, const hobject_t& oid,
   return r;
 }
 
-int FileStore::_truncate(coll_t cid, const hobject_t& oid, uint64_t size)
+int FileStore::_truncate(coll_t cid, const ghobject_t& oid, uint64_t size)
 {
   dout(15) << "truncate " << cid << "/" << oid << " size " << size << dendl;
   int r = lfn_truncate(cid, oid, size);
@@ -2556,7 +2651,7 @@ int FileStore::_truncate(coll_t cid, const hobject_t& oid, uint64_t size)
 }
 
 
-int FileStore::_touch(coll_t cid, const hobject_t& oid)
+int FileStore::_touch(coll_t cid, const ghobject_t& oid)
 {
   dout(15) << "touch " << cid << "/" << oid << dendl;
 
@@ -2571,7 +2666,7 @@ int FileStore::_touch(coll_t cid, const hobject_t& oid)
   return r;
 }
 
-int FileStore::_write(coll_t cid, const hobject_t& oid, 
+int FileStore::_write(coll_t cid, const ghobject_t& oid,
                      uint64_t offset, size_t len,
                      const bufferlist& bl, bool replica)
 {
@@ -2609,6 +2704,11 @@ int FileStore::_write(coll_t cid, const hobject_t& oid,
   if (r == 0)
     r = bl.length();
 
+  if (r >= 0 && m_filestore_sloppy_crc) {
+    int rc = backend->_crc_update_write(**fd, offset, len, bl);
+    assert(rc >= 0);
+  }
+
   // flush?
   if (!replaying &&
       g_conf->filestore_wbthrottle_enable)
@@ -2620,7 +2720,7 @@ int FileStore::_write(coll_t cid, const hobject_t& oid,
   return r;
 }
 
-int FileStore::_zero(coll_t cid, const hobject_t& oid, uint64_t offset, size_t len)
+int FileStore::_zero(coll_t cid, const ghobject_t& oid, uint64_t offset, size_t len)
 {
   dout(15) << "zero " << cid << "/" << oid << " " << offset << "~" << len << dendl;
   int ret = 0;
@@ -2640,6 +2740,11 @@ int FileStore::_zero(coll_t cid, const hobject_t& oid, uint64_t offset, size_t l
     ret = -errno;
   lfn_close(fd);
 
+  if (ret >= 0 && m_filestore_sloppy_crc) {
+    int rc = backend->_crc_update_zero(**fd, offset, len);
+    assert(rc >= 0);
+  }
+
   if (ret == 0)
     goto out;  // yay!
   if (ret != -EOPNOTSUPP)
@@ -2663,7 +2768,7 @@ int FileStore::_zero(coll_t cid, const hobject_t& oid, uint64_t offset, size_t l
   return ret;
 }
 
-int FileStore::_clone(coll_t cid, const hobject_t& oldoid, const hobject_t& newoid,
+int FileStore::_clone(coll_t cid, const ghobject_t& oldoid, const ghobject_t& newoid,
 		      const SequencerPosition& spos)
 {
   dout(15) << "clone " << cid << "/" << oldoid << " -> " << cid << "/" << newoid << dendl;
@@ -2793,11 +2898,15 @@ int FileStore::_do_copy_range(int from, int to, uint64_t srcoff, uint64_t len, u
       break;
     pos += r;
   }
+  if (r >= 0 && m_filestore_sloppy_crc) {
+    int rc = backend->_crc_update_clone_range(from, to, srcoff, len, dstoff);
+    assert(rc >= 0);
+  }
   dout(20) << "_do_copy_range " << srcoff << "~" << len << " to " << dstoff << " = " << r << dendl;
   return r;
 }
 
-int FileStore::_clone_range(coll_t cid, const hobject_t& oldoid, const hobject_t& newoid,
+int FileStore::_clone_range(coll_t cid, const ghobject_t& oldoid, const ghobject_t& newoid,
 			    uint64_t srcoff, uint64_t len, uint64_t dstoff,
 			    const SequencerPosition& spos)
 {
@@ -3238,23 +3347,23 @@ int FileStore::_fsetattrs(int fd, map<string, bufferptr> &aset)
 }
 
 // debug EIO injection
-void FileStore::inject_data_error(const hobject_t &oid) {
+void FileStore::inject_data_error(const ghobject_t &oid) {
   Mutex::Locker l(read_error_lock);
   dout(10) << __func__ << ": init error on " << oid << dendl;
   data_error_set.insert(oid);
 }
-void FileStore::inject_mdata_error(const hobject_t &oid) {
+void FileStore::inject_mdata_error(const ghobject_t &oid) {
   Mutex::Locker l(read_error_lock);
   dout(10) << __func__ << ": init error on " << oid << dendl;
   mdata_error_set.insert(oid);
 }
-void FileStore::debug_obj_on_delete(const hobject_t &oid) {
+void FileStore::debug_obj_on_delete(const ghobject_t &oid) {
   Mutex::Locker l(read_error_lock);
   dout(10) << __func__ << ": clear error on " << oid << dendl;
   data_error_set.erase(oid);
   mdata_error_set.erase(oid);
 }
-bool FileStore::debug_data_eio(const hobject_t &oid) {
+bool FileStore::debug_data_eio(const ghobject_t &oid) {
   Mutex::Locker l(read_error_lock);
   if (data_error_set.count(oid)) {
     dout(10) << __func__ << ": inject error on " << oid << dendl;
@@ -3263,7 +3372,7 @@ bool FileStore::debug_data_eio(const hobject_t &oid) {
     return false;
   }
 }
-bool FileStore::debug_mdata_eio(const hobject_t &oid) {
+bool FileStore::debug_mdata_eio(const ghobject_t &oid) {
   Mutex::Locker l(read_error_lock);
   if (mdata_error_set.count(oid)) {
     dout(10) << __func__ << ": inject error on " << oid << dendl;
@@ -3276,7 +3385,7 @@ bool FileStore::debug_mdata_eio(const hobject_t &oid) {
 
 // objects
 
-int FileStore::getattr(coll_t cid, const hobject_t& oid, const char *name, bufferptr &bp)
+int FileStore::getattr(coll_t cid, const ghobject_t& oid, const char *name, bufferptr &bp)
 {
   dout(15) << "getattr " << cid << "/" << oid << " '" << name << "'" << dendl;
   FDRef fd;
@@ -3322,7 +3431,7 @@ int FileStore::getattr(coll_t cid, const hobject_t& oid, const char *name, buffe
   }
 }
 
-int FileStore::getattrs(coll_t cid, const hobject_t& oid, map<string,bufferptr>& aset, bool user_only) 
+int FileStore::getattrs(coll_t cid, const ghobject_t& oid, map<string,bufferptr>& aset, bool user_only)
 {
   dout(15) << "getattrs " << cid << "/" << oid << dendl;
   FDRef fd;
@@ -3381,7 +3490,7 @@ int FileStore::getattrs(coll_t cid, const hobject_t& oid, map<string,bufferptr>&
   }
 }
 
-int FileStore::_setattrs(coll_t cid, const hobject_t& oid, map<string,bufferptr>& aset,
+int FileStore::_setattrs(coll_t cid, const ghobject_t& oid, map<string,bufferptr>& aset,
 			 const SequencerPosition &spos)
 {
   map<string, bufferlist> omap_set;
@@ -3466,7 +3575,7 @@ int FileStore::_setattrs(coll_t cid, const hobject_t& oid, map<string,bufferptr>
 }
 
 
-int FileStore::_rmattr(coll_t cid, const hobject_t& oid, const char *name,
+int FileStore::_rmattr(coll_t cid, const ghobject_t& oid, const char *name,
 		       const SequencerPosition &spos)
 {
   dout(15) << "rmattr " << cid << "/" << oid << " '" << name << "'" << dendl;
@@ -3501,7 +3610,7 @@ int FileStore::_rmattr(coll_t cid, const hobject_t& oid, const char *name,
   return r;
 }
 
-int FileStore::_rmattrs(coll_t cid, const hobject_t& oid,
+int FileStore::_rmattrs(coll_t cid, const ghobject_t& oid,
 			const SequencerPosition &spos)
 {
   dout(15) << "rmattrs " << cid << "/" << oid << dendl;
@@ -3697,14 +3806,14 @@ int FileStore::_collection_remove_recursive(const coll_t &cid,
     return r;
   }
 
-  vector<hobject_t> objects;
-  hobject_t max;
+  vector<ghobject_t> objects;
+  ghobject_t max;
   r = 0;
   while (!max.is_max()) {
     r = collection_list_partial(cid, max, 200, 300, 0, &objects, &max);
     if (r < 0)
       return r;
-    for (vector<hobject_t>::iterator i = objects.begin();
+    for (vector<ghobject_t>::iterator i = objects.begin();
 	 i != objects.end();
 	 ++i) {
       assert(_check_replay_guard(cid, *i, spos));
@@ -3776,7 +3885,7 @@ int FileStore::collection_version_current(coll_t c, uint32_t *version)
   if (r < 0)
     return r;
   *version = index->collection_version();
-  if (*version == on_disk_version)
+  if (*version == target_version)
     return 1;
   else 
     return 0;
@@ -3869,9 +3978,9 @@ bool FileStore::collection_empty(coll_t c)
   int r = get_index(c, &index);
   if (r < 0)
     return false;
-  vector<hobject_t> ls;
+  vector<ghobject_t> ls;
   collection_list_handle_t handle;
-  r = index->collection_list_partial(hobject_t(), 1, 1, 0, &ls, NULL);
+  r = index->collection_list_partial(ghobject_t(), 1, 1, 0, &ls, NULL);
   if (r < 0) {
     assert(!m_filestore_fail_eio || r != -EIO);
     return false;
@@ -3879,14 +3988,14 @@ bool FileStore::collection_empty(coll_t c)
   return ls.empty();
 }
 
-int FileStore::collection_list_range(coll_t c, hobject_t start, hobject_t end,
-                                     snapid_t seq, vector<hobject_t> *ls)
+int FileStore::collection_list_range(coll_t c, ghobject_t start, ghobject_t end,
+                                     snapid_t seq, vector<ghobject_t> *ls)
 {
   bool done = false;
-  hobject_t next = start;
+  ghobject_t next = start;
 
   while (!done) {
-    vector<hobject_t> next_objects;
+    vector<ghobject_t> next_objects;
     int r = collection_list_partial(c, next,
                                 get_ideal_list_min(), get_ideal_list_max(),
                                 seq, &next_objects, &next);
@@ -3913,10 +4022,11 @@ int FileStore::collection_list_range(coll_t c, hobject_t start, hobject_t end,
   return 0;
 }
 
-int FileStore::collection_list_partial(coll_t c, hobject_t start,
+int FileStore::collection_list_partial(coll_t c, ghobject_t start,
 				       int min, int max, snapid_t seq,
-				       vector<hobject_t> *ls, hobject_t *next)
+				       vector<ghobject_t> *ls, ghobject_t *next)
 {
+  dout(10) << "collection_list_partial: " << c << dendl;
   Index index;
   int r = get_index(c, &index);
   if (r < 0)
@@ -3928,10 +4038,12 @@ int FileStore::collection_list_partial(coll_t c, hobject_t start,
     assert(!m_filestore_fail_eio || r != -EIO);
     return r;
   }
+  if (ls)
+    dout(20) << "objects: " << *ls << dendl;
   return 0;
 }
 
-int FileStore::collection_list(coll_t c, vector<hobject_t>& ls) 
+int FileStore::collection_list(coll_t c, vector<ghobject_t>& ls)
 {  
   Index index;
   int r = get_index(c, &index);
@@ -3942,7 +4054,7 @@ int FileStore::collection_list(coll_t c, vector<hobject_t>& ls)
   return r;
 }
 
-int FileStore::omap_get(coll_t c, const hobject_t &hoid,
+int FileStore::omap_get(coll_t c, const ghobject_t &hoid,
 			bufferlist *header,
 			map<string, bufferlist> *out)
 {
@@ -3961,7 +4073,7 @@ int FileStore::omap_get(coll_t c, const hobject_t &hoid,
 
 int FileStore::omap_get_header(
   coll_t c,
-  const hobject_t &hoid,
+  const ghobject_t &hoid,
   bufferlist *bl,
   bool allow_eio)
 {
@@ -3978,7 +4090,7 @@ int FileStore::omap_get_header(
   return 0;
 }
 
-int FileStore::omap_get_keys(coll_t c, const hobject_t &hoid, set<string> *keys)
+int FileStore::omap_get_keys(coll_t c, const ghobject_t &hoid, set<string> *keys)
 {
   dout(15) << __func__ << " " << c << "/" << hoid << dendl;
   IndexedPath path;
@@ -3993,7 +4105,7 @@ int FileStore::omap_get_keys(coll_t c, const hobject_t &hoid, set<string> *keys)
   return 0;
 }
 
-int FileStore::omap_get_values(coll_t c, const hobject_t &hoid,
+int FileStore::omap_get_values(coll_t c, const ghobject_t &hoid,
 			       const set<string> &keys,
 			       map<string, bufferlist> *out)
 {
@@ -4010,7 +4122,7 @@ int FileStore::omap_get_values(coll_t c, const hobject_t &hoid,
   return 0;
 }
 
-int FileStore::omap_check_keys(coll_t c, const hobject_t &hoid,
+int FileStore::omap_check_keys(coll_t c, const ghobject_t &hoid,
 			       const set<string> &keys,
 			       set<string> *out)
 {
@@ -4028,7 +4140,7 @@ int FileStore::omap_check_keys(coll_t c, const hobject_t &hoid,
 }
 
 ObjectMap::ObjectMapIterator FileStore::get_omap_iterator(coll_t c,
-							  const hobject_t &hoid)
+							  const ghobject_t &hoid)
 {
   dout(15) << __func__ << " " << c << "/" << hoid << dendl;
   IndexedPath path;
@@ -4099,7 +4211,7 @@ int FileStore::_destroy_collection(coll_t c)
 }
 
 
-int FileStore::_collection_add(coll_t c, coll_t oldcid, const hobject_t& o,
+int FileStore::_collection_add(coll_t c, coll_t oldcid, const ghobject_t& o,
 			       const SequencerPosition& spos)
 {
   dout(15) << "collection_add " << c << "/" << o << " from " << oldcid << "/" << o << dendl;
@@ -4147,8 +4259,8 @@ int FileStore::_collection_add(coll_t c, coll_t oldcid, const hobject_t& o,
   return r;
 }
 
-int FileStore::_collection_move_rename(coll_t oldcid, const hobject_t& oldoid,
-				       coll_t c, const hobject_t& o,
+int FileStore::_collection_move_rename(coll_t oldcid, const ghobject_t& oldoid,
+				       coll_t c, const ghobject_t& o,
 				       const SequencerPosition& spos)
 {
   dout(15) << __func__ << " " << c << "/" << o << " from " << oldcid << "/" << oldoid << dendl;
@@ -4227,7 +4339,7 @@ void FileStore::_inject_failure()
   }
 }
 
-int FileStore::_omap_clear(coll_t cid, const hobject_t &hoid,
+int FileStore::_omap_clear(coll_t cid, const ghobject_t &hoid,
 			   const SequencerPosition &spos) {
   dout(15) << __func__ << " " << cid << "/" << hoid << dendl;
   IndexedPath path;
@@ -4240,7 +4352,7 @@ int FileStore::_omap_clear(coll_t cid, const hobject_t &hoid,
   return 0;
 }
 
-int FileStore::_omap_setkeys(coll_t cid, const hobject_t &hoid,
+int FileStore::_omap_setkeys(coll_t cid, const ghobject_t &hoid,
 			     const map<string, bufferlist> &aset,
 			     const SequencerPosition &spos) {
   dout(15) << __func__ << " " << cid << "/" << hoid << dendl;
@@ -4251,7 +4363,7 @@ int FileStore::_omap_setkeys(coll_t cid, const hobject_t &hoid,
   return object_map->set_keys(hoid, aset, &spos);
 }
 
-int FileStore::_omap_rmkeys(coll_t cid, const hobject_t &hoid,
+int FileStore::_omap_rmkeys(coll_t cid, const ghobject_t &hoid,
 			    const set<string> &keys,
 			    const SequencerPosition &spos) {
   dout(15) << __func__ << " " << cid << "/" << hoid << dendl;
@@ -4265,7 +4377,7 @@ int FileStore::_omap_rmkeys(coll_t cid, const hobject_t &hoid,
   return 0;
 }
 
-int FileStore::_omap_rmkeyrange(coll_t cid, const hobject_t &hoid,
+int FileStore::_omap_rmkeyrange(coll_t cid, const ghobject_t &hoid,
 				const string& first, const string& last,
 				const SequencerPosition &spos) {
   dout(15) << __func__ << " " << cid << "/" << hoid << " [" << first << "," << last << "]" << dendl;
@@ -4282,7 +4394,7 @@ int FileStore::_omap_rmkeyrange(coll_t cid, const hobject_t &hoid,
   return _omap_rmkeys(cid, hoid, keys, spos);
 }
 
-int FileStore::_omap_setheader(coll_t cid, const hobject_t &hoid,
+int FileStore::_omap_setheader(coll_t cid, const ghobject_t &hoid,
 			       const bufferlist &bl,
 			       const SequencerPosition &spos)
 {
@@ -4342,8 +4454,8 @@ int FileStore::_split_collection(coll_t cid,
     _close_replay_guard(dest, spos);
   }
   if (g_conf->filestore_debug_verify_split) {
-    vector<hobject_t> objects;
-    hobject_t next;
+    vector<ghobject_t> objects;
+    ghobject_t next;
     while (1) {
       collection_list_partial(
 	cid,
@@ -4353,7 +4465,7 @@ int FileStore::_split_collection(coll_t cid,
 	&next);
       if (objects.empty())
 	break;
-      for (vector<hobject_t>::iterator i = objects.begin();
+      for (vector<ghobject_t>::iterator i = objects.begin();
 	   i != objects.end();
 	   ++i) {
 	dout(20) << __func__ << ": " << *i << " still in source "
@@ -4362,7 +4474,7 @@ int FileStore::_split_collection(coll_t cid,
       }
       objects.clear();
     }
-    next = hobject_t();
+    next = ghobject_t();
     while (1) {
       collection_list_partial(
 	dest,
@@ -4372,7 +4484,7 @@ int FileStore::_split_collection(coll_t cid,
 	&next);
       if (objects.empty())
 	break;
-      for (vector<hobject_t>::iterator i = objects.begin();
+      for (vector<ghobject_t>::iterator i = objects.begin();
 	   i != objects.end();
 	   ++i) {
 	dout(20) << __func__ << ": " << *i << " now in dest "
@@ -4438,6 +4550,8 @@ const char** FileStore::get_tracked_conf_keys() const
     "filestore_kill_at",
     "filestore_fail_eio",
     "filestore_replica_fadvise",
+    "filestore_sloppy_crc",
+    "filestore_sloppy_crc_block_size",
     NULL
   };
   return KEYS;
@@ -4454,6 +4568,8 @@ void FileStore::handle_conf_change(const struct md_config_t *conf,
       changed.count("filestore_queue_committing_max_bytes") ||
       changed.count("filestore_kill_at") ||
       changed.count("filestore_fail_eio") ||
+      changed.count("filestore_sloppy_crc") ||
+      changed.count("filestore_sloppy_crc_block_size") ||
       changed.count("filestore_replica_fadvise")) {
     Mutex::Locker l(lock);
     m_filestore_min_sync_interval = conf->filestore_min_sync_interval;
@@ -4465,6 +4581,8 @@ void FileStore::handle_conf_change(const struct md_config_t *conf,
     m_filestore_kill_at.set(conf->filestore_kill_at);
     m_filestore_fail_eio = conf->filestore_fail_eio;
     m_filestore_replica_fadvise = conf->filestore_replica_fadvise;
+    m_filestore_sloppy_crc = conf->filestore_sloppy_crc;
+    m_filestore_sloppy_crc_block_size = conf->filestore_sloppy_crc_block_size;
   }
   if (changed.count("filestore_commit_timeout")) {
     Mutex::Locker l(sync_entry_timeo_lock);
@@ -4520,3 +4638,39 @@ void FileStore::dump_transactions(list<ObjectStore::Transaction*>& ls, uint64_t
   m_filestore_dump_fmt.flush(m_filestore_dump);
   m_filestore_dump.flush();
 }
+
+// -- FSSuperblock --
+
+void FSSuperblock::encode(bufferlist &bl) const
+{
+  ENCODE_START(1, 1, bl);
+  compat_features.encode(bl);
+  ENCODE_FINISH(bl);
+}
+
+void FSSuperblock::decode(bufferlist::iterator &bl)
+{
+  DECODE_START(1, bl);
+  compat_features.decode(bl);
+  DECODE_FINISH(bl);
+}
+
+void FSSuperblock::dump(Formatter *f) const
+{
+  f->open_object_section("compat");
+  compat_features.dump(f);
+  f->close_section();
+}
+
+void FSSuperblock::generate_test_instances(list<FSSuperblock*>& o)
+{
+  FSSuperblock z;
+  o.push_back(new FSSuperblock(z));
+  CompatSet::FeatureSet feature_compat;
+  CompatSet::FeatureSet feature_ro_compat;
+  CompatSet::FeatureSet feature_incompat;
+  feature_incompat.insert(CEPH_FS_FEATURE_INCOMPAT_SHARDS);
+  z.compat_features = CompatSet(feature_compat, feature_ro_compat,
+                                feature_incompat);
+  o.push_back(new FSSuperblock(z));
+}
diff --git a/src/os/FileStore.h b/src/os/FileStore.h
index 4f58df4d698..fdab0ece34f 100644
--- a/src/os/FileStore.h
+++ b/src/os/FileStore.h
@@ -66,6 +66,26 @@ static const __SWORD_TYPE ZFS_SUPER_MAGIC(0x2fc12fc1);
 
 class FileStoreBackend;
 
+#define CEPH_FS_FEATURE_INCOMPAT_SHARDS CompatSet::Feature(1, "sharded objects")
+
+class FSSuperblock {
+public:
+  CompatSet compat_features;
+
+  FSSuperblock() { }
+
+  void encode(bufferlist &bl) const;
+  void decode(bufferlist::iterator &bl);
+  void dump(Formatter *f) const;
+  static void generate_test_instances(list<FSSuperblock*>& o);
+};
+WRITE_CLASS_ENCODER(FSSuperblock)
+
+inline ostream& operator<<(ostream& out, const FSSuperblock& sb)
+{
+  return out << "sb(" << sb.compat_features << ")";
+}
+
 class FileStore : public JournalingObjectStore,
                   public md_config_obs_t
 {
@@ -89,7 +109,7 @@ public:
     return perf_tracker.get_cur_stats();
   }
 
-  static const uint32_t on_disk_version = 3;
+  static const uint32_t target_version = 3;
 private:
   string internal_name;         ///< internal name, used to name the perfcounter instance
   string basedir, journalpath;
@@ -281,25 +301,26 @@ private:
   void op_queue_release_throttle(Op *o);
   void _journaled_ahead(OpSequencer *osr, Op *o, Context *ondisk);
   friend struct C_JournaledAhead;
+  int write_version_stamp();
 
   int open_journal();
 
   PerfCounters *logger;
 
 public:
-  int lfn_find(coll_t cid, const hobject_t& oid, IndexedPath *path);
-  int lfn_truncate(coll_t cid, const hobject_t& oid, off_t length);
-  int lfn_stat(coll_t cid, const hobject_t& oid, struct stat *buf);
+  int lfn_find(coll_t cid, const ghobject_t& oid, IndexedPath *path);
+  int lfn_truncate(coll_t cid, const ghobject_t& oid, off_t length);
+  int lfn_stat(coll_t cid, const ghobject_t& oid, struct stat *buf);
   int lfn_open(
     coll_t cid,
-    const hobject_t& oid,
+    const ghobject_t& oid,
     bool create,
     FDRef *outfd,
     IndexedPath *path = 0,
     Index *index = 0);
   void lfn_close(FDRef fd);
-  int lfn_link(coll_t c, coll_t newcid, const hobject_t& o, const hobject_t& newoid) ;
-  int lfn_unlink(coll_t cid, const hobject_t& o, const SequencerPosition &spos,
+  int lfn_link(coll_t c, coll_t newcid, const ghobject_t& o, const ghobject_t& newoid) ;
+  int lfn_unlink(coll_t cid, const ghobject_t& o, const SequencerPosition &spos,
 		 bool force_clear_omap=false);
 
 public:
@@ -310,7 +331,6 @@ public:
   int _sanity_check_fs();
   
   bool test_mount_in_use();
-  int write_version_stamp();
   int version_stamp_is_valid(uint32_t *version);
   int update_version_stamp();
   int read_op_seq(uint64_t *seq);
@@ -321,6 +341,22 @@ public:
   int mkfs();
   int mkjournal();
 
+  /**
+   * set_allow_sharded_objects()
+   *
+   * Before sharded ghobject_t can be specified this function must be called
+   *
+   * Once this function is called the FileStore is not mountable by prior releases
+   */
+  void set_allow_sharded_objects();
+
+  /**
+   * get_allow_sharded_objects()
+   *
+   * return value: true if set_allow_sharded_objects() called, otherwise false
+   */
+  bool get_allow_sharded_objects();
+
   int statfs(struct statfs *buf);
 
   int _do_transactions(
@@ -329,7 +365,9 @@ public:
   int do_transactions(list<Transaction*> &tls, uint64_t op_seq) {
     return _do_transactions(tls, op_seq, 0);
   }
-  unsigned _do_transaction(Transaction& t, uint64_t op_seq, int trans_num);
+  unsigned _do_transaction(
+    Transaction& t, uint64_t op_seq, int trans_num,
+    ThreadPool::TPHandle *handle);
 
   int queue_transactions(Sequencer *osr, list<Transaction*>& tls,
 			 TrackedOpRef op = TrackedOpRef());
@@ -345,7 +383,7 @@ public:
    */
   void _set_replay_guard(int fd,
 			 const SequencerPosition& spos,
-			 const hobject_t *hoid=0,
+			 const ghobject_t *oid=0,
 			 bool in_progress=false);
   void _set_replay_guard(coll_t cid,
                          const SequencerPosition& spos,
@@ -375,42 +413,42 @@ public:
    */
   int _check_replay_guard(int fd, const SequencerPosition& spos);
   int _check_replay_guard(coll_t cid, const SequencerPosition& spos);
-  int _check_replay_guard(coll_t cid, hobject_t oid, const SequencerPosition& pos);
+  int _check_replay_guard(coll_t cid, ghobject_t oid, const SequencerPosition& pos);
   int _check_global_replay_guard(coll_t cid, const SequencerPosition& spos);
 
   // ------------------
   // objects
-  int pick_object_revision_lt(hobject_t& oid) {
+  int pick_object_revision_lt(ghobject_t& oid) {
     return 0;
   }
-  bool exists(coll_t cid, const hobject_t& oid);
+  bool exists(coll_t cid, const ghobject_t& oid);
   int stat(
     coll_t cid,
-    const hobject_t& oid,
+    const ghobject_t& oid,
     struct stat *st,
     bool allow_eio = false);
   int read(
     coll_t cid,
-    const hobject_t& oid,
+    const ghobject_t& oid,
     uint64_t offset,
     size_t len,
     bufferlist& bl,
     bool allow_eio = false);
-  int fiemap(coll_t cid, const hobject_t& oid, uint64_t offset, size_t len, bufferlist& bl);
+  int fiemap(coll_t cid, const ghobject_t& oid, uint64_t offset, size_t len, bufferlist& bl);
 
-  int _touch(coll_t cid, const hobject_t& oid);
-  int _write(coll_t cid, const hobject_t& oid, uint64_t offset, size_t len, const bufferlist& bl,
+  int _touch(coll_t cid, const ghobject_t& oid);
+  int _write(coll_t cid, const ghobject_t& oid, uint64_t offset, size_t len, const bufferlist& bl,
       bool replica = false);
-  int _zero(coll_t cid, const hobject_t& oid, uint64_t offset, size_t len);
-  int _truncate(coll_t cid, const hobject_t& oid, uint64_t size);
-  int _clone(coll_t cid, const hobject_t& oldoid, const hobject_t& newoid,
+  int _zero(coll_t cid, const ghobject_t& oid, uint64_t offset, size_t len);
+  int _truncate(coll_t cid, const ghobject_t& oid, uint64_t size);
+  int _clone(coll_t cid, const ghobject_t& oldoid, const ghobject_t& newoid,
 	     const SequencerPosition& spos);
-  int _clone_range(coll_t cid, const hobject_t& oldoid, const hobject_t& newoid,
+  int _clone_range(coll_t cid, const ghobject_t& oldoid, const ghobject_t& newoid,
 		   uint64_t srcoff, uint64_t len, uint64_t dstoff,
 		   const SequencerPosition& spos);
   int _do_clone_range(int from, int to, uint64_t srcoff, uint64_t len, uint64_t dstoff);
   int _do_copy_range(int from, int to, uint64_t srcoff, uint64_t len, uint64_t dstoff);
-  int _remove(coll_t cid, const hobject_t& oid, const SequencerPosition &spos);
+  int _remove(coll_t cid, const ghobject_t& oid, const SequencerPosition &spos);
 
   int _fgetattr(int fd, const char *name, bufferptr& bp);
   int _fgetattrs(int fd, map<string,bufferptr>& aset, bool user_only);
@@ -434,25 +472,25 @@ public:
 
   // DEBUG read error injection, an object is removed from both on delete()
   Mutex read_error_lock;
-  set<hobject_t> data_error_set; // read() will return -EIO
-  set<hobject_t> mdata_error_set; // getattr(),stat() will return -EIO
-  void inject_data_error(const hobject_t &oid);
-  void inject_mdata_error(const hobject_t &oid);
-  void debug_obj_on_delete(const hobject_t &oid);
-  bool debug_data_eio(const hobject_t &oid);
-  bool debug_mdata_eio(const hobject_t &oid);
+  set<ghobject_t> data_error_set; // read() will return -EIO
+  set<ghobject_t> mdata_error_set; // getattr(),stat() will return -EIO
+  void inject_data_error(const ghobject_t &oid);
+  void inject_mdata_error(const ghobject_t &oid);
+  void debug_obj_on_delete(const ghobject_t &oid);
+  bool debug_data_eio(const ghobject_t &oid);
+  bool debug_mdata_eio(const ghobject_t &oid);
 
   int snapshot(const string& name);
 
   // attrs
-  int getattr(coll_t cid, const hobject_t& oid, const char *name, bufferptr &bp);
-  int getattrs(coll_t cid, const hobject_t& oid, map<string,bufferptr>& aset, bool user_only = false);
+  int getattr(coll_t cid, const ghobject_t& oid, const char *name, bufferptr &bp);
+  int getattrs(coll_t cid, const ghobject_t& oid, map<string,bufferptr>& aset, bool user_only = false);
 
-  int _setattrs(coll_t cid, const hobject_t& oid, map<string,bufferptr>& aset,
+  int _setattrs(coll_t cid, const ghobject_t& oid, map<string,bufferptr>& aset,
 		const SequencerPosition &spos);
-  int _rmattr(coll_t cid, const hobject_t& oid, const char *name,
+  int _rmattr(coll_t cid, const ghobject_t& oid, const char *name,
 	      const SequencerPosition &spos);
-  int _rmattrs(coll_t cid, const hobject_t& oid,
+  int _rmattrs(coll_t cid, const ghobject_t& oid,
 	       const SequencerPosition &spos);
 
   int collection_getattr(coll_t c, const char *name, void *value, size_t size);
@@ -473,35 +511,35 @@ public:
   int collection_stat(coll_t c, struct stat *st);
   bool collection_exists(coll_t c);
   bool collection_empty(coll_t c);
-  int collection_list(coll_t c, vector<hobject_t>& o);
-  int collection_list_partial(coll_t c, hobject_t start,
+  int collection_list(coll_t c, vector<ghobject_t>& oid);
+  int collection_list_partial(coll_t c, ghobject_t start,
 			      int min, int max, snapid_t snap,
-			      vector<hobject_t> *ls, hobject_t *next);
-  int collection_list_range(coll_t c, hobject_t start, hobject_t end,
-                            snapid_t seq, vector<hobject_t> *ls);
+			      vector<ghobject_t> *ls, ghobject_t *next);
+  int collection_list_range(coll_t c, ghobject_t start, ghobject_t end,
+                            snapid_t seq, vector<ghobject_t> *ls);
 
   // omap (see ObjectStore.h for documentation)
-  int omap_get(coll_t c, const hobject_t &hoid, bufferlist *header,
+  int omap_get(coll_t c, const ghobject_t &oid, bufferlist *header,
 	       map<string, bufferlist> *out);
   int omap_get_header(
     coll_t c,
-    const hobject_t &hoid,
+    const ghobject_t &oid,
     bufferlist *out,
     bool allow_eio = false);
-  int omap_get_keys(coll_t c, const hobject_t &hoid, set<string> *keys);
-  int omap_get_values(coll_t c, const hobject_t &hoid, const set<string> &keys,
+  int omap_get_keys(coll_t c, const ghobject_t &oid, set<string> *keys);
+  int omap_get_values(coll_t c, const ghobject_t &oid, const set<string> &keys,
 		      map<string, bufferlist> *out);
-  int omap_check_keys(coll_t c, const hobject_t &hoid, const set<string> &keys,
+  int omap_check_keys(coll_t c, const ghobject_t &oid, const set<string> &keys,
 		      set<string> *out);
-  ObjectMap::ObjectMapIterator get_omap_iterator(coll_t c, const hobject_t &hoid);
+  ObjectMap::ObjectMapIterator get_omap_iterator(coll_t c, const ghobject_t &oid);
 
   int _create_collection(coll_t c);
   int _create_collection(coll_t c, const SequencerPosition &spos);
   int _destroy_collection(coll_t c);
-  int _collection_add(coll_t c, coll_t ocid, const hobject_t& o,
+  int _collection_add(coll_t c, coll_t ocid, const ghobject_t& oid,
 		      const SequencerPosition& spos);
-  int _collection_move_rename(coll_t oldcid, const hobject_t& oldoid,
-			      coll_t c, const hobject_t& o,
+  int _collection_move_rename(coll_t oldcid, const ghobject_t& oldoid,
+			      coll_t c, const ghobject_t& o,
 			      const SequencerPosition& spos);
   void dump_start(const std::string& file);
   void dump_stop();
@@ -511,17 +549,17 @@ private:
   void _inject_failure();
 
   // omap
-  int _omap_clear(coll_t cid, const hobject_t &hoid,
+  int _omap_clear(coll_t cid, const ghobject_t &oid,
 		  const SequencerPosition &spos);
-  int _omap_setkeys(coll_t cid, const hobject_t &hoid,
+  int _omap_setkeys(coll_t cid, const ghobject_t &oid,
 		    const map<string, bufferlist> &aset,
 		    const SequencerPosition &spos);
-  int _omap_rmkeys(coll_t cid, const hobject_t &hoid, const set<string> &keys,
+  int _omap_rmkeys(coll_t cid, const ghobject_t &oid, const set<string> &keys,
 		   const SequencerPosition &spos);
-  int _omap_rmkeyrange(coll_t cid, const hobject_t &hoid,
+  int _omap_rmkeyrange(coll_t cid, const ghobject_t &oid,
 		       const string& first, const string& last,
 		       const SequencerPosition &spos);
-  int _omap_setheader(coll_t cid, const hobject_t &hoid, const bufferlist &bl,
+  int _omap_setheader(coll_t cid, const ghobject_t &oid, const bufferlist &bl,
 		      const SequencerPosition &spos);
   int _split_collection(coll_t cid, uint32_t bits, uint32_t rem, coll_t dest,
                         const SequencerPosition &spos);
@@ -553,6 +591,27 @@ private:
   std::ofstream m_filestore_dump;
   JSONFormatter m_filestore_dump_fmt;
   atomic_t m_filestore_kill_at;
+  bool m_filestore_sloppy_crc;
+  int m_filestore_sloppy_crc_block_size;
+  FSSuperblock superblock;
+
+  /**
+   * write_superblock()
+   *
+   * Write superblock to persisent storage
+   *
+   * return value: 0 on success, otherwise negative errno
+   */
+  int write_superblock();
+
+  /**
+   * read_superblock()
+   *
+   * Fill in FileStore::superblock by reading persistent storage
+   *
+   * return value: 0 on success, otherwise negative errno
+   */
+  int read_superblock();
 
   friend class FileStoreBackend;
 };
@@ -586,6 +645,9 @@ protected:
   int _copy_range(int from, int to, uint64_t srcoff, uint64_t len, uint64_t dstoff) {
     return filestore->_do_copy_range(from, to, srcoff, len, dstoff);
   }
+  int get_crc_block_size() {
+    return filestore->m_filestore_sloppy_crc_block_size;
+  }
 public:
   FileStoreBackend(FileStore *fs) : filestore(fs) {}
   virtual ~FileStoreBackend() {};
@@ -601,6 +663,15 @@ public:
   virtual bool has_fiemap() = 0;
   virtual int do_fiemap(int fd, off_t start, size_t len, struct fiemap **pfiemap) = 0;
   virtual int clone_range(int from, int to, uint64_t srcoff, uint64_t len, uint64_t dstoff) = 0;
+
+  // hooks for (sloppy) crc tracking
+  virtual int _crc_update_write(int fd, loff_t off, size_t len, const bufferlist& bl) = 0;
+  virtual int _crc_update_truncate(int fd, loff_t off) = 0;
+  virtual int _crc_update_zero(int fd, loff_t off, size_t len) = 0;
+  virtual int _crc_update_clone_range(int srcfd, int destfd,
+				      loff_t srcoff, size_t len, loff_t dstoff) = 0;
+  virtual int _crc_verify_read(int fd, loff_t off, size_t len, const bufferlist& bl,
+			       ostream *out) = 0;
 };
 
 #endif
diff --git a/src/os/FlatIndex.cc b/src/os/FlatIndex.cc
index db46750e411..d4644abc627 100644
--- a/src/os/FlatIndex.cc
+++ b/src/os/FlatIndex.cc
@@ -134,18 +134,18 @@ static void lfn_translate(const char *path, const char *name, char *new_name, in
   return;
 }
 
-static int append_oname(const hobject_t &oid, char *s, int len)
+static int append_oname(const ghobject_t &oid, char *s, int len)
 {
   //assert(sizeof(oid) == 28);
   char *end = s + len;
   char *t = s + strlen(s);
 
-  const char *i = oid.oid.name.c_str();
+  const char *i = oid.hobj.oid.name.c_str();
   while (*i && t < end) {
     if (*i == '\\') {
       *t++ = '\\';
       *t++ = '\\';      
-    } else if (*i == '.' && i == oid.oid.name.c_str()) {  // only escape leading .
+    } else if (*i == '.' && i == oid.hobj.oid.name.c_str()) {  // only escape leading .
       *t++ = '\\';
       *t++ = '.';
     } else if (*i == '/') {
@@ -158,17 +158,17 @@ static int append_oname(const hobject_t &oid, char *s, int len)
 
   int size = t - s;
 
-  if (oid.snap == CEPH_NOSNAP)
+  if (oid.hobj.snap == CEPH_NOSNAP)
     size += snprintf(t, end - t, "_head");
-  else if (oid.snap == CEPH_SNAPDIR)
+  else if (oid.hobj.snap == CEPH_SNAPDIR)
     size += snprintf(t, end - t, "_snapdir");
   else
-    size += snprintf(t, end - t, "_%llx", (long long unsigned)oid.snap);
+    size += snprintf(t, end - t, "_%llx", (long long unsigned)oid.hobj.snap);
 
   return size;
 }
 
-static bool parse_object(char *s, hobject_t& oid)
+static bool parse_object(char *s, ghobject_t& oid)
 {
   sobject_t o;
   char *bar = s + strlen(s) - 1;
@@ -201,13 +201,13 @@ static bool parse_object(char *s, hobject_t& oid)
       o.snap = CEPH_SNAPDIR;
     else
       o.snap = strtoull(bar+1, &s, 16);
-    oid = hobject_t(o);
+    oid = ghobject_t(hobject_t(o));
     return true;
   }
   return false;
 }
 
-static int lfn_get(const char *coll_path, const hobject_t& oid, char *pathname, int len, char *lfn, int lfn_len, int *exist, int *is_lfn)
+static int lfn_get(const char *coll_path, const ghobject_t& oid, char *pathname, int len, char *lfn, int lfn_len, int *exist, int *is_lfn)
 {
   int i = 0;
   strncpy(pathname, coll_path, len);
@@ -277,7 +277,7 @@ int FlatIndex::init() {
   return 0;
 }
 
-int FlatIndex::created(const hobject_t &hoid, const char *path) {
+int FlatIndex::created(const ghobject_t &hoid, const char *path) {
   char long_name[PATH_MAX];
   long_name[0] = '\0';
   int actual_len = append_oname(hoid, long_name, sizeof(long_name));
@@ -292,7 +292,7 @@ int FlatIndex::created(const hobject_t &hoid, const char *path) {
   return 0;
 }
 
-int FlatIndex::unlink(const hobject_t &o) {
+int FlatIndex::unlink(const ghobject_t &o) {
   char long_fn[PATH_MAX];
   char short_fn[PATH_MAX];
   char short_fn2[PATH_MAX];
@@ -346,7 +346,7 @@ int FlatIndex::unlink(const hobject_t &o) {
   return 0;
 }
 
-int FlatIndex::lookup(const hobject_t &hoid, IndexedPath *path, int *exist) {
+int FlatIndex::lookup(const ghobject_t &hoid, IndexedPath *path, int *exist) {
   char long_fn[PATH_MAX];
   char short_fn[PATH_MAX];
   int r;
@@ -361,7 +361,7 @@ int FlatIndex::lookup(const hobject_t &hoid, IndexedPath *path, int *exist) {
 }
 
 static int get_hobject_from_oinfo(const char *dir, const char *file, 
-				  hobject_t *o) {
+				  ghobject_t *o) {
   char path[PATH_MAX];
   bufferptr bp(PATH_MAX);
   snprintf(path, sizeof(path), "%s/%s", dir, file);
@@ -376,17 +376,17 @@ static int get_hobject_from_oinfo(const char *dir, const char *file,
   return 0;
 }
 
-int FlatIndex::collection_list_partial(const hobject_t &start,
+int FlatIndex::collection_list_partial(const ghobject_t &start,
 				       int min_count,
 				       int max_count,
 				       snapid_t seq,
-				       vector<hobject_t> *ls,
-				       hobject_t *next) {
+				       vector<ghobject_t> *ls,
+				       ghobject_t *next) {
   assert(0); // Should not be called
   return 0;
 }
 
-int FlatIndex::collection_list(vector<hobject_t> *ls) {
+int FlatIndex::collection_list(vector<ghobject_t> *ls) {
   char buf[offsetof(struct dirent, d_name) + PATH_MAX + 1];
   char dir_name[PATH_MAX], new_name[PATH_MAX];
   strncpy(dir_name, base_path.c_str(), sizeof(dir_name));
@@ -397,7 +397,7 @@ int FlatIndex::collection_list(vector<hobject_t> *ls) {
     return -errno;
   
   // first, build (ino, object) list
-  vector< pair<ino_t,hobject_t> > inolist;
+  vector< pair<ino_t,ghobject_t> > inolist;
 
   struct dirent *de;
   while (::readdir_r(dir, (struct dirent *)buf, &de) == 0) {
@@ -407,11 +407,11 @@ int FlatIndex::collection_list(vector<hobject_t> *ls) {
     if (de->d_name[0] == '.')
       continue;
     //cout << "  got object " << de->d_name << std::endl;
-    hobject_t o;
+    ghobject_t o;
     lfn_translate(dir_name, de->d_name, new_name, sizeof(new_name));
     if (parse_object(new_name, o)) {
       get_hobject_from_oinfo(dir_name, de->d_name, &o);
-      inolist.push_back(pair<ino_t,hobject_t>(de->d_ino, o));
+      inolist.push_back(pair<ino_t,ghobject_t>(de->d_ino, o));
       ls->push_back(o);
     }
   }
@@ -422,7 +422,7 @@ int FlatIndex::collection_list(vector<hobject_t> *ls) {
   // build final list
   ls->resize(inolist.size());
   int i = 0;
-  for (vector< pair<ino_t,hobject_t> >::iterator p = inolist.begin(); p != inolist.end(); ++p)
+  for (vector< pair<ino_t,ghobject_t> >::iterator p = inolist.begin(); p != inolist.end(); ++p)
     (*ls)[i++].swap(p->second);
   
   ::closedir(dir);
diff --git a/src/os/FlatIndex.h b/src/os/FlatIndex.h
index 7a10912dc28..657c273468b 100644
--- a/src/os/FlatIndex.h
+++ b/src/os/FlatIndex.h
@@ -52,35 +52,35 @@ public:
 
   /// @see CollectionIndex
   int created(
-    const hobject_t &hoid,
+    const ghobject_t &oid,
     const char *path
     );
 
   /// @see CollectionIndex
   int unlink(
-    const hobject_t &hoid
+    const ghobject_t &oid
     );
 
   /// @see CollectionIndex
   int lookup(
-    const hobject_t &hoid,
+    const ghobject_t &oid,
     IndexedPath *path,
     int *exist
     );
 
   /// @see CollectionIndex
   int collection_list(
-    vector<hobject_t> *ls
+    vector<ghobject_t> *ls
     );
 
   /// @see CollectionIndex
   int collection_list_partial(
-    const hobject_t &start,
+    const ghobject_t &start,
     int min_count,
     int max_count,
     snapid_t seq,
-    vector<hobject_t> *ls,
-    hobject_t *next
+    vector<ghobject_t> *ls,
+    ghobject_t *next
     );
 };
 
diff --git a/src/os/GenericFileStoreBackend.cc b/src/os/GenericFileStoreBackend.cc
index 461158fdfab..81d896a0943 100644
--- a/src/os/GenericFileStoreBackend.cc
+++ b/src/os/GenericFileStoreBackend.cc
@@ -40,6 +40,12 @@
 #include "common/config.h"
 #include "common/sync_filesystem.h"
 
+#include "common/SloppyCRCMap.h"
+#include "os/chain_xattr.h"
+
+#define SLOPPY_CRC_XATTR "user.cephos.scrc"
+
+
 #define dout_subsys ceph_subsys_filestore
 #undef dout_prefix
 #define dout_prefix *_dout << "genericfilestorebackend(" << get_basedir_path() << ") "
@@ -251,3 +257,110 @@ done_err:
   free(fiemap);
   return ret;
 }
+
+
+int GenericFileStoreBackend::_crc_load_or_init(int fd, SloppyCRCMap *cm)
+{
+  char buf[100];
+  bufferptr bp;
+  int r = 0;
+  int l = chain_fgetxattr(fd, SLOPPY_CRC_XATTR, buf, sizeof(buf));
+  if (l == -ENODATA) {
+    return 0;
+  }
+  if (l >= 0) {
+    bp = buffer::create(l);
+    memcpy(bp.c_str(), buf, l);
+  } else if (l == -ERANGE) {
+    l = chain_fgetxattr(fd, SLOPPY_CRC_XATTR, 0, 0);
+    if (l > 0) {
+      bp = buffer::create(l);
+      l = chain_fgetxattr(fd, SLOPPY_CRC_XATTR, bp.c_str(), l);
+    }
+  }
+  bufferlist bl;
+  bl.append(bp);
+  bufferlist::iterator p = bl.begin();
+  try {
+    ::decode(*cm, p);
+  }
+  catch (buffer::error &e) {
+    r = -EIO;
+  }
+  if (r < 0)
+    derr << __func__ << " got " << cpp_strerror(r) << dendl;
+  return r;
+}
+
+int GenericFileStoreBackend::_crc_save(int fd, SloppyCRCMap *cm)
+{
+  bufferlist bl;
+  ::encode(*cm, bl);
+  int r = chain_fsetxattr(fd, SLOPPY_CRC_XATTR, bl.c_str(), bl.length());
+  if (r < 0)
+    derr << __func__ << " got " << cpp_strerror(r) << dendl;
+  return r;
+}
+
+int GenericFileStoreBackend::_crc_update_write(int fd, loff_t off, size_t len, const bufferlist& bl)
+{
+  SloppyCRCMap scm(get_crc_block_size());
+  int r = _crc_load_or_init(fd, &scm);
+  if (r < 0)
+    return r;
+  ostringstream ss;
+  scm.write(off, len, bl, &ss);
+  dout(30) << __func__ << "\n" << ss.str() << dendl;
+  r = _crc_save(fd, &scm);
+  return r;
+}
+
+int GenericFileStoreBackend::_crc_update_truncate(int fd, loff_t off)
+{
+  SloppyCRCMap scm(get_crc_block_size());
+  int r = _crc_load_or_init(fd, &scm);
+  if (r < 0)
+    return r;
+  scm.truncate(off);
+  r = _crc_save(fd, &scm);
+  return r;
+}
+
+int GenericFileStoreBackend::_crc_update_zero(int fd, loff_t off, size_t len)
+{
+  SloppyCRCMap scm(get_crc_block_size());
+  int r = _crc_load_or_init(fd, &scm);
+  if (r < 0)
+    return r;
+  scm.zero(off, len);
+  r = _crc_save(fd, &scm);
+  return r;
+}
+
+int GenericFileStoreBackend::_crc_update_clone_range(int srcfd, int destfd,
+						     loff_t srcoff, size_t len, loff_t dstoff)
+{
+  SloppyCRCMap scm_src(get_crc_block_size());
+  SloppyCRCMap scm_dst(get_crc_block_size());
+  int r = _crc_load_or_init(srcfd, &scm_src);
+  if (r < 0)
+    return r;
+  r = _crc_load_or_init(destfd, &scm_dst);
+  if (r < 0)
+    return r;
+  ostringstream ss;
+  scm_dst.clone_range(srcoff, len, dstoff, scm_src, &ss);
+  dout(30) << __func__ << "\n" << ss.str() << dendl;
+  r = _crc_save(destfd, &scm_dst);
+  return r;
+}
+
+int GenericFileStoreBackend::_crc_verify_read(int fd, loff_t off, size_t len, const bufferlist& bl,
+					      ostream *out)
+{
+  SloppyCRCMap scm(get_crc_block_size());
+  int r = _crc_load_or_init(fd, &scm);
+  if (r < 0)
+    return r;
+  return scm.read(off, len, bl, out);
+}
diff --git a/src/os/GenericFileStoreBackend.h b/src/os/GenericFileStoreBackend.h
index 95aca971708..5a09c2497a8 100644
--- a/src/os/GenericFileStoreBackend.h
+++ b/src/os/GenericFileStoreBackend.h
@@ -17,6 +17,8 @@
 
 #include "FileStore.h"
 
+class SloppyCRCMap;
+
 class GenericFileStoreBackend : public FileStoreBackend {
 private:
   bool ioctl_fiemap;
@@ -25,6 +27,7 @@ private:
 public:
   GenericFileStoreBackend(FileStore *fs);
   virtual ~GenericFileStoreBackend() {};
+
   virtual int detect_features();
   virtual int create_current();
   virtual bool can_checkpoint() { return false; };
@@ -39,5 +42,17 @@ public:
   virtual int clone_range(int from, int to, uint64_t srcoff, uint64_t len, uint64_t dstoff) {
     return _copy_range(from, to, srcoff, len, dstoff);
   }
+
+private:
+  int _crc_load_or_init(int fd, SloppyCRCMap *cm);
+  int _crc_save(int fd, SloppyCRCMap *cm);
+public:
+  virtual int _crc_update_write(int fd, loff_t off, size_t len, const bufferlist& bl);
+  virtual int _crc_update_truncate(int fd, loff_t off);
+  virtual int _crc_update_zero(int fd, loff_t off, size_t len);
+  virtual int _crc_update_clone_range(int srcfd, int destfd,
+				      loff_t srcoff, size_t len, loff_t dstoff);
+  virtual int _crc_verify_read(int fd, loff_t off, size_t len, const bufferlist& bl,
+			       ostream *out);
 };
 #endif
diff --git a/src/os/HashIndex.cc b/src/os/HashIndex.cc
index c279bab3a60..ea50cd038ca 100644
--- a/src/os/HashIndex.cc
+++ b/src/os/HashIndex.cc
@@ -66,7 +66,7 @@ int HashIndex::reset_attr(
     return r;
   if (!exists)
     return 0;
-  map<string, hobject_t> objects;
+  map<string, ghobject_t> objects;
   set<string> subdirs;
   r = list_objects(path, 0, 0, &objects);
   if (r < 0)
@@ -98,7 +98,7 @@ int HashIndex::col_split_level(
   int r = from.list_subdirs(path, &subdirs);
   if (r < 0)
     return r;
-  map<string, hobject_t> objects;
+  map<string, ghobject_t> objects;
   r = from.list_objects(path, 0, 0, &objects);
   if (r < 0)
     return r;
@@ -134,8 +134,8 @@ int HashIndex::col_split_level(
   }
 
   /* Then, do the same for each object */
-  map<string, hobject_t> objs_to_move;
-  for (map<string, hobject_t>::iterator i = objects.begin();
+  map<string, ghobject_t> objs_to_move;
+  for (map<string, ghobject_t>::iterator i = objects.begin();
        i != objects.end();
        ++i) {
     if (i->second.match(inbits, match)) {
@@ -199,7 +199,7 @@ int HashIndex::col_split_level(
       return r;
   }
 
-  for (map<string, hobject_t>::iterator i = objs_to_move.begin();
+  for (map<string, ghobject_t>::iterator i = objs_to_move.begin();
        i != objs_to_move.end();
        ++i) {
     from_info.objs--;
@@ -244,7 +244,7 @@ int HashIndex::_init() {
 
 /* LFNIndex virtual method implementations */
 int HashIndex::_created(const vector<string> &path,
-			const hobject_t &hoid,
+			const ghobject_t &oid,
 			const string &mangled_name) {
   subdir_info_s info;
   int r;
@@ -267,10 +267,10 @@ int HashIndex::_created(const vector<string> &path,
 }
 
 int HashIndex::_remove(const vector<string> &path,
-		       const hobject_t &hoid,
+		       const ghobject_t &oid,
 		       const string &mangled_name) {
   int r;
-  r = remove_object(path, hoid);
+  r = remove_object(path, oid);
   if (r < 0)
     return r;
   subdir_info_s info;
@@ -291,12 +291,12 @@ int HashIndex::_remove(const vector<string> &path,
   }
 }
 
-int HashIndex::_lookup(const hobject_t &hoid,
+int HashIndex::_lookup(const ghobject_t &oid,
 		       vector<string> *path,
 		       string *mangled_name,
 		       int *exists_out) {
   vector<string> path_comp;
-  get_path_components(hoid, &path_comp);
+  get_path_components(oid, &path_comp);
   vector<string>::iterator next = path_comp.begin();
   int exists;
   while (1) {
@@ -313,22 +313,22 @@ int HashIndex::_lookup(const hobject_t &hoid,
       break;
     path->push_back(*(next++));
   }
-  return get_mangled_name(*path, hoid, mangled_name, exists_out);
+  return get_mangled_name(*path, oid, mangled_name, exists_out);
 }
 
-int HashIndex::_collection_list(vector<hobject_t> *ls) {
+int HashIndex::_collection_list(vector<ghobject_t> *ls) {
   vector<string> path;
   return list_by_hash(path, 0, 0, 0, 0, ls);
 }
 
-int HashIndex::_collection_list_partial(const hobject_t &start,
+int HashIndex::_collection_list_partial(const ghobject_t &start,
 					int min_count,
 					int max_count,
 					snapid_t seq,
-					vector<hobject_t> *ls,
-					hobject_t *next) {
+					vector<ghobject_t> *ls,
+					ghobject_t *next) {
   vector<string> path;
-  hobject_t _next;
+  ghobject_t _next;
   if (!next)
     next = &_next;
   *next = start;
@@ -345,7 +345,7 @@ int HashIndex::recursive_remove(const vector<string> &path) {
   int r = list_subdirs(path, &subdirs);
   if (r < 0)
     return r;
-  map<string, hobject_t> objects;
+  map<string, ghobject_t> objects;
   r = list_objects(path, 0, 0, &objects);
   if (r < 0)
     return r;
@@ -475,7 +475,7 @@ int HashIndex::initiate_split(const vector<string> &path, subdir_info_s info) {
 
 int HashIndex::complete_split(const vector<string> &path, subdir_info_s info) {
   int level = info.hash_level;
-  map<string, hobject_t> objects;
+  map<string, ghobject_t> objects;
   vector<string> dst = path;
   int r;
   dst.push_back("");
@@ -486,17 +486,17 @@ int HashIndex::complete_split(const vector<string> &path, subdir_info_s info) {
   r = list_subdirs(path, &subdirs);
   if (r < 0)
     return r;
-  map<string, map<string, hobject_t> > mapped;
-  map<string, hobject_t> moved;
+  map<string, map<string, ghobject_t> > mapped;
+  map<string, ghobject_t> moved;
   int num_moved = 0;
-  for (map<string, hobject_t>::iterator i = objects.begin();
+  for (map<string, ghobject_t>::iterator i = objects.begin();
        i != objects.end();
        ++i) {
     vector<string> new_path;
     get_path_components(i->second, &new_path);
     mapped[new_path[level]][i->first] = i->second;
   }
-  for (map<string, map<string, hobject_t> >::iterator i = mapped.begin();
+  for (map<string, map<string, ghobject_t> >::iterator i = mapped.begin();
        i != mapped.end();
        ) {
     dst[level] = i->first;
@@ -505,7 +505,7 @@ int HashIndex::complete_split(const vector<string> &path, subdir_info_s info) {
     subdir_info_s temp;
     // subdir has already been fully copied
     if (subdirs.count(i->first) && !get_info(dst, &temp)) {
-      for (map<string, hobject_t>::iterator j = i->second.begin();
+      for (map<string, ghobject_t>::iterator j = i->second.begin();
 	   j != i->second.end();
 	   ++j) {
 	moved[j->first] = j->second;
@@ -533,7 +533,7 @@ int HashIndex::complete_split(const vector<string> &path, subdir_info_s info) {
 	return r;
     } // else subdir has been created but only partially copied
 
-    for (map<string, hobject_t>::iterator j = i->second.begin();
+    for (map<string, ghobject_t>::iterator j = i->second.begin();
 	 j != i->second.end();
 	 ++j) {
       moved[j->first] = j->second;
@@ -574,12 +574,12 @@ int HashIndex::complete_split(const vector<string> &path, subdir_info_s info) {
   return end_split_or_merge(path);
 }
 
-void HashIndex::get_path_components(const hobject_t &hoid,
+void HashIndex::get_path_components(const ghobject_t &oid,
 				    vector<string> *path) {
   char buf[MAX_HASH_LEVEL + 1];
-  snprintf(buf, sizeof(buf), "%.*X", MAX_HASH_LEVEL, (uint32_t)hoid.get_filestore_key());
+  snprintf(buf, sizeof(buf), "%.*X", MAX_HASH_LEVEL, (uint32_t)oid.hobj.get_filestore_key());
 
-  // Path components are the hex characters of hoid.hash, least
+  // Path components are the hex characters of oid.hobj.hash, least
   // significant first
   for (int i = 0; i < MAX_HASH_LEVEL; ++i) {
     path->push_back(string(&buf[i], 1));
@@ -596,9 +596,9 @@ string HashIndex::get_hash_str(uint32_t hash) {
   return retval;
 }
 
-string HashIndex::get_path_str(const hobject_t &hoid) {
-  assert(!hoid.is_max());
-  return get_hash_str(hoid.hash);
+string HashIndex::get_path_str(const ghobject_t &oid) {
+  assert(!oid.is_max());
+  return get_hash_str(oid.hobj.hash);
 }
 
 uint32_t HashIndex::hash_prefix_to_hash(string prefix) {
@@ -616,12 +616,12 @@ uint32_t HashIndex::hash_prefix_to_hash(string prefix) {
 
 int HashIndex::get_path_contents_by_hash(const vector<string> &path,
 					 const string *lower_bound,
-					 const hobject_t *next_object,
+					 const ghobject_t *next_object,
 					 const snapid_t *seq,
 					 set<string> *hash_prefixes,
-					 set<pair<string, hobject_t> > *objects) {
+					 set<pair<string, ghobject_t> > *objects) {
   set<string> subdirs;
-  map<string, hobject_t> rev_objects;
+  map<string, ghobject_t> rev_objects;
   int r;
   string cur_prefix;
   for (vector<string>::const_iterator i = path.begin();
@@ -632,7 +632,7 @@ int HashIndex::get_path_contents_by_hash(const vector<string> &path,
   r = list_objects(path, 0, 0, &rev_objects);
   if (r < 0)
     return r;
-  for (map<string, hobject_t>::iterator i = rev_objects.begin();
+  for (map<string, ghobject_t>::iterator i = rev_objects.begin();
        i != rev_objects.end();
        ++i) {
     string hash_prefix = get_path_str(i->second);
@@ -640,10 +640,10 @@ int HashIndex::get_path_contents_by_hash(const vector<string> &path,
       continue;
     if (next_object && i->second < *next_object)
       continue;
-    if (seq && i->second.snap < *seq)
+    if (seq && i->second.hobj.snap < *seq)
       continue;
     hash_prefixes->insert(hash_prefix);
-    objects->insert(pair<string, hobject_t>(hash_prefix, i->second));
+    objects->insert(pair<string, ghobject_t>(hash_prefix, i->second));
   }
   r = list_subdirs(path, &subdirs);
   if (r < 0)
@@ -667,13 +667,13 @@ int HashIndex::list_by_hash(const vector<string> &path,
 			    int min_count,
 			    int max_count,
 			    snapid_t seq,
-			    hobject_t *next,
-			    vector<hobject_t> *out) {
+			    ghobject_t *next,
+			    vector<ghobject_t> *out) {
   assert(out);
   vector<string> next_path = path;
   next_path.push_back("");
   set<string> hash_prefixes;
-  set<pair<string, hobject_t> > objects;
+  set<pair<string, ghobject_t> > objects;
   int r = get_path_contents_by_hash(path,
 				    NULL,
 				    next,
@@ -686,16 +686,16 @@ int HashIndex::list_by_hash(const vector<string> &path,
   for (set<string>::iterator i = hash_prefixes.begin();
        i != hash_prefixes.end();
        ++i) {
-    set<pair<string, hobject_t> >::iterator j = objects.lower_bound(
-      make_pair(*i, hobject_t()));
+    set<pair<string, ghobject_t> >::iterator j = objects.lower_bound(
+      make_pair(*i, ghobject_t()));
     if (j == objects.end() || j->first != *i) {
       if (min_count > 0 && out->size() > (unsigned)min_count) {
 	if (next)
-	  *next = hobject_t("", "", CEPH_NOSNAP, hash_prefix_to_hash(*i), -1, "");
+	  *next = ghobject_t(hobject_t("", "", CEPH_NOSNAP, hash_prefix_to_hash(*i), -1, ""));
 	return 0;
       }
       *(next_path.rbegin()) = *(i->rbegin());
-      hobject_t next_recurse;
+      ghobject_t next_recurse;
       if (next)
 	next_recurse = *next;
       r = list_by_hash(next_path,
@@ -727,6 +727,6 @@ int HashIndex::list_by_hash(const vector<string> &path,
     }
   }
   if (next)
-    *next = hobject_t::get_max();
+    *next = ghobject_t(hobject_t::get_max());
   return 0;
 }
diff --git a/src/os/HashIndex.h b/src/os/HashIndex.h
index fcabd9f7198..6f5bca077d4 100644
--- a/src/os/HashIndex.h
+++ b/src/os/HashIndex.h
@@ -39,7 +39,7 @@
  * given by the hex characters in the hash beginning with the least
  * significant.
  * 
- * ex: hobject_t("object", CEPH_NO_SNAP, 0xA4CEE0D2)
+ * ex: ghobject_t("object", CEPH_NO_SNAP, 0xA4CEE0D2)
  * would be located in (root)/2/D/0/
  * 
  * Subdirectories are created when the number of objects in a directory
@@ -163,30 +163,30 @@ protected:
 
   int _created(
     const vector<string> &path,
-    const hobject_t &hoid,
+    const ghobject_t &oid,
     const string &mangled_name
     );
   int _remove(
     const vector<string> &path,
-    const hobject_t &hoid,
+    const ghobject_t &oid,
     const string &mangled_name
     );
   int _lookup(
-    const hobject_t &hoid,
+    const ghobject_t &oid,
     vector<string> *path,
     string *mangled_name,
     int *exists
     );
   int _collection_list(
-    vector<hobject_t> *ls
+    vector<ghobject_t> *ls
     );
   int _collection_list_partial(
-    const hobject_t &start,
+    const ghobject_t &start,
     int min_count,
     int max_count,
     snapid_t seq,
-    vector<hobject_t> *ls,
-    hobject_t *next
+    vector<ghobject_t> *ls,
+    ghobject_t *next
     );
 private:
   /// Recursively remove path and its subdirs
@@ -262,7 +262,7 @@ private:
 
   /// Determine path components from hoid hash
   void get_path_components(
-    const hobject_t &hoid, ///< [in] Object for which to get path components
+    const ghobject_t &oid, ///< [in] Object for which to get path components
     vector<string> *path   ///< [out] Path components for hoid.
     );
 
@@ -278,12 +278,12 @@ private:
     
 
   /** 
-   * Get string representation of hobject_t/hash
+   * Get string representation of ghobject_t/hash
    *
    * e.g: 0x01234567 -> "76543210"
    */
   static string get_path_str(
-    const hobject_t &hoid ///< [in] Object to get hash string for
+    const ghobject_t &oid ///< [in] Object to get hash string for
     ); ///< @return Hash string for hoid.
 
   /// Get string from hash, @see get_path_str
@@ -319,20 +319,20 @@ private:
   int get_path_contents_by_hash(
     const vector<string> &path,            /// [in] Path to list
     const string *lower_bound,             /// [in] list > *lower_bound
-    const hobject_t *next_object,          /// [in] list > *next_object
+    const ghobject_t *next_object,          /// [in] list > *next_object
     const snapid_t *seq,                   /// [in] list >= *seq
     set<string> *hash_prefixes,            /// [out] prefixes in dir
-    set<pair<string, hobject_t> > *objects /// [out] objects
+    set<pair<string, ghobject_t> > *objects /// [out] objects
     );
 
-  /// List objects in collection in hobject_t order
+  /// List objects in collection in ghobject_t order
   int list_by_hash(
     const vector<string> &path, /// [in] Path to list
     int min_count,              /// [in] List at least min_count
     int max_count,              /// [in] List at most max_count
     snapid_t seq,               /// [in] list only objects where snap >= seq
-    hobject_t *next,            /// [in,out] List objects >= *next
-    vector<hobject_t> *out      /// [out] Listed objects
+    ghobject_t *next,            /// [in,out] List objects >= *next
+    vector<ghobject_t> *out      /// [out] Listed objects
     ); ///< @return Error Code, 0 on success
 };
 
diff --git a/src/os/IndexManager.cc b/src/os/IndexManager.cc
index 412721a04c8..83bbfc9703e 100644
--- a/src/os/IndexManager.cc
+++ b/src/os/IndexManager.cc
@@ -75,7 +75,7 @@ int IndexManager::init_index(coll_t c, const char *path, uint32_t version) {
     return r;
   HashIndex index(c, path, g_conf->filestore_merge_threshold,
 		  g_conf->filestore_split_multiple,
-		  CollectionIndex::HASH_INDEX_TAG_2,
+		  version,
 		  g_conf->filestore_index_retry_probability);
   return index.init();
 }
diff --git a/src/os/LFNIndex.cc b/src/os/LFNIndex.cc
index 029e8ad8197..83e1c144754 100644
--- a/src/os/LFNIndex.cc
+++ b/src/os/LFNIndex.cc
@@ -73,7 +73,7 @@ int LFNIndex::init()
   return _init();
 }
 
-int LFNIndex::created(const hobject_t &hoid, const char *path)
+int LFNIndex::created(const ghobject_t &oid, const char *path)
 {
   WRAP_RETRY(
   vector<string> path_comp;
@@ -81,38 +81,39 @@ int LFNIndex::created(const hobject_t &hoid, const char *path)
   r = decompose_full_path(path, &path_comp, 0, &short_name);
   if (r < 0)
     goto out;
-  r = lfn_created(path_comp, hoid, short_name);
+  r = lfn_created(path_comp, oid, short_name);
   if (r < 0)
     goto out;
-  r = _created(path_comp, hoid, short_name);
+  r = _created(path_comp, oid, short_name);
   if (r < 0)
     goto out;
     );
 }
 
-int LFNIndex::unlink(const hobject_t &hoid)
+int LFNIndex::unlink(const ghobject_t &oid)
 {
   WRAP_RETRY(
   vector<string> path;
   string short_name;
-  r = _lookup(hoid, &path, &short_name, NULL);
+  r = _lookup(oid, &path, &short_name, NULL);
   if (r < 0) {
     goto out;
   }
-  r = _remove(path, hoid, short_name);
+  r = _remove(path, oid, short_name);
   if (r < 0) {
     goto out;
   }
   );
 }
 
-int LFNIndex::lookup(const hobject_t &hoid,
+int LFNIndex::lookup(const ghobject_t &oid,
 		     IndexedPath *out_path,
-		     int *exist) {
+		     int *exist)
+{
   WRAP_RETRY(
   vector<string> path;
   string short_name;
-  r = _lookup(hoid, &path, &short_name, exist);
+  r = _lookup(oid, &path, &short_name, exist);
   if (r < 0)
     goto out;
   string full_path = get_full_path(path, short_name);
@@ -135,18 +136,18 @@ int LFNIndex::lookup(const hobject_t &hoid,
   );
 }
 
-int LFNIndex::collection_list(vector<hobject_t> *ls)
+int LFNIndex::collection_list(vector<ghobject_t> *ls)
 {
   return _collection_list(ls);
 }
 
 
-int LFNIndex::collection_list_partial(const hobject_t &start,
+int LFNIndex::collection_list_partial(const ghobject_t &start,
 				      int min_count,
 				      int max_count,
 				      snapid_t seq,
-				      vector<hobject_t> *ls,
-				      hobject_t *next)
+				      vector<ghobject_t> *ls,
+				      ghobject_t *next)
 {
   return _collection_list_partial(start, min_count, max_count, seq, ls, next);
 }
@@ -171,13 +172,14 @@ int LFNIndex::fsync_dir(const vector<string> &path)
 
 int LFNIndex::link_object(const vector<string> &from,
 			  const vector<string> &to,
-			  const hobject_t &hoid,
-			  const string &from_short_name) {
+			  const ghobject_t &oid,
+			  const string &from_short_name)
+{
   int r;
   string from_path = get_full_path(from, from_short_name);
   string to_path;
   maybe_inject_failure();
-  r = lfn_get_name(to, hoid, 0, &to_path, 0);
+  r = lfn_get_name(to, oid, 0, &to_path, 0);
   if (r < 0)
     return r;
   maybe_inject_failure();
@@ -190,10 +192,11 @@ int LFNIndex::link_object(const vector<string> &from,
 }
 
 int LFNIndex::remove_objects(const vector<string> &dir,
-			     const map<string, hobject_t> &to_remove,
-			     map<string, hobject_t> *remaining) {
+			     const map<string, ghobject_t> &to_remove,
+			     map<string, ghobject_t> *remaining)
+{
   set<string> clean_chains;
-  for (map<string, hobject_t>::const_iterator to_clean = to_remove.begin();
+  for (map<string, ghobject_t>::const_iterator to_clean = to_remove.begin();
        to_clean != to_remove.end();
        ++to_clean) {
     if (!lfn_is_hashed_filename(to_clean->first)) {
@@ -207,7 +210,7 @@ int LFNIndex::remove_objects(const vector<string> &dir,
     if (clean_chains.count(lfn_get_short_name(to_clean->second, 0)))
       continue;
     set<int> holes;
-    map<int, pair<string, hobject_t> > chain;
+    map<int, pair<string, ghobject_t> > chain;
     for (int i = 0; ; ++i) {
       string short_name = lfn_get_short_name(to_clean->second, i);
       if (remaining->count(short_name)) {
@@ -219,7 +222,7 @@ int LFNIndex::remove_objects(const vector<string> &dir,
       }
     }
 
-    map<int, pair<string, hobject_t > >::reverse_iterator candidate = chain.rbegin();
+    map<int, pair<string, ghobject_t > >::reverse_iterator candidate = chain.rbegin();
     for (set<int>::iterator i = holes.begin();
 	 i != holes.end();
 	 ++i) {
@@ -241,7 +244,7 @@ int LFNIndex::remove_objects(const vector<string> &dir,
       if (r < 0)
 	return -errno;
       remaining->erase(candidate->second.first);
-      remaining->insert(pair<string, hobject_t>(
+      remaining->insert(pair<string, ghobject_t>(
 			  lfn_get_short_name(candidate->second.second, *i),
 					     candidate->second.second));
       ++candidate;
@@ -253,13 +256,14 @@ int LFNIndex::remove_objects(const vector<string> &dir,
 }
 
 int LFNIndex::move_objects(const vector<string> &from,
-			   const vector<string> &to) {
-  map<string, hobject_t> to_move;
+			   const vector<string> &to)
+{
+  map<string, ghobject_t> to_move;
   int r;
   r = list_objects(from, 0, NULL, &to_move);
   if (r < 0)
     return r;
-  for (map<string,hobject_t>::iterator i = to_move.begin();
+  for (map<string,ghobject_t>::iterator i = to_move.begin();
        i != to_move.end();
        ++i) {
     string from_path = get_full_path(from, i->first);
@@ -280,7 +284,7 @@ int LFNIndex::move_objects(const vector<string> &from,
   r = fsync_dir(to);
   if (r < 0)
     return r;
-  for (map<string,hobject_t>::iterator i = to_move.begin();
+  for (map<string,ghobject_t>::iterator i = to_move.begin();
        i != to_move.end();
        ++i) {
     maybe_inject_failure();
@@ -293,21 +297,23 @@ int LFNIndex::move_objects(const vector<string> &from,
 }
 
 int LFNIndex::remove_object(const vector<string> &from,
-			    const hobject_t &hoid) {
+			    const ghobject_t &oid)
+{
   string short_name;
   int r, exist;
   maybe_inject_failure();
-  r = get_mangled_name(from, hoid, &short_name, &exist);
+  r = get_mangled_name(from, oid, &short_name, &exist);
   maybe_inject_failure();
   if (r < 0)
     return r;
-  return lfn_unlink(from, hoid, short_name);
+  return lfn_unlink(from, oid, short_name);
 }
 
 int LFNIndex::get_mangled_name(const vector<string> &from,
-			       const hobject_t &hoid,
-			       string *mangled_name, int *exists) {
-  return lfn_get_name(from, hoid, mangled_name, 0, exists);
+			       const ghobject_t &oid,
+			       string *mangled_name, int *exists)
+{
+  return lfn_get_name(from, oid, mangled_name, 0, exists);
 }
 
 int LFNIndex::move_subdir(
@@ -315,7 +321,8 @@ int LFNIndex::move_subdir(
   LFNIndex &dest,
   const vector<string> &path,
   string dir
-  ) {
+  )
+{
   vector<string> sub_path(path.begin(), path.end());
   sub_path.push_back(dir);
   string from_path(from.get_full_path_subdir(sub_path));
@@ -330,8 +337,9 @@ int LFNIndex::move_object(
   LFNIndex &from,
   LFNIndex &dest,
   const vector<string> &path,
-  const pair<string, hobject_t> &obj
-  ) {
+  const pair<string, ghobject_t> &obj
+  )
+{
   string from_path(from.get_full_path(path, obj.first));
   string to_path;
   string to_name;
@@ -358,7 +366,8 @@ int LFNIndex::move_object(
 
 
 static int get_hobject_from_oinfo(const char *dir, const char *file, 
-				  hobject_t *o) {
+				  ghobject_t *o)
+{
   char path[PATH_MAX];
   bufferptr bp(PATH_MAX);
   snprintf(path, sizeof(path), "%s/%s", dir, file);
@@ -375,7 +384,8 @@ static int get_hobject_from_oinfo(const char *dir, const char *file,
 
 
 int LFNIndex::list_objects(const vector<string> &to_list, int max_objs,
-			   long *handle, map<string, hobject_t> *out) {
+			   long *handle, map<string, ghobject_t> *out)
+{
   string to_list_path = get_full_path_subdir(to_list);
   DIR *dir = ::opendir(to_list_path.c_str());
   char buf[offsetof(struct dirent, d_name) + PATH_MAX + 1];
@@ -402,7 +412,7 @@ int LFNIndex::list_objects(const vector<string> &to_list, int max_objs,
     if (de->d_name[0] == '.')
       continue;
     string short_name(de->d_name);
-    hobject_t obj;
+    ghobject_t obj;
     if (lfn_is_object(short_name)) {
       r = lfn_translate(to_list, short_name, &obj);
       if (r < 0) {
@@ -416,7 +426,7 @@ int LFNIndex::list_objects(const vector<string> &to_list, int max_objs,
 	if (index_version == HASH_INDEX_TAG)
 	  get_hobject_from_oinfo(to_list_path.c_str(), short_name.c_str(), &obj);
 	  
-	out->insert(pair<string, hobject_t>(short_name, obj));
+	out->insert(pair<string, ghobject_t>(short_name, obj));
 	++listed;
       } else {
 	continue;
@@ -435,7 +445,8 @@ int LFNIndex::list_objects(const vector<string> &to_list, int max_objs,
 }
 
 int LFNIndex::list_subdirs(const vector<string> &to_list,
-				  set<string> *out) {
+				  set<string> *out)
+{
   string to_list_path = get_full_path_subdir(to_list);
   DIR *dir = ::opendir(to_list_path.c_str());
   char buf[offsetof(struct dirent, d_name) + PATH_MAX + 1];
@@ -449,7 +460,7 @@ int LFNIndex::list_subdirs(const vector<string> &to_list,
     }
     string short_name(de->d_name);
     string demangled_name;
-    hobject_t obj;
+    ghobject_t obj;
     if (lfn_is_subdir(short_name, &demangled_name)) {
       out->insert(demangled_name);
     }
@@ -501,7 +512,8 @@ int LFNIndex::path_exists(const vector<string> &to_check, int *exists)
 
 int LFNIndex::add_attr_path(const vector<string> &path,
 			    const string &attr_name, 
-			    bufferlist &attr_value) {
+			    bufferlist &attr_value)
+{
   string full_path = get_full_path_subdir(path);
   maybe_inject_failure();
   return chain_setxattr(full_path.c_str(), mangle_attr_name(attr_name).c_str(),
@@ -511,7 +523,8 @@ int LFNIndex::add_attr_path(const vector<string> &path,
 
 int LFNIndex::get_attr_path(const vector<string> &path,
 			    const string &attr_name, 
-			    bufferlist &attr_value) {
+			    bufferlist &attr_value)
+{
   string full_path = get_full_path_subdir(path);
   size_t size = 1024; // Initial
   while (1) {
@@ -536,22 +549,24 @@ int LFNIndex::get_attr_path(const vector<string> &path,
 }
 
 int LFNIndex::remove_attr_path(const vector<string> &path,
-			       const string &attr_name) {
+			       const string &attr_name)
+{
   string full_path = get_full_path_subdir(path);
   string mangled_attr_name = mangle_attr_name(attr_name);
   maybe_inject_failure();
   return chain_removexattr(full_path.c_str(), mangled_attr_name.c_str());
 }
   
-string LFNIndex::lfn_generate_object_name_keyless(const hobject_t &hoid)
+string LFNIndex::lfn_generate_object_name_keyless(const ghobject_t &oid)
 {
   char s[FILENAME_MAX_LEN];
   char *end = s + sizeof(s);
   char *t = s;
 
-  const char *i = hoid.oid.name.c_str();
+  assert(oid.generation == ghobject_t::NO_GEN);
+  const char *i = oid.hobj.oid.name.c_str();
   // Escape subdir prefix
-  if (hoid.oid.name.substr(0, 4) == "DIR_") {
+  if (oid.hobj.oid.name.substr(0, 4) == "DIR_") {
     *t++ = '\\';
     *t++ = 'd';
     i += 4;
@@ -560,7 +575,7 @@ string LFNIndex::lfn_generate_object_name_keyless(const hobject_t &hoid)
     if (*i == '\\') {
       *t++ = '\\';
       *t++ = '\\';      
-    } else if (*i == '.' && i == hoid.oid.name.c_str()) {  // only escape leading .
+    } else if (*i == '.' && i == oid.hobj.oid.name.c_str()) {  // only escape leading .
       *t++ = '\\';
       *t++ = '.';
     } else if (*i == '/') {
@@ -571,13 +586,13 @@ string LFNIndex::lfn_generate_object_name_keyless(const hobject_t &hoid)
     i++;
   }
 
-  if (hoid.snap == CEPH_NOSNAP)
+  if (oid.hobj.snap == CEPH_NOSNAP)
     t += snprintf(t, end - t, "_head");
-  else if (hoid.snap == CEPH_SNAPDIR)
+  else if (oid.hobj.snap == CEPH_SNAPDIR)
     t += snprintf(t, end - t, "_snapdir");
   else
-    t += snprintf(t, end - t, "_%llx", (long long unsigned)hoid.snap);
-  snprintf(t, end - t, "_%.*X", (int)(sizeof(hoid.hash)*2), hoid.hash);
+    t += snprintf(t, end - t, "_%llx", (long long unsigned)oid.hobj.snap);
+  snprintf(t, end - t, "_%.*X", (int)(sizeof(oid.hobj.hash)*2), oid.hobj.hash);
 
   return string(s);
 }
@@ -601,94 +616,112 @@ static void append_escaped(string::const_iterator begin,
   }
 }
 
-string LFNIndex::lfn_generate_object_name(const hobject_t &hoid)
+string LFNIndex::lfn_generate_object_name(const ghobject_t &oid)
 {
   if (index_version == HASH_INDEX_TAG)
-    return lfn_generate_object_name_keyless(hoid);
+    return lfn_generate_object_name_keyless(oid);
   if (index_version == HASH_INDEX_TAG_2)
-    return lfn_generate_object_name_poolless(hoid);
+    return lfn_generate_object_name_poolless(oid);
 
   string full_name;
-  string::const_iterator i = hoid.oid.name.begin();
-  if (hoid.oid.name.substr(0, 4) == "DIR_") {
+  string::const_iterator i = oid.hobj.oid.name.begin();
+  if (oid.hobj.oid.name.substr(0, 4) == "DIR_") {
     full_name.append("\\d");
     i += 4;
-  } else if (hoid.oid.name[0] == '.') {
+  } else if (oid.hobj.oid.name[0] == '.') {
     full_name.append("\\.");
     ++i;
   }
-  append_escaped(i, hoid.oid.name.end(), &full_name);
+  append_escaped(i, oid.hobj.oid.name.end(), &full_name);
   full_name.append("_");
-  append_escaped(hoid.get_key().begin(), hoid.get_key().end(), &full_name);
+  append_escaped(oid.hobj.get_key().begin(), oid.hobj.get_key().end(), &full_name);
   full_name.append("_");
 
   char buf[PATH_MAX];
   char *t = buf;
   char *end = t + sizeof(buf);
-  if (hoid.snap == CEPH_NOSNAP)
+  if (oid.hobj.snap == CEPH_NOSNAP)
     t += snprintf(t, end - t, "head");
-  else if (hoid.snap == CEPH_SNAPDIR)
+  else if (oid.hobj.snap == CEPH_SNAPDIR)
     t += snprintf(t, end - t, "snapdir");
   else
-    t += snprintf(t, end - t, "%llx", (long long unsigned)hoid.snap);
-  snprintf(t, end - t, "_%.*X", (int)(sizeof(hoid.hash)*2), hoid.hash);
+    t += snprintf(t, end - t, "%llx", (long long unsigned)oid.hobj.snap);
+  snprintf(t, end - t, "_%.*X", (int)(sizeof(oid.hobj.hash)*2), oid.hobj.hash);
   full_name += string(buf);
   full_name.append("_");
 
-  append_escaped(hoid.nspace.begin(), hoid.nspace.end(), &full_name);
+  append_escaped(oid.hobj.nspace.begin(), oid.hobj.nspace.end(), &full_name);
   full_name.append("_");
 
   t = buf;
   end = t + sizeof(buf);
-  if (hoid.pool == -1)
+  if (oid.hobj.pool == -1)
     t += snprintf(t, end - t, "none");
   else
-    t += snprintf(t, end - t, "%llx", (long long unsigned)hoid.pool);
+    t += snprintf(t, end - t, "%llx", (long long unsigned)oid.hobj.pool);
   full_name += string(buf);
 
+  if (oid.generation != ghobject_t::NO_GEN) {
+    assert(oid.shard_id != ghobject_t::NO_SHARD);
+    full_name.append("_");
+
+    t = buf;
+    end = t + sizeof(buf);
+    t += snprintf(t, end - t, "%llx", (long long unsigned)oid.generation);
+    full_name += string(buf);
+
+    full_name.append("_");
+
+    t = buf;
+    end = t + sizeof(buf);
+    t += snprintf(t, end - t, "%x", (int)oid.shard_id);
+    full_name += string(buf);
+  }
+
   return full_name;
 }
 
-string LFNIndex::lfn_generate_object_name_poolless(const hobject_t &hoid)
+string LFNIndex::lfn_generate_object_name_poolless(const ghobject_t &oid)
 {
   if (index_version == HASH_INDEX_TAG)
-    return lfn_generate_object_name_keyless(hoid);
+    return lfn_generate_object_name_keyless(oid);
 
+  assert(oid.generation == ghobject_t::NO_GEN);
   string full_name;
-  string::const_iterator i = hoid.oid.name.begin();
-  if (hoid.oid.name.substr(0, 4) == "DIR_") {
+  string::const_iterator i = oid.hobj.oid.name.begin();
+  if (oid.hobj.oid.name.substr(0, 4) == "DIR_") {
     full_name.append("\\d");
     i += 4;
-  } else if (hoid.oid.name[0] == '.') {
+  } else if (oid.hobj.oid.name[0] == '.') {
     full_name.append("\\.");
     ++i;
   }
-  append_escaped(i, hoid.oid.name.end(), &full_name);
+  append_escaped(i, oid.hobj.oid.name.end(), &full_name);
   full_name.append("_");
-  append_escaped(hoid.get_key().begin(), hoid.get_key().end(), &full_name);
+  append_escaped(oid.hobj.get_key().begin(), oid.hobj.get_key().end(), &full_name);
   full_name.append("_");
 
   char snap_with_hash[PATH_MAX];
   char *t = snap_with_hash;
   char *end = t + sizeof(snap_with_hash);
-  if (hoid.snap == CEPH_NOSNAP)
+  if (oid.hobj.snap == CEPH_NOSNAP)
     t += snprintf(t, end - t, "head");
-  else if (hoid.snap == CEPH_SNAPDIR)
+  else if (oid.hobj.snap == CEPH_SNAPDIR)
     t += snprintf(t, end - t, "snapdir");
   else
-    t += snprintf(t, end - t, "%llx", (long long unsigned)hoid.snap);
-  snprintf(t, end - t, "_%.*X", (int)(sizeof(hoid.hash)*2), hoid.hash);
+    t += snprintf(t, end - t, "%llx", (long long unsigned)oid.hobj.snap);
+  snprintf(t, end - t, "_%.*X", (int)(sizeof(oid.hobj.hash)*2), oid.hobj.hash);
   full_name += string(snap_with_hash);
   return full_name;
 }
 
 int LFNIndex::lfn_get_name(const vector<string> &path, 
-			   const hobject_t &hoid,
+			   const ghobject_t &oid,
 			   string *mangled_name, string *out_path,
 			   int *exists)
 {
   string subdir_path = get_full_path_subdir(path);
-  string full_name = lfn_generate_object_name(hoid);
+  string full_name = lfn_generate_object_name(oid);
   int r;
 
   if (!lfn_must_hash(full_name)) {
@@ -718,7 +751,7 @@ int LFNIndex::lfn_get_name(const vector<string> &path,
   string candidate_path;
   char buf[FILENAME_MAX_LEN + 1];
   for ( ; ; ++i) {
-    candidate = lfn_get_short_name(hoid, i);
+    candidate = lfn_get_short_name(oid, i);
     candidate_path = get_full_path(path, candidate);
     r = chain_getxattr(candidate_path.c_str(), get_lfn_attr().c_str(), buf, sizeof(buf));
     if (r < 0) {
@@ -757,20 +790,20 @@ int LFNIndex::lfn_get_name(const vector<string> &path,
 }
 
 int LFNIndex::lfn_created(const vector<string> &path,
-			  const hobject_t &hoid,
+			  const ghobject_t &oid,
 			  const string &mangled_name)
 {
   if (!lfn_is_hashed_filename(mangled_name))
     return 0;
   string full_path = get_full_path(path, mangled_name);
-  string full_name = lfn_generate_object_name(hoid);
+  string full_name = lfn_generate_object_name(oid);
   maybe_inject_failure();
   return chain_setxattr(full_path.c_str(), get_lfn_attr().c_str(), 
 		     full_name.c_str(), full_name.size());
 }
 
 int LFNIndex::lfn_unlink(const vector<string> &path,
-			 const hobject_t &hoid,
+			 const ghobject_t &oid,
 			 const string &mangled_name)
 {
   if (!lfn_is_hashed_filename(mangled_name)) {
@@ -787,7 +820,7 @@ int LFNIndex::lfn_unlink(const vector<string> &path,
   
   int i = 0;
   for ( ; ; ++i) {
-    string candidate = lfn_get_short_name(hoid, i);
+    string candidate = lfn_get_short_name(oid, i);
     if (candidate == mangled_name)
       break;
   }
@@ -795,7 +828,7 @@ int LFNIndex::lfn_unlink(const vector<string> &path,
   ++i;
   for ( ; ; ++i) {
     struct stat buf;
-    string to_check = lfn_get_short_name(hoid, i);
+    string to_check = lfn_get_short_name(oid, i);
     string to_check_path = get_full_path(path, to_check);
     int r = ::stat(to_check_path.c_str(), &buf);
     if (r < 0) {
@@ -817,7 +850,7 @@ int LFNIndex::lfn_unlink(const vector<string> &path,
       return 0;
   } else {
     string rename_to = get_full_path(path, mangled_name);
-    string rename_from = get_full_path(path, lfn_get_short_name(hoid, i - 1));
+    string rename_from = get_full_path(path, lfn_get_short_name(oid, i - 1));
     maybe_inject_failure();
     int r = ::rename(rename_from.c_str(), rename_to.c_str());
     maybe_inject_failure();
@@ -830,7 +863,7 @@ int LFNIndex::lfn_unlink(const vector<string> &path,
 
 int LFNIndex::lfn_translate(const vector<string> &path,
 			    const string &short_name,
-			    hobject_t *out)
+			    ghobject_t *out)
 {
   if (!lfn_is_hashed_filename(short_name)) {
     return lfn_parse_object_name(short_name, out);
@@ -863,7 +896,7 @@ bool LFNIndex::lfn_is_subdir(const string &name, string *demangled)
   return 0;
 }
 
-static int parse_object(const char *s, hobject_t& o)
+static int parse_object(const char *s, ghobject_t& o)
 {
   const char *hash = s + strlen(s) - 1;
   while (*hash != '_' &&
@@ -899,28 +932,28 @@ static int parse_object(const char *s, hobject_t& o)
       i++;
     }
     *t = 0;
-    o.oid.name = string(buf, t-buf);
+    o.hobj.oid.name = string(buf, t-buf);
     if (strncmp(bar+1, "head", 4) == 0)
-      o.snap = CEPH_NOSNAP;
+      o.hobj.snap = CEPH_NOSNAP;
     else if (strncmp(bar+1, "snapdir", 7) == 0)
-      o.snap = CEPH_SNAPDIR;
+      o.hobj.snap = CEPH_SNAPDIR;
     else 
-      o.snap = strtoull(bar+1, NULL, 16);
-    sscanf(hash, "_%X", &o.hash);
+      o.hobj.snap = strtoull(bar+1, NULL, 16);
+    sscanf(hash, "_%X", &o.hobj.hash);
 
     return 1;
   }
   return 0;
 }
 
-bool LFNIndex::lfn_parse_object_name_keyless(const string &long_name, hobject_t *out)
+bool LFNIndex::lfn_parse_object_name_keyless(const string &long_name, ghobject_t *out)
 {
   bool r = parse_object(long_name.c_str(), *out);
   int64_t pool = -1;
   pg_t pg;
   if (coll().is_pg_prefix(pg))
     pool = (int64_t)pg.pool();
-  out->pool = pool;
+  out->hobj.pool = pool;
   if (!r) return r;
   string temp = lfn_generate_object_name(*out);
   return r;
@@ -928,7 +961,8 @@ bool LFNIndex::lfn_parse_object_name_keyless(const string &long_name, hobject_t
 
 static bool append_unescaped(string::const_iterator begin,
 			     string::const_iterator end, 
-			     string *out) {
+			     string *out)
+{
   for (string::const_iterator i = begin; i != end; ++i) {
     if (*i == '\\') {
       ++i;
@@ -950,7 +984,8 @@ static bool append_unescaped(string::const_iterator begin,
 }
 
 bool LFNIndex::lfn_parse_object_name_poolless(const string &long_name,
-					      hobject_t *out) {
+					      ghobject_t *out)
+{
   string name;
   string key;
   uint32_t hash;
@@ -1011,12 +1046,12 @@ bool LFNIndex::lfn_parse_object_name_poolless(const string &long_name,
   pg_t pg;
   if (coll().is_pg_prefix(pg))
     pool = (int64_t)pg.pool();
-  (*out) = hobject_t(name, key, snap, hash, pool, "");
+  (*out) = ghobject_t(hobject_t(name, key, snap, hash, pool, ""));
   return true;
 }
 
 
-bool LFNIndex::lfn_parse_object_name(const string &long_name, hobject_t *out)
+bool LFNIndex::lfn_parse_object_name(const string &long_name, ghobject_t *out)
 {
   string name;
   string key;
@@ -1024,6 +1059,8 @@ bool LFNIndex::lfn_parse_object_name(const string &long_name, hobject_t *out)
   uint32_t hash;
   snapid_t snap;
   uint64_t pool;
+  gen_t generation = ghobject_t::NO_GEN;
+  shard_t shard_id = ghobject_t::NO_SHARD;
 
   if (index_version == HASH_INDEX_TAG)
     return lfn_parse_object_name_keyless(long_name, out);
@@ -1081,10 +1118,28 @@ bool LFNIndex::lfn_parse_object_name(const string &long_name, hobject_t *out)
 
   current = ++end;
   for ( ; end != long_name.end() && *end != '_'; ++end) ;
-  if (end != long_name.end())
-    return false;
   string pstring(current, end);
 
+  // Optional generation/shard_id
+  string genstring, shardstring;
+  if (end != long_name.end()) {
+    current = ++end;
+    for ( ; end != long_name.end() && *end != '_'; ++end) ;
+    if (end == long_name.end())
+      return false;
+    genstring = string(current, end);
+
+    generation = (gen_t)strtoull(genstring.c_str(), NULL, 16);
+
+    current = ++end;
+    for ( ; end != long_name.end() && *end != '_'; ++end) ;
+    if (end != long_name.end())
+      return false;
+    shardstring = string(current, end);
+
+    shard_id = (shard_t)strtoul(shardstring.c_str(), NULL, 16);
+  }
+
   if (snap_str == "head")
     snap = CEPH_NOSNAP;
   else if (snap_str == "snapdir")
@@ -1098,7 +1153,7 @@ bool LFNIndex::lfn_parse_object_name(const string &long_name, hobject_t *out)
   else
     pool = strtoull(pstring.c_str(), NULL, 16);
 
-  (*out) = hobject_t(name, key, snap, hash, (int64_t)pool, ns);
+  (*out) = ghobject_t(hobject_t(name, key, snap, hash, (int64_t)pool, ns), generation, shard_id);
   return true;
 }
 
@@ -1170,9 +1225,9 @@ void LFNIndex::build_filename(const char *old_filename, int i, char *filename, i
   }
 }
 
-string LFNIndex::lfn_get_short_name(const hobject_t &hoid, int i)
+string LFNIndex::lfn_get_short_name(const ghobject_t &oid, int i)
 {
-  string long_name = lfn_generate_object_name(hoid);
+  string long_name = lfn_generate_object_name(oid);
   assert(lfn_must_hash(long_name));
   char buf[FILENAME_SHORT_LEN + 4];
   build_filename(long_name.c_str(), i, buf, sizeof(buf));
@@ -1212,7 +1267,7 @@ string LFNIndex::demangle_path_component(const string &component)
 }
 
 int LFNIndex::decompose_full_path(const char *in, vector<string> *out,
-				  hobject_t *hoid, string *shortname)
+				  ghobject_t *oid, string *shortname)
 {
   const char *beginning = in + get_base_path().size();
   const char *end = beginning;
@@ -1228,8 +1283,8 @@ int LFNIndex::decompose_full_path(const char *in, vector<string> *out,
     }
   }
   *shortname = string(beginning, end - beginning);
-  if (hoid) {
-    int r = lfn_translate(*out, *shortname, hoid);
+  if (oid) {
+    int r = lfn_translate(*out, *shortname, oid);
     if (r < 0)
       return r;
   }
diff --git a/src/os/LFNIndex.h b/src/os/LFNIndex.h
index b73ff4db268..f436446bf0f 100644
--- a/src/os/LFNIndex.h
+++ b/src/os/LFNIndex.h
@@ -165,35 +165,35 @@ public:
 
   /// @see CollectionIndex
   int created(
-    const hobject_t &hoid,
+    const ghobject_t &oid,
     const char *path
     );
 
   /// @see CollectionIndex
   int unlink(
-    const hobject_t &hoid
+    const ghobject_t &oid
     );
 
   /// @see CollectionIndex
   int lookup(
-    const hobject_t &hoid,
+    const ghobject_t &oid,
     IndexedPath *path,
     int *exist
     );
 
   /// @see CollectionIndex
   int collection_list(
-    vector<hobject_t> *ls
+    vector<ghobject_t> *ls
     );
 
   /// @see CollectionIndex
   int collection_list_partial(
-    const hobject_t &start,
+    const ghobject_t &start,
     int min_count,
     int max_count,
     snapid_t seq,
-    vector<hobject_t> *ls,
-    hobject_t *next
+    vector<ghobject_t> *ls,
+    ghobject_t *next
     );
 
   virtual int _split(
@@ -221,20 +221,20 @@ protected:
   /// Will be called upon object creation
   virtual int _created(
     const vector<string> &path, ///< [in] Path to subdir.
-    const hobject_t &hoid,      ///< [in] Object created.
+    const ghobject_t &oid,      ///< [in] Object created.
     const string &mangled_name  ///< [in] Mangled filename.
     ) = 0;
 
   /// Will be called to remove an object
   virtual int _remove(
     const vector<string> &path,     ///< [in] Path to subdir.
-    const hobject_t &hoid,          ///< [in] Object to remove.
+    const ghobject_t &oid,          ///< [in] Object to remove.
     const string &mangled_name	    ///< [in] Mangled filename.
     ) = 0;
 
-  /// Return the path and mangled_name for hoid.
+  /// Return the path and mangled_name for oid.
   virtual int _lookup(
-    const hobject_t &hoid,///< [in] Object for lookup.
+    const ghobject_t &oid,///< [in] Object for lookup.
     vector<string> *path, ///< [out] Path to the object.
     string *mangled_name, ///< [out] Mangled filename.
     int *exists		  ///< [out] True if the object exists.
@@ -252,17 +252,17 @@ protected:
    */
   /// List contents of collection.
   virtual int _collection_list(
-    vector<hobject_t> *ls ///< [out] Listed objects.
+    vector<ghobject_t> *ls ///< [out] Listed objects.
     ) = 0;
 
   /// @see CollectionIndex
   virtual int _collection_list_partial(
-    const hobject_t &start,
+    const ghobject_t &start,
     int min_count,
     int max_count,
     snapid_t seq,
-    vector<hobject_t> *ls,
-    hobject_t *next
+    vector<ghobject_t> *ls,
+    ghobject_t *next
     ) = 0;
 
 protected:
@@ -278,8 +278,8 @@ protected:
   int link_object(
     const vector<string> &from,   ///< [in] Source subdirectory.
     const vector<string> &to,     ///< [in] Dest subdirectory.
-    const hobject_t &hoid,        ///< [in] Object to move.
-    const string &from_short_name ///< [in] Mangled filename of hoid.
+    const ghobject_t &oid,        ///< [in] Object to move.
+    const string &from_short_name ///< [in] Mangled filename of oid.
     ); ///< @return Error Code, 0 on success
 
   /**
@@ -296,8 +296,8 @@ protected:
    */
   int remove_objects(
     const vector<string> &dir,             
-    const map<string, hobject_t> &to_remove,
-    map<string, hobject_t> *remaining
+    const map<string, ghobject_t> &to_remove,
+    map<string, ghobject_t> *remaining
     );
 	
 
@@ -322,11 +322,11 @@ protected:
    */
   int remove_object(
     const vector<string> &from,  ///< [in] Directory from which to remove.
-    const hobject_t &to_remove   ///< [in] Object to remove.
+    const ghobject_t &to_remove   ///< [in] Object to remove.
     );
 
   /**
-   * Gets the filename corresponding to hoid in from.
+   * Gets the filename corresponding to oid in from.
    * 
    * The filename may differ between subdirectories.  Furthermore,
    * file creations ore removals in from may invalidate the name.
@@ -334,7 +334,7 @@ protected:
    */
   int get_mangled_name(
     const vector<string> &from, ///< [in] Subdirectory
-    const hobject_t &hoid,	///< [in] Object 
+    const ghobject_t &oid,	///< [in] Object
     string *mangled_name,	///< [out] Filename
     int *exists			///< [out] 1 if the file exists, else 0
     );
@@ -352,7 +352,7 @@ protected:
     LFNIndex &from,             ///< [in] from index
     LFNIndex &dest,             ///< [in] to index
     const vector<string> &path, ///< [in] path to split
-    const pair<string, hobject_t> &obj ///< [in] obj to move
+    const pair<string, ghobject_t> &obj ///< [in] obj to move
     );
 
   /**
@@ -369,7 +369,7 @@ protected:
     const vector<string> &to_list,
     int max_objects,
     long *handle,
-    map<string, hobject_t> *out
+    map<string, ghobject_t> *out
     );
 
   /// Lists subdirectories.
@@ -425,43 +425,43 @@ private:
   }
 
   /**
-   * Gets the filename corresponsing to hoid in path.
+   * Gets the filename corresponsing to oid in path.
    *
-   * @param [in] path Path in which to get filename for hoid.
-   * @param [in] hoid Object for which to get filename.
-   * @param [out] mangled_name Filename for hoid, pass NULL if not needed.
-   * @param [out] full_path Fullpath for hoid, pass NULL if not needed.
+   * @param [in] path Path in which to get filename for oid.
+   * @param [in] oid Object for which to get filename.
+   * @param [out] mangled_name Filename for oid, pass NULL if not needed.
+   * @param [out] full_path Fullpath for oid, pass NULL if not needed.
    * @param [out] exists 1 if the file exists, 0 otherwise, pass NULL if 
    * not needed
    * @return Error Code, 0 on success.
    */
   int lfn_get_name(
     const vector<string> &path,
-    const hobject_t &hoid, 
+    const ghobject_t &oid,
     string *mangled_name,
     string *full_path,
     int *exists
     );
 
-  /// Adjusts path contents when hoid is created at name mangled_name.
+  /// Adjusts path contents when oid is created at name mangled_name.
   int lfn_created(
     const vector<string> &path, ///< [in] Path to adjust.
-    const hobject_t &hoid,	///< [in] Object created. 
+    const ghobject_t &oid,	///< [in] Object created.
     const string &mangled_name  ///< [in] Filename of created object.
     );
 
-  /// Removes hoid from path while adjusting path contents
+  /// Removes oid from path while adjusting path contents
   int lfn_unlink(
-    const vector<string> &path, ///< [in] Path containing hoid.
-    const hobject_t &hoid,	///< [in] Object to remove.
+    const vector<string> &path, ///< [in] Path containing oid.
+    const ghobject_t &oid,	///< [in] Object to remove.
     const string &mangled_name	///< [in] Filename of object to remove.
     );
 
-  ///Transate a file into and hobject_t.
+  ///Transate a file into and ghobject_t.
   int lfn_translate(
     const vector<string> &path, ///< [in] Path containing the file.
     const string &short_name,	///< [in] Filename to translate. 
-    hobject_t *out		///< [out] Object found.
+    ghobject_t *out		///< [out] Object found.
     ); ///< @return Negative error code on error, 0 if not an object, 1 else
 
   /* manglers/demanglers */
@@ -478,35 +478,35 @@ private:
 
   /// Generate object name
   string lfn_generate_object_name_keyless(
-    const hobject_t &hoid ///< [in] Object for which to generate.
+    const ghobject_t &oid ///< [in] Object for which to generate.
     ); ///< @return Generated object name.
 
   /// Generate object name
   string lfn_generate_object_name_poolless(
-    const hobject_t &hoid ///< [in] Object for which to generate.
+    const ghobject_t &oid ///< [in] Object for which to generate.
     ); ///< @return Generated object name.
 
   /// Generate object name
   string lfn_generate_object_name(
-    const hobject_t &hoid ///< [in] Object for which to generate.
+    const ghobject_t &oid ///< [in] Object for which to generate.
     ); ///< @return Generated object name.
 
   /// Parse object name
   bool lfn_parse_object_name_keyless(
     const string &long_name, ///< [in] Name to parse
-    hobject_t *out	     ///< [out] Resulting Object
+    ghobject_t *out	     ///< [out] Resulting Object
     ); ///< @return True if successfull, False otherwise.
 
   /// Parse object name
   bool lfn_parse_object_name_poolless(
     const string &long_name, ///< [in] Name to parse
-    hobject_t *out	     ///< [out] Resulting Object
+    ghobject_t *out	     ///< [out] Resulting Object
     ); ///< @return True if successfull, False otherwise.
 
   /// Parse object name
   bool lfn_parse_object_name(
     const string &long_name, ///< [in] Name to parse
-    hobject_t *out	     ///< [out] Resulting Object
+    ghobject_t *out	     ///< [out] Resulting Object
     ); ///< @return True if successfull, False otherwise.
 
   /// Checks whether short_name is a hashed filename.
@@ -521,7 +521,7 @@ private:
 
   /// Generate hashed name.
   string lfn_get_short_name(
-    const hobject_t &hoid, ///< [in] Object for which to generate.
+    const ghobject_t &oid, ///< [in] Object for which to generate.
     int i		   ///< [in] Index of hashed name to generate.
     ); ///< @return Hashed filename.
 
@@ -554,7 +554,7 @@ private:
   int decompose_full_path(
     const char *in,      ///< [in] Full path to object.
     vector<string> *out, ///< [out] Path to object at in.
-    hobject_t *hoid,	 ///< [out] Object at in.
+    ghobject_t *oid,	 ///< [out] Object at in.
     string *shortname	 ///< [out] Filename of object at in.
     ); ///< @return Error Code, 0 on success.
 
diff --git a/src/os/ObjectMap.h b/src/os/ObjectMap.h
index 5cc1e495de1..7717aac7437 100644
--- a/src/os/ObjectMap.h
+++ b/src/os/ObjectMap.h
@@ -30,102 +30,102 @@ class ObjectMap {
 public:
   /// Set keys and values from specified map
   virtual int set_keys(
-    const hobject_t &hoid,              ///< [in] object containing map
+    const ghobject_t &oid,              ///< [in] object containing map
     const map<string, bufferlist> &set,  ///< [in] key to value map to set
     const SequencerPosition *spos=0     ///< [in] sequencer position
     ) = 0;
 
   /// Set header
   virtual int set_header(
-    const hobject_t &hoid,              ///< [in] object containing map
+    const ghobject_t &oid,              ///< [in] object containing map
     const bufferlist &bl,               ///< [in] header to set
     const SequencerPosition *spos=0     ///< [in] sequencer position
     ) = 0;
 
   /// Retrieve header
   virtual int get_header(
-    const hobject_t &hoid,              ///< [in] object containing map
+    const ghobject_t &oid,              ///< [in] object containing map
     bufferlist *bl                      ///< [out] header to set
     ) = 0;
 
-  /// Clear all map keys and values from hoid
+  /// Clear all map keys and values from oid
   virtual int clear(
-    const hobject_t &hoid,             ///< [in] object containing map
+    const ghobject_t &oid,             ///< [in] object containing map
     const SequencerPosition *spos=0     ///< [in] sequencer position
     ) = 0;
 
-  /// Clear all map keys and values from hoid
+  /// Clear all map keys and values from oid
   virtual int rm_keys(
-    const hobject_t &hoid,              ///< [in] object containing map
+    const ghobject_t &oid,              ///< [in] object containing map
     const set<string> &to_clear,        ///< [in] Keys to clear
     const SequencerPosition *spos=0     ///< [in] sequencer position
     ) = 0;
 
   /// Get all keys and values
   virtual int get(
-    const hobject_t &hoid,             ///< [in] object containing map
+    const ghobject_t &oid,             ///< [in] object containing map
     bufferlist *header,                ///< [out] Returned Header
     map<string, bufferlist> *out       ///< [out] Returned keys and values
     ) = 0;
 
   /// Get values for supplied keys
   virtual int get_keys(
-    const hobject_t &hoid,             ///< [in] object containing map
-    set<string> *keys                  ///< [out] Keys defined on hoid
+    const ghobject_t &oid,             ///< [in] object containing map
+    set<string> *keys                  ///< [out] Keys defined on oid
     ) = 0;
 
   /// Get values for supplied keys
   virtual int get_values(
-    const hobject_t &hoid,             ///< [in] object containing map
+    const ghobject_t &oid,             ///< [in] object containing map
     const set<string> &keys,           ///< [in] Keys to get
     map<string, bufferlist> *out       ///< [out] Returned keys and values
     ) = 0;
 
   /// Check key existence
   virtual int check_keys(
-    const hobject_t &hoid,             ///< [in] object containing map
+    const ghobject_t &oid,             ///< [in] object containing map
     const set<string> &keys,           ///< [in] Keys to check
-    set<string> *out                   ///< [out] Subset of keys defined on hoid
+    set<string> *out                   ///< [out] Subset of keys defined on oid
     ) = 0;
 
   /// Get xattrs
   virtual int get_xattrs(
-    const hobject_t &hoid,             ///< [in] object
+    const ghobject_t &oid,             ///< [in] object
     const set<string> &to_get,         ///< [in] keys to get
     map<string, bufferlist> *out       ///< [out] subset of attrs/vals defined
     ) = 0;
 
   /// Get all xattrs
   virtual int get_all_xattrs(
-    const hobject_t &hoid,             ///< [in] object
+    const ghobject_t &oid,             ///< [in] object
     set<string> *out       ///< [out] attrs and values
     ) = 0;
 
   /// set xattrs in to_set
   virtual int set_xattrs(
-    const hobject_t &hoid,                ///< [in] object
+    const ghobject_t &oid,                ///< [in] object
     const map<string, bufferlist> &to_set,///< [in] attrs/values to set
     const SequencerPosition *spos=0     ///< [in] sequencer position
     ) = 0;
 
   /// remove xattrs in to_remove
   virtual int remove_xattrs(
-    const hobject_t &hoid,               ///< [in] object
+    const ghobject_t &oid,               ///< [in] object
     const set<string> &to_remove,        ///< [in] attrs to remove
     const SequencerPosition *spos=0     ///< [in] sequencer position
     ) = 0;
 
 
-  /// Clone keys efficiently from hoid map to target map
+  /// Clone keys efficiently from oid map to target map
   virtual int clone(
-    const hobject_t &hoid,             ///< [in] object containing map
-    const hobject_t &target,           ///< [in] target of clone
+    const ghobject_t &oid,             ///< [in] object containing map
+    const ghobject_t &target,           ///< [in] target of clone
     const SequencerPosition *spos=0     ///< [in] sequencer position
     ) { return 0; }
 
   /// Ensure all previous writes are durable
   virtual int sync(
-    const hobject_t *hoid=0,          ///< [in] object
+    const ghobject_t *oid=0,          ///< [in] object
     const SequencerPosition *spos=0   ///< [in] Sequencer
     ) { return 0; }
 
@@ -144,7 +144,7 @@ public:
     virtual ~ObjectMapIteratorImpl() {}
   };
   typedef std::tr1::shared_ptr<ObjectMapIteratorImpl> ObjectMapIterator;
-  virtual ObjectMapIterator get_iterator(const hobject_t &hoid) {
+  virtual ObjectMapIterator get_iterator(const ghobject_t &oid) {
     return ObjectMapIterator();
   }
 
diff --git a/src/os/ObjectStore.cc b/src/os/ObjectStore.cc
index 9d8b989225b..1a1bbcb0b67 100644
--- a/src/os/ObjectStore.cc
+++ b/src/os/ObjectStore.cc
@@ -15,6 +15,7 @@
 #include <tr1/memory>
 #include "ObjectStore.h"
 #include "common/Formatter.h"
+#include "FileStore.h"
 
 ostream& operator<<(ostream& out, const ObjectStore::Sequencer& s)
 {
@@ -77,7 +78,7 @@ void ObjectStore::Transaction::dump(ceph::Formatter *f)
     case Transaction::OP_TOUCH:
       {
 	coll_t cid = i.get_cid();
-	hobject_t oid = i.get_oid();
+	ghobject_t oid = i.get_oid();
 	f->dump_string("op_name", "touch");
 	f->dump_stream("collection") << cid;
 	f->dump_stream("oid") << oid;
@@ -87,7 +88,7 @@ void ObjectStore::Transaction::dump(ceph::Formatter *f)
     case Transaction::OP_WRITE:
       {
 	coll_t cid = i.get_cid();
-	hobject_t oid = i.get_oid();
+	ghobject_t oid = i.get_oid();
 	uint64_t off = i.get_length();
 	uint64_t len = i.get_length();
 	bufferlist bl;
@@ -104,7 +105,7 @@ void ObjectStore::Transaction::dump(ceph::Formatter *f)
     case Transaction::OP_ZERO:
       {
 	coll_t cid = i.get_cid();
-	hobject_t oid = i.get_oid();
+	ghobject_t oid = i.get_oid();
 	uint64_t off = i.get_length();
 	uint64_t len = i.get_length();
 	f->dump_string("op_name", "zero");
@@ -118,7 +119,7 @@ void ObjectStore::Transaction::dump(ceph::Formatter *f)
     case Transaction::OP_TRIMCACHE:
       {
 	coll_t cid = i.get_cid();
-	hobject_t oid = i.get_oid();
+	ghobject_t oid = i.get_oid();
 	uint64_t off = i.get_length();
 	uint64_t len = i.get_length();
 	f->dump_string("op_name", "trim_cache");
@@ -132,7 +133,7 @@ void ObjectStore::Transaction::dump(ceph::Formatter *f)
     case Transaction::OP_TRUNCATE:
       {
 	coll_t cid = i.get_cid();
-	hobject_t oid = i.get_oid();
+	ghobject_t oid = i.get_oid();
 	uint64_t off = i.get_length();
 	f->dump_string("op_name", "truncate");
 	f->dump_stream("collection") << cid;
@@ -144,7 +145,7 @@ void ObjectStore::Transaction::dump(ceph::Formatter *f)
     case Transaction::OP_REMOVE:
       {
 	coll_t cid = i.get_cid();
-	hobject_t oid = i.get_oid();
+	ghobject_t oid = i.get_oid();
 	f->dump_string("op_name", "remove");
 	f->dump_stream("collection") << cid;
 	f->dump_stream("oid") << oid;
@@ -154,7 +155,7 @@ void ObjectStore::Transaction::dump(ceph::Formatter *f)
     case Transaction::OP_SETATTR:
       {
 	coll_t cid = i.get_cid();
-	hobject_t oid = i.get_oid();
+	ghobject_t oid = i.get_oid();
 	string name = i.get_attrname();
 	bufferlist bl;
 	i.get_bl(bl);
@@ -169,7 +170,7 @@ void ObjectStore::Transaction::dump(ceph::Formatter *f)
     case Transaction::OP_SETATTRS:
       {
 	coll_t cid = i.get_cid();
-	hobject_t oid = i.get_oid();
+	ghobject_t oid = i.get_oid();
 	map<string, bufferptr> aset;
 	i.get_attrset(aset);
 	f->dump_string("op_name", "setattrs");
@@ -187,7 +188,7 @@ void ObjectStore::Transaction::dump(ceph::Formatter *f)
     case Transaction::OP_RMATTR:
       {
 	coll_t cid = i.get_cid();
-	hobject_t oid = i.get_oid();
+	ghobject_t oid = i.get_oid();
 	string name = i.get_attrname();
 	f->dump_string("op_name", "rmattr");
 	f->dump_stream("collection") << cid;
@@ -199,7 +200,7 @@ void ObjectStore::Transaction::dump(ceph::Formatter *f)
     case Transaction::OP_RMATTRS:
       {
 	coll_t cid = i.get_cid();
-	hobject_t oid = i.get_oid();
+	ghobject_t oid = i.get_oid();
 	f->dump_string("op_name", "rmattrs");
 	f->dump_stream("collection") << cid;
 	f->dump_stream("oid") << oid;
@@ -209,8 +210,8 @@ void ObjectStore::Transaction::dump(ceph::Formatter *f)
     case Transaction::OP_CLONE:
       {
 	coll_t cid = i.get_cid();
-	hobject_t oid = i.get_oid();
-	hobject_t noid = i.get_oid();
+	ghobject_t oid = i.get_oid();
+	ghobject_t noid = i.get_oid();
 	f->dump_string("op_name", "clone");
 	f->dump_stream("collection") << cid;
 	f->dump_stream("src_oid") << oid;
@@ -221,8 +222,8 @@ void ObjectStore::Transaction::dump(ceph::Formatter *f)
     case Transaction::OP_CLONERANGE:
       {
 	coll_t cid = i.get_cid();
-	hobject_t oid = i.get_oid();
-	hobject_t noid = i.get_oid();
+	ghobject_t oid = i.get_oid();
+	ghobject_t noid = i.get_oid();
  	uint64_t off = i.get_length();
 	uint64_t len = i.get_length();
 	f->dump_string("op_name", "clonerange");
@@ -237,8 +238,8 @@ void ObjectStore::Transaction::dump(ceph::Formatter *f)
     case Transaction::OP_CLONERANGE2:
       {
 	coll_t cid = i.get_cid();
-	hobject_t oid = i.get_oid();
-	hobject_t noid = i.get_oid();
+	ghobject_t oid = i.get_oid();
+	ghobject_t noid = i.get_oid();
  	uint64_t srcoff = i.get_length();
 	uint64_t len = i.get_length();
  	uint64_t dstoff = i.get_length();
@@ -272,7 +273,7 @@ void ObjectStore::Transaction::dump(ceph::Formatter *f)
       {
 	coll_t ocid = i.get_cid();
 	coll_t ncid = i.get_cid();
-	hobject_t oid = i.get_oid();
+	ghobject_t oid = i.get_oid();
 	f->dump_string("op_name", "collection_add");
 	f->dump_stream("src_collection") << ocid;
 	f->dump_stream("dst_collection") << ncid;
@@ -283,7 +284,7 @@ void ObjectStore::Transaction::dump(ceph::Formatter *f)
     case Transaction::OP_COLL_REMOVE:
        {
 	coll_t cid = i.get_cid();
-	hobject_t oid = i.get_oid();
+	ghobject_t oid = i.get_oid();
 	f->dump_string("op_name", "collection_remove");
 	f->dump_stream("collection") << cid;
 	f->dump_stream("oid") << oid;
@@ -294,7 +295,7 @@ void ObjectStore::Transaction::dump(ceph::Formatter *f)
        {
 	coll_t ocid = i.get_cid();
 	coll_t ncid = i.get_cid();
-	hobject_t oid = i.get_oid();
+	ghobject_t oid = i.get_oid();
 	f->open_object_section("collection_move");
 	f->dump_stream("src_collection") << ocid;
 	f->dump_stream("dst_collection") << ncid;
@@ -344,7 +345,7 @@ void ObjectStore::Transaction::dump(ceph::Formatter *f)
     case Transaction::OP_OMAP_CLEAR:
       {
 	coll_t cid(i.get_cid());
-	hobject_t oid = i.get_oid();
+	ghobject_t oid = i.get_oid();
 	f->dump_string("op_name", "omap_clear");
 	f->dump_stream("collection") << cid;
 	f->dump_stream("oid") << oid;
@@ -354,7 +355,7 @@ void ObjectStore::Transaction::dump(ceph::Formatter *f)
     case Transaction::OP_OMAP_SETKEYS:
       {
 	coll_t cid(i.get_cid());
-	hobject_t oid = i.get_oid();
+	ghobject_t oid = i.get_oid();
 	map<string, bufferlist> aset;
 	i.get_attrset(aset);
 	f->dump_string("op_name", "omap_setkeys");
@@ -372,7 +373,7 @@ void ObjectStore::Transaction::dump(ceph::Formatter *f)
     case Transaction::OP_OMAP_RMKEYS:
       {
 	coll_t cid(i.get_cid());
-	hobject_t oid = i.get_oid();
+	ghobject_t oid = i.get_oid();
 	set<string> keys;
 	i.get_keyset(keys);
 	f->dump_string("op_name", "omap_rmkeys");
@@ -384,7 +385,7 @@ void ObjectStore::Transaction::dump(ceph::Formatter *f)
     case Transaction::OP_OMAP_SETHEADER:
       {
 	coll_t cid(i.get_cid());
-	hobject_t oid = i.get_oid();
+	ghobject_t oid = i.get_oid();
 	bufferlist bl;
 	i.get_bl(bl);
 	f->dump_string("op_name", "omap_setheader");
@@ -425,7 +426,7 @@ void ObjectStore::Transaction::dump(ceph::Formatter *f)
     case Transaction::OP_OMAP_RMKEYRANGE:
       {
 	coll_t cid(i.get_cid());
-	hobject_t oid = i.get_oid();
+	ghobject_t oid = i.get_oid();
 	string first, last;
 	first = i.get_key();
 	last = i.get_key();
@@ -460,9 +461,9 @@ void ObjectStore::Transaction::generate_test_instances(list<ObjectStore::Transac
   t = new Transaction;
   coll_t c("foocoll");
   coll_t c2("foocoll2");
-  hobject_t o1("obj", "", 123, 456, -1, "");
-  hobject_t o2("obj2", "", 123, 456, -1, "");
-  hobject_t o3("obj3", "", 123, 456, -1, "");
+  ghobject_t o1(hobject_t("obj", "", 123, 456, -1, ""));
+  ghobject_t o2(hobject_t("obj2", "", 123, 456, -1, ""));
+  ghobject_t o3(hobject_t("obj3", "", 123, 456, -1, ""));
   t->touch(c, o1);
   bufferlist bl;
   bl.append("some data");
@@ -497,3 +498,44 @@ void ObjectStore::Transaction::generate_test_instances(list<ObjectStore::Transac
   o.push_back(t);  
 }
 
+int ObjectStore::collection_list(coll_t c, vector<hobject_t>& o)
+{
+  vector<ghobject_t> go;
+  int ret = collection_list(c, go);
+  if (ret == 0) {
+    o.reserve(go.size());
+    for (vector<ghobject_t>::iterator i = go.begin(); i != go.end() ; i++)
+      o.push_back(i->hobj);
+  }
+  return ret;
+}
+
+int ObjectStore::collection_list_partial(coll_t c, hobject_t start,
+			      int min, int max, snapid_t snap,
+				      vector<hobject_t> *ls, hobject_t *next)
+{
+  vector<ghobject_t> go;
+  ghobject_t gnext, gstart(start);
+  int ret = collection_list_partial(c, gstart, min, max, snap, &go, &gnext);
+  if (ret == 0) {
+    *next = gnext.hobj;
+    ls->reserve(go.size());
+    for (vector<ghobject_t>::iterator i = go.begin(); i != go.end() ; i++)
+      ls->push_back(i->hobj);
+  }
+  return ret;
+}
+
+int ObjectStore::collection_list_range(coll_t c, hobject_t start, hobject_t end,
+			    snapid_t seq, vector<hobject_t> *ls)
+{
+  vector<ghobject_t> go;
+  ghobject_t gstart(start), gend(end);
+  int ret = collection_list_range(c, gstart, gend, seq, &go);
+  if (ret == 0) {
+    ls->reserve(go.size());
+    for (vector<ghobject_t>::iterator i = go.begin(); i != go.end() ; i++)
+      ls->push_back(i->hobj);
+  }
+  return ret;
+}
diff --git a/src/os/ObjectStore.h b/src/os/ObjectStore.h
index 7e8f6ce43bf..07473b344f5 100644
--- a/src/os/ObjectStore.h
+++ b/src/os/ObjectStore.h
@@ -340,21 +340,23 @@ public:
       void get_bl(bufferlist& bl) {
 	::decode(bl, p);
       }
-      hobject_t get_oid() {
-	hobject_t hoid;
+      ghobject_t get_oid() {
+	ghobject_t oid;
 	if (sobject_encoding) {
 	  sobject_t soid;
 	  ::decode(soid, p);
-	  hoid.snap = soid.snap;
-	  hoid.oid = soid.oid;
+	  oid.hobj.snap = soid.snap;
+	  oid.hobj.oid = soid.oid;
+	  oid.generation = ghobject_t::NO_GEN;
+	  oid.shard_id = ghobject_t::NO_SHARD;
 	} else {
-	  ::decode(hoid, p);
+	  ::decode(oid, p);
 	  if (use_pool_override && pool_override != -1 &&
-	      hoid.pool == -1) {
-	    hoid.pool = pool_override;
+	      oid.hobj.pool == -1) {
+	    oid.hobj.pool = pool_override;
 	  }
 	}
-	return hoid;
+	return oid;
       }
       coll_t get_cid() {
 	coll_t c;
@@ -408,14 +410,14 @@ public:
       ::encode(op, tbl);
       ops++;
     }
-    void touch(coll_t cid, const hobject_t& oid) {
+    void touch(coll_t cid, const ghobject_t& oid) {
       __u32 op = OP_TOUCH;
       ::encode(op, tbl);
       ::encode(cid, tbl);
       ::encode(oid, tbl);
       ops++;
     }
-    void write(coll_t cid, const hobject_t& oid, uint64_t off, uint64_t len, const bufferlist& data) {
+    void write(coll_t cid, const ghobject_t& oid, uint64_t off, uint64_t len, const bufferlist& data) {
       __u32 op = OP_WRITE;
       ::encode(op, tbl);
       ::encode(cid, tbl);
@@ -431,7 +433,7 @@ public:
       ::encode(data, tbl);
       ops++;
     }
-    void zero(coll_t cid, const hobject_t& oid, uint64_t off, uint64_t len) {
+    void zero(coll_t cid, const ghobject_t& oid, uint64_t off, uint64_t len) {
       __u32 op = OP_ZERO;
       ::encode(op, tbl);
       ::encode(cid, tbl);
@@ -440,7 +442,7 @@ public:
       ::encode(len, tbl);
       ops++;
     }
-    void truncate(coll_t cid, const hobject_t& oid, uint64_t off) {
+    void truncate(coll_t cid, const ghobject_t& oid, uint64_t off) {
       __u32 op = OP_TRUNCATE;
       ::encode(op, tbl);
       ::encode(cid, tbl);
@@ -448,18 +450,18 @@ public:
       ::encode(off, tbl);
       ops++;
     }
-    void remove(coll_t cid, const hobject_t& oid) {
+    void remove(coll_t cid, const ghobject_t& oid) {
       __u32 op = OP_REMOVE;
       ::encode(op, tbl);
       ::encode(cid, tbl);
       ::encode(oid, tbl);
       ops++;
     }
-    void setattr(coll_t cid, const hobject_t& oid, const char* name, bufferlist& val) {
+    void setattr(coll_t cid, const ghobject_t& oid, const char* name, bufferlist& val) {
       string n(name);
       setattr(cid, oid, n, val);
     }
-    void setattr(coll_t cid, const hobject_t& oid, const string& s, bufferlist& val) {
+    void setattr(coll_t cid, const ghobject_t& oid, const string& s, bufferlist& val) {
       __u32 op = OP_SETATTR;
       ::encode(op, tbl);
       ::encode(cid, tbl);
@@ -468,7 +470,7 @@ public:
       ::encode(val, tbl);
       ops++;
     }
-    void setattrs(coll_t cid, const hobject_t& oid, map<string,bufferptr>& attrset) {
+    void setattrs(coll_t cid, const ghobject_t& oid, map<string,bufferptr>& attrset) {
       __u32 op = OP_SETATTRS;
       ::encode(op, tbl);
       ::encode(cid, tbl);
@@ -484,11 +486,11 @@ public:
       ::encode(attrset, tbl);
       ops++;
     }
-    void rmattr(coll_t cid, const hobject_t& oid, const char *name) {
+    void rmattr(coll_t cid, const ghobject_t& oid, const char *name) {
       string n(name);
       rmattr(cid, oid, n);
     }
-    void rmattr(coll_t cid, const hobject_t& oid, const string& s) {
+    void rmattr(coll_t cid, const ghobject_t& oid, const string& s) {
       __u32 op = OP_RMATTR;
       ::encode(op, tbl);
       ::encode(cid, tbl);
@@ -496,14 +498,14 @@ public:
       ::encode(s, tbl);
       ops++;
     }
-    void rmattrs(coll_t cid, const hobject_t& oid) {
+    void rmattrs(coll_t cid, const ghobject_t& oid) {
       __u32 op = OP_RMATTR;
       ::encode(op, tbl);
       ::encode(cid, tbl);
       ::encode(oid, tbl);
       ops++;
     }
-    void clone(coll_t cid, const hobject_t& oid, hobject_t noid) {
+    void clone(coll_t cid, const ghobject_t& oid, ghobject_t noid) {
       __u32 op = OP_CLONE;
       ::encode(op, tbl);
       ::encode(cid, tbl);
@@ -511,7 +513,7 @@ public:
       ::encode(noid, tbl);
       ops++;
     }
-    void clone_range(coll_t cid, const hobject_t& oid, hobject_t noid,
+    void clone_range(coll_t cid, const ghobject_t& oid, ghobject_t noid,
 		     uint64_t srcoff, uint64_t srclen, uint64_t dstoff) {
       __u32 op = OP_CLONERANGE2;
       ::encode(op, tbl);
@@ -535,7 +537,7 @@ public:
       ::encode(cid, tbl);
       ops++;
     }
-    void collection_add(coll_t cid, coll_t ocid, const hobject_t& oid) {
+    void collection_add(coll_t cid, coll_t ocid, const ghobject_t& oid) {
       __u32 op = OP_COLL_ADD;
       ::encode(op, tbl);
       ::encode(cid, tbl);
@@ -543,20 +545,20 @@ public:
       ::encode(oid, tbl);
       ops++;
     }
-    void collection_remove(coll_t cid, const hobject_t& oid) {
+    void collection_remove(coll_t cid, const ghobject_t& oid) {
       __u32 op = OP_COLL_REMOVE;
       ::encode(op, tbl);
       ::encode(cid, tbl);
       ::encode(oid, tbl);
       ops++;
     }
-    void collection_move(coll_t cid, coll_t oldcid, const hobject_t& oid) {
+    void collection_move(coll_t cid, coll_t oldcid, const ghobject_t& oid) {
       collection_add(cid, oldcid, oid);
       collection_remove(oldcid, oid);
       return;
     }
-    void collection_move_rename(coll_t oldcid, const hobject_t& oldoid,
-				coll_t cid, const hobject_t& oid) {
+    void collection_move_rename(coll_t oldcid, const ghobject_t& oldoid,
+				coll_t cid, const ghobject_t& oid) {
       __u32 op = OP_COLL_MOVE_RENAME;
       ::encode(op, tbl);
       ::encode(oldcid, tbl);
@@ -611,55 +613,55 @@ public:
       ops++;
     }
 
-    /// Remove omap from hoid
+    /// Remove omap from oid
     void omap_clear(
-      coll_t cid,           ///< [in] Collection containing hoid
-      const hobject_t &hoid ///< [in] Object from which to remove omap
+      coll_t cid,           ///< [in] Collection containing oid
+      const ghobject_t &oid  ///< [in] Object from which to remove omap
       ) {
       __u32 op = OP_OMAP_CLEAR;
       ::encode(op, tbl);
       ::encode(cid, tbl);
-      ::encode(hoid, tbl);
+      ::encode(oid, tbl);
       ops++;
     }
-    /// Set keys on hoid omap.  Replaces duplicate keys.
+    /// Set keys on oid omap.  Replaces duplicate keys.
     void omap_setkeys(
-      coll_t cid,                           ///< [in] Collection containing hoid
-      const hobject_t &hoid,                ///< [in] Object to update
+      coll_t cid,                           ///< [in] Collection containing oid
+      const ghobject_t &oid,                ///< [in] Object to update
       const map<string, bufferlist> &attrset ///< [in] Replacement keys and values
       ) {
       __u32 op = OP_OMAP_SETKEYS;
       ::encode(op, tbl);
       ::encode(cid, tbl);
-      ::encode(hoid, tbl);
+      ::encode(oid, tbl);
       ::encode(attrset, tbl);
       ops++;
     }
-    /// Remove keys from hoid omap
+    /// Remove keys from oid omap
     void omap_rmkeys(
-      coll_t cid,             ///< [in] Collection containing hoid
-      const hobject_t &hoid,  ///< [in] Object from which to remove the omap
+      coll_t cid,             ///< [in] Collection containing oid
+      const ghobject_t &oid,  ///< [in] Object from which to remove the omap
       const set<string> &keys ///< [in] Keys to clear
       ) {
       __u32 op = OP_OMAP_RMKEYS;
       ::encode(op, tbl);
       ::encode(cid, tbl);
-      ::encode(hoid, tbl);
+      ::encode(oid, tbl);
       ::encode(keys, tbl);
       ops++;
     }
 
-    /// Remove key range from hoid omap
+    /// Remove key range from oid omap
     void omap_rmkeyrange(
-      coll_t cid,             ///< [in] Collection containing hoid
-      const hobject_t &hoid,  ///< [in] Object from which to remove the omap
+      coll_t cid,             ///< [in] Collection containing oid
+      const ghobject_t &oid,  ///< [in] Object from which to remove the omap
       const string& first,    ///< [in] first key in range
       const string& last      ///< [in] first key past range
       ) {
       __u32 op = OP_OMAP_RMKEYRANGE;
       ::encode(op, tbl);
       ::encode(cid, tbl);
-      ::encode(hoid, tbl);
+      ::encode(oid, tbl);
       ::encode(first, tbl);
       ::encode(last, tbl);
       ops++;
@@ -667,14 +669,14 @@ public:
 
     /// Set omap header
     void omap_setheader(
-      coll_t cid,             ///< [in] Collection containing hoid
-      const hobject_t &hoid,  ///< [in] Object from which to remove the omap
+      coll_t cid,             ///< [in] Collection containing oid
+      const ghobject_t &oid,  ///< [in] Object from which to remove the omap
       const bufferlist &bl    ///< [in] Header value
       ) {
       __u32 op = OP_OMAP_SETHEADER;
       ::encode(op, tbl);
       ::encode(cid, tbl);
-      ::encode(hoid, tbl);
+      ::encode(oid, tbl);
       ::encode(bl, tbl);
       ops++;
     }
@@ -857,6 +859,8 @@ public:
   virtual int get_max_object_name_length() = 0;
   virtual int mkfs() = 0;  // wipe
   virtual int mkjournal() = 0; // journal only
+  virtual void set_allow_sharded_objects() = 0;
+  virtual bool get_allow_sharded_objects() = 0;
 
   virtual int statfs(struct statfs *buf) = 0;
 
@@ -875,32 +879,32 @@ public:
   virtual int get_ideal_list_max() { return 64; }
 
   // objects
-  virtual bool exists(coll_t cid, const hobject_t& oid) = 0;                   // useful?
+  virtual bool exists(coll_t cid, const ghobject_t& oid) = 0;                   // useful?
   virtual int stat(
     coll_t cid,
-    const hobject_t& oid,
+    const ghobject_t& oid,
     struct stat *st,
     bool allow_eio = false) = 0; // struct stat?
 
   virtual int read(
     coll_t cid,
-    const hobject_t& oid,
+    const ghobject_t& oid,
     uint64_t offset,
     size_t len,
     bufferlist& bl,
     bool allow_eio = false) = 0;
 
-  virtual int fiemap(coll_t cid, const hobject_t& oid, uint64_t offset, size_t len, bufferlist& bl) = 0;
+  virtual int fiemap(coll_t cid, const ghobject_t& oid, uint64_t offset, size_t len, bufferlist& bl) = 0;
 
-  virtual int getattr(coll_t cid, const hobject_t& oid, const char *name, bufferptr& value) = 0;
-  int getattr(coll_t cid, const hobject_t& oid, const char *name, bufferlist& value) {
+  virtual int getattr(coll_t cid, const ghobject_t& oid, const char *name, bufferptr& value) = 0;
+  int getattr(coll_t cid, const ghobject_t& oid, const char *name, bufferlist& value) {
     bufferptr bp;
     int r = getattr(cid, oid, name, bp);
     if (bp.length())
       value.push_back(bp);
     return r;
   }
-  virtual int getattrs(coll_t cid, const hobject_t& oid, map<string,bufferptr>& aset, bool user_only = false) {return 0;};
+  virtual int getattrs(coll_t cid, const ghobject_t& oid, map<string,bufferptr>& aset, bool user_only = false) {return 0;};
 
    
   // collections
@@ -915,7 +919,7 @@ public:
   virtual int collection_getattr(coll_t cid, const char *name, bufferlist& bl) = 0;
   virtual int collection_getattrs(coll_t cid, map<string,bufferptr> &aset) = 0;
   virtual bool collection_empty(coll_t c) = 0;
-  virtual int collection_list(coll_t c, vector<hobject_t>& o) = 0;
+  virtual int collection_list(coll_t c, vector<ghobject_t>& o) = 0;
 
   /**
    * list partial contents of collection relative to a hash offset/position
@@ -929,9 +933,9 @@ public:
    * @param next [out] next item sorts >= this value
    * @return zero on success, or negative error
    */
-  virtual int collection_list_partial(coll_t c, hobject_t start,
+  virtual int collection_list_partial(coll_t c, ghobject_t start,
 				      int min, int max, snapid_t snap, 
-				      vector<hobject_t> *ls, hobject_t *next) = 0;
+				      vector<ghobject_t> *ls, ghobject_t *next) = 0;
 
   /**
    * list contents of a collection that fall in the range [start, end)
@@ -943,47 +947,57 @@ public:
    * @param ls [out] result
    * @return zero on success, or negative error
    */
-  virtual int collection_list_range(coll_t c, hobject_t start, hobject_t end,
-                                    snapid_t seq, vector<hobject_t> *ls) = 0;
+  virtual int collection_list_range(coll_t c, ghobject_t start, ghobject_t end,
+                                    snapid_t seq, vector<ghobject_t> *ls) = 0;
+
+  //TODO: Remove
+  int collection_list(coll_t c, vector<hobject_t>& o);
+
+  int collection_list_partial(coll_t c, hobject_t start,
+				      int min, int max, snapid_t snap,
+				      vector<hobject_t> *ls, hobject_t *next);
+
+  int collection_list_range(coll_t c, hobject_t start, hobject_t end,
+                                    snapid_t seq, vector<hobject_t> *ls);
 
   /// OMAP
   /// Get omap contents
   virtual int omap_get(
-    coll_t c,                ///< [in] Collection containing hoid
-    const hobject_t &hoid,   ///< [in] Object containing omap
+    coll_t c,                ///< [in] Collection containing oid
+    const ghobject_t &oid,   ///< [in] Object containing omap
     bufferlist *header,      ///< [out] omap header
     map<string, bufferlist> *out /// < [out] Key to value map
     ) = 0;
 
   /// Get omap header
   virtual int omap_get_header(
-    coll_t c,                ///< [in] Collection containing hoid
-    const hobject_t &hoid,   ///< [in] Object containing omap
+    coll_t c,                ///< [in] Collection containing oid
+    const ghobject_t &oid,   ///< [in] Object containing omap
     bufferlist *header,      ///< [out] omap header
     bool allow_eio = false ///< [in] don't assert on eio
     ) = 0;
 
-  /// Get keys defined on hoid
+  /// Get keys defined on oid
   virtual int omap_get_keys(
-    coll_t c,              ///< [in] Collection containing hoid
-    const hobject_t &hoid, ///< [in] Object containing omap
-    set<string> *keys      ///< [out] Keys defined on hoid
+    coll_t c,              ///< [in] Collection containing oid
+    const ghobject_t &oid, ///< [in] Object containing omap
+    set<string> *keys      ///< [out] Keys defined on oid
     ) = 0;
 
   /// Get key values
   virtual int omap_get_values(
-    coll_t c,                    ///< [in] Collection containing hoid
-    const hobject_t &hoid,       ///< [in] Object containing omap
+    coll_t c,                    ///< [in] Collection containing oid
+    const ghobject_t &oid,       ///< [in] Object containing omap
     const set<string> &keys,     ///< [in] Keys to get
     map<string, bufferlist> *out ///< [out] Returned keys and values
     ) = 0;
 
-  /// Filters keys into out which are defined on hoid
+  /// Filters keys into out which are defined on oid
   virtual int omap_check_keys(
-    coll_t c,                ///< [in] Collection containing hoid
-    const hobject_t &hoid,   ///< [in] Object containing omap
+    coll_t c,                ///< [in] Collection containing oid
+    const ghobject_t &oid,   ///< [in] Object containing omap
     const set<string> &keys, ///< [in] Keys to check
-    set<string> *out         ///< [out] Subset of keys defined on hoid
+    set<string> *out         ///< [out] Subset of keys defined on oid
     ) = 0;
 
   /**
@@ -997,7 +1011,7 @@ public:
    */
   virtual ObjectMap::ObjectMapIterator get_omap_iterator(
     coll_t c,              ///< [in] collection
-    const hobject_t &hoid  ///< [in] object
+    const ghobject_t &oid  ///< [in] object
     ) = 0;
 
   virtual void sync(Context *onsync) {}
@@ -1013,8 +1027,8 @@ public:
   virtual uuid_d get_fsid() = 0;
 
   // DEBUG
-  virtual void inject_data_error(const hobject_t &oid) {}
-  virtual void inject_mdata_error(const hobject_t &oid) {}
+  virtual void inject_data_error(const ghobject_t &oid) {}
+  virtual void inject_mdata_error(const ghobject_t &oid) {}
 };
 
 
diff --git a/src/os/WBThrottle.cc b/src/os/WBThrottle.cc
index 8479b3c878d..e02c17677bb 100644
--- a/src/os/WBThrottle.cc
+++ b/src/os/WBThrottle.cc
@@ -116,7 +116,7 @@ void WBThrottle::handle_conf_change(const md_config_t *conf,
 }
 
 bool WBThrottle::get_next_should_flush(
-  boost::tuple<hobject_t, FDRef, PendingWB> *next)
+  boost::tuple<ghobject_t, FDRef, PendingWB> *next)
 {
   assert(lock.is_locked());
   assert(next);
@@ -128,9 +128,9 @@ bool WBThrottle::get_next_should_flush(
   if (stopping)
     return false;
   assert(!pending_wbs.empty());
-  hobject_t obj(pop_object());
+  ghobject_t obj(pop_object());
   
-  map<hobject_t, pair<PendingWB, FDRef> >::iterator i =
+  map<ghobject_t, pair<PendingWB, FDRef> >::iterator i =
     pending_wbs.find(obj);
   *next = boost::make_tuple(obj, i->second.second, i->second.first);
   pending_wbs.erase(i);
@@ -141,7 +141,7 @@ bool WBThrottle::get_next_should_flush(
 void *WBThrottle::entry()
 {
   Mutex::Locker l(lock);
-  boost::tuple<hobject_t, FDRef, PendingWB> wb;
+  boost::tuple<ghobject_t, FDRef, PendingWB> wb;
   while (get_next_should_flush(&wb)) {
     clearing = wb.get<0>();
     lock.Unlock();
@@ -149,24 +149,24 @@ void *WBThrottle::entry()
     if (wb.get<2>().nocache)
       posix_fadvise(**wb.get<1>(), 0, 0, POSIX_FADV_DONTNEED);
     lock.Lock();
-    clearing = hobject_t();
+    clearing = ghobject_t();
     cur_ios -= wb.get<2>().ios;
     logger->dec(l_wbthrottle_ios_dirtied, wb.get<2>().ios);
     cur_size -= wb.get<2>().size;
     logger->dec(l_wbthrottle_bytes_dirtied, wb.get<2>().size);
     logger->dec(l_wbthrottle_inodes_dirtied);
     cond.Signal();
-    wb = boost::tuple<hobject_t, FDRef, PendingWB>();
+    wb = boost::tuple<ghobject_t, FDRef, PendingWB>();
   }
   return 0;
 }
 
 void WBThrottle::queue_wb(
-  FDRef fd, const hobject_t &hoid, uint64_t offset, uint64_t len,
+  FDRef fd, const ghobject_t &hoid, uint64_t offset, uint64_t len,
   bool nocache)
 {
   Mutex::Locker l(lock);
-  map<hobject_t, pair<PendingWB, FDRef> >::iterator wbiter =
+  map<ghobject_t, pair<PendingWB, FDRef> >::iterator wbiter =
     pending_wbs.find(hoid);
   if (wbiter == pending_wbs.end()) {
     wbiter = pending_wbs.insert(
@@ -192,7 +192,7 @@ void WBThrottle::queue_wb(
 void WBThrottle::clear()
 {
   Mutex::Locker l(lock);
-  for (map<hobject_t, pair<PendingWB, FDRef> >::iterator i =
+  for (map<ghobject_t, pair<PendingWB, FDRef> >::iterator i =
 	 pending_wbs.begin();
        i != pending_wbs.end();
        ++i) {
@@ -208,12 +208,12 @@ void WBThrottle::clear()
   cond.Signal();
 }
 
-void WBThrottle::clear_object(const hobject_t &hoid)
+void WBThrottle::clear_object(const ghobject_t &hoid)
 {
   Mutex::Locker l(lock);
   while (clearing == hoid)
     cond.Wait(lock);
-  map<hobject_t, pair<PendingWB, FDRef> >::iterator i =
+  map<ghobject_t, pair<PendingWB, FDRef> >::iterator i =
     pending_wbs.find(hoid);
   if (i == pending_wbs.end())
     return;
diff --git a/src/os/WBThrottle.h b/src/os/WBThrottle.h
index d480a6b751c..e418cf98d2a 100644
--- a/src/os/WBThrottle.h
+++ b/src/os/WBThrottle.h
@@ -44,7 +44,7 @@ enum {
  * Tracks, throttles, and flushes outstanding IO
  */
 class WBThrottle : Thread, public md_config_obs_t {
-  hobject_t clearing;
+  ghobject_t clearing;
 
   /* *_limits.first is the start_flusher limit and
    * *_limits.second is the hard limit
@@ -89,36 +89,36 @@ class WBThrottle : Thread, public md_config_obs_t {
   /**
    * Flush objects in lru order
    */
-  list<hobject_t> lru;
-  map<hobject_t, list<hobject_t>::iterator> rev_lru;
-  void remove_object(const hobject_t &hoid) {
+  list<ghobject_t> lru;
+  map<ghobject_t, list<ghobject_t>::iterator> rev_lru;
+  void remove_object(const ghobject_t &oid) {
     assert(lock.is_locked());
-    map<hobject_t, list<hobject_t>::iterator>::iterator iter =
-      rev_lru.find(hoid);
+    map<ghobject_t, list<ghobject_t>::iterator>::iterator iter =
+      rev_lru.find(oid);
     if (iter == rev_lru.end())
       return;
 
     lru.erase(iter->second);
     rev_lru.erase(iter);
   }
-  hobject_t pop_object() {
+  ghobject_t pop_object() {
     assert(!lru.empty());
-    hobject_t hoid(lru.front());
+    ghobject_t oid(lru.front());
     lru.pop_front();
-    rev_lru.erase(hoid);
-    return hoid;
+    rev_lru.erase(oid);
+    return oid;
   }
-  void insert_object(const hobject_t &hoid) {
-    assert(rev_lru.find(hoid) == rev_lru.end());
-    lru.push_back(hoid);
-    rev_lru.insert(make_pair(hoid, --lru.end()));
+  void insert_object(const ghobject_t &oid) {
+    assert(rev_lru.find(oid) == rev_lru.end());
+    lru.push_back(oid);
+    rev_lru.insert(make_pair(oid, --lru.end()));
   }
 
-  map<hobject_t, pair<PendingWB, FDRef> > pending_wbs;
+  map<ghobject_t, pair<PendingWB, FDRef> > pending_wbs;
 
   /// get next flush to perform
   bool get_next_should_flush(
-    boost::tuple<hobject_t, FDRef, PendingWB> *next ///< [out] next to flush
+    boost::tuple<ghobject_t, FDRef, PendingWB> *next ///< [out] next to flush
     ); ///< @return false if we are shutting down
 public:
   enum FS {
@@ -141,10 +141,10 @@ public:
     set_from_conf();
   }
 
-  /// Queue wb on hoid, fd taking throttle (does not block)
+  /// Queue wb on oid, fd taking throttle (does not block)
   void queue_wb(
-    FDRef fd,              ///< [in] FDRef to hoid
-    const hobject_t &hoid, ///< [in] object
+    FDRef fd,              ///< [in] FDRef to oid
+    const ghobject_t &oid, ///< [in] object
     uint64_t offset,       ///< [in] offset written
     uint64_t len,          ///< [in] length written
     bool nocache           ///< [in] try to clear out of cache after write
@@ -154,7 +154,7 @@ public:
   void clear();
 
   /// Clear object
-  void clear_object(const hobject_t &hoid);
+  void clear_object(const ghobject_t &oid);
 
   /// Block until there is throttle available
   void throttle();
diff --git a/src/osd/ErasureCodeInterface.h b/src/osd/ErasureCodeInterface.h
index 5ce2842d562..656ee91987e 100644
--- a/src/osd/ErasureCodeInterface.h
+++ b/src/osd/ErasureCodeInterface.h
@@ -25,15 +25,15 @@
     are systematic (i.e. the data is not mangled and can be
     reconstructed by concatenating chunks ).
     
-    All methods returns **0** on success and a negative value on
+    All methods return **0** on success and a negative value on
     error. If the value returned on error is not explained in
     **ErasureCodeInterface**, the sources or the documentation of the
-    interface implementer must be read to figure out what it means. It
-    is recommended that each error code matches an *errno* value that
-    relates to the cause of the error.
+    interface implementer (i.e. the plugin ) must be read to figure
+    out what it means. It is recommended that each error code matches
+    an *errno* value that relates to the cause of the error.
 
     Assuming the interface implementer provides three data chunks ( K
-    = 3 ) and two coding chunks ( M = 2 ), a buffer can be encoded as
+    = 3 ) and two coding chunks ( M = 2 ), a buffer could be encoded as
     follows:
     
     ~~~~~~~~~~~~~~~~{.c}
@@ -50,16 +50,20 @@
     encoded[4]         // coding chunk 1
     ~~~~~~~~~~~~~~~~
 
-    If encoded[2] ( which contains **EF** ) is missing and accessing
-    encoded[3] ( the first coding chunk ) is more expensive than
-    accessing encoded[4] ( the second coding chunk ), the
-    **minimum_to_decode_with_cost** method can be called as follows:
+    The **minimum_to_decode_with_cost** method can be used to minimize
+    the cost of fetching the chunks necessary to retrieve a given
+    content. For instance, if encoded[2] (contained **EF**) is missing
+    and accessing encoded[3] (the first coding chunk) is more
+    expensive than accessing encoded[4] (the second coding chunk),
+    **minimum_to_decode_with_cost** is expected to chose the first
+    coding chunk.
 
     ~~~~~~~~~~~~~~~~{.c}
     set<int> want_to_read(2); // want the chunk containing "EF"
     map<int,int> available(
           0 => 1,  // data chunk 0 : available and costs 1
           1 => 1,  // data chunk 1 : available and costs 1
+                   // data chunk 2 : missing
           3 => 9,  // coding chunk 1 : available and costs 9
           4 => 1,  // coding chunk 2 : available and costs 1
     );
@@ -67,14 +71,14 @@
     minimum_to_decode_with_cost(want_to_read,
                                 available,
                                 &minimum);
-    minimum == set<int>(0, 1, 4);
+    minimum == set<int>(0, 1, 4); // NOT set<int>(0, 1, 3);
     ~~~~~~~~~~~~~~~~
     
     It sets **minimum** with three chunks to reconstruct the desired
     data chunk and will pick the second coding chunk ( 4 ) because it
     is less expensive ( 1 < 9 ) to retrieve than the first coding
     chunk ( 3 ). The caller is responsible for retrieving the chunks
-    and call **decode** to reconstruct the second data chunk content.
+    and call **decode** to reconstruct the second data chunk.
     
     ~~~~~~~~~~~~~~~~{.c}
     map<int,bufferlist> chunks;
@@ -85,6 +89,10 @@
     decoded[2] == "EF"
     ~~~~~~~~~~~~~~~~
 
+    The semantic of the cost value is defined by the caller and must
+    be known to the implementer. For instance, it may be more
+    expensive to retrieve two chunks with cost 1 + 9 = 10 than two
+    chunks with cost 6 + 6 = 12. 
  */ 
 
 #include <map>
@@ -113,7 +121,7 @@ namespace ceph {
      *
      * @param [in] want_to_read chunk indexes to be decoded
      * @param [in] available chunk indexes containing valid data
-     * @param [out] minimum chunk indexes to retrieve for decode
+     * @param [out] minimum chunk indexes to retrieve 
      * @return **0** on success or a negative errno on error.
      */
     virtual int minimum_to_decode(const set<int> &want_to_read,
@@ -124,8 +132,8 @@ namespace ceph {
      * Compute the smallest subset of **available** chunks that needs
      * to be retrieved in order to successfully decode
      * **want_to_read** chunks. If there are more than one possible
-     * subset, select the subset that contains the chunks with the
-     * lowest cost.
+     * subset, select the subset that minimizes the overall retrieval
+     * cost.
      *
      * The **available** parameter maps chunk indexes to their
      * retrieval cost. The higher the cost value, the more costly it
@@ -141,7 +149,7 @@ namespace ceph {
      * @param [in] want_to_read chunk indexes to be decoded
      * @param [in] available map chunk indexes containing valid data 
      *             to their retrieval cost
-     * @param [out] minimum chunk indexes to retrieve for decode
+     * @param [out] minimum chunk indexes to retrieve 
      * @return **0** on success or a negative errno on error.
      */
     virtual int minimum_to_decode_with_cost(const set<int> &want_to_read,
@@ -150,15 +158,31 @@ namespace ceph {
 
     /**
      * Encode the content of **in** and store the result in
-     * **encoded**. The **encoded** map contains at least all
-     * chunk indexes found in the **want_to_encode** set. 
+     * **encoded**. All buffers pointed to by **encoded** have the
+     * same size. The **encoded** map contains at least all chunk
+     * indexes found in the **want_to_encode** set.
      *
      * The **encoded** map is expected to be a pointer to an empty
      * map.
      *
+     * Assuming the **in** parameter is **length** bytes long, 
+     * the concatenation of the first **length** bytes of the
+     * **encoded** buffers is equal to the content of the **in**
+     * parameter.
+     *
      * The **encoded** map may contain more chunks than required by
      * **want_to_encode** and the caller is expected to permanently
-     * store all of them, not just the chunks from **want_to_encode**.
+     * store all of them, not just the chunks listed in
+     * **want_to_encode**.
+     *
+     * The **encoded** map may contain pointers to data stored in
+     * the **in** parameter. If the caller modifies the content of
+     * **in** after calling the encode method, it may have a side
+     * effect on the content of **encoded**. 
+     *
+     * The **encoded** map may contain pointers to buffers allocated
+     * by the encode method. They will be freed when **encoded** is
+     * freed. The allocation method is not specified.
      *
      * Returns 0 on success.
      *
@@ -172,24 +196,30 @@ namespace ceph {
                        map<int, bufferlist> *encoded) = 0;
 
     /**
-     * Decode the **chunks** and store at least **want_to_read** chunks
-     * in **decoded**. 
+     * Decode the **chunks** and store at least **want_to_read**
+     * chunks in **decoded**.
+     *
+     * The **decoded** map must be a pointer to an empty map.
      *
      * There must be enough **chunks** ( as returned by
      * **minimum_to_decode** or **minimum_to_decode_with_cost** ) to
-     * perform a successfull decoding of all chunks found in
+     * perform a successful decoding of all chunks listed in
      * **want_to_read**.
      *
-     * The **decoded** map is expected to be a pointer to an empty
-     * map.
+     * All buffers pointed by **in** must have the same size.
+     *
+     * On success, the **decoded** map may contain more chunks than
+     * required by **want_to_read** and they can safely be used by the
+     * caller.
      *
-     * The **decoded** map may contain more chunks than required by
-     * **want_to_read** and they can safely be used by the caller.
+     * If a chunk is listed in **want_to_read** and there is a
+     * corresponding **bufferlist** in **chunks**, it will be
+     * referenced in **decoded**. If not it will be reconstructed from
+     * the existing chunks. 
      *
-     * If a chunk is listed in **want_to_read** and there is
-     * corresponding **bufferlist** in **chunks**, it will be copied
-     * verbatim into **decoded**. If not it will be reconstructed from
-     * the existing chunks.
+     * Because **decoded** may contain pointers to data found in
+     * **chunks**, modifying the content of **chunks** after calling
+     * decode may have a side effect on the content of **decoded**.
      *
      * Returns 0 on success.
      *
diff --git a/src/osd/ErasureCodePlugin.cc b/src/osd/ErasureCodePlugin.cc
index 10b65b2604b..d8b9ae0fbbd 100644
--- a/src/osd/ErasureCodePlugin.cc
+++ b/src/osd/ErasureCodePlugin.cc
@@ -36,7 +36,8 @@ static ostream& _prefix(std::ostream* _dout)
 ErasureCodePluginRegistry ErasureCodePluginRegistry::singleton;
 
 ErasureCodePluginRegistry::ErasureCodePluginRegistry() :
-  lock("ErasureCodePluginRegistry::lock")
+  lock("ErasureCodePluginRegistry::lock"),
+  loading(false)
 {
 }
 
@@ -76,7 +77,9 @@ int ErasureCodePluginRegistry::factory(const std::string &plugin_name,
   int r = 0;
   ErasureCodePlugin *plugin = get(plugin_name);
   if (plugin == 0) {
+    loading = true;
     r = load(plugin_name, parameters, &plugin);
+    loading = false;
     if (r != 0)
       return r;
   }
diff --git a/src/osd/ErasureCodePlugin.h b/src/osd/ErasureCodePlugin.h
index f1c1ccb31b3..a2feb71695a 100644
--- a/src/osd/ErasureCodePlugin.h
+++ b/src/osd/ErasureCodePlugin.h
@@ -41,6 +41,7 @@ namespace ceph {
   class ErasureCodePluginRegistry {
   public:
     Mutex lock;
+    bool loading;
     std::map<std::string,ErasureCodePlugin*> plugins;
 
     static ErasureCodePluginRegistry singleton;
diff --git a/src/osd/ErasureCodePluginJerasure/ErasureCodeJerasure.cc b/src/osd/ErasureCodePluginJerasure/ErasureCodeJerasure.cc
index 25821274081..f2be1ed06e7 100644
--- a/src/osd/ErasureCodePluginJerasure/ErasureCodeJerasure.cc
+++ b/src/osd/ErasureCodePluginJerasure/ErasureCodeJerasure.cc
@@ -15,6 +15,7 @@
  */
 
 #include <errno.h>
+#include <algorithm>
 #include "common/debug.h"
 #include "ErasureCodeJerasure.h"
 extern "C" {
@@ -34,7 +35,8 @@ static ostream& _prefix(std::ostream* _dout)
   return *_dout << "ErasureCodeJerasure: ";
 }
 
-void ErasureCodeJerasure::init(const map<std::string,std::string> &parameters) {
+void ErasureCodeJerasure::init(const map<std::string,std::string> &parameters)
+{
   dout(10) << "technique=" << technique << dendl;
   parse(parameters);
   prepare();
@@ -42,19 +44,26 @@ void ErasureCodeJerasure::init(const map<std::string,std::string> &parameters) {
 
 int ErasureCodeJerasure::minimum_to_decode(const set<int> &want_to_read,
                                            const set<int> &available_chunks,
-                                           set<int> *minimum) {
-  if (available_chunks.size() < (unsigned)k)
-    return -EIO;
-  set<int>::iterator i;
-  unsigned j;
-  for (i = available_chunks.begin(), j = 0; j < (unsigned)k; i++, j++)
-    minimum->insert(*i);
+                                           set<int> *minimum) 
+{
+  if (includes(available_chunks.begin(), available_chunks.end(),
+	       want_to_read.begin(), want_to_read.end())) {
+    *minimum = want_to_read;
+  } else {
+    if (available_chunks.size() < (unsigned)k)
+      return -EIO;
+    set<int>::iterator i;
+    unsigned j;
+    for (i = available_chunks.begin(), j = 0; j < (unsigned)k; i++, j++)
+      minimum->insert(*i);
+  }
   return 0;
 }
 
 int ErasureCodeJerasure::minimum_to_decode_with_cost(const set<int> &want_to_read,
                                                      const map<int, int> &available,
-                                                     set<int> *minimum) {
+                                                     set<int> *minimum)
+{
   set <int> available_chunks;
   for (map<int, int>::const_iterator i = available.begin();
        i != available.end();
@@ -65,39 +74,38 @@ int ErasureCodeJerasure::minimum_to_decode_with_cost(const set<int> &want_to_rea
 
 int ErasureCodeJerasure::encode(const set<int> &want_to_encode,
                                 const bufferlist &in,
-                                map<int, bufferlist> *encoded) {
-  unsigned in_length = pad_in_length(in.length());
-  dout(10) << "encode adjusted buffer length from " << in.length() << " to " << in_length << dendl;
-  assert(in_length % k == 0);
-  unsigned blocksize = in_length / k;
+                                map<int, bufferlist> *encoded)
+{
+  unsigned alignment = get_alignment();
+  unsigned tail = in.length() % alignment;
+  unsigned padded_length = in.length() + ( tail ?  ( alignment - tail ) : 0 );
+  dout(10) << "encode adjusted buffer length from " << in.length()
+	   << " to " << padded_length << dendl;
+  assert(padded_length % k == 0);
+  unsigned blocksize = padded_length / k;
   unsigned length = blocksize * ( k + m );
   bufferlist out(in);
   bufferptr pad(length - in.length());
-  pad.zero(0, k);
+  pad.zero(0, padded_length - in.length());
   out.push_back(pad);
-  char *p = out.c_str();
-  char *data[k];
-  for (int i = 0; i < k; i++) {
-    data[i] = p + i * blocksize;
+  char *chunks[k + m];
+  for (int i = 0; i < k + m; i++) {
+    bufferlist &chunk = (*encoded)[i];
+    chunk.substr_of(out, i * blocksize, blocksize);
+    chunks[i] = chunk.c_str();
   }
-  char *coding[m];
-  for (int i = 0; i < m; i++) {
-    coding[i] = p + ( k + i ) * blocksize;
-  }
-  jerasure_encode(data, coding, blocksize);
-  const bufferptr ptr = out.buffers().front();
-  for (set<int>::iterator j = want_to_encode.begin();
-       j != want_to_encode.end();
-       j++) {
-    bufferptr chunk(ptr, (*j) * blocksize, blocksize);
-    (*encoded)[*j].push_front(chunk);
+  jerasure_encode(&chunks[0], &chunks[k], blocksize);
+  for (int i = 0; i < k + m; i++) {
+    if (want_to_encode.count(i) == 0)
+      encoded->erase(i);
   }
   return 0;
 }
 
 int ErasureCodeJerasure::decode(const set<int> &want_to_read,
                                 const map<int, bufferlist> &chunks,
-                                map<int, bufferlist> *decoded) {
+                                map<int, bufferlist> *decoded)
+{
   unsigned blocksize = (*chunks.begin()).second.length();
   int erasures[k + m + 1];
   int erasures_count = 0;
@@ -127,7 +135,8 @@ int ErasureCodeJerasure::decode(const set<int> &want_to_read,
 
 int ErasureCodeJerasure::to_int(const std::string &name,
                                 const map<std::string,std::string> &parameters,
-                                int default_value) {
+                                int default_value)
+{
   if (parameters.find(name) == parameters.end() ||
       parameters.find(name)->second.size() == 0) {
     dout(10) << name << " defaults to " << default_value << dendl;
@@ -147,7 +156,8 @@ int ErasureCodeJerasure::to_int(const std::string &name,
   return r;
 }
 
-bool ErasureCodeJerasure::is_prime(int value) {
+bool ErasureCodeJerasure::is_prime(int value)
+{
   int prime55[] = {
     2,3,5,7,11,13,17,19,23,29,31,37,41,43,47,53,59,61,67,71,
     73,79,83,89,97,101,103,107,109,113,127,131,137,139,149,
@@ -166,34 +176,39 @@ bool ErasureCodeJerasure::is_prime(int value) {
 //
 void ErasureCodeJerasureReedSolomonVandermonde::jerasure_encode(char **data,
                                                                 char **coding,
-                                                                int blocksize) {
+                                                                int blocksize)
+{
   jerasure_matrix_encode(k, m, w, matrix, data, coding, blocksize);
 }
 
 int ErasureCodeJerasureReedSolomonVandermonde::jerasure_decode(int *erasures,
                                                                 char **data,
                                                                 char **coding,
-                                                                int blocksize) {
-  return jerasure_matrix_decode(k, m, w, matrix, 1, erasures, data, coding, blocksize);
+                                                                int blocksize)
+{
+  return jerasure_matrix_decode(k, m, w, matrix, 1,
+				erasures, data, coding, blocksize);
 }
 
-unsigned ErasureCodeJerasureReedSolomonVandermonde::pad_in_length(unsigned in_length) {
-  while (in_length%(k*w*sizeof(int)) != 0) 
-    in_length++;
-  return in_length;
+unsigned ErasureCodeJerasureReedSolomonVandermonde::get_alignment()
+{
+  return k*w*sizeof(int);
 }
 
-void ErasureCodeJerasureReedSolomonVandermonde::parse(const map<std::string,std::string> &parameters) {
+void ErasureCodeJerasureReedSolomonVandermonde::parse(const map<std::string,std::string> &parameters)
+{
   k = to_int("erasure-code-k", parameters, DEFAULT_K);
   m = to_int("erasure-code-m", parameters, DEFAULT_M);
   w = to_int("erasure-code-w", parameters, DEFAULT_W);
   if (w != 8 && w != 16 && w != 32) {
-    derr << "ReedSolomonVandermonde: w=" << w << " must be one of {8, 16, 32} : revert to 8 " << dendl;
+    derr << "ReedSolomonVandermonde: w=" << w
+	 << " must be one of {8, 16, 32} : revert to 8 " << dendl;
     w = 8;
   }
 }
 
-void ErasureCodeJerasureReedSolomonVandermonde::prepare() {
+void ErasureCodeJerasureReedSolomonVandermonde::prepare()
+{
   matrix = reed_sol_vandermonde_coding_matrix(k, m, w);
 }
 
@@ -202,34 +217,38 @@ void ErasureCodeJerasureReedSolomonVandermonde::prepare() {
 //
 void ErasureCodeJerasureReedSolomonRAID6::jerasure_encode(char **data,
                                                                 char **coding,
-                                                                int blocksize) {
+                                                                int blocksize)
+{
   reed_sol_r6_encode(k, w, data, coding, blocksize);
 }
 
 int ErasureCodeJerasureReedSolomonRAID6::jerasure_decode(int *erasures,
-                                                                char **data,
-                                                                char **coding,
-                                                                int blocksize) {
+							 char **data,
+							 char **coding,
+							 int blocksize)
+{
   return jerasure_matrix_decode(k, m, w, matrix, 1, erasures, data, coding, blocksize);
 }
 
-unsigned ErasureCodeJerasureReedSolomonRAID6::pad_in_length(unsigned in_length) {
-  while (in_length%(k*w*sizeof(int)) != 0) 
-    in_length++;
-  return in_length;
+unsigned ErasureCodeJerasureReedSolomonRAID6::get_alignment()
+{
+  return k*w*sizeof(int);
 }
 
-void ErasureCodeJerasureReedSolomonRAID6::parse(const map<std::string,std::string> &parameters) {
+void ErasureCodeJerasureReedSolomonRAID6::parse(const map<std::string,std::string> &parameters)
+{
   k = to_int("erasure-code-k", parameters, DEFAULT_K);
   m = 2;
   w = to_int("erasure-code-w", parameters, DEFAULT_W);
   if (w != 8 && w != 16 && w != 32) {
-    derr << "ReedSolomonRAID6: w=" << w << " must be one of {8, 16, 32} : revert to 8 " << dendl;
+    derr << "ReedSolomonRAID6: w=" << w
+	 << " must be one of {8, 16, 32} : revert to 8 " << dendl;
     w = 8;
   }
 }
 
-void ErasureCodeJerasureReedSolomonRAID6::prepare() {
+void ErasureCodeJerasureReedSolomonRAID6::prepare()
+{
   matrix = reed_sol_r6_coding_matrix(k, w);
 }
 
@@ -237,32 +256,37 @@ void ErasureCodeJerasureReedSolomonRAID6::prepare() {
 // ErasureCodeJerasureCauchy
 //
 void ErasureCodeJerasureCauchy::jerasure_encode(char **data,
-                                                    char **coding,
-                                                    int blocksize) {
-  jerasure_schedule_encode(k, m, w, schedule, data, coding, blocksize, packetsize);
+						char **coding,
+						int blocksize)
+{
+  jerasure_schedule_encode(k, m, w, schedule,
+			   data, coding, blocksize, packetsize);
 }
 
 int ErasureCodeJerasureCauchy::jerasure_decode(int *erasures,
-                                                    char **data,
-                                                    char **coding,
-                                                    int blocksize) {
-  return jerasure_schedule_decode_lazy(k, m, w, bitmatrix, erasures, data, coding, blocksize, packetsize, 1);
+					       char **data,
+					       char **coding,
+					       int blocksize)
+{
+  return jerasure_schedule_decode_lazy(k, m, w, bitmatrix,
+				       erasures, data, coding, blocksize, packetsize, 1);
 }
 
-unsigned ErasureCodeJerasureCauchy::pad_in_length(unsigned in_length) {
-  while (in_length%(k*w*packetsize*sizeof(int)) != 0) 
-    in_length++;
-  return in_length;
+unsigned ErasureCodeJerasureCauchy::get_alignment()
+{
+  return k*w*packetsize*sizeof(int);
 }
 
-void ErasureCodeJerasureCauchy::parse(const map<std::string,std::string> &parameters) {
+void ErasureCodeJerasureCauchy::parse(const map<std::string,std::string> &parameters)
+{
   k = to_int("erasure-code-k", parameters, DEFAULT_K);
   m = to_int("erasure-code-m", parameters, DEFAULT_M);
   w = to_int("erasure-code-w", parameters, DEFAULT_W);
   packetsize = to_int("erasure-code-packetsize", parameters, DEFAULT_PACKETSIZE);
 }
 
-void ErasureCodeJerasureCauchy::prepare_schedule(int *matrix) {
+void ErasureCodeJerasureCauchy::prepare_schedule(int *matrix)
+{
   bitmatrix = jerasure_matrix_to_bitmatrix(k, m, w, matrix);
   schedule = jerasure_smart_bitmatrix_to_schedule(k, m, w, bitmatrix);
 }
@@ -270,7 +294,8 @@ void ErasureCodeJerasureCauchy::prepare_schedule(int *matrix) {
 // 
 // ErasureCodeJerasureCauchyOrig
 //
-void ErasureCodeJerasureCauchyOrig::prepare() {
+void ErasureCodeJerasureCauchyOrig::prepare()
+{
   int *matrix = cauchy_original_coding_matrix(k, m, w);
   prepare_schedule(matrix);
   free(matrix);
@@ -279,7 +304,8 @@ void ErasureCodeJerasureCauchyOrig::prepare() {
 // 
 // ErasureCodeJerasureCauchyGood
 //
-void ErasureCodeJerasureCauchyGood::prepare() {
+void ErasureCodeJerasureCauchyGood::prepare()
+{
   int *matrix = cauchy_good_general_coding_matrix(k, m, w);
   prepare_schedule(matrix);
   free(matrix);
@@ -288,7 +314,8 @@ void ErasureCodeJerasureCauchyGood::prepare() {
 // 
 // ErasureCodeJerasureLiberation
 //
-ErasureCodeJerasureLiberation::~ErasureCodeJerasureLiberation() {
+ErasureCodeJerasureLiberation::~ErasureCodeJerasureLiberation()
+{
   if (bitmatrix)
     free(bitmatrix);
   if (schedule)
@@ -297,24 +324,28 @@ ErasureCodeJerasureLiberation::~ErasureCodeJerasureLiberation() {
 
 void ErasureCodeJerasureLiberation::jerasure_encode(char **data,
                                                     char **coding,
-                                                    int blocksize) {
-  jerasure_schedule_encode(k, m, w, schedule, data, coding, blocksize, packetsize);
+                                                    int blocksize)
+{
+  jerasure_schedule_encode(k, m, w, schedule, data,
+			   coding, blocksize, packetsize);
 }
 
 int ErasureCodeJerasureLiberation::jerasure_decode(int *erasures,
                                                     char **data,
                                                     char **coding,
-                                                    int blocksize) {
-  return jerasure_schedule_decode_lazy(k, m, w, bitmatrix, erasures, data, coding, blocksize, packetsize, 1);
+                                                    int blocksize)
+{
+  return jerasure_schedule_decode_lazy(k, m, w, bitmatrix, erasures, data,
+				       coding, blocksize, packetsize, 1);
 }
 
-unsigned ErasureCodeJerasureLiberation::pad_in_length(unsigned in_length) {
-  while (in_length%(k*w*packetsize*sizeof(int)) != 0) 
-    in_length++;
-  return in_length;
+unsigned ErasureCodeJerasureLiberation::get_alignment()
+{
+  return k*w*packetsize*sizeof(int);
 }
 
-void ErasureCodeJerasureLiberation::parse(const map<std::string,std::string> &parameters) {
+void ErasureCodeJerasureLiberation::parse(const map<std::string,std::string> &parameters)
+{
   k = to_int("erasure-code-k", parameters, DEFAULT_K);
   m = to_int("erasure-code-m", parameters, DEFAULT_M);
   w = to_int("erasure-code-w", parameters, DEFAULT_W);
@@ -334,18 +365,21 @@ void ErasureCodeJerasureLiberation::parse(const map<std::string,std::string> &pa
     error = true;
   }
   if ((packetsize%(sizeof(int))) != 0) {
-    derr << "packetsize=" << packetsize << " must be a multiple of sizeof(int) = " << sizeof(int) << dendl;
+    derr << "packetsize=" << packetsize
+	 << " must be a multiple of sizeof(int) = " << sizeof(int) << dendl;
     error = true;
   }
   if (error) {
-    derr << "reverting to k=" << DEFAULT_K << ", w=" << DEFAULT_W << ", packetsize=" << DEFAULT_PACKETSIZE << dendl;
+    derr << "reverting to k=" << DEFAULT_K << ", w="
+	 << DEFAULT_W << ", packetsize=" << DEFAULT_PACKETSIZE << dendl;
     k = DEFAULT_K;
     w = DEFAULT_W;
     packetsize = DEFAULT_PACKETSIZE;
   }
 }
 
-void ErasureCodeJerasureLiberation::prepare() {
+void ErasureCodeJerasureLiberation::prepare()
+{
   bitmatrix = liberation_coding_bitmatrix(k, w);
   schedule = jerasure_smart_bitmatrix_to_schedule(k, m, w, bitmatrix);
 }
@@ -353,7 +387,8 @@ void ErasureCodeJerasureLiberation::prepare() {
 // 
 // ErasureCodeJerasureBlaumRoth
 //
-void ErasureCodeJerasureBlaumRoth::prepare() {
+void ErasureCodeJerasureBlaumRoth::prepare()
+{
   bitmatrix = blaum_roth_coding_bitmatrix(k, w);
   schedule = jerasure_smart_bitmatrix_to_schedule(k, m, w, bitmatrix);
 }
@@ -361,7 +396,8 @@ void ErasureCodeJerasureBlaumRoth::prepare() {
 // 
 // ErasureCodeJerasureLiber8tion
 //
-void ErasureCodeJerasureLiber8tion::parse(const map<std::string,std::string> &parameters) {
+void ErasureCodeJerasureLiber8tion::parse(const map<std::string,std::string> &parameters)
+{
   k = to_int("erasure-code-k", parameters, DEFAULT_K);
   m = DEFAULT_M;
   w = DEFAULT_W;
@@ -377,13 +413,15 @@ void ErasureCodeJerasureLiber8tion::parse(const map<std::string,std::string> &pa
     error = true;
   }
   if (error) {
-    derr << "reverting to k=" << DEFAULT_K << ", packetsize=" << DEFAULT_PACKETSIZE << dendl;
+    derr << "reverting to k=" << DEFAULT_K << ", packetsize="
+	 << DEFAULT_PACKETSIZE << dendl;
     k = DEFAULT_K;
     packetsize = DEFAULT_PACKETSIZE;
   }
 }
 
-void ErasureCodeJerasureLiber8tion::prepare() {
+void ErasureCodeJerasureLiber8tion::prepare()
+{
   bitmatrix = liber8tion_coding_bitmatrix(k);
   schedule = jerasure_smart_bitmatrix_to_schedule(k, m, w, bitmatrix);
 }
diff --git a/src/osd/ErasureCodePluginJerasure/ErasureCodeJerasure.h b/src/osd/ErasureCodePluginJerasure/ErasureCodeJerasure.h
index 7728751c383..fc76ed7b1e2 100644
--- a/src/osd/ErasureCodePluginJerasure/ErasureCodeJerasure.h
+++ b/src/osd/ErasureCodePluginJerasure/ErasureCodeJerasure.h
@@ -56,7 +56,7 @@ public:
                                char **data,
                                char **coding,
                                int blocksize) = 0;
-  virtual unsigned pad_in_length(unsigned in_length) = 0;
+  virtual unsigned get_alignment() = 0;
   virtual void parse(const map<std::string,std::string> &parameters) = 0;
   virtual void prepare() = 0;
   static int to_int(const std::string &name,
@@ -88,7 +88,7 @@ public:
                                char **data,
                                char **coding,
                                int blocksize);
-  virtual unsigned pad_in_length(unsigned in_length);
+  virtual unsigned get_alignment();
   virtual void parse(const map<std::string,std::string> &parameters);
   virtual void prepare();
 };
@@ -115,7 +115,7 @@ public:
                                char **data,
                                char **coding,
                                int blocksize);
-  virtual unsigned pad_in_length(unsigned in_length);
+  virtual unsigned get_alignment();
   virtual void parse(const map<std::string,std::string> &parameters);
   virtual void prepare();
 };
@@ -149,7 +149,7 @@ public:
                                char **data,
                                char **coding,
                                int blocksize);
-  virtual unsigned pad_in_length(unsigned in_length);
+  virtual unsigned get_alignment();
   virtual void parse(const map<std::string,std::string> &parameters);
   void prepare_schedule(int *matrix);
 };
@@ -196,7 +196,7 @@ public:
                                char **data,
                                char **coding,
                                int blocksize);
-  virtual unsigned pad_in_length(unsigned in_length);
+  virtual unsigned get_alignment();
   virtual void parse(const map<std::string,std::string> &parameters);
   virtual void prepare();
 };
diff --git a/src/osd/Makefile.am b/src/osd/Makefile.am
index d6d0e363dbb..cae02015fce 100644
--- a/src/osd/Makefile.am
+++ b/src/osd/Makefile.am
@@ -9,6 +9,7 @@ libosd_la_SOURCES = \
 	osd/PG.cc \
 	osd/PGLog.cc \
 	osd/ReplicatedPG.cc \
+	osd/ReplicatedBackend.cc \
 	osd/Ager.cc \
 	osd/OSD.cc \
 	osd/OSDCap.cc \
@@ -36,6 +37,8 @@ noinst_HEADERS += \
 	osd/PG.h \
 	osd/PGLog.h \
 	osd/ReplicatedPG.h \
+	osd/PGBackend.h \
+	osd/ReplicatedBackend.h \
 	osd/Watch.h \
 	osd/osd_types.h
 
diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc
index 1ba35ec2ef5..b2aa2ebbcd2 100644
--- a/src/osd/OSD.cc
+++ b/src/osd/OSD.cc
@@ -134,7 +134,9 @@ static ostream& _prefix(std::ostream* _dout, int whoami, OSDMapRef osdmap) {
 		<< " ";
 }
 
-static CompatSet get_osd_compat_set() {
+//Initial features in new superblock.
+//Features here are also automatically upgraded
+CompatSet OSD::get_osd_initial_compat_set() {
   CompatSet::FeatureSet ceph_osd_feature_compat;
   CompatSet::FeatureSet ceph_osd_feature_ro_compat;
   CompatSet::FeatureSet ceph_osd_feature_incompat;
@@ -152,6 +154,14 @@ static CompatSet get_osd_compat_set() {
 		   ceph_osd_feature_incompat);
 }
 
+//Features are added here that this OSD supports.
+CompatSet OSD::get_osd_compat_set() {
+  CompatSet compat =  get_osd_initial_compat_set();
+  //Any features here can be set in code, but not in initial superblock
+  compat.incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SHARDS);
+  return compat;
+}
+
 OSDService::OSDService(OSD *osd) :
   osd(osd),
   cct(osd->cct),
@@ -170,6 +180,7 @@ OSDService::OSDService(OSD *osd) :
   scrub_wq(osd->scrub_wq),
   scrub_finalize_wq(osd->scrub_finalize_wq),
   rep_scrub_wq(osd->rep_scrub_wq),
+  push_wq("push_wq", cct->_conf->osd_recovery_thread_timeout, &osd->recovery_tp),
   class_handler(osd->class_handler),
   publish_lock("OSDService::publish_lock"),
   pre_publish_lock("OSDService::pre_publish_lock"),
@@ -423,6 +434,7 @@ void OSDService::init()
     objecter_timer.init();
     objecter->set_client_incarnation(0);
     objecter->init_locked();
+    objecter->unset_honor_cache_redirects();
   }
   watch_timer.init();
 }
@@ -449,7 +461,7 @@ int OSD::convert_collection(ObjectStore *store, coll_t cid)
 {
   coll_t tmp0("convertfs_temp");
   coll_t tmp1("convertfs_temp1");
-  vector<hobject_t> objects;
+  vector<ghobject_t> objects;
 
   map<string, bufferptr> aset;
   int r = store->collection_getattrs(cid, aset);
@@ -469,10 +481,10 @@ int OSD::convert_collection(ObjectStore *store, coll_t cid)
     store->apply_transaction(t);
   }
 
-  hobject_t next;
+  ghobject_t next;
   while (!next.is_max()) {
     objects.clear();
-    hobject_t start = next;
+    ghobject_t start = next;
     r = store->collection_list_partial(cid, start,
 				       200, 300, 0,
 				       &objects, &next);
@@ -480,7 +492,7 @@ int OSD::convert_collection(ObjectStore *store, coll_t cid)
       return r;
 
     ObjectStore::Transaction t;
-    for (vector<hobject_t>::iterator i = objects.begin();
+    for (vector<ghobject_t>::iterator i = objects.begin();
 	 i != objects.end();
 	 ++i) {
       t.collection_add(tmp0, cid, *i);
@@ -646,7 +658,7 @@ int OSD::mkfs(CephContext *cct, const std::string &dev, const std::string &jdev,
       sb.cluster_fsid = fsid;
       sb.osd_fsid = store->get_fsid();
       sb.whoami = whoami;
-      sb.compat_features = get_osd_compat_set();
+      sb.compat_features = get_osd_initial_compat_set();
 
       // benchmark?
       if (cct->_conf->osd_auto_weight) {
@@ -701,7 +713,7 @@ int OSD::mkfs(CephContext *cct, const std::string &dev, const std::string &jdev,
       goto umount_store;
     }
 
-    ret = write_meta(dev, "ready", "ready\n", 6);
+    ret = safe_write_file(dev.c_str(), "ready", "ready\n", 6);
     if (ret) {
       derr << "OSD::mkfs: failed to write ready file: error " << ret << dendl;
       goto umount_store;
@@ -757,103 +769,19 @@ int OSD::dump_journal(CephContext *cct, const std::string &dev, const std::strin
   return err;
 }
 
-int OSD::write_meta(const std::string &base, const std::string &file,
-		    const char *val, size_t vallen)
-{
-  int ret;
-  char fn[PATH_MAX];
-  char tmp[PATH_MAX];
-  int fd;
-
-  // does the file already have correct content?
-  char oldval[80];
-  ret = read_meta(base, file, oldval, sizeof(oldval));
-  if (ret == (int)vallen && memcmp(oldval, val, vallen) == 0)
-    return 0;  // yes.
-
-  snprintf(fn, sizeof(fn), "%s/%s", base.c_str(), file.c_str());
-  snprintf(tmp, sizeof(tmp), "%s/%s.tmp", base.c_str(), file.c_str());
-  fd = ::open(tmp, O_WRONLY|O_CREAT|O_TRUNC, 0644);
-  if (fd < 0) {
-    ret = errno;
-    derr << "write_meta: error opening '" << tmp << "': "
-	 << cpp_strerror(ret) << dendl;
-    return -ret;
-  }
-  ret = safe_write(fd, val, vallen);
-  if (ret) {
-    derr << "write_meta: failed to write to '" << tmp << "': "
-	 << cpp_strerror(ret) << dendl;
-    TEMP_FAILURE_RETRY(::close(fd));
-    return ret;
-  }
-
-  ret = ::fsync(fd);
-  TEMP_FAILURE_RETRY(::close(fd));
-  if (ret) {
-    ::unlink(tmp);
-    derr << "write_meta: failed to fsync to '" << tmp << "': "
-	 << cpp_strerror(ret) << dendl;
-    return ret;
-  }
-  ret = ::rename(tmp, fn);
-  if (ret) {
-    ::unlink(tmp);
-    derr << "write_meta: failed to rename '" << tmp << "' to '" << fn << "': "
-	 << cpp_strerror(ret) << dendl;
-    return ret;
-  }
-
-  fd = ::open(base.c_str(), O_RDONLY);
-  if (fd < 0) {
-    ret = errno;
-    derr << "write_meta: failed to open dir '" << base << "': "
-	 << cpp_strerror(ret) << dendl;
-    return -ret;
-  }
-  ::fsync(fd);
-  TEMP_FAILURE_RETRY(::close(fd));
-
-  return 0;
-}
-
-int OSD::read_meta(const  std::string &base, const std::string &file,
-		   char *val, size_t vallen)
-{
-  char fn[PATH_MAX];
-  int fd, len;
-
-  snprintf(fn, sizeof(fn), "%s/%s", base.c_str(), file.c_str());
-  fd = ::open(fn, O_RDONLY);
-  if (fd < 0) {
-    int err = errno;
-    return -err;
-  }
-  len = safe_read(fd, val, vallen);
-  if (len < 0) {
-    TEMP_FAILURE_RETRY(::close(fd));
-    return len;
-  }
-  // close sometimes returns errors, but only after write()
-  TEMP_FAILURE_RETRY(::close(fd));
-
-  val[len] = 0;
-  return len;
-}
-
 int OSD::write_meta(const std::string &base, uuid_d& cluster_fsid, uuid_d& osd_fsid, int whoami)
 {
   char val[80];
   
   snprintf(val, sizeof(val), "%s\n", CEPH_OSD_ONDISK_MAGIC);
-  write_meta(base, "magic", val, strlen(val));
+  safe_write_file(base.c_str(), "magic", val, strlen(val));
 
   snprintf(val, sizeof(val), "%d\n", whoami);
-  write_meta(base, "whoami", val, strlen(val));
+  safe_write_file(base.c_str(), "whoami", val, strlen(val));
 
   cluster_fsid.print(val);
   strcat(val, "\n");
-  write_meta(base, "ceph_fsid", val, strlen(val));
+  safe_write_file(base.c_str(), "ceph_fsid", val, strlen(val));
 
   return 0;
 }
@@ -863,24 +791,24 @@ int OSD::peek_meta(const std::string &dev, std::string& magic,
 {
   char val[80] = { 0 };
 
-  if (read_meta(dev, "magic", val, sizeof(val)) < 0)
+  if (safe_read_file(dev.c_str(), "magic", val, sizeof(val)) < 0)
     return -errno;
   int l = strlen(val);
   if (l && val[l-1] == '\n')
     val[l-1] = 0;
   magic = val;
 
-  if (read_meta(dev, "whoami", val, sizeof(val)) < 0)
+  if (safe_read_file(dev.c_str(), "whoami", val, sizeof(val)) < 0)
     return -errno;
   whoami = atoi(val);
 
-  if (read_meta(dev, "ceph_fsid", val, sizeof(val)) < 0)
+  if (safe_read_file(dev.c_str(), "ceph_fsid", val, sizeof(val)) < 0)
     return -errno;
   if (strlen(val) > 36)
     val[36] = 0;
   cluster_fsid.parse(val);
 
-  if (read_meta(dev, "fsid", val, sizeof(val)) < 0)
+  if (safe_read_file(dev.c_str(), "fsid", val, sizeof(val)) < 0)
     osd_fsid = uuid_d();
   else {
     if (strlen(val) > 36)
@@ -1143,6 +1071,7 @@ public:
 
 int OSD::init()
 {
+  CompatSet initial, diff;
   Mutex::Locker lock(osd_lock);
   if (is_stopping())
     return 0;
@@ -1167,9 +1096,48 @@ int OSD::init()
   r = read_superblock();
   if (r < 0) {
     derr << "OSD::init() : unable to read osd superblock" << dendl;
-    store->umount();
-    delete store;
-    return -EINVAL;
+    r = -EINVAL;
+    goto out;
+  }
+
+  if (osd_compat.compare(superblock.compat_features) < 0) {
+    derr << "The disk uses features unsupported by the executable." << dendl;
+    derr << " ondisk features " << superblock.compat_features << dendl;
+    derr << " daemon features " << osd_compat << dendl;
+
+    if (osd_compat.writeable(superblock.compat_features)) {
+      CompatSet diff = osd_compat.unsupported(superblock.compat_features);
+      derr << "it is still writeable, though. Missing features: " << diff << dendl;
+      r = -EOPNOTSUPP;
+      goto out;
+    }
+    else {
+      CompatSet diff = osd_compat.unsupported(superblock.compat_features);
+      derr << "Cannot write to disk! Missing features: " << diff << dendl;
+      r = -EOPNOTSUPP;
+      goto out;
+    }
+  }
+
+  assert_warn(whoami == superblock.whoami);
+  if (whoami != superblock.whoami) {
+    derr << "OSD::init: superblock says osd"
+	 << superblock.whoami << " but i am osd." << whoami << dendl;
+    r = -EINVAL;
+    goto out;
+  }
+
+  initial = get_osd_initial_compat_set();
+  diff = superblock.compat_features.unsupported(initial);
+  if (superblock.compat_features.merge(initial)) {
+    // We need to persist the new compat_set before we
+    // do anything else
+    dout(5) << "Upgrading superblock adding: " << diff << dendl;
+    ObjectStore::Transaction t;
+    write_superblock(t);
+    r = store->apply_transaction(t);
+    if (r < 0)
+      goto out;
   }
 
   // make sure info object exists
@@ -1179,7 +1147,7 @@ int OSD::init()
     t.touch(coll_t::META_COLL, service.infos_oid);
     r = store->apply_transaction(t);
     if (r < 0)
-      return r;
+      goto out;
   }
 
   // make sure snap mapper object exists
@@ -1189,19 +1157,7 @@ int OSD::init()
     t.touch(coll_t::META_COLL, OSD::make_snapmapper_oid());
     r = store->apply_transaction(t);
     if (r < 0)
-      return r;
-  }
-
-  if (osd_compat.compare(superblock.compat_features) != 0) {
-    // We need to persist the new compat_set before we
-    // do anything else
-    dout(5) << "Upgrading superblock compat_set" << dendl;
-    superblock.compat_features = osd_compat;
-    ObjectStore::Transaction t;
-    write_superblock(t);
-    r = store->apply_transaction(t);
-    if (r < 0)
-      return r;
+      goto out;
   }
 
   class_handler = new ClassHandler(cct);
@@ -1217,7 +1173,8 @@ int OSD::init()
   assert_warn(!osdmap);
   if (osdmap) {
     derr << "OSD::init: unable to read current osdmap" << dendl;
-    return -EINVAL;
+    r = -EINVAL;
+    goto out;
   }
   osdmap = get_map(superblock.current_epoch);
   check_osdmap_features();
@@ -1230,12 +1187,6 @@ int OSD::init()
   load_pgs();
 
   dout(2) << "superblock: i am osd." << superblock.whoami << dendl;
-  assert_warn(whoami == superblock.whoami);
-  if (whoami != superblock.whoami) {
-    derr << "OSD::init: logic error: superblock says osd"
-	 << superblock.whoami << " but i am osd." << whoami << dendl;
-    return -EINVAL;
-  }
 
   create_logger();
     
@@ -1252,7 +1203,7 @@ int OSD::init()
   monc->set_want_keys(CEPH_ENTITY_TYPE_MON | CEPH_ENTITY_TYPE_OSD);
   r = monc->init();
   if (r < 0)
-    return r;
+    goto out;
 
   // tell monc about log_client so it will know about mon session resets
   monc->set_log_client(&clog);
@@ -1276,12 +1227,10 @@ int OSD::init()
 
   r = monc->authenticate();
   if (r < 0) {
-    monc->shutdown();
-    store->umount();
     osd_lock.Lock(); // locker is going to unlock this on function exit
     if (is_stopping())
-      return 0;
-    return r;
+      r =  0;
+    goto monout;
   }
 
   while (monc->wait_auth_rotating(30.0) < 0) {
@@ -1301,6 +1250,13 @@ int OSD::init()
   start_boot();
 
   return 0;
+monout:
+  monc->shutdown();
+
+out:
+  store->umount();
+  delete store;
+  return r;
 }
 
 void OSD::final_init()
@@ -1719,28 +1675,6 @@ int OSD::read_superblock()
   ::decode(superblock, p);
 
   dout(10) << "read_superblock " << superblock << dendl;
-  if (osd_compat.compare(superblock.compat_features) < 0) {
-    derr << "The disk uses features unsupported by the executable." << dendl;
-    derr << " ondisk features " << superblock.compat_features << dendl;
-    derr << " daemon features " << osd_compat << dendl;
-
-    if (osd_compat.writeable(superblock.compat_features)) {
-      derr << "it is still writeable, though. Missing features:" << dendl;
-      CompatSet diff = osd_compat.unsupported(superblock.compat_features);
-      return -EOPNOTSUPP;
-    }
-    else {
-      derr << "Cannot write to disk! Missing features:" << dendl;
-      CompatSet diff = osd_compat.unsupported(superblock.compat_features);
-      return -EOPNOTSUPP;
-    }
-  }
-
-  if (whoami != superblock.whoami) {
-    derr << "read_superblock superblock says osd." << superblock.whoami
-         << ", but i (think i) am osd." << whoami << dendl;
-    return -1;
-  }
   
   return 0;
 }
@@ -1755,17 +1689,17 @@ void OSD::recursive_remove_collection(ObjectStore *store, coll_t tmp)
     make_snapmapper_oid());
   SnapMapper mapper(&driver, 0, 0, 0);
 
-  vector<hobject_t> objects;
+  vector<ghobject_t> objects;
   store->collection_list(tmp, objects);
 
   // delete them.
   ObjectStore::Transaction t;
   unsigned removed = 0;
-  for (vector<hobject_t>::iterator p = objects.begin();
+  for (vector<ghobject_t>::iterator p = objects.begin();
        p != objects.end();
        ++p, removed++) {
     OSDriver::OSTransaction _t(driver.get_transaction(&t));
-    int r = mapper.remove_oid(*p, &_t);
+    int r = mapper.remove_oid(p->hobj, &_t);
     if (r != 0 && r != -ENOENT)
       assert(0);
     t.collection_remove(tmp, *p);
@@ -3346,10 +3280,10 @@ bool remove_dir(
   ObjectStore::Sequencer *osr,
   coll_t coll, DeletingStateRef dstate)
 {
-  vector<hobject_t> olist;
+  vector<ghobject_t> olist;
   int64_t num = 0;
   ObjectStore::Transaction *t = new ObjectStore::Transaction;
-  hobject_t next;
+  ghobject_t next;
   while (!next.is_max()) {
     store->collection_list_partial(
       coll,
@@ -3359,11 +3293,11 @@ bool remove_dir(
       0,
       &olist,
       &next);
-    for (vector<hobject_t>::iterator i = olist.begin();
+    for (vector<ghobject_t>::iterator i = olist.begin();
 	 i != olist.end();
 	 ++i, ++num) {
       OSDriver::OSTransaction _t(osdriver->get_transaction(t));
-      int r = mapper->remove_oid(*i, &_t);
+      int r = mapper->remove_oid(i->hobj, &_t);
       if (r != 0 && r != -ENOENT) {
 	assert(0);
       }
@@ -3406,16 +3340,16 @@ void OSD::RemoveWQ::_process(pair<PGRef, DeletingStateRef> item)
   if (!item.second->start_clearing())
     return;
 
-  if (pg->have_temp_coll()) {
+  list<coll_t> colls_to_remove;
+  pg->get_colls(&colls_to_remove);
+  for (list<coll_t>::iterator i = colls_to_remove.begin();
+       i != colls_to_remove.end();
+       ++i) {
     bool cont = remove_dir(
-      pg->cct, store, &mapper, &driver, pg->osr.get(), pg->get_temp_coll(), item.second);
+      pg->cct, store, &mapper, &driver, pg->osr.get(), *i, item.second);
     if (!cont)
       return;
   }
-  bool cont = remove_dir(
-      pg->cct, store, &mapper, &driver, pg->osr.get(), coll, item.second);
-  if (!cont)
-    return;
 
   if (!item.second->start_deleting())
     return;
@@ -3426,9 +3360,12 @@ void OSD::RemoveWQ::_process(pair<PGRef, DeletingStateRef> item)
     OSD::make_infos_oid(),
     pg->log_oid,
     t);
-  if (pg->have_temp_coll())
-    t->remove_collection(pg->get_temp_coll());
-  t->remove_collection(coll);
+
+  for (list<coll_t>::iterator i = colls_to_remove.begin();
+       i != colls_to_remove.end();
+       ++i) {
+    t->remove_collection(*i);
+  }
 
   // We need the sequencer to stick around until the op is complete
   store->queue_transaction(
@@ -3970,6 +3907,10 @@ COMMAND("bench " \
 	"(default 1G size 4MB). Results in log.",
 	"osd", "rw", "cli,rest")
 COMMAND("flush_pg_stats", "flush pg stats", "osd", "rw", "cli,rest")
+COMMAND("heap " \
+	"name=heapcmd,type=CephChoices,strings=dump|start_profiler|stop_profiler|release|stats", \
+	"show heap usage info (available only if compiled with tcmalloc)", \
+	"osd", "rw", "cli,rest")
 COMMAND("debug_dump_missing " \
 	"name=filename,type=CephFilepath",
 	"dump missing objects to a named file", "osd", "r", "cli,rest")
@@ -5879,22 +5820,11 @@ void OSD::split_pgs(
     dout(10) << "m_seed " << i->ps() << dendl;
     dout(10) << "split_bits is " << split_bits << dendl;
 
-    rctx->transaction->create_collection(
-      coll_t(*i));
-    rctx->transaction->split_collection(
-      coll_t(parent->info.pgid),
+    parent->split_colls(
+      *i,
       split_bits,
       i->m_seed,
-      coll_t(*i));
-    if (parent->have_temp_coll()) {
-      rctx->transaction->create_collection(
-	coll_t::make_temp_coll(*i));
-      rctx->transaction->split_collection(
-	coll_t::make_temp_coll(parent->info.pgid),
-	split_bits,
-	i->m_seed,
-	coll_t::make_temp_coll(*i));
-    }
+      rctx->transaction);
     parent->split_into(
       *i,
       child,
diff --git a/src/osd/OSD.h b/src/osd/OSD.h
index c2f45196870..9346cee6890 100644
--- a/src/osd/OSD.h
+++ b/src/osd/OSD.h
@@ -307,6 +307,7 @@ public:
   ThreadPool::WorkQueue<PG> &scrub_wq;
   ThreadPool::WorkQueue<PG> &scrub_finalize_wq;
   ThreadPool::WorkQueue<MOSDRepScrub> &rep_scrub_wq;
+  GenContextWQ push_wq;
   ClassHandler  *&class_handler;
 
   void dequeue_pg(PG *pg, list<OpRequestRef> *dequeued);
@@ -635,6 +636,20 @@ public:
   OSDService(OSD *osd);
   ~OSDService();
 };
+
+struct C_OSD_SendMessageOnConn: public Context {
+  OSDService *osd;
+  Message *reply;
+  ConnectionRef conn;
+  C_OSD_SendMessageOnConn(
+    OSDService *osd,
+    Message *reply,
+    ConnectionRef conn) : osd(osd), reply(reply), conn(conn) {}
+  void finish(int) {
+    osd->send_message_osd_cluster(reply, conn.get());
+  }
+};
+
 class OSD : public Dispatcher,
 	    public md_config_obs_t {
   /** OSD **/
@@ -731,6 +746,25 @@ public:
     return oid;
   }
   static void recursive_remove_collection(ObjectStore *store, coll_t tmp);
+
+  /**
+   * get_osd_initial_compat_set()
+   *
+   * Get the initial feature set for this OSD.  Features
+   * here are automatically upgraded.
+   *
+   * Return value: Initial osd CompatSet
+   */
+  static CompatSet get_osd_initial_compat_set();
+
+  /**
+   * get_osd_compat_set()
+   *
+   * Get all features supported by this OSD
+   *
+   * Return value: CompatSet of all supported features
+   */
+  static CompatSet get_osd_compat_set();
   
 
 private:
@@ -1704,10 +1738,6 @@ protected:
   }
 
 private:
-  static int write_meta(const std::string &base, const std::string &file,
-			const char *val, size_t vallen);
-  static int read_meta(const std::string &base, const std::string &file,
-		       char *val, size_t vallen);
   static int write_meta(const std::string &base,
 			uuid_d& cluster_fsid, uuid_d& osd_fsid, int whoami);
 public:
diff --git a/src/osd/PG.cc b/src/osd/PG.cc
index 2c09ec62486..37661a01ea5 100644
--- a/src/osd/PG.cc
+++ b/src/osd/PG.cc
@@ -1399,76 +1399,6 @@ void PG::queue_op(OpRequestRef op)
   osd->op_wq.queue(make_pair(PGRef(this), op));
 }
 
-void PG::do_request(
-  OpRequestRef op,
-  ThreadPool::TPHandle &handle)
-{
-  // do any pending flush
-  do_pending_flush();
-
-  if (!op_has_sufficient_caps(op)) {
-    osd->reply_op_error(op, -EPERM);
-    return;
-  }
-  assert(!op_must_wait_for_map(get_osdmap(), op));
-  if (can_discard_request(op)) {
-    return;
-  }
-  if (!flushed) {
-    dout(20) << " !flushed, waiting for active on " << op << dendl;
-    waiting_for_active.push_back(op);
-    return;
-  }
-
-  switch (op->get_req()->get_type()) {
-  case CEPH_MSG_OSD_OP:
-    if (is_replay() || !is_active()) {
-      dout(20) << " replay, waiting for active on " << op << dendl;
-      waiting_for_active.push_back(op);
-      return;
-    }
-    do_op(op); // do it now
-    break;
-
-  case MSG_OSD_SUBOP:
-    do_sub_op(op);
-    break;
-
-  case MSG_OSD_SUBOPREPLY:
-    do_sub_op_reply(op);
-    break;
-
-  case MSG_OSD_PG_SCAN:
-    do_scan(op, handle);
-    break;
-
-  case MSG_OSD_PG_BACKFILL:
-    do_backfill(op);
-    break;
-
-  case MSG_OSD_PG_PUSH:
-    if (!is_active()) {
-      waiting_for_active.push_back(op);
-      op->mark_delayed("waiting for active");
-      return;
-    }
-    do_push(op);
-    break;
-
-  case MSG_OSD_PG_PULL:
-    do_pull(op);
-    break;
-
-  case MSG_OSD_PG_PUSH_REPLY:
-    do_push_reply(op);
-    break;
-
-  default:
-    assert(0 == "bad message type in do_request");
-  }
-}
-
-
 void PG::replay_queued_ops()
 {
   assert(is_replay() && is_active());
@@ -2254,7 +2184,8 @@ epoch_t PG::peek_map_epoch(ObjectStore *store, coll_t coll, hobject_t &infos_oid
   snapid_t snap;
   bool ok = coll.is_pg(pgid, snap);
   assert(ok);
-  store->collection_getattr(coll, "info", *bl);
+  int r = store->collection_getattr(coll, "info", *bl);
+  assert(r > 0);
   bufferlist::iterator bp = bl->begin();
   __u8 struct_v = 0;
   ::decode(struct_v, bp);
diff --git a/src/osd/PG.h b/src/osd/PG.h
index cdbe827a4a9..74809eea268 100644
--- a/src/osd/PG.h
+++ b/src/osd/PG.h
@@ -870,8 +870,12 @@ public:
   virtual void _scrub(ScrubMap &map) { }
   virtual void _scrub_clear_state() { }
   virtual void _scrub_finish() { }
-  virtual coll_t get_temp_coll() = 0;
-  virtual bool have_temp_coll() = 0;
+  virtual void get_colls(list<coll_t> *out) = 0;
+  virtual void split_colls(
+    pg_t child,
+    int split_bits,
+    int seed,
+    ObjectStore::Transaction *t) = 0;
   virtual bool _report_snap_collection_errors(
     const hobject_t &hoid,
     const map<string, bufferptr> &attrs,
@@ -1789,10 +1793,10 @@ public:
 
 
   // abstract bits
-  void do_request(
+  virtual void do_request(
     OpRequestRef op,
     ThreadPool::TPHandle &handle
-  );
+  ) = 0;
 
   virtual void do_op(OpRequestRef op) = 0;
   virtual void do_sub_op(OpRequestRef op) = 0;
@@ -1802,9 +1806,6 @@ public:
     ThreadPool::TPHandle &handle
   ) = 0;
   virtual void do_backfill(OpRequestRef op) = 0;
-  virtual void do_push(OpRequestRef op) = 0;
-  virtual void do_pull(OpRequestRef op) = 0;
-  virtual void do_push_reply(OpRequestRef op) = 0;
   virtual void snap_trimmer() = 0;
 
   virtual int do_command(cmdmap_t cmdmap, ostream& ss,
diff --git a/src/osd/PGBackend.h b/src/osd/PGBackend.h
new file mode 100644
index 00000000000..e3cc05bf345
--- /dev/null
+++ b/src/osd/PGBackend.h
@@ -0,0 +1,210 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2013 Inktank Storage, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef PGBACKEND_H
+#define PGBACKEND_H
+
+#include "osd_types.h"
+#include "include/Context.h"
+#include <string>
+
+ /**
+  * PGBackend
+  *
+  * PGBackend defines an interface for logic handling IO and
+  * replication on RADOS objects.  The PGBackend implementation
+  * is responsible for:
+  *
+  * 1) Handling client operations
+  * 2) Handling object recovery
+  * 3) Handling object access
+  */
+ class PGBackend {
+ public:	
+   /**
+    * Provides interfaces for PGBackend callbacks
+    *
+    * The intention is that the parent calls into the PGBackend
+    * implementation holding a lock and that the callbacks are
+    * called under the same locks.
+    */
+   class Listener {
+   public:
+     /// Recovery
+
+     virtual void on_local_recover_start(
+       const hobject_t &oid,
+       ObjectStore::Transaction *t) = 0;
+     /**
+      * Called with the transaction recovering oid
+      */
+     virtual void on_local_recover(
+       const hobject_t &oid,
+       const object_stat_sum_t &stat_diff,
+       const ObjectRecoveryInfo &recovery_info,
+       ObjectContextRef obc,
+       ObjectStore::Transaction *t
+       ) = 0;
+
+     /**
+      * Called when transaction recovering oid is durable and
+      * applied on all replicas
+      */
+     virtual void on_global_recover(const hobject_t &oid) = 0;
+
+     /**
+      * Called when peer is recovered
+      */
+     virtual void on_peer_recover(
+       int peer,
+       const hobject_t &oid,
+       const ObjectRecoveryInfo &recovery_info,
+       const object_stat_sum_t &stat
+       ) = 0;
+
+     virtual void begin_peer_recover(
+       int peer,
+       const hobject_t oid) = 0;
+
+     virtual void failed_push(int from, const hobject_t &soid) = 0;
+
+     
+     virtual void cancel_pull(const hobject_t &soid) = 0;
+
+     /**
+      * Bless a context
+      *
+      * Wraps a context in whatever outer layers the parent usually
+      * uses to call into the PGBackend
+      */
+     virtual Context *bless_context(Context *c) = 0;
+     virtual GenContext<ThreadPool::TPHandle&> *bless_gencontext(
+       GenContext<ThreadPool::TPHandle&> *c) = 0;
+
+     virtual void send_message(int to_osd, Message *m) = 0;
+     virtual void queue_transaction(ObjectStore::Transaction *t) = 0;
+     virtual epoch_t get_epoch() = 0;
+     virtual const vector<int> &get_acting() = 0;
+     virtual std::string gen_dbg_prefix() const = 0;
+
+     virtual const map<hobject_t, set<int> > &get_missing_loc() = 0;
+     virtual const map<int, pg_missing_t> &get_peer_missing() = 0;
+     virtual const map<int, pg_info_t> &get_peer_info() = 0;
+     virtual const pg_missing_t &get_local_missing() = 0;
+     virtual const PGLog &get_log() = 0;
+     virtual bool pgb_is_primary() const = 0;
+     virtual OSDMapRef pgb_get_osdmap() const = 0;
+     virtual const pg_info_t &get_info() const = 0;
+
+     virtual ObjectContextRef get_obc(
+       const hobject_t &hoid,
+       map<string, bufferptr> &attrs) = 0;
+
+     virtual ~Listener() {}
+   };
+   Listener *parent;
+   Listener *get_parent() const { return parent; }
+   PGBackend(Listener *l) : parent(l) {}
+   bool is_primary() const { return get_parent()->pgb_is_primary(); }
+   OSDMapRef get_osdmap() const { return get_parent()->pgb_get_osdmap(); }
+   const pg_info_t &get_info() { return get_parent()->get_info(); }
+
+   std::string gen_prefix() const {
+     return parent->gen_dbg_prefix();
+   }
+
+   /**
+    * RecoveryHandle
+    *
+    * We may want to recover multiple objects in the same set of
+    * messages.  RecoveryHandle is an interface for the opaque
+    * object used by the implementation to store the details of
+    * the pending recovery operations.
+    */
+   struct RecoveryHandle {
+     virtual ~RecoveryHandle() {}
+   };
+
+   /// Get a fresh recovery operation
+   virtual RecoveryHandle *open_recovery_op() = 0;
+
+   /// run_recovery_op: finish the operation represented by h
+   virtual void run_recovery_op(
+     RecoveryHandle *h,     ///< [in] op to finish
+     int priority           ///< [in] msg priority
+     ) = 0;
+
+   /**
+    * recover_object
+    *
+    * Triggers a recovery operation on the specified hobject_t
+    * onreadable must be called before onwriteable
+    *
+    * On each replica (primary included), get_parent()->on_not_missing()
+    * must be called when the transaction finalizing the recovery
+    * is queued.  Similarly, get_parent()->on_readable() must be called
+    * when the transaction is applied in the backing store.
+    *
+    * get_parent()->on_not_degraded() should be called on the primary
+    * when writes can resume on the object.
+    *
+    * obc may be NULL if the primary lacks the object.
+    *
+    * head may be NULL only if the head/snapdir is missing
+    *
+    * @param missing [in] set of info, missing pairs for queried nodes
+    * @param overlaps [in] mapping of object to file offset overlaps
+    */
+   virtual void recover_object(
+     const hobject_t &hoid, ///< [in] object to recover
+     ObjectContextRef head,  ///< [in] context of the head/snapdir object
+     ObjectContextRef obc,  ///< [in] context of the object
+     RecoveryHandle *h      ///< [in,out] handle to attach recovery op to
+     ) = 0;
+
+   /// gives PGBackend a crack at an incoming message
+   virtual bool handle_message(
+     OpRequestRef op ///< [in] message received
+     ) = 0; ///< @return true if the message was handled
+
+   virtual void check_recovery_sources(const OSDMapRef osdmap) = 0;
+
+   /**
+    * implementation should clear itself, contexts blessed prior to on_change
+    * won't be called after on_change()
+    */
+   virtual void on_change(ObjectStore::Transaction *t) = 0;
+   virtual void clear_state() = 0;
+
+   virtual void on_flushed() = 0;
+
+
+   virtual void split_colls(
+     pg_t child,
+     int split_bits,
+     int seed,
+     ObjectStore::Transaction *t) = 0;
+
+   virtual void temp_colls(list<coll_t> *out) = 0;
+
+   virtual void dump_recovery_info(Formatter *f) const = 0;
+
+   virtual coll_t get_temp_coll(ObjectStore::Transaction *t) = 0;
+   virtual void add_temp_obj(const hobject_t &oid) = 0;
+   virtual void clear_temp_obj(const hobject_t &oid) = 0;
+
+   virtual ~PGBackend() {}
+ };
+
+#endif
diff --git a/src/osd/PGLog.cc b/src/osd/PGLog.cc
index 486d64302b9..6e025f289bc 100644
--- a/src/osd/PGLog.cc
+++ b/src/osd/PGLog.cc
@@ -782,10 +782,6 @@ void PGLog::read_log_old(ObjectStore *store, coll_t coll, hobject_t log_oid,
  
   log.tail = info.log_tail;
 
-  // In case of sobject_t based encoding, may need to list objects in the store
-  // to find hashes
-  vector<hobject_t> ls;
-  
   if (ondisklog_head > 0) {
     // read
     bufferlist bl;
@@ -803,7 +799,6 @@ void PGLog::read_log_old(ObjectStore *store, coll_t coll, hobject_t log_oid,
     assert(log.empty());
     eversion_t last;
     bool reorder = false;
-    bool listed_collection = false;
 
     while (!p.end()) {
       uint64_t pos = ondisklog_tail + p.get_off();
@@ -846,29 +841,7 @@ void PGLog::read_log_old(ObjectStore *store, coll_t coll, hobject_t log_oid,
 	      << e.version << " after " << last << "\n";
       }
 
-      if (e.invalid_hash) {
-	// We need to find the object in the store to get the hash
-	if (!listed_collection) {
-	  store->collection_list(coll, ls);
-	  listed_collection = true;
-	}
-	bool found = false;
-	for (vector<hobject_t>::iterator i = ls.begin();
-	     i != ls.end();
-	     ++i) {
-	  if (i->oid == e.soid.oid && i->snap == e.soid.snap) {
-	    e.soid = *i;
-	    found = true;
-	    break;
-	  }
-	}
-	if (!found) {
-	  // Didn't find the correct hash
-	  std::ostringstream oss;
-	  oss << "Could not find hash for hoid " << e.soid << std::endl;
-	  throw read_log_error(oss.str().c_str());
-	}
-      }
+      assert(!e.invalid_hash);
 
       if (e.invalid_pool) {
 	e.soid.pool = info.pgid.pool();
diff --git a/src/osd/ReplicatedBackend.cc b/src/osd/ReplicatedBackend.cc
new file mode 100644
index 00000000000..b39207e14f8
--- /dev/null
+++ b/src/osd/ReplicatedBackend.cc
@@ -0,0 +1,196 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2013 Inktank Storage, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+#include "ReplicatedBackend.h"
+#include "messages/MOSDSubOp.h"
+#include "messages/MOSDSubOpReply.h"
+#include "messages/MOSDPGPush.h"
+#include "messages/MOSDPGPull.h"
+#include "messages/MOSDPGPushReply.h"
+
+#define dout_subsys ceph_subsys_osd
+#define DOUT_PREFIX_ARGS this
+#undef dout_prefix
+#define dout_prefix _prefix(_dout, this)
+static ostream& _prefix(std::ostream *_dout, ReplicatedBackend *pgb) {
+  return *_dout << pgb->get_parent()->gen_dbg_prefix();
+}
+
+ReplicatedBackend::ReplicatedBackend(
+  PGBackend::Listener *pg, coll_t coll, OSDService *osd) :
+  PGBackend(pg), temp_created(false),
+  temp_coll(coll_t::make_temp_coll(pg->get_info().pgid)),
+  coll(coll), osd(osd), cct(osd->cct) {}
+
+void ReplicatedBackend::run_recovery_op(
+  PGBackend::RecoveryHandle *_h,
+  int priority)
+{
+  RPGHandle *h = static_cast<RPGHandle *>(_h);
+  send_pushes(priority, h->pushes);
+  send_pulls(priority, h->pulls);
+  delete h;
+}
+
+void ReplicatedBackend::recover_object(
+  const hobject_t &hoid,
+  ObjectContextRef head,
+  ObjectContextRef obc,
+  RecoveryHandle *_h
+  )
+{
+  dout(10) << __func__ << ": " << hoid << dendl;
+  RPGHandle *h = static_cast<RPGHandle *>(_h);
+  if (get_parent()->get_local_missing().is_missing(hoid)) {
+    assert(!obc);
+    // pull
+    prepare_pull(
+      hoid,
+      head,
+      h);
+    return;
+  } else {
+    assert(obc);
+    int started = start_pushes(
+      hoid,
+      obc,
+      h);
+    assert(started > 0);
+  }
+}
+
+void ReplicatedBackend::check_recovery_sources(const OSDMapRef osdmap)
+{
+  for(map<int, set<hobject_t> >::iterator i = pull_from_peer.begin();
+      i != pull_from_peer.end();
+      ) {
+    if (osdmap->is_down(i->first)) {
+      dout(10) << "check_recovery_sources resetting pulls from osd." << i->first
+	       << ", osdmap has it marked down" << dendl;
+      for (set<hobject_t>::iterator j = i->second.begin();
+	   j != i->second.end();
+	   ++j) {
+	assert(pulling.count(*j) == 1);
+	get_parent()->cancel_pull(*j);
+	pulling.erase(*j);
+      }
+      pull_from_peer.erase(i++);
+    } else {
+      ++i;
+    }
+  }
+}
+
+bool ReplicatedBackend::handle_message(
+  OpRequestRef op
+  )
+{
+  dout(10) << __func__ << ": " << op << dendl;
+  switch (op->get_req()->get_type()) {
+  case MSG_OSD_PG_PUSH:
+    // TODOXXX: needs to be active possibly
+    do_push(op);
+    return true;
+
+  case MSG_OSD_PG_PULL:
+    do_pull(op);
+    return true;
+
+  case MSG_OSD_PG_PUSH_REPLY:
+    do_push_reply(op);
+    return true;
+
+  case MSG_OSD_SUBOP: {
+    MOSDSubOp *m = static_cast<MOSDSubOp*>(op->get_req());
+    if (m->ops.size() >= 1) {
+      OSDOp *first = &m->ops[0];
+      switch (first->op.op) {
+      case CEPH_OSD_OP_PULL:
+	sub_op_pull(op);
+	return true;
+      case CEPH_OSD_OP_PUSH:
+        // TODOXXX: needs to be active possibly
+	sub_op_push(op);
+	return true;
+      default:
+	break;
+      }
+    }
+    break;
+  }
+
+  case MSG_OSD_SUBOPREPLY: {
+    MOSDSubOpReply *r = static_cast<MOSDSubOpReply*>(op->get_req());
+    if (r->ops.size() >= 1) {
+      OSDOp &first = r->ops[0];
+      switch (first.op.op) {
+      case CEPH_OSD_OP_PUSH:
+	// continue peer recovery
+	sub_op_push_reply(op);
+	return true;
+      }
+    }
+    break;
+  }
+
+  default:
+    break;
+  }
+  return false;
+}
+
+void ReplicatedBackend::clear_state()
+{
+  // clear pushing/pulling maps
+  pushing.clear();
+  pulling.clear();
+  pull_from_peer.clear();
+}
+
+void ReplicatedBackend::on_change(ObjectStore::Transaction *t)
+{
+  dout(10) << __func__ << dendl;
+  // clear temp
+  for (set<hobject_t>::iterator i = temp_contents.begin();
+       i != temp_contents.end();
+       ++i) {
+    dout(10) << __func__ << ": Removing oid "
+	     << *i << " from the temp collection" << dendl;
+    t->remove(get_temp_coll(t), *i);
+  }
+  temp_contents.clear();
+  clear_state();
+}
+
+coll_t ReplicatedBackend::get_temp_coll(ObjectStore::Transaction *t)
+{
+  if (temp_created)
+    return temp_coll;
+  if (!osd->store->collection_exists(temp_coll))
+      t->create_collection(temp_coll);
+  temp_created = true;
+  return temp_coll;
+}
+
+void ReplicatedBackend::on_flushed()
+{
+  if (have_temp_coll() &&
+      !osd->store->collection_empty(get_temp_coll())) {
+    vector<hobject_t> objects;
+    osd->store->collection_list(get_temp_coll(), objects);
+    derr << __func__ << ": found objects in the temp collection: "
+	 << objects << ", crashing now"
+	 << dendl;
+    assert(0 == "found garbage in the temp collection");
+  }
+}
diff --git a/src/osd/ReplicatedBackend.h b/src/osd/ReplicatedBackend.h
new file mode 100644
index 00000000000..e34e55a618e
--- /dev/null
+++ b/src/osd/ReplicatedBackend.h
@@ -0,0 +1,309 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2013 Inktank Storage, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef REPBACKEND_H
+#define REPBACKEND_H
+
+#include "OSD.h"
+#include "PGBackend.h"
+#include "osd_types.h"
+
+struct C_ReplicatedBackend_OnPullComplete;
+class ReplicatedBackend : public PGBackend {
+  struct RPGHandle : public PGBackend::RecoveryHandle {
+    map<int, vector<PushOp> > pushes;
+    map<int, vector<PullOp> > pulls;
+  };
+  friend struct C_ReplicatedBackend_OnPullComplete;
+private:
+  bool temp_created;
+  const coll_t temp_coll;
+  coll_t get_temp_coll() const {
+    return temp_coll;
+  }
+  bool have_temp_coll() const { return temp_created; }
+
+  // Track contents of temp collection, clear on reset
+  set<hobject_t> temp_contents;
+public:
+  coll_t coll;
+  OSDService *osd;
+  CephContext *cct;
+
+  ReplicatedBackend(PGBackend::Listener *pg, coll_t coll, OSDService *osd);
+
+  /// @see PGBackend::open_recovery_op
+  RPGHandle *_open_recovery_op() {
+    return new RPGHandle();
+  }
+  PGBackend::RecoveryHandle *open_recovery_op() {
+    return _open_recovery_op();
+  }
+
+  /// @see PGBackend::run_recovery_op
+  void run_recovery_op(
+    PGBackend::RecoveryHandle *h,
+    int priority);
+
+  /// @see PGBackend::recover_object
+  void recover_object(
+    const hobject_t &hoid,
+    ObjectContextRef head,
+    ObjectContextRef obc,
+    RecoveryHandle *h
+    );
+
+  void check_recovery_sources(const OSDMapRef osdmap);
+
+  /// @see PGBackend::handle_message
+  bool handle_message(
+    OpRequestRef op
+    );
+
+  void on_change(ObjectStore::Transaction *t);
+  void clear_state();
+  void on_flushed();
+
+  void temp_colls(list<coll_t> *out) {
+    if (temp_created)
+      out->push_back(temp_coll);
+  }
+  void split_colls(
+    pg_t child,
+    int split_bits,
+    int seed,
+    ObjectStore::Transaction *t) {
+    coll_t target = coll_t::make_temp_coll(child);
+    if (!temp_created)
+      return;
+    t->create_collection(target);
+    t->split_collection(
+      temp_coll,
+      split_bits,
+      seed,
+      target);
+  }
+
+  virtual void dump_recovery_info(Formatter *f) const {
+    {
+      f->open_array_section("pull_from_peer");
+      for (map<int, set<hobject_t> >::const_iterator i = pull_from_peer.begin();
+	   i != pull_from_peer.end();
+	   ++i) {
+	f->open_object_section("pulling_from");
+	f->dump_int("pull_from", i->first);
+	{
+	  f->open_array_section("pulls");
+	  for (set<hobject_t>::const_iterator j = i->second.begin();
+	       j != i->second.end();
+	       ++j) {
+	    f->open_object_section("pull_info");
+	    assert(pulling.count(*j));
+	    pulling.find(*j)->second.dump(f);
+	    f->close_section();
+	  }
+	  f->close_section();
+	}
+	f->close_section();
+      }
+      f->close_section();
+    }
+    {
+      f->open_array_section("pushing");
+      for (map<hobject_t, map<int, PushInfo> >::const_iterator i =
+	     pushing.begin();
+	   i != pushing.end();
+	   ++i) {
+	f->open_object_section("object");
+	f->dump_stream("pushing") << i->first;
+	{
+	  f->open_array_section("pushing_to");
+	  for (map<int, PushInfo>::const_iterator j = i->second.begin();
+	       j != i->second.end();
+	       ++j) {
+	    f->open_object_section("push_progress");
+	    f->dump_stream("object_pushing") << j->first;
+	    {
+	      f->open_object_section("push_info");
+	      j->second.dump(f);
+	      f->close_section();
+	    }
+	    f->close_section();
+	  }
+	  f->close_section();
+	}
+	f->close_section();
+      }
+      f->close_section();
+    }
+  }
+private:
+  // push
+  struct PushInfo {
+    ObjectRecoveryProgress recovery_progress;
+    ObjectRecoveryInfo recovery_info;
+    ObjectContextRef obc;
+    object_stat_sum_t stat;
+
+    void dump(Formatter *f) const {
+      {
+	f->open_object_section("recovery_progress");
+	recovery_progress.dump(f);
+	f->close_section();
+      }
+      {
+	f->open_object_section("recovery_info");
+	recovery_info.dump(f);
+	f->close_section();
+      }
+    }
+  };
+  map<hobject_t, map<int, PushInfo> > pushing;
+
+  // pull
+  struct PullInfo {
+    ObjectRecoveryProgress recovery_progress;
+    ObjectRecoveryInfo recovery_info;
+    ObjectContextRef head_ctx;
+    ObjectContextRef obc;
+    object_stat_sum_t stat;
+
+    void dump(Formatter *f) const {
+      {
+	f->open_object_section("recovery_progress");
+	recovery_progress.dump(f);
+	f->close_section();
+      }
+      {
+	f->open_object_section("recovery_info");
+	recovery_info.dump(f);
+	f->close_section();
+      }
+    }
+
+    bool is_complete() const {
+      return recovery_progress.is_complete(recovery_info);
+    }
+  };
+
+  coll_t get_temp_coll(ObjectStore::Transaction *t);
+  void add_temp_obj(const hobject_t &oid) {
+    temp_contents.insert(oid);
+  }
+  void clear_temp_obj(const hobject_t &oid) {
+    temp_contents.erase(oid);
+  }
+
+  map<hobject_t, PullInfo> pulling;
+
+  // Reverse mapping from osd peer to objects beging pulled from that peer
+  map<int, set<hobject_t> > pull_from_peer;
+
+  void sub_op_push(OpRequestRef op);
+  void sub_op_push_reply(OpRequestRef op);
+  void sub_op_pull(OpRequestRef op);
+
+  void _do_push(OpRequestRef op);
+  void _do_pull_response(OpRequestRef op);
+  void do_push(OpRequestRef op) {
+    if (is_primary()) {
+      _do_pull_response(op);
+    } else {
+      _do_push(op);
+    }
+  }
+  void do_pull(OpRequestRef op);
+  void do_push_reply(OpRequestRef op);
+
+  bool handle_push_reply(int peer, PushReplyOp &op, PushOp *reply);
+  void handle_pull(int peer, PullOp &op, PushOp *reply);
+  bool handle_pull_response(
+    int from, PushOp &op, PullOp *response,
+    list<ObjectContextRef> *to_continue,
+    ObjectStore::Transaction *t);
+  void handle_push(int from, PushOp &op, PushReplyOp *response,
+		   ObjectStore::Transaction *t);
+
+  static void trim_pushed_data(const interval_set<uint64_t> &copy_subset,
+			       const interval_set<uint64_t> &intervals_received,
+			       bufferlist data_received,
+			       interval_set<uint64_t> *intervals_usable,
+			       bufferlist *data_usable);
+  void _failed_push(int from, const hobject_t &soid);
+
+  void send_pushes(int prio, map<int, vector<PushOp> > &pushes);
+  void prep_push_op_blank(const hobject_t& soid, PushOp *op);
+  int send_push_op_legacy(int priority, int peer,
+			  PushOp &pop);
+  int send_pull_legacy(int priority, int peer,
+		       const ObjectRecoveryInfo& recovery_info,
+		       ObjectRecoveryProgress progress);
+  void send_pulls(
+    int priority,
+    map<int, vector<PullOp> > &pulls);
+
+  int build_push_op(const ObjectRecoveryInfo &recovery_info,
+		    const ObjectRecoveryProgress &progress,
+		    ObjectRecoveryProgress *out_progress,
+		    PushOp *out_op,
+		    object_stat_sum_t *stat = 0);
+  void submit_push_data(ObjectRecoveryInfo &recovery_info,
+			bool first,
+			bool complete,
+			const interval_set<uint64_t> &intervals_included,
+			bufferlist data_included,
+			bufferlist omap_header,
+			map<string, bufferptr> &attrs,
+			map<string, bufferlist> &omap_entries,
+			ObjectStore::Transaction *t);
+  void submit_push_complete(ObjectRecoveryInfo &recovery_info,
+			    ObjectStore::Transaction *t);
+
+  void calc_clone_subsets(
+    SnapSet& snapset, const hobject_t& poid, const pg_missing_t& missing,
+    const hobject_t &last_backfill,
+    interval_set<uint64_t>& data_subset,
+    map<hobject_t, interval_set<uint64_t> >& clone_subsets);
+  void prepare_pull(
+    const hobject_t& soid,
+    ObjectContextRef headctx,
+    RPGHandle *h);
+  int start_pushes(
+    const hobject_t &soid,
+    ObjectContextRef obj,
+    RPGHandle *h);
+  void prep_push_to_replica(
+    ObjectContextRef obc, const hobject_t& soid, int peer,
+    PushOp *pop);
+  void prep_push(ObjectContextRef obc,
+		 const hobject_t& oid, int dest,
+		 PushOp *op);
+  void prep_push(ObjectContextRef obc,
+		 const hobject_t& soid, int peer,
+		 eversion_t version,
+		 interval_set<uint64_t> &data_subset,
+		 map<hobject_t, interval_set<uint64_t> >& clone_subsets,
+		 PushOp *op);
+  void calc_head_subsets(ObjectContextRef obc, SnapSet& snapset, const hobject_t& head,
+			 const pg_missing_t& missing,
+			 const hobject_t &last_backfill,
+			 interval_set<uint64_t>& data_subset,
+			 map<hobject_t, interval_set<uint64_t> >& clone_subsets);
+  ObjectRecoveryInfo recalc_subsets(
+    const ObjectRecoveryInfo& recovery_info,
+    SnapSetContext *ssc
+    );
+};
+
+#endif
diff --git a/src/osd/ReplicatedPG.cc b/src/osd/ReplicatedPG.cc
index 401ad9014ff..1e2a863e389 100644
--- a/src/osd/ReplicatedPG.cc
+++ b/src/osd/ReplicatedPG.cc
@@ -60,8 +60,9 @@
 #define dout_subsys ceph_subsys_osd
 #define DOUT_PREFIX_ARGS this, osd->whoami, get_osdmap()
 #undef dout_prefix
-#define dout_prefix _prefix(_dout, this, osd->whoami, get_osdmap())
-static ostream& _prefix(std::ostream *_dout, PG *pg, int whoami, OSDMapRef osdmap) {
+#define dout_prefix _prefix(_dout, this)
+template <typename T>
+static ostream& _prefix(std::ostream *_dout, T *pg) {
   return *_dout << pg->gen_prefix();
 }
 
@@ -79,6 +80,159 @@ PGLSFilter::~PGLSFilter()
 {
 }
 
+static void log_subop_stats(
+  OSDService *osd,
+  OpRequestRef op, int tag_inb, int tag_lat)
+{
+  utime_t now = ceph_clock_now(g_ceph_context);
+  utime_t latency = now;
+  latency -= op->get_req()->get_recv_stamp();
+
+  uint64_t inb = op->get_req()->get_data().length();
+
+  osd->logger->inc(l_osd_sop);
+
+  osd->logger->inc(l_osd_sop_inb, inb);
+  osd->logger->tinc(l_osd_sop_lat, latency);
+
+  if (tag_inb)
+    osd->logger->inc(tag_inb, inb);
+  osd->logger->tinc(tag_lat, latency);
+}
+
+// ======================
+// PGBackend::Listener
+
+
+void ReplicatedPG::on_local_recover_start(
+  const hobject_t &oid,
+  ObjectStore::Transaction *t)
+{
+  pg_log.revise_have(oid, eversion_t());
+  remove_snap_mapped_object(*t, oid);
+  t->remove(coll, oid);
+}
+
+void ReplicatedPG::on_local_recover(
+  const hobject_t &hoid,
+  const object_stat_sum_t &stat_diff,
+  const ObjectRecoveryInfo &_recovery_info,
+  ObjectContextRef obc,
+  ObjectStore::Transaction *t
+  )
+{
+  ObjectRecoveryInfo recovery_info(_recovery_info);
+  if (recovery_info.soid.snap < CEPH_NOSNAP) {
+    assert(recovery_info.oi.snaps.size());
+    OSDriver::OSTransaction _t(osdriver.get_transaction(t));
+    set<snapid_t> snaps(
+      recovery_info.oi.snaps.begin(),
+      recovery_info.oi.snaps.end());
+    snap_mapper.add_oid(
+      recovery_info.soid,
+      snaps,
+      &_t);
+  }
+
+  if (pg_log.get_missing().is_missing(recovery_info.soid) &&
+      pg_log.get_missing().missing.find(recovery_info.soid)->second.need > recovery_info.version) {
+    assert(is_primary());
+    const pg_log_entry_t *latest = pg_log.get_log().objects.find(recovery_info.soid)->second;
+    if (latest->op == pg_log_entry_t::LOST_REVERT &&
+	latest->reverting_to == recovery_info.version) {
+      dout(10) << " got old revert version " << recovery_info.version
+	       << " for " << *latest << dendl;
+      recovery_info.version = latest->version;
+      // update the attr to the revert event version
+      recovery_info.oi.prior_version = recovery_info.oi.version;
+      recovery_info.oi.version = latest->version;
+      bufferlist bl;
+      ::encode(recovery_info.oi, bl);
+      t->setattr(coll, recovery_info.soid, OI_ATTR, bl);
+    }
+  }
+
+  // keep track of active pushes for scrub
+  ++active_pushes;
+
+  recover_got(recovery_info.soid, recovery_info.version);
+
+  if (is_primary()) {
+    info.stats.stats.sum.add(stat_diff);
+
+    assert(obc);
+    obc->obs.exists = true;
+    obc->ondisk_write_lock();
+    obc->obs.oi = recovery_info.oi;  // may have been updated above
+
+
+    t->register_on_applied(new C_OSD_AppliedRecoveredObject(this, obc));
+    t->register_on_applied_sync(new C_OSD_OndiskWriteUnlock(obc));
+
+    publish_stats_to_osd();
+    if (waiting_for_missing_object.count(hoid)) {
+      dout(20) << " kicking waiters on " << hoid << dendl;
+      requeue_ops(waiting_for_missing_object[hoid]);
+      waiting_for_missing_object.erase(hoid);
+      if (pg_log.get_missing().missing.size() == 0) {
+	requeue_ops(waiting_for_all_missing);
+	waiting_for_all_missing.clear();
+      }
+    }
+  } else {
+    t->register_on_applied(
+      new C_OSD_AppliedRecoveredObjectReplica(this));
+
+  }
+
+  t->register_on_commit(
+    new C_OSD_CommittedPushedObject(
+      this,
+      get_osdmap()->get_epoch(),
+      info.last_complete));
+
+  // update pg
+  dirty_info = true;
+  write_if_dirty(*t);
+
+}
+
+void ReplicatedPG::on_global_recover(
+  const hobject_t &soid)
+{
+  publish_stats_to_osd();
+  dout(10) << "pushed " << soid << " to all replicas" << dendl;
+  assert(recovering.count(soid));
+  recovering.erase(soid);
+  finish_recovery_op(soid);
+  if (waiting_for_degraded_object.count(soid)) {
+    requeue_ops(waiting_for_degraded_object[soid]);
+    waiting_for_degraded_object.erase(soid);
+  }
+  finish_degraded_object(soid);
+}
+
+void ReplicatedPG::on_peer_recover(
+  int peer,
+  const hobject_t &soid,
+  const ObjectRecoveryInfo &recovery_info,
+  const object_stat_sum_t &stat)
+{
+  info.stats.stats.sum.add(stat);
+  publish_stats_to_osd();
+  // done!
+  peer_missing[peer].got(soid, recovery_info.version);
+  if (peer == backfill_target && backfills_in_flight.count(soid))
+    backfills_in_flight.erase(soid);
+}
+
+void ReplicatedPG::begin_peer_recover(
+  int peer,
+  const hobject_t soid)
+{
+  peer_missing[peer].revise_have(soid, eversion_t());
+}
+
 // =======================
 // pg changes
 
@@ -117,18 +271,18 @@ void ReplicatedPG::wait_for_missing_object(const hobject_t& soid, OpRequestRef o
   assert(g != missing.missing.end());
   const eversion_t &v(g->second.need);
 
-  map<hobject_t, PullInfo>::const_iterator p = pulling.find(soid);
-  if (p != pulling.end()) {
-    dout(7) << "missing " << soid << " v " << v << ", already pulling." << dendl;
+  set<hobject_t>::const_iterator p = recovering.find(soid);
+  if (p != recovering.end()) {
+    dout(7) << "missing " << soid << " v " << v << ", already recovering." << dendl;
   }
   else if (missing_loc.find(soid) == missing_loc.end()) {
     dout(7) << "missing " << soid << " v " << v << ", is unfound." << dendl;
   }
   else {
-    dout(7) << "missing " << soid << " v " << v << ", pulling." << dendl;
-    map<int, vector<PullOp> > pulls;
-    prepare_pull(soid, v, cct->_conf->osd_client_op_priority, &pulls);
-    send_pulls(cct->_conf->osd_client_op_priority, pulls);
+    dout(7) << "missing " << soid << " v " << v << ", recovering." << dendl;
+    PGBackend::RecoveryHandle *h = pgbackend->open_recovery_op();
+    recover_missing(soid, v, cct->_conf->osd_client_op_priority, h);
+    pgbackend->run_recovery_op(h, cct->_conf->osd_client_op_priority);
   }
   waiting_for_missing_object[soid].push_back(op);
   op->mark_delayed("waiting for missing object");
@@ -165,15 +319,15 @@ void ReplicatedPG::wait_for_degraded_object(const hobject_t& soid, OpRequestRef
   assert(is_degraded_object(soid));
 
   // we don't have it (yet).
-  if (pushing.count(soid)) {
+  if (recovering.count(soid)) {
     dout(7) << "degraded "
 	    << soid 
-	    << ", already pushing"
+	    << ", already recovering"
 	    << dendl;
   } else {
     dout(7) << "degraded " 
 	    << soid 
-	    << ", pushing"
+	    << ", recovering"
 	    << dendl;
     eversion_t v;
     for (unsigned i = 1; i < acting.size(); i++) {
@@ -184,9 +338,9 @@ void ReplicatedPG::wait_for_degraded_object(const hobject_t& soid, OpRequestRef
 	break;
       }
     }
-    map<int, vector<PushOp> > pushes;
-    prep_object_replica_pushes(soid, v, cct->_conf->osd_client_op_priority, &pushes);
-    send_pushes(cct->_conf->osd_client_op_priority, pushes);
+    PGBackend::RecoveryHandle *h = pgbackend->open_recovery_op();
+    prep_object_replica_pushes(soid, v, h);
+    pgbackend->run_recovery_op(h, cct->_conf->osd_client_op_priority);
   }
   waiting_for_degraded_object[soid].push_back(op);
   op->mark_delayed("waiting for degraded object");
@@ -628,9 +782,8 @@ ReplicatedPG::ReplicatedPG(OSDService *o, OSDMapRef curmap,
 			   const PGPool &_pool, pg_t p, const hobject_t& oid,
 			   const hobject_t& ioid) :
   PG(o, curmap, _pool, p, oid, ioid),
+  pgbackend(new ReplicatedBackend(this, coll_t(p), o)),
   snapset_contexts_lock("ReplicatedPG::snapset_contexts"),
-  temp_created(false),
-  temp_coll(coll_t::make_temp_coll(p)),
   temp_seq(0),
   snap_trimmer_machine(this)
 { 
@@ -644,6 +797,62 @@ void ReplicatedPG::get_src_oloc(const object_t& oid, const object_locator_t& olo
     src_oloc.key = oid.name;
 }
 
+void ReplicatedPG::do_request(
+  OpRequestRef op,
+  ThreadPool::TPHandle &handle)
+{
+  // do any pending flush
+  do_pending_flush();
+
+  if (!op_has_sufficient_caps(op)) {
+    osd->reply_op_error(op, -EPERM);
+    return;
+  }
+  assert(!op_must_wait_for_map(get_osdmap(), op));
+  if (can_discard_request(op)) {
+    return;
+  }
+  if (!flushed) {
+    dout(20) << " !flushed, waiting for active on " << op << dendl;
+    waiting_for_active.push_back(op);
+    return;
+  }
+
+  if (pgbackend->handle_message(op))
+    return;
+
+  switch (op->get_req()->get_type()) {
+  case CEPH_MSG_OSD_OP:
+    if (is_replay() || !is_active()) {
+      dout(20) << " replay, waiting for active on " << op << dendl;
+      waiting_for_active.push_back(op);
+      return;
+    }
+    do_op(op); // do it now
+    break;
+
+  case MSG_OSD_SUBOP:
+    do_sub_op(op);
+    break;
+
+  case MSG_OSD_SUBOPREPLY:
+    do_sub_op_reply(op);
+    break;
+
+  case MSG_OSD_PG_SCAN:
+    do_scan(op, handle);
+    break;
+
+  case MSG_OSD_PG_BACKFILL:
+    do_backfill(op);
+    break;
+
+  default:
+    assert(0 == "bad message type in do_request");
+  }
+}
+
+
 /** do_op - do an op
  * pg lock will be held (if multithreaded)
  * osd_lock NOT held.
@@ -660,13 +869,21 @@ void ReplicatedPG::do_op(OpRequestRef op)
     return do_pg_op(op);
   }
 
-  dout(10) << "do_op " << *m << (op->may_write() ? " may_write" : "") << dendl;
+  // order this op as a write?
+  bool write_ordered = op->may_write() || (m->get_flags() & CEPH_OSD_FLAG_RWORDERED);
+
+  dout(10) << "do_op " << *m
+	   << (op->may_write() ? " may_write" : "")
+	   << (op->may_read() ? " may_read" : "")
+	   << " -> " << (write_ordered ? "write-ordered" : "read-ordered")
+	   << dendl;
 
   hobject_t head(m->get_oid(), m->get_object_locator().key,
 		 CEPH_NOSNAP, m->get_pg().ps(),
 		 info.pgid.pool(), m->get_object_locator().nspace);
 
-  if (op->may_write() && scrubber.write_blocked_by_scrub(head)) {
+
+  if (write_ordered && scrubber.write_blocked_by_scrub(head)) {
     dout(20) << __func__ << ": waiting for scrub" << dendl;
     waiting_for_active.push_back(op);
     op->mark_delayed("waiting for scrub");
@@ -680,7 +897,7 @@ void ReplicatedPG::do_op(OpRequestRef op)
   }
 
   // degraded object?
-  if (op->may_write() && is_degraded_object(head)) {
+  if (write_ordered && is_degraded_object(head)) {
     wait_for_degraded_object(head, op);
     return;
   }
@@ -700,7 +917,7 @@ void ReplicatedPG::do_op(OpRequestRef op)
   }
 
   // degraded object?
-  if (op->may_write() && is_degraded_object(snapdir)) {
+  if (write_ordered && is_degraded_object(snapdir)) {
     wait_for_degraded_object(snapdir, op);
     return;
   }
@@ -764,7 +981,7 @@ void ReplicatedPG::do_op(OpRequestRef op)
     return;
   }
 
-  if ((op->may_read()) && (obc->obs.oi.lost)) {
+  if ((op->may_read()) && (obc->obs.oi.is_lost())) {
     // This object is lost. Reading from it returns an error.
     dout(20) << __func__ << ": object " << obc->obs.oi.soid
 	     << " is lost" << dendl;
@@ -774,7 +991,8 @@ void ReplicatedPG::do_op(OpRequestRef op)
   dout(25) << __func__ << ": object " << obc->obs.oi.soid
 	   << " has oi of " << obc->obs.oi << dendl;
   
-  if (!op->may_write() && !obc->obs.exists) {
+  if (!op->may_write() && (!obc->obs.exists ||
+			   obc->obs.oi.is_whiteout())) {
     osd->reply_op_error(op, -ENOENT);
     return;
   }
@@ -831,6 +1049,8 @@ void ReplicatedPG::do_op(OpRequestRef op)
 	  wait_for_missing_object(wait_oid, op);
 	} else if (r) {
 	  osd->reply_op_error(op, r);
+	} else if (sobc->obs.oi.is_whiteout()) {
+	  osd->reply_op_error(op, -ENOENT);
 	} else {
 	  if (sobc->obs.oi.soid.get_key() != obc->obs.oi.soid.get_key() &&
 		   sobc->obs.oi.soid.get_key() != obc->obs.oi.soid.oid.name &&
@@ -885,6 +1105,8 @@ void ReplicatedPG::do_op(OpRequestRef op)
 	  wait_for_missing_object(wait_oid, op);
 	} else if (r) {
 	  osd->reply_op_error(op, r);
+	} else if (sobc->obs.oi.is_whiteout()) {
+	  osd->reply_op_error(op, -ENOENT);
 	} else {
 	  dout(10) << " clone_oid " << clone_oid << " obc " << sobc << dendl;
 	  src_obc[clone_oid] = sobc;
@@ -1229,26 +1451,6 @@ void ReplicatedPG::log_op_stats(OpContext *ctx)
 	   << " lat " << latency << dendl;
 }
 
-void ReplicatedPG::log_subop_stats(OpRequestRef op, int tag_inb, int tag_lat)
-{
-  utime_t now = ceph_clock_now(cct);
-  utime_t latency = now;
-  latency -= op->get_req()->get_recv_stamp();
-
-  uint64_t inb = op->get_req()->get_data().length();
-
-  osd->logger->inc(l_osd_sop);
-
-  osd->logger->inc(l_osd_sop_inb, inb);
-  osd->logger->tinc(l_osd_sop_lat, latency);
-
-  if (tag_inb)
-    osd->logger->inc(tag_inb, inb);
-  osd->logger->tinc(tag_lat, latency);
-}
-
-
-
 void ReplicatedPG::do_sub_op(OpRequestRef op)
 {
   MOSDSubOp *m = static_cast<MOSDSubOp*>(op->get_req());
@@ -1259,11 +1461,6 @@ void ReplicatedPG::do_sub_op(OpRequestRef op)
   OSDOp *first = NULL;
   if (m->ops.size() >= 1) {
     first = &m->ops[0];
-    switch (first->op.op) {
-    case CEPH_OSD_OP_PULL:
-      sub_op_pull(op);
-      return;
-    }
   }
 
   if (!is_active()) {
@@ -1274,9 +1471,6 @@ void ReplicatedPG::do_sub_op(OpRequestRef op)
 
   if (first) {
     switch (first->op.op) {
-    case CEPH_OSD_OP_PUSH:
-      sub_op_push(op);
-      return;
     case CEPH_OSD_OP_DELETE:
       sub_op_remove(op);
       return;
@@ -1305,11 +1499,6 @@ void ReplicatedPG::do_sub_op_reply(OpRequestRef op)
   if (r->ops.size() >= 1) {
     OSDOp& first = r->ops[0];
     switch (first.op.op) {
-    case CEPH_OSD_OP_PUSH:
-      // continue peer recovery
-      sub_op_push_reply(op);
-      return;
-
     case CEPH_OSD_OP_SCRUB_RESERVE:
       sub_op_scrub_reserve_reply(op);
       return;
@@ -1395,7 +1584,7 @@ void ReplicatedPG::do_scan(
   }
 }
 
-void ReplicatedPG::_do_push(OpRequestRef op)
+void ReplicatedBackend::_do_push(OpRequestRef op)
 {
   MOSDPGPush *m = static_cast<MOSDPGPush *>(op->get_req());
   assert(m->get_header().type == MSG_OSD_PG_PUSH);
@@ -1412,18 +1601,42 @@ void ReplicatedPG::_do_push(OpRequestRef op)
 
   MOSDPGPushReply *reply = new MOSDPGPushReply;
   reply->set_priority(m->get_priority());
-  reply->pgid = info.pgid;
+  reply->pgid = get_info().pgid;
   reply->map_epoch = m->map_epoch;
   reply->replies.swap(replies);
   reply->compute_cost(cct);
 
-  t->register_on_complete(new C_OSD_SendMessageOnConn(
-			    osd, reply, m->get_connection()));
+  t->register_on_complete(
+    new C_OSD_SendMessageOnConn(
+      osd, reply, m->get_connection()));
 
-  osd->store->queue_transaction(osr.get(), t);
+  get_parent()->queue_transaction(t);
 }
 
-void ReplicatedPG::_do_pull_response(OpRequestRef op)
+struct C_ReplicatedBackend_OnPullComplete : GenContext<ThreadPool::TPHandle&> {
+  ReplicatedBackend *bc;
+  list<ObjectContextRef> to_continue;
+  int priority;
+  C_ReplicatedBackend_OnPullComplete(ReplicatedBackend *bc, int priority)
+    : bc(bc), priority(priority) {}
+
+  void finish(ThreadPool::TPHandle &handle) {
+    ReplicatedBackend::RPGHandle *h = bc->_open_recovery_op();
+    for (list<ObjectContextRef>::iterator i =
+	   to_continue.begin();
+	 i != to_continue.end();
+	 ++i) {
+      if (!bc->start_pushes((*i)->obs.oi.soid, *i, h)) {
+	bc->get_parent()->on_global_recover(
+	  (*i)->obs.oi.soid);
+      }
+      handle.reset_tp_timeout();
+    }
+    bc->run_recovery_op(h, priority);
+  }
+};
+
+void ReplicatedBackend::_do_pull_response(OpRequestRef op)
 {
   MOSDPGPush *m = static_cast<MOSDPGPush *>(op->get_req());
   assert(m->get_header().type == MSG_OSD_PG_PUSH);
@@ -1431,31 +1644,44 @@ void ReplicatedPG::_do_pull_response(OpRequestRef op)
 
   vector<PullOp> replies(1);
   ObjectStore::Transaction *t = new ObjectStore::Transaction;
+  list<ObjectContextRef> to_continue;
   for (vector<PushOp>::iterator i = m->pushes.begin();
        i != m->pushes.end();
        ++i) {
-    bool more = handle_pull_response(from, *i, &(replies.back()), t);
+    bool more = handle_pull_response(from, *i, &(replies.back()), &to_continue, t);
     if (more)
       replies.push_back(PullOp());
   }
+  if (!to_continue.empty()) {
+    C_ReplicatedBackend_OnPullComplete *c =
+      new C_ReplicatedBackend_OnPullComplete(
+	this,
+	m->get_priority());
+    c->to_continue.swap(to_continue);
+    t->register_on_complete(
+      new C_QueueInWQ(
+	&osd->push_wq,
+	get_parent()->bless_gencontext(c)));
+  }
   replies.erase(replies.end() - 1);
 
   if (replies.size()) {
     MOSDPGPull *reply = new MOSDPGPull;
     reply->set_priority(m->get_priority());
-    reply->pgid = info.pgid;
+    reply->pgid = get_info().pgid;
     reply->map_epoch = m->map_epoch;
     reply->pulls.swap(replies);
     reply->compute_cost(cct);
 
-    t->register_on_complete(new C_OSD_SendMessageOnConn(
-			      osd, reply, m->get_connection()));
+    t->register_on_complete(
+      new C_OSD_SendMessageOnConn(
+	osd, reply, m->get_connection()));
   }
 
-  osd->store->queue_transaction(osr.get(), t);
+  get_parent()->queue_transaction(t);
 }
 
-void ReplicatedPG::do_pull(OpRequestRef op)
+void ReplicatedBackend::do_pull(OpRequestRef op)
 {
   MOSDPGPull *m = static_cast<MOSDPGPull *>(op->get_req());
   assert(m->get_header().type == MSG_OSD_PG_PULL);
@@ -1471,7 +1697,7 @@ void ReplicatedPG::do_pull(OpRequestRef op)
   send_pushes(m->get_priority(), replies);
 }
 
-void ReplicatedPG::do_push_reply(OpRequestRef op)
+void ReplicatedBackend::do_push_reply(OpRequestRef op)
 {
   MOSDPGPushReply *m = static_cast<MOSDPGPushReply *>(op->get_req());
   assert(m->get_header().type == MSG_OSD_PG_PUSH_REPLY);
@@ -2408,6 +2634,25 @@ int ReplicatedPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
       }
       break;
 
+    case CEPH_OSD_OP_ISDIRTY:
+      ++ctx->num_read;
+      {
+	bool is_dirty = obs.oi.is_dirty();
+	::encode(is_dirty, osd_op.outdata);
+	ctx->delta_stats.num_rd++;
+	result = 0;
+      }
+      break;
+
+    case CEPH_OSD_OP_UNDIRTY:
+      ++ctx->num_write;
+      {
+	ctx->undirty = true;  // see make_writeable()
+	ctx->modify = true;
+	ctx->delta_stats.num_wr++;
+      }
+      break;
+
     case CEPH_OSD_OP_GETXATTR:
       ++ctx->num_read;
       {
@@ -2523,8 +2768,8 @@ int ReplicatedPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
 	  result = -ERANGE;
 	else if (ver > oi.user_version)
 	  result = -EOVERFLOW;
-	break;
       }
+      break;
 
     case CEPH_OSD_OP_LIST_WATCHERS:
       ++ctx->num_read;
@@ -2707,6 +2952,9 @@ int ReplicatedPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
 	  op.extent.length = (op.extent.offset > oi.size ? 0 : oi.size - op.extent.offset);
 	  dout(10) << " old truncate_seq " << op.extent.truncate_seq << " < current " << seq
 		   << ", adjusting write length to " << op.extent.length << dendl;
+	  bufferlist t;
+	  t.substr_of(osd_op.indata, 0, op.extent.length);
+	  osd_op.indata.swap(t);
         }
 	if (op.extent.truncate_seq > seq) {
 	  // write arrives before trimtrunc
@@ -2829,7 +3077,7 @@ int ReplicatedPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
 	}
       }
       break;
-      
+
     case CEPH_OSD_OP_TRIMTRUNC:
       op.extent.offset = op.extent.truncate_size;
       // falling through
@@ -2960,7 +3208,8 @@ int ReplicatedPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
     case CEPH_OSD_OP_SETXATTR:
       ++ctx->num_write;
       {
-	if (op.xattr.value_len > cct->_conf->osd_max_attr_size) {
+	if (cct->_conf->osd_max_attr_size > 0 &&
+	    op.xattr.value_len > cct->_conf->osd_max_attr_size) {
 	  result = -EFBIG;
 	  break;
 	}
@@ -3058,11 +3307,6 @@ int ReplicatedPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
 	  }
 	}
 
-	if (cct->_conf->osd_tmapput_sets_uses_tmap) {
-	  assert(cct->_conf->osd_auto_upgrade_tmap);
-	  oi.uses_tmap = true;
-	}
-
 	// write it
 	vector<OSDOp> nops(1);
 	OSDOp& newop = nops[0];
@@ -3108,29 +3352,6 @@ int ReplicatedPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
 	}
 	set<string> out_set;
 
-	if (oi.uses_tmap && cct->_conf->osd_auto_upgrade_tmap) {
-	  dout(20) << "CEPH_OSD_OP_OMAPGETKEYS: "
-		   << " Reading " << oi.soid << " omap from tmap" << dendl;
-	  map<string, bufferlist> vals;
-	  bufferlist header;
-	  int r = _get_tmap(ctx, &vals, &header);
-	  if (r == 0) {
-	    map<string, bufferlist>::iterator iter =
-	      vals.upper_bound(start_after);
-	    for (uint64_t i = 0;
-		 i < max_return && iter != vals.end();
-		 ++i, iter++) {
-	      out_set.insert(iter->first);
-	    }
-	    ::encode(out_set, osd_op.outdata);
-	    ctx->delta_stats.num_rd_kb += SHIFT_ROUND_UP(osd_op.outdata.length(), 10);
-	    ctx->delta_stats.num_rd++;
-	    break;
-	  }
-	  dout(10) << "failed, reading from omap" << dendl;
-	  // No valid tmap, use omap
-	}
-
 	{
 	  ObjectMap::ObjectMapIterator iter = osd->store->get_omap_iterator(
 	    coll, soid
@@ -3166,30 +3387,6 @@ int ReplicatedPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
 	}
 	map<string, bufferlist> out_set;
 
-	if (oi.uses_tmap && cct->_conf->osd_auto_upgrade_tmap) {
-	  dout(20) << "CEPH_OSD_OP_OMAPGETVALS: "
-		   << " Reading " << oi.soid << " omap from tmap" << dendl;
-	  map<string, bufferlist> vals;
-	  bufferlist header;
-	  int r = _get_tmap(ctx, &vals, &header);
-	  if (r == 0) {
-	    map<string, bufferlist>::iterator iter = vals.upper_bound(start_after);
-	    if (filter_prefix > start_after) iter = vals.lower_bound(filter_prefix);
-	    for (uint64_t i = 0;
-		 i < max_return && iter != vals.end() &&
-		   iter->first.substr(0, filter_prefix.size()) == filter_prefix;
-		 ++i, iter++) {
-	      out_set.insert(*iter);
-	    }
-	    ::encode(out_set, osd_op.outdata);
-	    ctx->delta_stats.num_rd_kb += SHIFT_ROUND_UP(osd_op.outdata.length(), 10);
-	    ctx->delta_stats.num_rd++;
-	    break;
-	  }
-	  // No valid tmap, use omap
-	  dout(10) << "failed, reading from omap" << dendl;
-	}
-
 	{
 	  ObjectMap::ObjectMapIterator iter = osd->store->get_omap_iterator(
 	    coll, soid
@@ -3217,19 +3414,6 @@ int ReplicatedPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
     case CEPH_OSD_OP_OMAPGETHEADER:
       ++ctx->num_read;
       {
-	if (oi.uses_tmap && cct->_conf->osd_auto_upgrade_tmap) {
-	  dout(20) << "CEPH_OSD_OP_OMAPGETHEADER: "
-		   << " Reading " << oi.soid << " omap from tmap" << dendl;
-	  map<string, bufferlist> vals;
-	  bufferlist header;
-	  int r = _get_tmap(ctx, &vals, &header);
-	  if (r == 0) {
-	    osd_op.outdata.claim(header);
-	    break;
-	  }
-	  // No valid tmap, fall through to omap
-	  dout(10) << "failed, reading from omap" << dendl;
-	}
 	osd->store->omap_get_header(coll, soid, &osd_op.outdata);
 	ctx->delta_stats.num_rd_kb += SHIFT_ROUND_UP(osd_op.outdata.length(), 10);
 	ctx->delta_stats.num_rd++;
@@ -3248,28 +3432,6 @@ int ReplicatedPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
 	  goto fail;
 	}
 	map<string, bufferlist> out;
-	if (oi.uses_tmap && cct->_conf->osd_auto_upgrade_tmap) {
-	  dout(20) << "CEPH_OSD_OP_OMAPGET: "
-		   << " Reading " << oi.soid << " omap from tmap" << dendl;
-	  map<string, bufferlist> vals;
-	  bufferlist header;
-	  int r = _get_tmap(ctx, &vals, &header);
-	  if (r == 0) {
-	    for (set<string>::iterator iter = keys_to_get.begin();
-		 iter != keys_to_get.end();
-		 ++iter) {
-	      if (vals.count(*iter)) {
-		out.insert(*(vals.find(*iter)));
-	      }
-	    }
-	    ::encode(out, osd_op.outdata);
-	    ctx->delta_stats.num_rd_kb += SHIFT_ROUND_UP(osd_op.outdata.length(), 10);
-	    ctx->delta_stats.num_rd++;
-	    break;
-	  }
-	  // No valid tmap, use omap
-	  dout(10) << "failed, reading from omap" << dendl;
-	}
 	osd->store->omap_get_values(coll, soid, keys_to_get, &out);
 	::encode(out, osd_op.outdata);
 	ctx->delta_stats.num_rd_kb += SHIFT_ROUND_UP(osd_op.outdata.length(), 10);
@@ -3347,9 +3509,6 @@ int ReplicatedPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
     case CEPH_OSD_OP_OMAPSETVALS:
       ++ctx->num_write;
       {
-	if (oi.uses_tmap && cct->_conf->osd_auto_upgrade_tmap) {
-	  _copy_up_tmap(ctx);
-	}
 	if (!obs.exists) {
 	  ctx->delta_stats.num_objects++;
 	  obs.exists = true;
@@ -3377,9 +3536,6 @@ int ReplicatedPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
     case CEPH_OSD_OP_OMAPSETHEADER:
       ++ctx->num_write;
       {
-	if (oi.uses_tmap && cct->_conf->osd_auto_upgrade_tmap) {
-	  _copy_up_tmap(ctx);
-	}
 	if (!obs.exists) {
 	  ctx->delta_stats.num_objects++;
 	  obs.exists = true;
@@ -3397,9 +3553,6 @@ int ReplicatedPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
 	  result = -ENOENT;
 	  break;
 	}
-	if (oi.uses_tmap && cct->_conf->osd_auto_upgrade_tmap) {
-	  _copy_up_tmap(ctx);
-	}
 	t.touch(coll, soid);
 	t.omap_clear(coll, soid);
 	ctx->delta_stats.num_wr++;
@@ -3413,9 +3566,6 @@ int ReplicatedPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
 	  result = -ENOENT;
 	  break;
 	}
-	if (oi.uses_tmap && cct->_conf->osd_auto_upgrade_tmap) {
-	  _copy_up_tmap(ctx);
-	}
 	t.touch(coll, soid);
 	set<string> to_rm;
 	try {
@@ -3532,7 +3682,12 @@ int ReplicatedPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
 	  hobject_t src(src_name, src_oloc.key, src_snapid,
 			raw_pg.ps(), raw_pg.pool(),
 			src_oloc.nspace);
-	  result = start_copy(ctx, src, src_oloc, src_version, &ctx->copy_op);
+	  if (src == soid) {
+	    dout(20) << " copy from self is invalid" << dendl;
+	    result = -EINVAL;
+	    break;
+	  }
+	  result = start_copy(ctx, src, src_oloc, src_version);
 	  if (result < 0)
 	    goto fail;
 	  result = -EINPROGRESS;
@@ -3584,22 +3739,6 @@ int ReplicatedPG::_get_tmap(OpContext *ctx,
   return 0;
 }
 
-int ReplicatedPG::_copy_up_tmap(OpContext *ctx)
-{
-  dout(20) << "copying up tmap for " << ctx->new_obs.oi.soid << dendl;
-  ctx->new_obs.oi.uses_tmap = false;
-  map<string, bufferlist> vals;
-  bufferlist header;
-  int r = _get_tmap(ctx, &vals, &header);
-  if (r < 0)
-    return 0;
-  ctx->op_t.omap_setkeys(coll, ctx->new_obs.oi.soid,
-			 vals);
-  ctx->op_t.omap_setheader(coll, ctx->new_obs.oi.soid,
-			   header);
-  return 0;
-}
-
 inline int ReplicatedPG::_delete_head(OpContext *ctx)
 {
   SnapSet& snapset = ctx->new_snapset;
@@ -3647,11 +3786,11 @@ int ReplicatedPG::_rollback_to(OpContext *ctx, ceph_osd_op& op)
     hobject_t(soid.oid, soid.get_key(), snapid, soid.hash, info.pgid.pool(), soid.get_namespace()),
     &rollback_to, false, &cloneid);
   if (ret) {
-    if (-ENOENT == ret) {
+    if (-ENOENT == ret || rollback_to->obs.oi.is_whiteout()) {
       // there's no snapshot here, or there's no object.
       // if there's no snapshot, we delete the object; otherwise, do nothing.
       dout(20) << "_rollback_to deleting head on " << soid.oid
-	       << " because got ENOENT on find_object_context" << dendl;
+	       << " because got ENOENT|whiteout on find_object_context" << dendl;
       if (ctx->obc->obs.oi.watchers.size()) {
 	// Cannot delete an object with watchers
 	ret = -EBUSY;
@@ -3752,6 +3891,15 @@ void ReplicatedPG::make_writeable(OpContext *ctx)
   dout(20) << "make_writeable " << soid << " snapset=" << ctx->snapset
 	   << "  snapc=" << snapc << dendl;;
   
+  // we will mark the object dirty
+  if (ctx->undirty) {
+    dout(20) << " clearing DIRTY flag" << dendl;
+    ctx->new_obs.oi.clear_flag(object_info_t::FLAG_DIRTY);
+  } else {
+    dout(20) << " setting DIRTY flag" << dendl;
+    ctx->new_obs.oi.set_flag(object_info_t::FLAG_DIRTY);
+  }
+
   // use newer snapc?
   if (ctx->new_snapset.seq > snapc.seq) {
     snapc.seq = ctx->new_snapset.seq;
@@ -3962,19 +4110,9 @@ void ReplicatedPG::do_osd_op_effects(OpContext *ctx)
   }
 }
 
-bool ReplicatedPG::have_temp_coll()
-{
-  return temp_created || osd->store->collection_exists(temp_coll);
-}
-
 coll_t ReplicatedPG::get_temp_coll(ObjectStore::Transaction *t)
 {
-  if (temp_created)
-    return temp_coll;
-  if (!osd->store->collection_exists(temp_coll))
-      t->create_collection(temp_coll);
-  temp_created = true;
-  return temp_coll;
+  return pgbackend->get_temp_coll(t);
 }
 
 hobject_t ReplicatedPG::generate_temp_object()
@@ -3982,6 +4120,7 @@ hobject_t ReplicatedPG::generate_temp_object()
   ostringstream ss;
   ss << "temp_" << info.pgid << "_" << get_role() << "_" << osd->monc->get_global_id() << "_" << (++temp_seq);
   hobject_t hoid(object_t(ss.str()), "", CEPH_NOSNAP, 0, -1, "");
+  pgbackend->add_temp_obj(hoid);
   dout(20) << __func__ << " " << hoid << dendl;
   return hoid;
 }
@@ -4154,8 +4293,7 @@ struct C_Copyfrom : public Context {
 };
 
 int ReplicatedPG::start_copy(OpContext *ctx,
-			     hobject_t src, object_locator_t oloc, version_t version,
-			     CopyOpRef *pcop)
+			     hobject_t src, object_locator_t oloc, version_t version)
 {
   const hobject_t& dest = ctx->obs->oi.soid;
   dout(10) << __func__ << " " << dest << " ctx " << ctx
@@ -4247,7 +4385,6 @@ void ReplicatedPG::process_copy_chunk(hobject_t oid, tid_t tid, int r)
     if (cop->temp_cursor.is_initial()) {
       cop->temp_coll = get_temp_coll(&tctx->local_t);
       cop->temp_oid = generate_temp_object();
-      temp_contents.insert(cop->temp_oid);
       repop->ctx->new_temp_oid = cop->temp_oid;
     }
 
@@ -4255,6 +4392,7 @@ void ReplicatedPG::process_copy_chunk(hobject_t oid, tid_t tid, int r)
 
     issue_repop(repop, repop->ctx->mtime);
     eval_repop(repop);
+    repop->put();
 
     dout(10) << __func__ << " fetching more" << dendl;
     _copy_some(ctx, cop);
@@ -4316,7 +4454,7 @@ int ReplicatedPG::finish_copy(OpContext *ctx)
     // finish writing to temp object, then move into place
     _write_copy_chunk(cop, &t);
     t.collection_move_rename(cop->temp_coll, cop->temp_oid, coll, obs.oi.soid);
-    temp_contents.erase(cop->temp_oid);
+    pgbackend->clear_temp_obj(cop->temp_oid);
     ctx->discard_temp_oid = cop->temp_oid;
   }
 
@@ -4358,19 +4496,12 @@ void ReplicatedPG::cancel_copy(CopyOpRef cop)
   delete ctx;
 }
 
-void ReplicatedPG::requeue_cancel_copy_ops(bool requeue)
+void ReplicatedPG::cancel_copy_ops()
 {
   dout(10) << __func__ << dendl;
-  for (map<hobject_t,CopyOpRef>::iterator p = copy_ops.begin();
-       p != copy_ops.end();
-       copy_ops.erase(p++)) {
-    // requeue initiating copy *and* any subsequent waiters
-    CopyOpRef cop = p->second;
-    if (requeue) {
-      cop->waiting.push_front(cop->ctx->op);
-      requeue_ops(cop->waiting);
-    }
-    cancel_copy(cop);
+  map<hobject_t,CopyOpRef>::iterator p = copy_ops.begin();
+  while (p != copy_ops.end()) {
+    cancel_copy((p++)->second);
   }
 }
 
@@ -4881,7 +5012,8 @@ void ReplicatedPG::check_blacklisted_obc_watchers(ObjectContextRef obc)
 void ReplicatedPG::populate_obc_watchers(ObjectContextRef obc)
 {
   assert(is_active());
-  assert(!is_missing_object(obc->obs.oi.soid) ||
+  assert((recovering.count(obc->obs.oi.soid) ||
+	  !is_missing_object(obc->obs.oi.soid)) ||
 	 (pg_log.get_log().objects.count(obc->obs.oi.soid) && // or this is a revert... see recover_primary()
 	  pg_log.get_log().objects.find(obc->obs.oi.soid)->second->op ==
 	    pg_log_entry_t::LOST_REVERT &&
@@ -4974,6 +5106,7 @@ void ReplicatedPG::handle_watch_timeout(WatchRef watch)
   // obc ref swallowed by repop!
   issue_repop(repop, repop->ctx->mtime);
   eval_repop(repop);
+  repop->put();
 }
 
 ObjectContextRef ReplicatedPG::create_object_context(const object_info_t& oi,
@@ -4993,23 +5126,37 @@ ObjectContextRef ReplicatedPG::create_object_context(const object_info_t& oi,
 }
 
 ObjectContextRef ReplicatedPG::get_object_context(const hobject_t& soid,
-						  bool can_create)
-{
+						  bool can_create,
+						  map<string, bufferptr> *attrs)
+{
+  assert(
+    attrs || !pg_log.get_missing().is_missing(soid) ||
+    // or this is a revert... see recover_primary()
+    (pg_log.get_log().objects.count(soid) &&
+      pg_log.get_log().objects.find(soid)->second->op ==
+      pg_log_entry_t::LOST_REVERT));
   ObjectContextRef obc = object_contexts.lookup(soid);
   if (obc) {
     dout(10) << "get_object_context " << obc << " " << soid << dendl;
   } else {
     // check disk
     bufferlist bv;
-    int r = osd->store->getattr(coll, soid, OI_ATTR, bv);
-    if (r < 0) {
-      if (!can_create)
-	return ObjectContextRef();   // -ENOENT!
-
-      // new object.
-      object_info_t oi(soid);
-      SnapSetContext *ssc = get_snapset_context(soid.oid, soid.get_key(), soid.hash, true, soid.get_namespace());
-      return create_object_context(oi, ssc);
+    if (attrs) {
+      assert(attrs->count(OI_ATTR));
+      bv.push_back(attrs->find(OI_ATTR)->second);
+    } else {
+      int r = osd->store->getattr(coll, soid, OI_ATTR, bv);
+      if (r < 0) {
+	if (!can_create)
+	  return ObjectContextRef();   // -ENOENT!
+
+	// new object.
+	object_info_t oi(soid);
+	SnapSetContext *ssc = get_snapset_context(
+	  soid.oid, soid.get_key(), soid.hash, true, soid.get_namespace(),
+	  soid.has_snapset() ? attrs : 0);
+	return create_object_context(oi, ssc);
+      }
     }
 
     object_info_t oi(bv);
@@ -5021,10 +5168,11 @@ ObjectContextRef ReplicatedPG::get_object_context(const hobject_t& soid,
     obc->obs.oi = oi;
     obc->obs.exists = true;
 
-    if (can_create) {
-      obc->ssc = get_snapset_context(soid.oid, soid.get_key(), soid.hash, true, soid.get_namespace());
-      register_snapset_context(obc->ssc);
-    }
+    obc->ssc = get_snapset_context(
+      soid.oid, soid.get_key(), soid.hash,
+      true, soid.get_namespace(),
+      soid.has_snapset() ? attrs : 0);
+    register_snapset_context(obc->ssc);
 
     populate_obc_watchers(obc);
     dout(10) << "get_object_context " << obc << " " << soid << " 0 -> 1 read " << obc->obs.oi << dendl;
@@ -5228,10 +5376,10 @@ void ReplicatedPG::kick_object_context_blocked(ObjectContextRef obc)
     return;
   }
 
-  list<OpRequestRef>& ls = waiting_for_blocked_object[soid];
+  list<OpRequestRef>& ls = p->second;
   dout(10) << __func__ << " " << soid << " requeuing " << ls.size() << " requests" << dendl;
   requeue_ops(ls);
-  waiting_for_blocked_object.erase(soid);
+  waiting_for_blocked_object.erase(p);
 }
 
 SnapSetContext *ReplicatedPG::create_snapset_context(const object_t& oid)
@@ -5243,11 +5391,13 @@ SnapSetContext *ReplicatedPG::create_snapset_context(const object_t& oid)
   return ssc;
 }
 
-SnapSetContext *ReplicatedPG::get_snapset_context(const object_t& oid,
-						  const string& key,
-						  ps_t seed,
-						  bool can_create,
-						  const string& nspace)
+SnapSetContext *ReplicatedPG::get_snapset_context(
+  const object_t& oid,
+  const string& key,
+  ps_t seed,
+  bool can_create,
+  const string& nspace,
+  map<string, bufferptr> *attrs)
 {
   Mutex::Locker l(snapset_contexts_lock);
   SnapSetContext *ssc;
@@ -5256,20 +5406,25 @@ SnapSetContext *ReplicatedPG::get_snapset_context(const object_t& oid,
     ssc = p->second;
   } else {
     bufferlist bv;
-    hobject_t head(oid, key, CEPH_NOSNAP, seed,
-		   info.pgid.pool(), nspace);
-    int r = osd->store->getattr(coll, head, SS_ATTR, bv);
-    if (r < 0) {
-      // try _snapset
-      hobject_t snapdir(oid, key, CEPH_SNAPDIR, seed,
-			info.pgid.pool(), nspace);
-      r = osd->store->getattr(coll, snapdir, SS_ATTR, bv);
-      if (r < 0 && !can_create)
-	return NULL;
+    if (!attrs) {
+      hobject_t head(oid, key, CEPH_NOSNAP, seed,
+		     info.pgid.pool(), nspace);
+      int r = osd->store->getattr(coll, head, SS_ATTR, bv);
+      if (r < 0) {
+	// try _snapset
+	hobject_t snapdir(oid, key, CEPH_SNAPDIR, seed,
+			  info.pgid.pool(), nspace);
+	r = osd->store->getattr(coll, snapdir, SS_ATTR, bv);
+	if (r < 0 && !can_create)
+	  return NULL;
+      }
+    } else {
+      assert(attrs->count(SS_ATTR));
+      bv.push_back(attrs->find(SS_ATTR)->second);
     }
     ssc = new SnapSetContext(oid);
     _register_snapset_context(ssc);
-    if (r >= 0) {
+    if (bv.length()) {
       bufferlist::iterator bvp = bv.begin();
       ssc->snapset.decode(bvp);
     }
@@ -5345,12 +5500,12 @@ void ReplicatedPG::sub_op_modify(OpRequestRef op)
 
     if (m->new_temp_oid != hobject_t()) {
       dout(20) << __func__ << " start tracking temp " << m->new_temp_oid << dendl;
-      temp_contents.insert(m->new_temp_oid);
+      pgbackend->add_temp_obj(m->new_temp_oid);
       get_temp_coll(&rm->localt);
     }
     if (m->discard_temp_oid != hobject_t()) {
       dout(20) << __func__ << " stop tracking temp " << m->discard_temp_oid << dendl;
-      temp_contents.erase(m->discard_temp_oid);
+      pgbackend->clear_temp_obj(m->discard_temp_oid);
     }
 
     ::decode(rm->opt, p);
@@ -5475,7 +5630,7 @@ void ReplicatedPG::sub_op_modify_commit(RepModify *rm)
 	     << last_peering_reset << dendl;
   }
   
-  log_subop_stats(rm->op, l_osd_sop_w_inb, l_osd_sop_w_lat);
+  log_subop_stats(osd, rm->op, l_osd_sop_w_inb, l_osd_sop_w_lat);
   bool done = rm->applied && rm->committed;
   unlock();
   if (done) {
@@ -5516,11 +5671,12 @@ void ReplicatedPG::sub_op_modify_reply(OpRequestRef op)
 
 // ===========================================================
 
-void ReplicatedPG::calc_head_subsets(ObjectContextRef obc, SnapSet& snapset, const hobject_t& head,
-				     pg_missing_t& missing,
-				     const hobject_t &last_backfill,
-				     interval_set<uint64_t>& data_subset,
-				     map<hobject_t, interval_set<uint64_t> >& clone_subsets)
+void ReplicatedBackend::calc_head_subsets(
+  ObjectContextRef obc, SnapSet& snapset, const hobject_t& head,
+  const pg_missing_t& missing,
+  const hobject_t &last_backfill,
+  interval_set<uint64_t>& data_subset,
+  map<hobject_t, interval_set<uint64_t> >& clone_subsets)
 {
   dout(10) << "calc_head_subsets " << head
 	   << " clone_overlap " << snapset.clone_overlap << dendl;
@@ -5570,11 +5726,12 @@ void ReplicatedPG::calc_head_subsets(ObjectContextRef obc, SnapSet& snapset, con
 	   << "  clone_subsets " << clone_subsets << dendl;
 }
 
-void ReplicatedPG::calc_clone_subsets(SnapSet& snapset, const hobject_t& soid,
-				      const pg_missing_t& missing,
-				      const hobject_t &last_backfill,
-				      interval_set<uint64_t>& data_subset,
-				      map<hobject_t, interval_set<uint64_t> >& clone_subsets)
+void ReplicatedBackend::calc_clone_subsets(
+  SnapSet& snapset, const hobject_t& soid,
+  const pg_missing_t& missing,
+  const hobject_t &last_backfill,
+  interval_set<uint64_t>& data_subset,
+  map<hobject_t, interval_set<uint64_t> >& clone_subsets)
 {
   dout(10) << "calc_clone_subsets " << soid
 	   << " clone_overlap " << snapset.clone_overlap << dendl;
@@ -5659,95 +5816,69 @@ void ReplicatedPG::calc_clone_subsets(SnapSet& snapset, const hobject_t& soid,
  */
 enum { PULL_NONE, PULL_OTHER, PULL_YES };
 
-int ReplicatedPG::prepare_pull(
-  const hobject_t& soid, eversion_t v,
-  int priority,
-  map<int, vector<PullOp> > *pulls)
-{
+void ReplicatedBackend::prepare_pull(
+  const hobject_t& soid,
+  ObjectContextRef headctx,
+  RPGHandle *h)
+{
+  assert(get_parent()->get_local_missing().missing.count(soid));
+  eversion_t v = get_parent()->get_local_missing().missing.find(
+    soid)->second.need;
+  const map<hobject_t, set<int> > &missing_loc(
+    get_parent()->get_missing_loc());
+  const map<int, pg_missing_t > &peer_missing(
+    get_parent()->get_peer_missing());
   int fromosd = -1;
-  map<hobject_t,set<int> >::iterator q = missing_loc.find(soid);
-  if (q != missing_loc.end()) {
-    // randomize the list of possible sources
-    // should we take weights into account?
-    vector<int> shuffle(q->second.begin(), q->second.end());
-    random_shuffle(shuffle.begin(), shuffle.end());
-    for (vector<int>::iterator p = shuffle.begin();
-	 p != shuffle.end();
-	 ++p) {
-      if (get_osdmap()->is_up(*p)) {
-	fromosd = *p;
-	break;
-      }
-    }
-  }
-  if (fromosd < 0) {
-    dout(7) << "pull " << soid
-	    << " v " << v 
-	    << " but it is unfound" << dendl;
-    return PULL_NONE;
-  }
+  map<hobject_t,set<int> >::const_iterator q = missing_loc.find(soid);
+  assert(q != missing_loc.end());
+  assert(!q->second.empty());
+
+  // pick a pullee
+  vector<int> shuffle(q->second.begin(), q->second.end());
+  random_shuffle(shuffle.begin(), shuffle.end());
+  vector<int>::iterator p = shuffle.begin();
+  assert(get_osdmap()->is_up(*p));
+  fromosd = *p;
+  assert(fromosd >= 0);
+
+  dout(7) << "pull " << soid
+	  << "v " << v
+	  << " on osds " << *p
+	  << " from osd." << fromosd
+	  << dendl;
 
   assert(peer_missing.count(fromosd));
-  if (peer_missing[fromosd].is_missing(soid, v)) {
-    assert(peer_missing[fromosd].missing[soid].have != v);
+  const pg_missing_t &pmissing = peer_missing.find(fromosd)->second;
+  if (pmissing.is_missing(soid, v)) {
+    assert(pmissing.missing.find(soid)->second.have != v);
     dout(10) << "pulling soid " << soid << " from osd " << fromosd
-	     << " at version " << peer_missing[fromosd].missing[soid].have
+	     << " at version " << pmissing.missing.find(soid)->second.have
 	     << " rather than at version " << v << dendl;
-    v = peer_missing[fromosd].missing[soid].have;
-    assert(pg_log.get_log().objects.count(soid) &&
-	   pg_log.get_log().objects.find(soid)->second->op == pg_log_entry_t::LOST_REVERT &&
-	   pg_log.get_log().objects.find(soid)->second->reverting_to == v);
+    v = pmissing.missing.find(soid)->second.have;
+    assert(get_parent()->get_log().get_log().objects.count(soid) &&
+	   (get_parent()->get_log().get_log().objects.find(soid)->second->op ==
+	    pg_log_entry_t::LOST_REVERT) &&
+	   (get_parent()->get_log().get_log().objects.find(
+	     soid)->second->reverting_to ==
+	    v));
   }
   
-  dout(7) << "pull " << soid
-	  << " v " << v 
-	  << " on osds " << missing_loc[soid]
-	  << " from osd." << fromosd
-	  << dendl;
-
   ObjectRecoveryInfo recovery_info;
 
-  // is this a snapped object?  if so, consult the snapset.. we may not need the entire object!
-  if (soid.snap && soid.snap < CEPH_NOSNAP) {
-    // do we have the head and/or snapdir?
-    hobject_t head = soid;
-    head.snap = CEPH_NOSNAP;
-    if (pg_log.get_missing().is_missing(head)) {
-      if (pulling.count(head)) {
-	dout(10) << " missing but already pulling head " << head << dendl;
-	return PULL_NONE;
-      } else {
-	int r = prepare_pull(
-	  head, pg_log.get_missing().missing.find(head)->second.need, priority,
-	  pulls);
-	if (r != PULL_NONE)
-	  return PULL_OTHER;
-	return PULL_NONE;
-      }
-    }
-    head.snap = CEPH_SNAPDIR;
-    if (pg_log.get_missing().is_missing(head)) {
-      if (pulling.count(head)) {
-	dout(10) << " missing but already pulling snapdir " << head << dendl;
-	return PULL_NONE;
-      } else {
-	int r = prepare_pull(
-	  head, pg_log.get_missing().missing.find(head)->second.need, priority,
-	  pulls);
-	if (r != PULL_NONE)
-	  return PULL_OTHER;
-	return PULL_NONE;
-      }
-    }
-
+  if (soid.is_snap()) {
+    assert(!get_parent()->get_local_missing().is_missing(
+	     soid.get_head()) ||
+	   !get_parent()->get_local_missing().is_missing(
+	     soid.get_snapdir()));
+    assert(headctx);
     // check snapset
-    SnapSetContext *ssc = get_snapset_context(soid.oid, soid.get_key(), soid.hash, false, soid.get_namespace());
+    SnapSetContext *ssc = headctx->ssc;
     assert(ssc);
     dout(10) << " snapset " << ssc->snapset << dendl;
-    calc_clone_subsets(ssc->snapset, soid, pg_log.get_missing(), info.last_backfill,
+    calc_clone_subsets(ssc->snapset, soid, get_parent()->get_local_missing(),
+		       get_info().last_backfill,
 		       recovery_info.copy_subset,
 		       recovery_info.clone_subset);
-    put_snapset_context(ssc);
     // FIXME: this may overestimate if we are pulling multiple clones in parallel...
     dout(10) << " pulling " << recovery_info << dendl;
   } else {
@@ -5757,8 +5888,8 @@ int ReplicatedPG::prepare_pull(
     recovery_info.size = ((uint64_t)-1);
   }
 
-  (*pulls)[fromosd].push_back(PullOp());
-  PullOp &op = (*pulls)[fromosd].back();
+  h->pulls[fromosd].push_back(PullOp());
+  PullOp &op = h->pulls[fromosd].back();
   op.soid = soid;
 
   op.recovery_info = recovery_info;
@@ -5772,11 +5903,78 @@ int ReplicatedPG::prepare_pull(
   assert(!pulling.count(soid));
   pull_from_peer[fromosd].insert(soid);
   PullInfo &pi = pulling[soid];
+  pi.head_ctx = headctx;
   pi.recovery_info = op.recovery_info;
   pi.recovery_progress = op.recovery_progress;
-  pi.priority = priority;
+}
 
+int ReplicatedPG::recover_missing(
+  const hobject_t &soid, eversion_t v,
+  int priority,
+  PGBackend::RecoveryHandle *h)
+{
+  map<hobject_t,set<int> >::iterator q = missing_loc.find(soid);
+  if (q == missing_loc.end()) {
+    dout(7) << "pull " << soid
+	    << " v " << v 
+	    << " but it is unfound" << dendl;
+    return PULL_NONE;
+  }
+
+  // is this a snapped object?  if so, consult the snapset.. we may not need the entire object!
+  ObjectContextRef obc;
+  ObjectContextRef head_obc;
+  if (soid.snap && soid.snap < CEPH_NOSNAP) {
+    // do we have the head and/or snapdir?
+    hobject_t head = soid.get_head();
+    if (pg_log.get_missing().is_missing(head)) {
+      if (recovering.count(head)) {
+	dout(10) << " missing but already recovering head " << head << dendl;
+	return PULL_NONE;
+      } else {
+	int r = recover_missing(
+	  head, pg_log.get_missing().missing.find(head)->second.need, priority,
+	  h);
+	if (r != PULL_NONE)
+	  return PULL_OTHER;
+	return PULL_NONE;
+      }
+    }
+    head = soid.get_snapdir();
+    if (pg_log.get_missing().is_missing(head)) {
+      if (recovering.count(head)) {
+	dout(10) << " missing but already recovering snapdir " << head << dendl;
+	return PULL_NONE;
+      } else {
+	int r = recover_missing(
+	  head, pg_log.get_missing().missing.find(head)->second.need, priority,
+	  h);
+	if (r != PULL_NONE)
+	  return PULL_OTHER;
+	return PULL_NONE;
+      }
+    }
+
+    // we must have one or the other
+    head_obc = get_object_context(
+      soid.get_head(),
+      false,
+      0);
+    if (!head_obc)
+      head_obc = get_object_context(
+	soid.get_snapdir(),
+	false,
+	0);
+    assert(head_obc);
+  }
   start_recovery_op(soid);
+  assert(!recovering.count(soid));
+  recovering.insert(soid);
+  pgbackend->recover_object(
+    soid,
+    head_obc,
+    obc,
+    h);
   return PULL_YES;
 }
 
@@ -5800,15 +5998,14 @@ void ReplicatedPG::send_remove_op(const hobject_t& oid, eversion_t v, int peer)
  * intelligently push an object to a replica.  make use of existing
  * clones/heads and dup data ranges where possible.
  */
-void ReplicatedPG::prep_push_to_replica(
+void ReplicatedBackend::prep_push_to_replica(
   ObjectContextRef obc, const hobject_t& soid, int peer,
-  int prio,
   PushOp *pop)
 {
   const object_info_t& oi = obc->obs.oi;
   uint64_t size = obc->obs.oi.size;
 
-  dout(10) << __func__ << soid << " v" << oi.version
+  dout(10) << __func__ << ": " << soid << " v" << oi.version
 	   << " size " << size << " to osd." << peer << dendl;
 
   map<hobject_t, interval_set<uint64_t> > clone_subsets;
@@ -5821,41 +6018,48 @@ void ReplicatedPG::prep_push_to_replica(
 
     // try to base push off of clones that succeed/preceed poid
     // we need the head (and current SnapSet) locally to do that.
-    if (pg_log.get_missing().is_missing(head)) {
+    if (get_parent()->get_local_missing().is_missing(head)) {
       dout(15) << "push_to_replica missing head " << head << ", pushing raw clone" << dendl;
-      return prep_push(prio, obc, soid, peer, pop);
+      return prep_push(obc, soid, peer, pop);
     }
     hobject_t snapdir = head;
     snapdir.snap = CEPH_SNAPDIR;
-    if (pg_log.get_missing().is_missing(snapdir)) {
-      dout(15) << "push_to_replica missing snapdir " << snapdir << ", pushing raw clone" << dendl;
-      return prep_push(prio, obc, soid, peer, pop);
+    if (get_parent()->get_local_missing().is_missing(snapdir)) {
+      dout(15) << "push_to_replica missing snapdir " << snapdir
+	       << ", pushing raw clone" << dendl;
+      return prep_push(obc, soid, peer, pop);
     }
     
-    SnapSetContext *ssc = get_snapset_context(soid.oid, soid.get_key(), soid.hash, false, soid.get_namespace());
+    SnapSetContext *ssc = obc->ssc;
     assert(ssc);
     dout(15) << "push_to_replica snapset is " << ssc->snapset << dendl;
-    calc_clone_subsets(ssc->snapset, soid, peer_missing[peer],
-		       peer_info[peer].last_backfill,
+    map<int, pg_missing_t>::const_iterator pm =
+      get_parent()->get_peer_missing().find(peer);
+    assert(pm != get_parent()->get_peer_missing().end());
+    map<int, pg_info_t>::const_iterator pi =
+      get_parent()->get_peer_info().find(peer);
+    assert(pi != get_parent()->get_peer_info().end());
+    calc_clone_subsets(ssc->snapset, soid,
+		       pm->second,
+		       pi->second.last_backfill,
 		       data_subset, clone_subsets);
-    put_snapset_context(ssc);
   } else if (soid.snap == CEPH_NOSNAP) {
     // pushing head or unversioned object.
     // base this on partially on replica's clones?
-    SnapSetContext *ssc = get_snapset_context(soid.oid, soid.get_key(), soid.hash, false, soid.get_namespace());
+    SnapSetContext *ssc = obc->ssc;
     assert(ssc);
     dout(15) << "push_to_replica snapset is " << ssc->snapset << dendl;
-    calc_head_subsets(obc, ssc->snapset, soid, peer_missing[peer],
-		      peer_info[peer].last_backfill,
-		      data_subset, clone_subsets);
-    put_snapset_context(ssc);
+    calc_head_subsets(
+      obc,
+      ssc->snapset, soid, get_parent()->get_peer_missing().find(peer)->second,
+      get_parent()->get_peer_info().find(peer)->second.last_backfill,
+      data_subset, clone_subsets);
   }
 
-  prep_push(prio, obc, soid, peer, oi.version, data_subset, clone_subsets, pop);
+  prep_push(obc, soid, peer, oi.version, data_subset, clone_subsets, pop);
 }
 
-void ReplicatedPG::prep_push(int prio,
-			     ObjectContextRef obc,
+void ReplicatedBackend::prep_push(ObjectContextRef obc,
 			     const hobject_t& soid, int peer,
 			     PushOp *pop)
 {
@@ -5864,13 +6068,12 @@ void ReplicatedPG::prep_push(int prio,
     data_subset.insert(0, obc->obs.oi.size);
   map<hobject_t, interval_set<uint64_t> > clone_subsets;
 
-  prep_push(prio, obc, soid, peer,
+  prep_push(obc, soid, peer,
 	    obc->obs.oi.version, data_subset, clone_subsets,
 	    pop);
 }
 
-void ReplicatedPG::prep_push(
-  int prio,
+void ReplicatedBackend::prep_push(
   ObjectContextRef obc,
   const hobject_t& soid, int peer,
   eversion_t version,
@@ -5878,9 +6081,10 @@ void ReplicatedPG::prep_push(
   map<hobject_t, interval_set<uint64_t> >& clone_subsets,
   PushOp *pop)
 {
-  peer_missing[peer].revise_have(soid, eversion_t());
+  get_parent()->begin_peer_recover(peer, soid);
   // take note.
   PushInfo &pi = pushing[soid][peer];
+  pi.obc = obc;
   pi.recovery_info.size = obc->obs.oi.size;
   pi.recovery_info.copy_subset = data_subset;
   pi.recovery_info.clone_subset = clone_subsets;
@@ -5891,19 +6095,20 @@ void ReplicatedPG::prep_push(
   pi.recovery_progress.data_recovered_to = 0;
   pi.recovery_progress.data_complete = 0;
   pi.recovery_progress.omap_complete = 0;
-  pi.priority = prio;
 
   ObjectRecoveryProgress new_progress;
-  build_push_op(pi.recovery_info,
-		pi.recovery_progress,
-		&new_progress,
-		pop);
+  int r = build_push_op(pi.recovery_info,
+			pi.recovery_progress,
+			&new_progress,
+			pop,
+			&(pi.stat));
+  assert(r == 0);
   pi.recovery_progress = new_progress;
 }
 
-int ReplicatedPG::send_pull_legacy(int prio, int peer,
-			    const ObjectRecoveryInfo &recovery_info,
-			    ObjectRecoveryProgress progress)
+int ReplicatedBackend::send_pull_legacy(int prio, int peer,
+					const ObjectRecoveryInfo &recovery_info,
+					ObjectRecoveryProgress progress)
 {
   // send op
   tid_t tid = osd->get_tid();
@@ -5916,7 +6121,7 @@ int ReplicatedPG::send_pull_legacy(int prio, int peer,
 	   << " from osd." << peer
 	   << " tid " << tid << dendl;
 
-  MOSDSubOp *subop = new MOSDSubOp(rid, info.pgid, recovery_info.soid,
+  MOSDSubOp *subop = new MOSDSubOp(rid, get_info().pgid, recovery_info.soid,
 				   false, CEPH_OSD_FLAG_ACK,
 				   get_osdmap()->get_epoch(), tid,
 				   recovery_info.version);
@@ -5933,7 +6138,7 @@ int ReplicatedPG::send_pull_legacy(int prio, int peer,
   return 0;
 }
 
-void ReplicatedPG::submit_push_data(
+void ReplicatedBackend::submit_push_data(
   ObjectRecoveryInfo &recovery_info,
   bool first,
   bool complete,
@@ -5955,9 +6160,7 @@ void ReplicatedPG::submit_push_data(
   }
 
   if (first) {
-    pg_log.revise_have(recovery_info.soid, eversion_t());
-    remove_snap_mapped_object(*t, recovery_info.soid);
-    t->remove(coll, recovery_info.soid);
+    get_parent()->on_local_recover_start(recovery_info.soid, t);
     t->remove(get_temp_coll(t), recovery_info.soid);
     t->touch(target_coll, recovery_info.soid);
     t->omap_setheader(target_coll, recovery_info.soid, omap_header);
@@ -5991,8 +6194,8 @@ void ReplicatedPG::submit_push_data(
   }
 }
 
-void ReplicatedPG::submit_push_complete(ObjectRecoveryInfo &recovery_info,
-					ObjectStore::Transaction *t)
+void ReplicatedBackend::submit_push_complete(ObjectRecoveryInfo &recovery_info,
+					     ObjectStore::Transaction *t)
 {
   for (map<hobject_t, interval_set<uint64_t> >::const_iterator p =
 	 recovery_info.clone_subset.begin();
@@ -6007,67 +6210,29 @@ void ReplicatedPG::submit_push_complete(ObjectRecoveryInfo &recovery_info,
 		     q.get_start(), q.get_len(), q.get_start());
     }
   }
-
-  if (recovery_info.soid.snap < CEPH_NOSNAP) {
-    assert(recovery_info.oi.snaps.size());
-    OSDriver::OSTransaction _t(osdriver.get_transaction(t));
-    set<snapid_t> snaps(
-      recovery_info.oi.snaps.begin(),
-      recovery_info.oi.snaps.end());
-    snap_mapper.add_oid(
-      recovery_info.soid,
-      snaps,
-      &_t);
-  }
-
-  if (pg_log.get_missing().is_missing(recovery_info.soid) &&
-      pg_log.get_missing().missing.find(recovery_info.soid)->second.need > recovery_info.version) {
-    assert(is_primary());
-    const pg_log_entry_t *latest = pg_log.get_log().objects.find(recovery_info.soid)->second;
-    if (latest->op == pg_log_entry_t::LOST_REVERT &&
-	latest->reverting_to == recovery_info.version) {
-      dout(10) << " got old revert version " << recovery_info.version
-	       << " for " << *latest << dendl;
-      recovery_info.version = latest->version;
-      // update the attr to the revert event version
-      recovery_info.oi.prior_version = recovery_info.oi.version;
-      recovery_info.oi.version = latest->version;
-      bufferlist bl;
-      ::encode(recovery_info.oi, bl);
-      t->setattr(coll, recovery_info.soid, OI_ATTR, bl);
-    }
-  }
-  recover_got(recovery_info.soid, recovery_info.version);
-
-  // update pg
-  dirty_info = true;
-  write_if_dirty(*t);
 }
 
-ObjectRecoveryInfo ReplicatedPG::recalc_subsets(const ObjectRecoveryInfo& recovery_info)
+ObjectRecoveryInfo ReplicatedBackend::recalc_subsets(
+  const ObjectRecoveryInfo& recovery_info,
+  SnapSetContext *ssc)
 {
   if (!recovery_info.soid.snap || recovery_info.soid.snap >= CEPH_NOSNAP)
     return recovery_info;
-
-  SnapSetContext *ssc = get_snapset_context(recovery_info.soid.oid,
-					    recovery_info.soid.get_key(),
-					    recovery_info.soid.hash,
-					    false,
-					    recovery_info.soid.get_namespace());
-  assert(ssc);
   ObjectRecoveryInfo new_info = recovery_info;
   new_info.copy_subset.clear();
   new_info.clone_subset.clear();
   assert(ssc);
-  calc_clone_subsets(ssc->snapset, new_info.soid, pg_log.get_missing(), info.last_backfill,
+  calc_clone_subsets(ssc->snapset, new_info.soid, get_parent()->get_local_missing(),
+		     get_info().last_backfill,
 		     new_info.copy_subset, new_info.clone_subset);
-  put_snapset_context(ssc);
   return new_info;
 }
 
-bool ReplicatedPG::handle_pull_response(
+bool ReplicatedBackend::handle_pull_response(
   int from, PushOp &pop, PullOp *response,
-  ObjectStore::Transaction *t)
+  list<ObjectContextRef> *to_continue,
+  ObjectStore::Transaction *t
+  )
 {
   interval_set<uint64_t> data_included = pop.data_included;
   bufferlist data;
@@ -6099,7 +6264,13 @@ bool ReplicatedPG::handle_pull_response(
       pop.recovery_info.copy_subset);
   }
 
-  pi.recovery_info = recalc_subsets(pi.recovery_info);
+  bool first = pi.recovery_progress.first;
+  if (first) {
+    pi.obc = get_parent()->get_obc(pi.recovery_info.soid, pop.attrset);
+    pi.recovery_info.oi = pi.obc->obs.oi;
+    pi.recovery_info = recalc_subsets(pi.recovery_info, pi.obc->ssc);
+  }
+
 
   interval_set<uint64_t> usable_intervals;
   bufferlist usable_data;
@@ -6111,33 +6282,15 @@ bool ReplicatedPG::handle_pull_response(
   data_included = usable_intervals;
   data.claim(usable_data);
 
-  info.stats.stats.sum.num_bytes_recovered += data.length();
 
-  bool first = pi.recovery_progress.first;
   pi.recovery_progress = pop.after_progress;
 
+  pi.stat.num_bytes_recovered += data.length();
+
   dout(10) << "new recovery_info " << pi.recovery_info
 	   << ", new progress " << pi.recovery_progress
 	   << dendl;
 
-  if (first) {
-    bufferlist oibl;
-    if (pop.attrset.count(OI_ATTR)) {
-      oibl.push_back(pop.attrset[OI_ATTR]);
-      ::decode(pi.recovery_info.oi, oibl);
-    } else {
-      assert(0);
-    }
-    bufferlist ssbl;
-    if (pop.attrset.count(SS_ATTR)) {
-      ssbl.push_back(pop.attrset[SS_ATTR]);
-      ::decode(pi.recovery_info.ss, ssbl);
-    } else {
-      assert(pi.recovery_info.soid.snap != CEPH_NOSNAP &&
-	     pi.recovery_info.soid.snap != CEPH_SNAPDIR);
-    }
-  }
-
   bool complete = pi.is_complete();
 
   submit_push_data(pi.recovery_info, first,
@@ -6148,53 +6301,17 @@ bool ReplicatedPG::handle_pull_response(
 		   pop.omap_entries,
 		   t);
 
-  info.stats.stats.sum.num_keys_recovered += pop.omap_entries.size();
-
-  if (complete) {
-    info.stats.stats.sum.num_objects_recovered++;
-
-    SnapSetContext *ssc;
-    if (hoid.snap == CEPH_NOSNAP || hoid.snap == CEPH_SNAPDIR) {
-      ssc = create_snapset_context(hoid.oid);
-      ssc->snapset = pi.recovery_info.ss;
-    } else {
-      ssc = get_snapset_context(hoid.oid, hoid.get_key(), hoid.hash, false,
-	hoid.get_namespace());
-      assert(ssc);
-    }
-    ObjectContextRef obc = create_object_context(pi.recovery_info.oi, ssc);
-    obc->obs.exists = true;
-
-    obc->ondisk_write_lock();
-
-    // keep track of active pushes for scrub
-    ++active_pushes;
-
-    t->register_on_applied(new C_OSD_AppliedRecoveredObject(this, obc));
-    t->register_on_applied_sync(new C_OSD_OndiskWriteUnlock(obc));
-    t->register_on_complete(
-      new C_OSD_CompletedPull(this, hoid, get_osdmap()->get_epoch()));
-  }
-
-  t->register_on_commit(
-    new C_OSD_CommittedPushedObject(
-      this,
-      get_osdmap()->get_epoch(),
-      info.last_complete));
+  pi.stat.num_keys_recovered += pop.omap_entries.size();
 
   if (complete) {
+    to_continue->push_back(pi.obc);
+    pi.stat.num_objects_recovered++;
+    get_parent()->on_local_recover(
+      hoid, pi.stat, pi.recovery_info, pi.obc, t);
     pulling.erase(hoid);
     pull_from_peer[from].erase(hoid);
-    publish_stats_to_osd();
-    if (waiting_for_missing_object.count(hoid)) {
-      dout(20) << " kicking waiters on " << hoid << dendl;
-      requeue_ops(waiting_for_missing_object[hoid]);
-      waiting_for_missing_object.erase(hoid);
-      if (pg_log.get_missing().missing.size() == 0) {
-	requeue_ops(waiting_for_all_missing);
-	waiting_for_all_missing.clear();
-      }
-    }
+    if (pull_from_peer[from].empty())
+      pull_from_peer.erase(from);
     return false;
   } else {
     response->soid = pop.soid;
@@ -6210,11 +6327,11 @@ struct C_OnPushCommit : public Context {
   C_OnPushCommit(ReplicatedPG *pg, OpRequestRef op) : pg(pg), op(op) {}
   void finish(int) {
     op->mark_event("committed");
-    pg->log_subop_stats(op, l_osd_push_inb, l_osd_sop_push_lat);
+    log_subop_stats(pg->osd, op, l_osd_push_inb, l_osd_sop_push_lat);
   }
 };
 
-void ReplicatedPG::handle_push(
+void ReplicatedBackend::handle_push(
   int from, PushOp &pop, PushReplyOp *response,
   ObjectStore::Transaction *t)
 {
@@ -6228,12 +6345,7 @@ void ReplicatedPG::handle_push(
   bool complete = pop.after_progress.data_complete &&
     pop.after_progress.omap_complete;
 
-  // keep track of active pushes for scrub
-  ++active_pushes;
-
   response->soid = pop.recovery_info.soid;
-  t->register_on_applied(
-    new C_OSD_AppliedRecoveredObjectReplica(this));
   submit_push_data(pop.recovery_info,
 		   first,
 		   complete,
@@ -6244,14 +6356,16 @@ void ReplicatedPG::handle_push(
 		   pop.omap_entries,
 		   t);
 
-  t->register_on_commit(
-    new C_OSD_CommittedPushedObject(
-      this,
-      get_osdmap()->get_epoch(),
-      info.last_complete));
+  if (complete)
+    get_parent()->on_local_recover(
+      pop.recovery_info.soid,
+      object_stat_sum_t(),
+      pop.recovery_info,
+      ObjectContextRef(), // ok, is replica
+      t);
 }
 
-void ReplicatedPG::send_pushes(int prio, map<int, vector<PushOp> > &pushes)
+void ReplicatedBackend::send_pushes(int prio, map<int, vector<PushOp> > &pushes)
 {
   for (map<int, vector<PushOp> >::iterator i = pushes.begin();
        i != pushes.end();
@@ -6275,7 +6389,7 @@ void ReplicatedPG::send_pushes(int prio, map<int, vector<PushOp> > &pushes)
 	uint64_t cost = 0;
 	uint64_t pushes = 0;
 	MOSDPGPush *msg = new MOSDPGPush();
-	msg->pgid = info.pgid;
+	msg->pgid = get_info().pgid;
 	msg->map_epoch = get_osdmap()->get_epoch();
 	msg->set_priority(prio);
 	for (;
@@ -6296,7 +6410,7 @@ void ReplicatedPG::send_pushes(int prio, map<int, vector<PushOp> > &pushes)
   }
 }
 
-void ReplicatedPG::send_pulls(int prio, map<int, vector<PullOp> > &pulls)
+void ReplicatedBackend::send_pulls(int prio, map<int, vector<PullOp> > &pulls)
 {
   for (map<int, vector<PullOp> >::iterator i = pulls.begin();
        i != pulls.end();
@@ -6323,7 +6437,7 @@ void ReplicatedPG::send_pulls(int prio, map<int, vector<PullOp> > &pulls)
 	       << " to osd." << i->first << dendl;
       MOSDPGPull *msg = new MOSDPGPull();
       msg->set_priority(prio);
-      msg->pgid = info.pgid;
+      msg->pgid = get_info().pgid;
       msg->map_epoch = get_osdmap()->get_epoch();
       msg->pulls.swap(i->second);
       msg->compute_cost(cct);
@@ -6332,22 +6446,11 @@ void ReplicatedPG::send_pulls(int prio, map<int, vector<PullOp> > &pulls)
   }
 }
 
-int ReplicatedPG::send_push(int prio, int peer,
-			    const ObjectRecoveryInfo &recovery_info,
-			    const ObjectRecoveryProgress &progress,
-			    ObjectRecoveryProgress *out_progress)
-{
-  PushOp op;
-  int r = build_push_op(recovery_info, progress, out_progress, &op);
-  if (r < 0)
-    return r;
-  return send_push_op_legacy(prio, peer, op);
-}
-
-int ReplicatedPG::build_push_op(const ObjectRecoveryInfo &recovery_info,
-				const ObjectRecoveryProgress &progress,
-				ObjectRecoveryProgress *out_progress,
-				PushOp *out_op)
+int ReplicatedBackend::build_push_op(const ObjectRecoveryInfo &recovery_info,
+				     const ObjectRecoveryProgress &progress,
+				     ObjectRecoveryProgress *out_progress,
+				     PushOp *out_op,
+				     object_stat_sum_t *stat)
 {
   ObjectRecoveryProgress _new_progress;
   if (!out_progress)
@@ -6371,7 +6474,7 @@ int ReplicatedPG::build_push_op(const ObjectRecoveryInfo &recovery_info,
     object_info_t oi(bv);
 
     if (oi.version != recovery_info.version) {
-      osd->clog.error() << info.pgid << " push "
+      osd->clog.error() << get_info().pgid << " push "
 			<< recovery_info.soid << " v "
 			<< " failed because local copy is "
 			<< oi.version << "\n";
@@ -6434,11 +6537,14 @@ int ReplicatedPG::build_push_op(const ObjectRecoveryInfo &recovery_info,
 
   if (new_progress.is_complete(recovery_info)) {
     new_progress.data_complete = true;
-    info.stats.stats.sum.num_objects_recovered++;
+    if (stat)
+      stat->num_objects_recovered++;
   }
 
-  info.stats.stats.sum.num_keys_recovered += out_op->omap_entries.size();
-  info.stats.stats.sum.num_bytes_recovered += out_op->data.length();
+  if (stat) {
+    stat->num_keys_recovered += out_op->omap_entries.size();
+    stat->num_bytes_recovered += out_op->data.length();
+  }
 
   osd->logger->inc(l_osd_push);
   osd->logger->inc(l_osd_push_outb, out_op->data.length());
@@ -6452,11 +6558,11 @@ int ReplicatedPG::build_push_op(const ObjectRecoveryInfo &recovery_info,
   return 0;
 }
 
-int ReplicatedPG::send_push_op_legacy(int prio, int peer, PushOp &pop)
+int ReplicatedBackend::send_push_op_legacy(int prio, int peer, PushOp &pop)
 {
   tid_t tid = osd->get_tid();
   osd_reqid_t rid(osd->get_cluster_msgr_name(), 0, tid);
-  MOSDSubOp *subop = new MOSDSubOp(rid, info.pgid, pop.soid,
+  MOSDSubOp *subop = new MOSDSubOp(rid, get_info().pgid, pop.soid,
 				   false, 0, get_osdmap()->get_epoch(),
 				   tid, pop.recovery_info.version);
   subop->ops = vector<OSDOp>(1);
@@ -6477,14 +6583,14 @@ int ReplicatedPG::send_push_op_legacy(int prio, int peer, PushOp &pop)
   return 0;
 }
 
-void ReplicatedPG::prep_push_op_blank(const hobject_t& soid, PushOp *op)
+void ReplicatedBackend::prep_push_op_blank(const hobject_t& soid, PushOp *op)
 {
   op->recovery_info.version = eversion_t();
   op->version = eversion_t();
   op->soid = soid;
 }
 
-void ReplicatedPG::sub_op_push_reply(OpRequestRef op)
+void ReplicatedBackend::sub_op_push_reply(OpRequestRef op)
 {
   MOSDSubOpReply *reply = static_cast<MOSDSubOpReply*>(op->get_req());
   const hobject_t& soid = reply->get_poid();
@@ -6499,10 +6605,10 @@ void ReplicatedPG::sub_op_push_reply(OpRequestRef op)
   PushOp pop;
   bool more = handle_push_reply(peer, rop, &pop);
   if (more)
-    send_push_op_legacy(pushing[soid][peer].priority, peer, pop);
+    send_push_op_legacy(op->get_req()->get_priority(), peer, pop);
 }
 
-bool ReplicatedPG::handle_push_reply(int peer, PushReplyOp &op, PushOp *reply)
+bool ReplicatedBackend::handle_push_reply(int peer, PushReplyOp &op, PushOp *reply)
 {
   const hobject_t &soid = op.soid;
   if (pushing.count(soid) == 0) {
@@ -6522,32 +6628,25 @@ bool ReplicatedPG::handle_push_reply(int peer, PushReplyOp &op, PushOp *reply)
 	       << pi->recovery_progress.data_recovered_to
 	       << " of " << pi->recovery_info.copy_subset << dendl;
       ObjectRecoveryProgress new_progress;
-      build_push_op(
+      int r = build_push_op(
 	pi->recovery_info,
-	pi->recovery_progress, &new_progress, reply);
+	pi->recovery_progress, &new_progress, reply,
+	&(pi->stat));
+      assert(r == 0);
       pi->recovery_progress = new_progress;
       return true;
     } else {
       // done!
-      if (peer == backfill_target && backfills_in_flight.count(soid))
-	backfills_in_flight.erase(soid);
-      else
-	peer_missing[peer].got(soid, pi->recovery_info.version);
+      get_parent()->on_peer_recover(
+	peer, soid, pi->recovery_info,
+	pi->stat);
       
       pushing[soid].erase(peer);
       pi = NULL;
       
-      publish_stats_to_osd();
       
       if (pushing[soid].empty()) {
-	pushing.erase(soid);
-	dout(10) << "pushed " << soid << " to all replicas" << dendl;
-	finish_recovery_op(soid);
-	if (waiting_for_degraded_object.count(soid)) {
-	  requeue_ops(waiting_for_degraded_object[soid]);
-	  waiting_for_degraded_object.erase(soid);
-	}
-	finish_degraded_object(soid);
+	get_parent()->on_global_recover(soid);
       } else {
 	dout(10) << "pushed " << soid << ", still waiting for push ack from " 
 		 << pushing[soid].size() << " others" << dendl;
@@ -6585,7 +6684,7 @@ void ReplicatedPG::finish_degraded_object(const hobject_t& oid)
  * process request to pull an entire object.
  * NOTE: called from opqueue.
  */
-void ReplicatedPG::sub_op_pull(OpRequestRef op)
+void ReplicatedBackend::sub_op_pull(OpRequestRef op)
 {
   MOSDSubOp *m = static_cast<MOSDSubOp*>(op->get_req());
   assert(m->get_header().type == MSG_OSD_SUBOP);
@@ -6612,16 +6711,17 @@ void ReplicatedPG::sub_op_pull(OpRequestRef op)
     m->get_source().num(),
     reply);
 
-  log_subop_stats(op, 0, l_osd_sop_pull_lat);
+  log_subop_stats(osd, op, 0, l_osd_sop_pull_lat);
 }
 
-void ReplicatedPG::handle_pull(int peer, PullOp &op, PushOp *reply)
+void ReplicatedBackend::handle_pull(int peer, PullOp &op, PushOp *reply)
 {
   const hobject_t &soid = op.soid;
   struct stat st;
   int r = osd->store->stat(coll, soid, &st);
   if (r != 0) {
-    osd->clog.error() << info.pgid << " " << peer << " tried to pull " << soid
+    osd->clog.error() << get_info().pgid << " "
+		      << peer << " tried to pull " << soid
 		      << " but got " << cpp_strerror(-r) << "\n";
     prep_push_op_blank(soid, reply);
   } else {
@@ -6738,7 +6838,7 @@ void ReplicatedPG::recover_got(hobject_t oid, eversion_t v)
  * @param intervals_usable intervals we want to keep
  * @param data_usable matching data we want to keep
  */
-void ReplicatedPG::trim_pushed_data(
+void ReplicatedBackend::trim_pushed_data(
   const interval_set<uint64_t> &copy_subset,
   const interval_set<uint64_t> &intervals_received,
   bufferlist data_received,
@@ -6776,7 +6876,7 @@ void ReplicatedPG::trim_pushed_data(
 /** op_push
  * NOTE: called from opqueue.
  */
-void ReplicatedPG::sub_op_push(OpRequestRef op)
+void ReplicatedBackend::sub_op_push(OpRequestRef op)
 {
   op->mark_started();
   MOSDSubOp *m = static_cast<MOSDSubOp *>(op->get_req());
@@ -6796,14 +6896,29 @@ void ReplicatedPG::sub_op_push(OpRequestRef op)
 
   if (is_primary()) {
     PullOp resp;
-    bool more = handle_pull_response(m->get_source().num(), pop, &resp, t);
+    RPGHandle *h = _open_recovery_op();
+    list<ObjectContextRef> to_continue;
+    bool more = handle_pull_response(
+      m->get_source().num(), pop, &resp,
+      &to_continue, t);
     if (more) {
       send_pull_legacy(
 	m->get_priority(),
 	m->get_source().num(),
 	resp.recovery_info,
 	resp.recovery_progress);
-    }
+    } else {
+      C_ReplicatedBackend_OnPullComplete *c =
+	new C_ReplicatedBackend_OnPullComplete(
+	  this,
+	  op->get_req()->get_priority());
+      c->to_continue.swap(to_continue);
+      t->register_on_complete(
+	new C_QueueInWQ(
+	  &osd->push_wq,
+	  get_parent()->bless_gencontext(c)));
+    }
+    run_recovery_op(h, op->get_req()->get_priority());
   } else {
     PushReplyOp resp;
     MOSDSubOpReply *reply = new MOSDSubOpReply(
@@ -6812,15 +6927,16 @@ void ReplicatedPG::sub_op_push(OpRequestRef op)
     assert(entity_name_t::TYPE_OSD == m->get_connection()->peer_type);
     handle_push(m->get_source().num(), pop, &resp, t);
     t->register_on_complete(new C_OSD_SendMessageOnConn(
-			     osd, reply, m->get_connection()));
+			      osd, reply, m->get_connection()));
   }
-  t->register_on_commit(new C_OnPushCommit(this, op));
-  osd->store->queue_transaction(osr.get(), t);
+  get_parent()->queue_transaction(t);
   return;
 }
 
-void ReplicatedPG::_failed_push(int from, const hobject_t &soid)
+void ReplicatedPG::failed_push(int from, const hobject_t &soid)
 {
+  assert(recovering.count(soid));
+  recovering.erase(soid);
   map<hobject_t,set<int> >::iterator p = missing_loc.find(soid);
   if (p != missing_loc.end()) {
     dout(0) << "_failed_push " << soid << " from osd." << from
@@ -6833,9 +6949,15 @@ void ReplicatedPG::_failed_push(int from, const hobject_t &soid)
     dout(0) << "_failed_push " << soid << " from osd." << from
 	    << " but not in missing_loc ???" << dendl;
   }
-
   finish_recovery_op(soid);  // close out this attempt,
+}
+
+void ReplicatedBackend::_failed_push(int from, const hobject_t &soid)
+{
+  get_parent()->failed_push(from, soid);
   pull_from_peer[from].erase(soid);
+  if (pull_from_peer[from].empty())
+    pull_from_peer.erase(from);
   pulling.erase(soid);
 }
 
@@ -6902,7 +7024,7 @@ ObjectContextRef ReplicatedPG::mark_object_lost(ObjectStore::Transaction *t,
 
   obc->ondisk_write_lock();
 
-  obc->obs.oi.lost = true;
+  obc->obs.oi.set_flag(object_info_t::FLAG_LOST);
   obc->obs.oi.version = info.last_update;
   obc->obs.oi.prior_version = version;
 
@@ -7119,7 +7241,7 @@ void ReplicatedPG::on_shutdown()
   deleting = true;
 
   unreg_next_scrub();
-  requeue_cancel_copy_ops(false);
+  cancel_copy_ops();
   apply_and_flush_repops(false);
   context_registry_on_change();
 
@@ -7131,20 +7253,6 @@ void ReplicatedPG::on_shutdown()
   cancel_recovery();
 }
 
-void ReplicatedPG::on_flushed()
-{
-  assert(object_contexts.empty());
-  if (have_temp_coll() &&
-      !osd->store->collection_empty(get_temp_coll())) {
-    vector<hobject_t> objects;
-    osd->store->collection_list(get_temp_coll(), objects);
-    derr << __func__ << ": found objects in the temp collection: "
-	 << objects << ", crashing now"
-	 << dendl;
-    assert(0 == "found garbage in the temp collection");
-  }
-}
-
 void ReplicatedPG::on_activate()
 {
   for (unsigned i = 1; i<acting.size(); i++) {
@@ -7170,7 +7278,7 @@ void ReplicatedPG::on_change(ObjectStore::Transaction *t)
 
   context_registry_on_change();
 
-  requeue_cancel_copy_ops(is_primary());
+  cancel_copy_ops();
 
   // requeue object waiters
   if (is_primary()) {
@@ -7207,20 +7315,7 @@ void ReplicatedPG::on_change(ObjectStore::Transaction *t)
   // any dups
   apply_and_flush_repops(is_primary());
 
-  // clear pushing/pulling maps
-  pushing.clear();
-  pulling.clear();
-  pull_from_peer.clear();
-
-  // clear temp
-  for (set<hobject_t>::iterator i = temp_contents.begin();
-       i != temp_contents.end();
-       ++i) {
-    dout(10) << __func__ << ": Removing oid "
-	     << *i << " from the temp collection" << dendl;
-    t->remove(get_temp_coll(t), *i);
-  }
-  temp_contents.clear();
+  pgbackend->on_change(t);
 
   // clear snap_trimmer state
   snap_trimmer_machine.process_event(Reset());
@@ -7246,9 +7341,16 @@ void ReplicatedPG::_clear_recovery_state()
   backfill_pos = hobject_t();
   backfills_in_flight.clear();
   pending_backfill_updates.clear();
-  pulling.clear();
-  pushing.clear();
-  pull_from_peer.clear();
+  recovering.clear();
+  pgbackend->clear_state();
+}
+
+void ReplicatedPG::cancel_pull(const hobject_t &soid)
+{
+  assert(recovering.count(soid));
+  recovering.erase(soid);
+  finish_recovery_op(soid);
+  pg_log.set_last_requested(0); // get recover_primary to start over
 }
 
 void ReplicatedPG::check_recovery_sources(const OSDMapRef osdmap)
@@ -7267,26 +7369,10 @@ void ReplicatedPG::check_recovery_sources(const OSDMapRef osdmap)
     }
     dout(10) << "check_recovery_sources source osd." << *p << " now down" << dendl;
     now_down.insert(*p);
-
-    // reset pulls?
-    map<int, set<hobject_t> >::iterator j = pull_from_peer.find(*p);
-    if (j != pull_from_peer.end()) {
-      dout(10) << "check_recovery_sources resetting pulls from osd." << *p
-	       << ", osdmap has it marked down" << dendl;
-      for (set<hobject_t>::iterator i = j->second.begin();
-	   i != j->second.end();
-	   ++i) {
-	assert(pulling.count(*i) == 1);
-	pulling.erase(*i);
-	finish_recovery_op(*i);
-      }
-      pg_log.set_last_requested(0);
-      pull_from_peer.erase(j++);
-    }
-
-    // remove from missing_loc_sources
     missing_loc_sources.erase(p++);
   }
+  pgbackend->check_recovery_sources(osdmap);
+
   if (now_down.empty()) {
     dout(10) << "check_recovery_sources no source osds (" << missing_loc_sources << ") went down" << dendl;
   } else {
@@ -7372,7 +7458,8 @@ int ReplicatedPG::start_recovery_ops(
   }
 
   bool deferred_backfill = false;
-  if (state_test(PG_STATE_BACKFILL) &&
+  if (recovering.empty() &&
+      state_test(PG_STATE_BACKFILL) &&
       backfill_target >= 0 && started < max &&
       missing.num_missing() == 0 &&
       !waiting_on_backfill) {
@@ -7400,9 +7487,11 @@ int ReplicatedPG::start_recovery_ops(
   dout(10) << " started " << started << dendl;
   osd->logger->inc(l_osd_rop, started);
 
-  if (started || recovery_ops_active > 0 || deferred_backfill)
+  if (!recovering.empty() ||
+      started || recovery_ops_active > 0 || deferred_backfill)
     return started;
 
+  assert(recovering.empty());
   assert(recovery_ops_active == 0);
 
   int unfound = get_num_unfound();
@@ -7468,7 +7557,8 @@ int ReplicatedPG::recover_primary(int max, ThreadPool::TPHandle &handle)
 
   const pg_missing_t &missing = pg_log.get_missing();
 
-  dout(10) << "recover_primary pulling " << pulling.size() << " in pg" << dendl;
+  dout(10) << "recover_primary recovering " << recovering.size()
+	   << " in pg" << dendl;
   dout(10) << "recover_primary " << missing << dendl;
   dout(25) << "recover_primary " << missing.missing << dendl;
 
@@ -7477,7 +7567,7 @@ int ReplicatedPG::recover_primary(int max, ThreadPool::TPHandle &handle)
   int started = 0;
   int skipped = 0;
 
-  map<int, vector<PullOp> > pulls;
+  PGBackend::RecoveryHandle *h = pgbackend->open_recovery_op();
   map<version_t, hobject_t>::const_iterator p =
     missing.rmissing.lower_bound(pg_log.get_log().last_requested);
   while (p != missing.rmissing.end()) {
@@ -7508,8 +7598,8 @@ int ReplicatedPG::recover_primary(int max, ThreadPool::TPHandle &handle)
 	     << (unfound ? " (unfound)":"")
 	     << (missing.is_missing(soid) ? " (missing)":"")
 	     << (missing.is_missing(head) ? " (missing head)":"")
-             << (pulling.count(soid) ? " (pulling)":"")
-	     << (pulling.count(head) ? " (pulling head)":"")
+             << (recovering.count(soid) ? " (recovering)":"")
+	     << (recovering.count(head) ? " (recovering head)":"")
              << dendl;
 
     if (latest) {
@@ -7584,14 +7674,14 @@ int ReplicatedPG::recover_primary(int max, ThreadPool::TPHandle &handle)
       }
     }
    
-    if (!pulling.count(soid)) {
-      if (pulling.count(head)) {
+    if (!recovering.count(soid)) {
+      if (recovering.count(head)) {
 	++skipped;
       } else if (unfound) {
 	++skipped;
       } else {
-	int r = prepare_pull(
-	  soid, need, cct->_conf->osd_recovery_op_priority, &pulls);
+	int r = recover_missing(
+	  soid, need, cct->_conf->osd_recovery_op_priority, h);
 	switch (r) {
 	case PULL_YES:
 	  ++started;
@@ -7613,14 +7703,14 @@ int ReplicatedPG::recover_primary(int max, ThreadPool::TPHandle &handle)
     if (!skipped)
       pg_log.set_last_requested(v);
   }
-
-  send_pulls(cct->_conf->osd_recovery_op_priority, pulls);
+ 
+  pgbackend->run_recovery_op(h, cct->_conf->osd_recovery_op_priority);
   return started;
 }
 
 int ReplicatedPG::prep_object_replica_pushes(
-  const hobject_t& soid, eversion_t v, int prio,
-  map<int, vector<PushOp> > *pushes)
+  const hobject_t& soid, eversion_t v,
+  PGBackend::RecoveryHandle *h)
 {
   dout(10) << __func__ << ": on " << soid << dendl;
 
@@ -7647,30 +7737,46 @@ int ReplicatedPG::prep_object_replica_pushes(
     return 0;
   }
 
-  dout(10) << " ondisk_read_lock for " << soid << dendl;
+  start_recovery_op(soid);
+  assert(!recovering.count(soid));
+  recovering.insert(soid);
+
+  /* We need this in case there is an in progress write on the object.  In fact,
+   * the only possible write is an update to the xattr due to a lost_revert --
+   * a client write would be blocked since the object is degraded.
+   * In almost all cases, therefore, this lock should be uncontended.
+   */
   obc->ondisk_read_lock();
-  
+  pgbackend->recover_object(
+    soid,
+    ObjectContextRef(),
+    obc, // has snapset context
+    h);
+  obc->ondisk_read_unlock();
+  return 1;
+}
+
+int ReplicatedBackend::start_pushes(
+  const hobject_t &soid,
+  ObjectContextRef obc,
+  RPGHandle *h)
+{
+  int pushes = 0;
   // who needs it?  
-  bool started = false;
-  for (unsigned i=1; i<acting.size(); i++) {
-    int peer = acting[i];
-    if (peer_missing.count(peer) &&
-	peer_missing[peer].is_missing(soid)) {
-      if (!started) {
-	start_recovery_op(soid);
-	started = true;
-      }
-      (*pushes)[peer].push_back(PushOp());
-      prep_push_to_replica(obc, soid, peer, prio,
-		      &((*pushes)[peer].back())
+  for (unsigned i=1; i<get_parent()->get_acting().size(); i++) {
+    int peer = get_parent()->get_acting()[i];
+    map<int, pg_missing_t>::const_iterator j =
+      get_parent()->get_peer_missing().find(peer);
+    assert(j != get_parent()->get_peer_missing().end());
+    if (j->second.is_missing(soid)) {
+      ++pushes;
+      h->pushes[peer].push_back(PushOp());
+      prep_push_to_replica(obc, soid, peer,
+			   &(h->pushes[peer].back())
 	);
     }
   }
-  
-  dout(10) << " ondisk_read_unlock on " << soid << dendl;
-  obc->ondisk_read_unlock();
-
-  return 1;
+  return pushes;
 }
 
 int ReplicatedPG::recover_replicas(int max, ThreadPool::TPHandle &handle)
@@ -7678,7 +7784,7 @@ int ReplicatedPG::recover_replicas(int max, ThreadPool::TPHandle &handle)
   dout(10) << __func__ << "(" << max << ")" << dendl;
   int started = 0;
 
-  map<int, vector<PushOp> > pushes;
+  PGBackend::RecoveryHandle *h = pgbackend->open_recovery_op();
 
   // this is FAR from an optimal recovery order.  pretty lame, really.
   for (unsigned i=1; i<acting.size(); i++) {
@@ -7698,8 +7804,8 @@ int ReplicatedPG::recover_replicas(int max, ThreadPool::TPHandle &handle)
       handle.reset_tp_timeout();
       const hobject_t soid(p->second);
 
-      if (pushing.count(soid)) {
-	dout(10) << __func__ << ": already pushing " << soid << dendl;
+      if (recovering.count(soid)) {
+	dout(10) << __func__ << ": already recovering" << soid << dendl;
 	continue;
       }
 
@@ -7714,13 +7820,11 @@ int ReplicatedPG::recover_replicas(int max, ThreadPool::TPHandle &handle)
       dout(10) << __func__ << ": recover_object_replicas(" << soid << ")" << dendl;
       map<hobject_t,pg_missing_t::item>::const_iterator r = m.missing.find(soid);
       started += prep_object_replica_pushes(soid, r->second.need,
-					    cct->_conf->osd_recovery_op_priority,
-					    &pushes);
+					    h);
     }
   }
 
-  send_pushes(cct->_conf->osd_recovery_op_priority, pushes);
-
+  pgbackend->run_recovery_op(h, cct->_conf->osd_recovery_op_priority);
   return started;
 }
 
@@ -7883,15 +7987,16 @@ int ReplicatedPG::recover_backfill(
     send_remove_op(i->first, i->second, backfill_target);
   }
 
+  PGBackend::RecoveryHandle *h = pgbackend->open_recovery_op();
   map<int, vector<PushOp> > pushes;
   for (map<hobject_t, pair<eversion_t, eversion_t> >::iterator i = to_push.begin();
        i != to_push.end();
        ++i) {
     handle.reset_tp_timeout();
     prep_backfill_object_push(
-      i->first, i->second.first, i->second.second, backfill_target, &pushes);
+      i->first, i->second.first, i->second.second, backfill_target, h);
   }
-  send_pushes(cct->_conf->osd_recovery_op_priority, pushes);
+  pgbackend->run_recovery_op(h, cct->_conf->osd_recovery_op_priority);
 
   release_waiting_for_backfill_pos();
   dout(5) << "backfill_pos is " << backfill_pos << " and pinfo.last_backfill is "
@@ -7937,20 +8042,25 @@ int ReplicatedPG::recover_backfill(
 
 void ReplicatedPG::prep_backfill_object_push(
   hobject_t oid, eversion_t v, eversion_t have, int peer,
-  map<int, vector<PushOp> > *pushes)
+  PGBackend::RecoveryHandle *h)
 {
   dout(10) << "push_backfill_object " << oid << " v " << v << " to osd." << peer << dendl;
 
   backfills_in_flight.insert(oid);
+  map<int, pg_missing_t>::iterator bpm = peer_missing.find(backfill_target);
+  assert(bpm != peer_missing.end());
+  bpm->second.add(oid, eversion_t(), eversion_t());
 
-  if (!pushing.count(oid))
-    start_recovery_op(oid);
+  assert(!recovering.count(oid));
+
+  start_recovery_op(oid);
+  recovering.insert(oid);
   ObjectContextRef obc = get_object_context(oid, false);
-  obc->ondisk_read_lock();
-  (*pushes)[peer].push_back(PushOp());
-  prep_push_to_replica(obc, oid, peer, cct->_conf->osd_recovery_op_priority,
-		       &((*pushes)[peer].back()));
-  obc->ondisk_read_unlock();
+  pgbackend->recover_object(
+    oid,
+    ObjectContextRef(),
+    obc,
+    h);
 }
 
 void ReplicatedPG::scan_range(
diff --git a/src/osd/ReplicatedPG.h b/src/osd/ReplicatedPG.h
index 5b36c28a51b..a3d42e87600 100644
--- a/src/osd/ReplicatedPG.h
+++ b/src/osd/ReplicatedPG.h
@@ -33,6 +33,9 @@
 
 #include "common/sharedptr_registry.hpp"
 
+#include "PGBackend.h"
+#include "ReplicatedBackend.h"
+
 class MOSDSubOpReply;
 
 class ReplicatedPG;
@@ -80,7 +83,7 @@ public:
   virtual bool filter(bufferlist& xattr_data, bufferlist& outdata);
 };
 
-class ReplicatedPG : public PG {
+class ReplicatedPG : public PG, public PGBackend::Listener {
   friend class OSD;
   friend class Watch;
 
@@ -99,8 +102,6 @@ public:
 
     tid_t objecter_tid;
 
-    list<OpRequestRef> waiting;
-
     object_copy_cursor_t cursor;
     uint64_t size;
     utime_t mtime;
@@ -122,6 +123,119 @@ public:
   };
   typedef boost::shared_ptr<CopyOp> CopyOpRef;
 
+  boost::scoped_ptr<PGBackend> pgbackend;
+
+  /// Listener methods
+  void on_local_recover_start(
+    const hobject_t &oid,
+    ObjectStore::Transaction *t);
+  void on_local_recover(
+    const hobject_t &oid,
+    const object_stat_sum_t &stat_diff,
+    const ObjectRecoveryInfo &recovery_info,
+    ObjectContextRef obc,
+    ObjectStore::Transaction *t
+    );
+  void on_peer_recover(
+    int peer,
+    const hobject_t &oid,
+    const ObjectRecoveryInfo &recovery_info,
+    const object_stat_sum_t &stat
+    );
+  void begin_peer_recover(
+    int peer,
+    const hobject_t oid);
+  void on_global_recover(
+    const hobject_t &oid);
+  void failed_push(int from, const hobject_t &soid);
+  void cancel_pull(const hobject_t &soid);
+
+  template <typename T>
+  class BlessedGenContext : public GenContext<T> {
+    ReplicatedPG *pg;
+    GenContext<T> *c;
+    epoch_t e;
+  public:
+    BlessedGenContext(ReplicatedPG *pg, GenContext<T> *c, epoch_t e)
+      : pg(pg), c(c), e(e) {}
+    void finish(T t) {
+      pg->lock();
+      if (pg->pg_has_reset_since(e))
+	delete c;
+      else
+	c->complete(t);
+      pg->unlock();
+    }
+  };
+  class BlessedContext : public Context {
+    ReplicatedPG *pg;
+    Context *c;
+    epoch_t e;
+  public:
+    BlessedContext(ReplicatedPG *pg, Context *c, epoch_t e)
+      : pg(pg), c(c), e(e) {}
+    void finish(int r) {
+      pg->lock();
+      if (pg->pg_has_reset_since(e))
+	delete c;
+      else
+	c->complete(r);
+      pg->unlock();
+    }
+  };
+  Context *bless_context(Context *c) {
+    return new BlessedContext(this, c, get_osdmap()->get_epoch());
+  }
+  GenContext<ThreadPool::TPHandle&> *bless_gencontext(
+    GenContext<ThreadPool::TPHandle&> *c) {
+    return new BlessedGenContext<ThreadPool::TPHandle&>(
+      this, c, get_osdmap()->get_epoch());
+  }
+    
+  void send_message(int to_osd, Message *m) {
+    osd->send_message_osd_cluster(to_osd, m, get_osdmap()->get_epoch());
+  }
+  void queue_transaction(ObjectStore::Transaction *t) {
+    osd->store->queue_transaction(osr.get(), t);
+  }
+  epoch_t get_epoch() {
+    return get_osdmap()->get_epoch();
+  }
+  const vector<int> &get_acting() {
+    return acting;
+  }
+  std::string gen_dbg_prefix() const { return gen_prefix(); }
+  
+  const map<hobject_t, set<int> > &get_missing_loc() {
+    return missing_loc;
+  }
+  const map<int, pg_missing_t> &get_peer_missing() {
+    return peer_missing;
+  }
+  const map<int, pg_info_t> &get_peer_info() {
+    return peer_info;
+  }
+  const pg_missing_t &get_local_missing() {
+    return pg_log.get_missing();
+  }
+  const PGLog &get_log() {
+    return pg_log;
+  }
+  bool pgb_is_primary() const {
+    return is_primary();
+  }
+  OSDMapRef pgb_get_osdmap() const {
+    return get_osdmap();
+  }
+  const pg_info_t &get_info() const {
+    return info;
+  }
+  ObjectContextRef get_obc(
+    const hobject_t &hoid,
+    map<string, bufferptr> &attrs) {
+    return get_object_context(hoid, true, &attrs);
+  }
+
   /*
    * Capture all object state associated with an in-progress read or write.
    */
@@ -140,6 +254,7 @@ public:
 
     bool modify;          // (force) modification (even if op_t is empty)
     bool user_modify;     // user-visible modification
+    bool undirty;         // user explicitly un-dirtying this object
 
     // side effects
     list<watch_info_t> watch_connects;
@@ -194,7 +309,7 @@ public:
 	      ReplicatedPG *_pg) :
       op(_op), reqid(_reqid), ops(_ops), obs(_obs), snapset(0),
       new_obs(_obs->oi, _obs->exists),
-      modify(false), user_modify(false),
+      modify(false), user_modify(false), undirty(false),
       bytes_written(0), bytes_read(0), user_at_version(0),
       current_osd_subop_num(0),
       data_off(0), reply(NULL), pg(_pg),
@@ -339,7 +454,11 @@ public:
 protected:
 
   ObjectContextRef create_object_context(const object_info_t& oi, SnapSetContext *ssc);
-  ObjectContextRef get_object_context(const hobject_t& soid, bool can_create);
+  ObjectContextRef get_object_context(
+    const hobject_t& soid,
+    bool can_create,
+    map<string, bufferptr> *attrs = 0
+    );
 
   void context_registry_on_change();
   void object_context_destructor_callback(ObjectContext *obc);
@@ -362,8 +481,11 @@ protected:
   void get_src_oloc(const object_t& oid, const object_locator_t& oloc, object_locator_t& src_oloc);
 
   SnapSetContext *create_snapset_context(const object_t& oid);
-  SnapSetContext *get_snapset_context(const object_t& oid, const string &key,
-				      ps_t seed, bool can_create, const string &nspace);
+  SnapSetContext *get_snapset_context(
+    const object_t& oid, const string &key,
+    ps_t seed, bool can_create, const string &nspace,
+    map<string, bufferptr> *attrs = 0
+    );
   void register_snapset_context(SnapSetContext *ssc) {
     Mutex::Locker l(snapset_contexts_lock);
     _register_snapset_context(ssc);
@@ -378,90 +500,7 @@ protected:
   }
   void put_snapset_context(SnapSetContext *ssc);
 
-  // push
-  struct PushInfo {
-    ObjectRecoveryProgress recovery_progress;
-    ObjectRecoveryInfo recovery_info;
-    int priority;
-
-    void dump(Formatter *f) const {
-      {
-	f->open_object_section("recovery_progress");
-	recovery_progress.dump(f);
-	f->close_section();
-      }
-      {
-	f->open_object_section("recovery_info");
-	recovery_info.dump(f);
-	f->close_section();
-      }
-    }
-  };
-  map<hobject_t, map<int, PushInfo> > pushing;
-
-  // pull
-  struct PullInfo {
-    ObjectRecoveryProgress recovery_progress;
-    ObjectRecoveryInfo recovery_info;
-    int priority;
-
-    void dump(Formatter *f) const {
-      {
-	f->open_object_section("recovery_progress");
-	recovery_progress.dump(f);
-	f->close_section();
-      }
-      {
-	f->open_object_section("recovery_info");
-	recovery_info.dump(f);
-	f->close_section();
-      }
-    }
-
-    bool is_complete() const {
-      return recovery_progress.is_complete(recovery_info);
-    }
-  };
-  map<hobject_t, PullInfo> pulling;
-
-  ObjectRecoveryInfo recalc_subsets(const ObjectRecoveryInfo& recovery_info);
-  static void trim_pushed_data(const interval_set<uint64_t> &copy_subset,
-			       const interval_set<uint64_t> &intervals_received,
-			       bufferlist data_received,
-			       interval_set<uint64_t> *intervals_usable,
-			       bufferlist *data_usable);
-  bool handle_pull_response(
-    int from, PushOp &op, PullOp *response,
-    ObjectStore::Transaction *t);
-  void handle_push(
-    int from, PushOp &op, PushReplyOp *response,
-    ObjectStore::Transaction *t);
-  void send_pushes(int prio, map<int, vector<PushOp> > &pushes);
-  int send_push(int priority, int peer,
-		const ObjectRecoveryInfo& recovery_info,
-		const ObjectRecoveryProgress &progress,
-		ObjectRecoveryProgress *out_progress = 0);
-  int build_push_op(const ObjectRecoveryInfo &recovery_info,
-		    const ObjectRecoveryProgress &progress,
-		    ObjectRecoveryProgress *out_progress,
-		    PushOp *out_op);
-  int send_push_op_legacy(int priority, int peer,
-		   PushOp &pop);
-    
-  int send_pull_legacy(int priority, int peer,
-		const ObjectRecoveryInfo& recovery_info,
-		ObjectRecoveryProgress progress);
-  void submit_push_data(ObjectRecoveryInfo &recovery_info,
-			bool first,
-			bool complete,
-			const interval_set<uint64_t> &intervals_included,
-			bufferlist data_included,
-			bufferlist omap_header,
-			map<string, bufferptr> &attrs,
-			map<string, bufferlist> &omap_entries,
-			ObjectStore::Transaction *t);
-  void submit_push_complete(ObjectRecoveryInfo &recovery_info,
-			    ObjectStore::Transaction *t);
+  set<hobject_t> recovering;
 
   /*
    * Backfill
@@ -504,54 +543,17 @@ protected:
       f->close_section();
     }
     {
-      f->open_array_section("pull_from_peer");
-      for (map<int, set<hobject_t> >::const_iterator i = pull_from_peer.begin();
-	   i != pull_from_peer.end();
+      f->open_array_section("recovering");
+      for (set<hobject_t>::const_iterator i = recovering.begin();
+	   i != recovering.end();
 	   ++i) {
-	f->open_object_section("pulling_from");
-	f->dump_int("pull_from", i->first);
-	{
-	  f->open_array_section("pulls");
-	  for (set<hobject_t>::const_iterator j = i->second.begin();
-	       j != i->second.end();
-	       ++j) {
-	    f->open_object_section("pull_info");
-	    assert(pulling.count(*j));
-	    pulling.find(*j)->second.dump(f);
-	    f->close_section();
-	  }
-	  f->close_section();
-	}
-	f->close_section();
+	f->dump_stream("object") << *i;
       }
       f->close_section();
     }
     {
-      f->open_array_section("pushing");
-      for (map<hobject_t, map<int, PushInfo> >::const_iterator i =
-	     pushing.begin();
-	   i != pushing.end();
-	   ++i) {
-	f->open_object_section("object");
-	f->dump_stream("pushing") << i->first;
-	{
-	  f->open_array_section("pushing_to");
-	  for (map<int, PushInfo>::const_iterator j = i->second.begin();
-	       j != i->second.end();
-	       ++j) {
-	    f->open_object_section("push_progress");
-	    f->dump_stream("object_pushing") << j->first;
-	    {
-	      f->open_object_section("push_info");
-	      j->second.dump(f);
-	      f->close_section();
-	    }
-	    f->close_section();
-	  }
-	  f->close_section();
-	}
-	f->close_section();
-      }
+      f->open_object_section("pg_backend");
+      pgbackend->dump_recovery_info(f);
       f->close_section();
     }
   }
@@ -559,53 +561,19 @@ protected:
   /// leading edge of backfill
   hobject_t backfill_pos;
 
-  // Reverse mapping from osd peer to objects beging pulled from that peer
-  map<int, set<hobject_t> > pull_from_peer;
-
   int prep_object_replica_pushes(const hobject_t& soid, eversion_t v,
-				 int priority,
-				 map<int, vector<PushOp> > *pushes);
-  void calc_head_subsets(ObjectContextRef obc, SnapSet& snapset, const hobject_t& head,
-			 pg_missing_t& missing,
-			 const hobject_t &last_backfill,
-			 interval_set<uint64_t>& data_subset,
-			 map<hobject_t, interval_set<uint64_t> >& clone_subsets);
-  void calc_clone_subsets(SnapSet& snapset, const hobject_t& poid, const pg_missing_t& missing,
-			  const hobject_t &last_backfill,
-			  interval_set<uint64_t>& data_subset,
-			  map<hobject_t, interval_set<uint64_t> >& clone_subsets);
-  void prep_push_to_replica(
-    ObjectContextRef obc,
-    const hobject_t& oid,
-    int dest,
-    int priority,
-    PushOp *push_op);
-  void prep_push(int priority,
-		 ObjectContextRef obc,
-		 const hobject_t& oid, int dest,
-		 PushOp *op);
-  void prep_push(int priority,
-		 ObjectContextRef obc,
-		 const hobject_t& soid, int peer,
-		 eversion_t version,
-		 interval_set<uint64_t> &data_subset,
-		 map<hobject_t, interval_set<uint64_t> >& clone_subsets,
-		 PushOp *op);
-  void prep_push_op_blank(const hobject_t& soid, PushOp *op);
+				 PGBackend::RecoveryHandle *h);
 
   void finish_degraded_object(const hobject_t& oid);
 
   // Cancels/resets pulls from peer
   void check_recovery_sources(const OSDMapRef map);
 
-  void send_pulls(
-    int priority,
-    map<int, vector<PullOp> > &pulls);
-  int prepare_pull(
-    const hobject_t& oid, eversion_t v,
+  int recover_missing(
+    const hobject_t& oid,
+    eversion_t v,
     int priority,
-    map<int, vector<PullOp> > *pulls
-    );
+    PGBackend::RecoveryHandle *h);
 
   // low level ops
 
@@ -657,7 +625,7 @@ protected:
 
   void prep_backfill_object_push(
     hobject_t oid, eversion_t v, eversion_t have, int peer,
-    map<int, vector<PushOp> > *pushes);
+    PGBackend::RecoveryHandle *h);
   void send_remove_op(const hobject_t& oid, eversion_t v, int peer);
 
 
@@ -731,35 +699,6 @@ protected:
       pg->_committed_pushed_object(epoch, last_complete);
     }
   };
-  struct C_OSD_SendMessageOnConn: public Context {
-    OSDService *osd;
-    Message *reply;
-    ConnectionRef conn;
-    C_OSD_SendMessageOnConn(
-      OSDService *osd,
-      Message *reply,
-      ConnectionRef conn) : osd(osd), reply(reply), conn(conn) {}
-    void finish(int) {
-      osd->send_message_osd_cluster(reply, conn.get());
-    }
-  };
-  struct C_OSD_CompletedPull : public Context {
-    ReplicatedPGRef pg;
-    hobject_t hoid;
-    epoch_t epoch;
-    C_OSD_CompletedPull(
-      ReplicatedPG *pg,
-      const hobject_t &hoid,
-      epoch_t epoch) : pg(pg), hoid(hoid), epoch(epoch) {}
-    void finish(int) {
-      pg->lock();
-      if (!pg->pg_has_reset_since(epoch)) {
-	pg->finish_recovery_op(hoid);
-      }
-      pg->unlock();
-    }
-  };
-  friend struct C_OSD_CompletedPull;
   struct C_OSD_AppliedRecoveredObjectReplica : public Context {
     ReplicatedPGRef pg;
     C_OSD_AppliedRecoveredObjectReplica(ReplicatedPG *p) :
@@ -780,26 +719,17 @@ protected:
   void _applied_recovered_object_replica();
   void _committed_pushed_object(epoch_t epoch, eversion_t lc);
   void recover_got(hobject_t oid, eversion_t v);
-  void sub_op_push(OpRequestRef op);
-  void _failed_push(int from, const hobject_t &soid);
-  void sub_op_push_reply(OpRequestRef op);
-  bool handle_push_reply(int peer, PushReplyOp &op, PushOp *reply);
-  void sub_op_pull(OpRequestRef op);
-  void handle_pull(int peer, PullOp &op, PushOp *reply);
-
-  void log_subop_stats(OpRequestRef op, int tag_inb, int tag_lat);
 
   // -- copyfrom --
   map<hobject_t, CopyOpRef> copy_ops;
 
-  int start_copy(OpContext *ctx, hobject_t src, object_locator_t oloc, version_t version,
-		 CopyOpRef *pcop);
+  int start_copy(OpContext *ctx, hobject_t src, object_locator_t oloc, version_t version);
   void process_copy_chunk(hobject_t oid, tid_t tid, int r);
   void _write_copy_chunk(CopyOpRef cop, ObjectStore::Transaction *t);
   void _copy_some(OpContext *ctx, CopyOpRef cop);
   int finish_copy(OpContext *ctx);
   void cancel_copy(CopyOpRef cop);
-  void requeue_cancel_copy_ops(bool requeue=true);
+  void cancel_copy_ops();
 
   friend class C_Copyfrom;
 
@@ -828,6 +758,9 @@ public:
   int do_command(cmdmap_t cmdmap, ostream& ss, bufferlist& idata,
 		 bufferlist& odata);
 
+  void do_request(
+    OpRequestRef op,
+    ThreadPool::TPHandle &handle);
   void do_op(OpRequestRef op);
   bool pg_op_must_wait(MOSDOp *op);
   void do_pg_op(OpRequestRef op);
@@ -837,17 +770,7 @@ public:
     OpRequestRef op,
     ThreadPool::TPHandle &handle);
   void do_backfill(OpRequestRef op);
-  void _do_push(OpRequestRef op);
-  void _do_pull_response(OpRequestRef op);
-  void do_push(OpRequestRef op) {
-    if (is_primary()) {
-      _do_pull_response(op);
-    } else {
-      _do_push(op);
-    }
-  }
-  void do_pull(OpRequestRef op);
-  void do_push_reply(OpRequestRef op);
+
   RepGather *trim_object(const hobject_t &coid);
   void snap_trimmer();
   int do_osd_ops(OpContext *ctx, vector<OSDOp>& ops);
@@ -857,16 +780,27 @@ public:
 
   void do_osd_op_effects(OpContext *ctx);
 private:
-  bool temp_created;
-  coll_t temp_coll;
-  set<hobject_t> temp_contents;   ///< contents of temp collection, clear on reset
   uint64_t temp_seq; ///< last id for naming temp objects
   coll_t get_temp_coll(ObjectStore::Transaction *t);
   hobject_t generate_temp_object();  ///< generate a new temp object name
 public:
-  bool have_temp_coll();
-  coll_t get_temp_coll() {
-    return temp_coll;
+  void get_colls(list<coll_t> *out) {
+    out->push_back(coll);
+    return pgbackend->temp_colls(out);
+  }
+  void split_colls(
+    pg_t child,
+    int split_bits,
+    int seed,
+    ObjectStore::Transaction *t) {
+    coll_t target = coll_t(child);
+    t->create_collection(target);
+    t->split_collection(
+      coll,
+      split_bits,
+      seed,
+      target);
+    pgbackend->split_colls(child, split_bits, seed, t);
   }
 private:
   struct NotTrimming;
@@ -922,7 +856,6 @@ private:
 
   int _get_tmap(OpContext *ctx, map<string, bufferlist> *out,
 		bufferlist *header);
-  int _copy_up_tmap(OpContext *ctx);
   int _delete_head(OpContext *ctx);
   int _rollback_to(OpContext *ctx, ceph_osd_op& op);
 public:
@@ -952,7 +885,10 @@ public:
   void on_role_change();
   void on_change(ObjectStore::Transaction *t);
   void on_activate();
-  void on_flushed();
+  void on_flushed() {
+    assert(object_contexts.empty());
+    pgbackend->on_flushed();
+  }
   void on_removal(ObjectStore::Transaction *t);
   void on_shutdown();
 };
diff --git a/src/osd/osd_types.cc b/src/osd/osd_types.cc
index aa20dc592fa..27f7b171677 100644
--- a/src/osd/osd_types.cc
+++ b/src/osd/osd_types.cc
@@ -2796,9 +2796,8 @@ void object_info_t::copy_user_bits(const object_info_t& other)
   last_reqid = other.last_reqid;
   truncate_seq = other.truncate_seq;
   truncate_size = other.truncate_size;
-  lost = other.lost;
+  flags = other.flags;
   category = other.category;
-  uses_tmap = other.uses_tmap;
 }
 
 ps_t object_info_t::legacy_object_locator_to_ps(const object_t &oid, 
@@ -2824,7 +2823,7 @@ void object_info_t::encode(bufferlist& bl) const
        ++i) {
     old_watchers.insert(make_pair(i->first.second, i->second));
   }
-  ENCODE_START(11, 8, bl);
+  ENCODE_START(12, 8, bl);
   ::encode(soid, bl);
   ::encode(myoloc, bl);	//Retained for compatibility
   ::encode(category, bl);
@@ -2839,13 +2838,15 @@ void object_info_t::encode(bufferlist& bl) const
     ::encode(snaps, bl);
   ::encode(truncate_seq, bl);
   ::encode(truncate_size, bl);
-  ::encode(lost, bl);
+  __u8 flags_lo = flags & 0xff;
+  __u8 flags_hi = (flags & 0xff00) >> 8;
+  ::encode(flags_lo, bl);
   ::encode(old_watchers, bl);
   /* shenanigans to avoid breaking backwards compatibility in the disk format.
    * When we can, switch this out for simply putting the version_t on disk. */
   eversion_t user_eversion(0, user_version);
   ::encode(user_eversion, bl);
-  ::encode(uses_tmap, bl);
+  ::encode(flags_hi, bl);
   ::encode(watchers, bl);
   ENCODE_FINISH(bl);
 }
@@ -2853,7 +2854,7 @@ void object_info_t::encode(bufferlist& bl) const
 void object_info_t::decode(bufferlist::iterator& bl)
 {
   object_locator_t myoloc;
-  DECODE_START_LEGACY_COMPAT_LEN(11, 8, 8, bl);
+  DECODE_START_LEGACY_COMPAT_LEN(12, 8, 8, bl);
   map<entity_name_t, watch_info_t> old_watchers;
   if (struct_v >= 2 && struct_v <= 5) {
     sobject_t obj;
@@ -2883,20 +2884,26 @@ void object_info_t::decode(bufferlist::iterator& bl)
     ::decode(snaps, bl);
   ::decode(truncate_seq, bl);
   ::decode(truncate_size, bl);
-  if (struct_v >= 3)
-    ::decode(lost, bl);
-  else
-    lost = false;
+  if (struct_v >= 3) {
+    __u8 lo;
+    ::decode(lo, bl);
+    flags = (flag_t)lo;
+  } else {
+    flags = (flag_t)0;
+  }
   if (struct_v >= 4) {
     ::decode(old_watchers, bl);
     eversion_t user_eversion;
     ::decode(user_eversion, bl);
     user_version = user_eversion.version;
   }
-  if (struct_v >= 9)
-    ::decode(uses_tmap, bl);
-  else
-    uses_tmap = true;
+  if (struct_v >= 9) {
+    __u8 hi;
+    ::decode(hi, bl);
+    flags = (flag_t)(flags | ((unsigned)hi << 8));
+  } else {
+    set_flag(FLAG_USES_TMAP);
+  }
   if (struct_v < 10)
     soid.pool = myoloc.pool;
   if (struct_v >= 11) {
@@ -2924,7 +2931,8 @@ void object_info_t::dump(Formatter *f) const
   f->dump_stream("last_reqid") << last_reqid;
   f->dump_unsigned("size", size);
   f->dump_stream("mtime") << mtime;
-  f->dump_unsigned("lost", lost);
+  f->dump_unsigned("lost", (int)is_lost());
+  f->dump_unsigned("flags", (int)flags);
   f->dump_stream("wrlock_by") << wrlock_by;
   f->open_array_section("snaps");
   for (vector<snapid_t>::const_iterator p = snaps.begin(); p != snaps.end(); ++p)
@@ -2960,8 +2968,8 @@ ostream& operator<<(ostream& out, const object_info_t& oi)
     out << " wrlock_by=" << oi.wrlock_by;
   else
     out << " " << oi.snaps;
-  if (oi.lost)
-    out << " LOST";
+  if (oi.flags)
+    out << " " << oi.get_flag_string();
   out << ")";
   return out;
 }
@@ -3515,6 +3523,8 @@ ostream& operator<<(ostream& out, const OSDOp& op)
     case CEPH_OSD_OP_DELETE:
     case CEPH_OSD_OP_LIST_WATCHERS:
     case CEPH_OSD_OP_LIST_SNAPS:
+    case CEPH_OSD_OP_UNDIRTY:
+    case CEPH_OSD_OP_ISDIRTY:
       break;
     case CEPH_OSD_OP_ASSERT_VER:
       out << " v" << op.op.assert_ver.ver;
diff --git a/src/osd/osd_types.h b/src/osd/osd_types.h
index c2876fdc1cd..a54fc65f375 100644
--- a/src/osd/osd_types.h
+++ b/src/osd/osd_types.h
@@ -42,10 +42,12 @@
 #define CEPH_OSD_FEATURE_INCOMPAT_LEVELDBINFO CompatSet::Feature(8, "leveldbinfo")
 #define CEPH_OSD_FEATURE_INCOMPAT_LEVELDBLOG CompatSet::Feature(9, "leveldblog")
 #define CEPH_OSD_FEATURE_INCOMPAT_SNAPMAPPER CompatSet::Feature(10, "snapmapper")
+#define CEPH_OSD_FEATURE_INCOMPAT_SHARDS CompatSet::Feature(11, "sharded objects")
 
 
 typedef hobject_t collection_list_handle_t;
 
+typedef uint8_t shard_id_t;
 
 /**
  * osd request identifier
@@ -2031,22 +2033,68 @@ struct object_info_t {
 
   uint64_t size;
   utime_t mtime;
-  bool lost;
+
+  // note: these are currently encoded into a total 16 bits; see
+  // encode()/decode() for the weirdness.
+  typedef enum {
+    FLAG_LOST     = 1<<0,
+    FLAG_WHITEOUT = 1<<1,  // object logically does not exist
+    FLAG_DIRTY    = 1<<2,  // object has been modified since last flushed or undirtied
+    // ...
+    FLAG_USES_TMAP = 1<<8,  // deprecated; no longer used.
+  } flag_t;
+
+  flag_t flags;
+
+  static string get_flag_string(flag_t flags) {
+    string s;
+    if (flags & FLAG_LOST)
+      s += "|lost";
+    if (flags & FLAG_WHITEOUT)
+      s += "|whiteout";
+    if (flags & FLAG_DIRTY)
+      s += "|dirty";
+    if (flags & FLAG_USES_TMAP)
+      s += "|uses_tmap";
+    if (s.length())
+      return s.substr(1);
+    return s;
+  }
+  string get_flag_string() const {
+    return get_flag_string(flags);
+  }
 
   osd_reqid_t wrlock_by;   // [head]
   vector<snapid_t> snaps;  // [clone]
 
   uint64_t truncate_seq, truncate_size;
 
-
   map<pair<uint64_t, entity_name_t>, watch_info_t> watchers;
-  bool uses_tmap;
 
   void copy_user_bits(const object_info_t& other);
 
   static ps_t legacy_object_locator_to_ps(const object_t &oid, 
 					  const object_locator_t &loc);
 
+  bool test_flag(flag_t f) const {
+    return (flags & f) == f;
+  }
+  void set_flag(flag_t f) {
+    flags = (flag_t)(flags | f);
+  }
+  void clear_flag(flag_t f) {
+    flags = (flag_t)(flags & ~f);
+  }
+  bool is_lost() const {
+    return test_flag(FLAG_LOST);
+  }
+  bool is_whiteout() const {
+    return test_flag(FLAG_WHITEOUT);
+  }
+  bool is_dirty() const {
+    return test_flag(FLAG_DIRTY);
+  }
+
   void encode(bufferlist& bl) const;
   void decode(bufferlist::iterator& bl);
   void decode(bufferlist& bl) {
@@ -2057,13 +2105,14 @@ struct object_info_t {
   static void generate_test_instances(list<object_info_t*>& o);
 
   explicit object_info_t()
-    : user_version(0), size(0), lost(false),
-      truncate_seq(0), truncate_size(0), uses_tmap(false)
+    : user_version(0), size(0), flags((flag_t)0),
+      truncate_seq(0), truncate_size(0)
   {}
 
   object_info_t(const hobject_t& s)
-    : soid(s), user_version(0), size(0),
-      lost(false), truncate_seq(0), truncate_size(0), uses_tmap(false) {}
+    : soid(s),
+      user_version(0), size(0), flags((flag_t)0),
+      truncate_seq(0), truncate_size(0) {}
 
   object_info_t(bufferlist& bl) {
     decode(bl);
@@ -2073,7 +2122,7 @@ WRITE_CLASS_ENCODER(object_info_t)
 
 struct ObjectState {
   object_info_t oi;
-  bool exists;
+  bool exists;         ///< the stored object exists (i.e., we will remember the object_info_t)
 
   ObjectState() : exists(false) {}
 
diff --git a/src/osdc/ObjectCacher.cc b/src/osdc/ObjectCacher.cc
index 01eeccc03be..81335b7957f 100644
--- a/src/osdc/ObjectCacher.cc
+++ b/src/osdc/ObjectCacher.cc
@@ -11,6 +11,8 @@
 
 #include "include/assert.h"
 
+#define MAX_FLUSH_UNDER_LOCK 20  ///< max bh's we start writeback on while holding the lock
+
 /*** ObjectCacher::BufferHead ***/
 
 
@@ -899,11 +901,10 @@ void ObjectCacher::bh_write_commit(int64_t poolid, sobject_t oid, loff_t start,
     ob->last_commit_tid = tid;
 
     // waiters?
+    list<Context*> ls;
     if (ob->waitfor_commit.count(tid)) {
-      list<Context*> ls;
       ls.splice(ls.begin(), ob->waitfor_commit[tid]);
       ob->waitfor_commit.erase(tid);
-      finish_contexts(cct, ls, r);
     }
 
     // is the entire object set now clean and fully committed?
@@ -915,6 +916,9 @@ void ObjectCacher::bh_write_commit(int64_t poolid, sobject_t oid, loff_t start,
 	oset->dirty_or_tx == 0) {        // nothing dirty/tx
       flush_set_callback(flush_set_callback_arg, oset);      
     }
+
+    if (!ls.empty())
+      finish_contexts(cct, ls, r);
   }
 }
 
@@ -1446,8 +1450,10 @@ void ObjectCacher::flusher_entry()
       utime_t cutoff = ceph_clock_now(cct);
       cutoff -= max_dirty_age;
       BufferHead *bh = 0;
+      int max = MAX_FLUSH_UNDER_LOCK;
       while ((bh = static_cast<BufferHead*>(bh_lru_dirty.lru_get_next_expire())) != 0 &&
-	     bh->last_write < cutoff) {
+	     bh->last_write < cutoff &&
+	     --max > 0) {
 	ldout(cct, 10) << "flusher flushing aged dirty bh " << *bh << dendl;
 	bh_write(bh);
       }
diff --git a/src/osdc/Objecter.cc b/src/osdc/Objecter.cc
index 6c0486ce801..d2c574d982e 100644
--- a/src/osdc/Objecter.cc
+++ b/src/osdc/Objecter.cc
@@ -1338,7 +1338,7 @@ int Objecter::recalc_op_target(Op *op)
     need_check_tiering = true;
   }
   
-  if (need_check_tiering) {
+  if (honor_cache_redirects && need_check_tiering) {
     const pg_pool_t *pi = osdmap->get_pg_pool(op->base_oloc.pool);
     if (pi) {
       if (is_read && pi->has_read_tier())
diff --git a/src/osdc/Objecter.h b/src/osdc/Objecter.h
index 880023ab37b..1196633276d 100644
--- a/src/osdc/Objecter.h
+++ b/src/osdc/Objecter.h
@@ -617,9 +617,10 @@ struct ObjectOperation {
 	}
 	::decode(*cursor, p);
       } catch (buffer::error& e) {
-	if (prval)
-	  *prval = -EIO;
+	r = -EIO;
       }
+      if (prval)
+	*prval = r;
     }
   };
 
@@ -643,6 +644,43 @@ struct ObjectOperation {
     out_handler[p] = h;
   }
 
+  void undirty() {
+    add_op(CEPH_OSD_OP_UNDIRTY);
+  }
+
+  struct C_ObjectOperation_isdirty : public Context {
+    bufferlist bl;
+    bool *pisdirty;
+    int *prval;
+    C_ObjectOperation_isdirty(bool *p, int *r)
+      : pisdirty(p), prval(r) {}
+    void finish(int r) {
+      if (r < 0)
+	return;
+      try {
+	bufferlist::iterator p = bl.begin();
+	bool isdirty;
+	::decode(isdirty, p);
+	if (pisdirty)
+	  *pisdirty = isdirty;
+      } catch (buffer::error& e) {
+	r = -EIO;
+      }
+      if (prval)
+	*prval = r;
+    }
+  };
+
+  void is_dirty(bool *pisdirty, int *prval) {
+    add_op(CEPH_OSD_OP_ISDIRTY);
+    unsigned p = ops.size() - 1;
+    out_rval[p] = prval;
+    C_ObjectOperation_isdirty *h =
+      new C_ObjectOperation_isdirty(pisdirty, prval);
+    out_bl[p] = &h->bl;
+    out_handler[p] = h;
+  }
+
   void omap_get_header(bufferlist *bl, int *prval) {
     add_op(CEPH_OSD_OP_OMAPGETHEADER);
     unsigned p = ops.size() - 1;
@@ -785,6 +823,7 @@ class Objecter {
   int global_op_flags; // flags which are applied to each IO op
   bool keep_balanced_budget;
   bool honor_osdmap_full;
+  bool honor_cache_redirects;
 
   void maybe_request_map();
 
@@ -1260,6 +1299,7 @@ public:
     num_unacked(0), num_uncommitted(0),
     global_op_flags(0),
     keep_balanced_budget(false), honor_osdmap_full(true),
+    honor_cache_redirects(true),
     last_seen_osdmap_version(0),
     last_seen_pgmap_version(0),
     client_lock(l), timer(t),
@@ -1293,6 +1333,9 @@ public:
   void set_honor_osdmap_full() { honor_osdmap_full = true; }
   void unset_honor_osdmap_full() { honor_osdmap_full = false; }
 
+  void set_honor_cache_redirects() { honor_cache_redirects = true; }
+  void unset_honor_cache_redirects() { honor_cache_redirects = false; }
+
   void scan_requests(bool skipped_map,
 		     map<tid_t, Op*>& need_resend,
 		     list<LingerOp*>& need_resend_linger,
diff --git a/src/perfglue/heap_profiler.cc b/src/perfglue/heap_profiler.cc
index 550f7f924c6..6b079b865fa 100644
--- a/src/perfglue/heap_profiler.cc
+++ b/src/perfglue/heap_profiler.cc
@@ -88,7 +88,7 @@ void ceph_heap_profiler_dump(const char *reason)
 void ceph_heap_profiler_handle_command(const std::vector<std::string>& cmd,
                                        ostream& out)
 {
-  if (cmd.size() == 2 && cmd[1] == "dump") {
+  if (cmd.size() == 1 && cmd[0] == "dump") {
     if (!ceph_heap_profiler_running()) {
       out << "heap profiler not running; can't dump";
       return;
@@ -98,16 +98,16 @@ void ceph_heap_profiler_handle_command(const std::vector<std::string>& cmd,
     out << g_conf->name << "dumping heap profile now.\n"
 	<< heap_stats;
     ceph_heap_profiler_dump("admin request");
-  } else if (cmd.size() == 2 && cmd[1] == "start_profiler") {
+  } else if (cmd.size() == 1 && cmd[0] == "start_profiler") {
     ceph_heap_profiler_start();
     out << g_conf->name << " started profiler";
-  } else if (cmd.size() == 2 && cmd[1] == "stop_profiler") {
+  } else if (cmd.size() == 1 && cmd[0] == "stop_profiler") {
     ceph_heap_profiler_stop();
     out << g_conf->name << " stopped profiler";
-  } else if (cmd.size() == 2 && cmd[1] == "release") {
+  } else if (cmd.size() == 1 && cmd[0] == "release") {
     ceph_heap_release_free_memory();
     out << g_conf->name << " releasing free RAM back to system.";
-  } else if (cmd.size() == 2 && cmd[1] == "stats") {
+  } else if (cmd.size() == 1 && cmd[0] == "stats") {
     char *heap_stats = new char[1024];
     ceph_heap_profiler_stats(heap_stats, 1024);
     out << g_conf->name << "tcmalloc heap stats:"
diff --git a/src/pybind/ceph_argparse.py b/src/pybind/ceph_argparse.py
index 427a4621216..1f6e90b6c1d 100644
--- a/src/pybind/ceph_argparse.py
+++ b/src/pybind/ceph_argparse.py
@@ -275,12 +275,26 @@ class CephIPAddr(CephArgtype):
 
 class CephEntityAddr(CephIPAddr):
     """
-    EntityAddress, that is, IP address/nonce
+    EntityAddress, that is, IP address[/nonce]
     """
     def valid(self, s, partial=False):
-        ip, nonce = s.split('/')
+        nonce = None
+        if '/' in s:
+            ip, nonce = s.split('/')
+        else:
+            ip = s
         super(self.__class__, self).valid(ip)
-        self.nonce = nonce
+        if nonce:
+            nonce_long = None
+            try:
+                nonce_long = long(nonce)
+            except ValueError:
+                pass
+            if nonce_long is None or nonce_long < 0:
+                raise ArgumentValid(
+                    '{0}: invalid entity, nonce {1} not integer > 0'.\
+                    format(s, nonce)
+                )
         self.val = s
 
     def __str__(self):
@@ -829,6 +843,11 @@ def validate(args, signature, partial=False):
                     # wanted n, got too few
                     if partial:
                         return d
+                    # special-case the "0 expected 1" case
+                    if desc.numseen == 0 and desc.n == 1:
+                        raise ArgumentNumber(
+                            'missing required parameter {0}'.format(desc)
+                        )
                     raise ArgumentNumber(
                         'saw {0} of {1}, expected {2}'.\
                         format(desc.numseen, desc, desc.n)
@@ -937,6 +956,7 @@ def validate_command(sigdict, args, verbose=False):
                     # Stop now, because we have the right command but
                     # some other input is invalid
                     print >> sys.stderr, "Invalid command: ", str(e)
+                    print >> sys.stderr, concise_sig(sig), ': ', cmd['help']
                     return {}
             if found:
                 break
diff --git a/src/pybind/ceph_rest_api.py b/src/pybind/ceph_rest_api.py
index c53c3d77737..75e61060544 100755
--- a/src/pybind/ceph_rest_api.py
+++ b/src/pybind/ceph_rest_api.py
@@ -1,4 +1,3 @@
-#!/usr/bin/python
 # vim: ts=4 sw=4 smarttab expandtab
 
 import errno
diff --git a/src/rbd.cc b/src/rbd.cc
index eea9733c4b9..147eb2c5138 100644
--- a/src/rbd.cc
+++ b/src/rbd.cc
@@ -68,6 +68,7 @@ static string dir_info_oid = RBD_INFO;
 bool udevadm_settle = true;
 bool progress = true;
 bool resize_allow_shrink = false;
+bool read_only = false;
 
 #define dout_subsys ceph_subsys_rbd
 
@@ -151,6 +152,7 @@ void usage()
 "  --pretty-format                    make json or xml output more readable\n"
 "  --no-settle                        do not wait for udevadm to settle on map/unmap\n"
 "  --no-progress                      do not show progress for long-running commands\n"
+"  --read-only                        set device readonly when mapping image\n"
 "  --allow-shrink                     allow shrinking of an image when resizing\n";
 }
 
@@ -1640,8 +1642,13 @@ static int do_kernel_add(const char *poolname, const char *imgname,
       oss << ",";
   }
 
+  if (read_only)
+    oss << " ro";
+  else
+    oss << " rw";
+
   const char *user = g_conf->name.get_id().c_str();
-  oss << " name=" << user;
+  oss << ",name=" << user;
 
   char key_name[strlen(user) + strlen("client.") + 1];
   snprintf(key_name, sizeof(key_name), "client.%s", user);
@@ -2200,6 +2207,8 @@ int main(int argc, const char **argv)
       lock_tag = strdup(val.c_str());
     } else if (ceph_argparse_flag(args, i, "--no-settle", (char *)NULL)) {
       udevadm_settle = false;
+    } else if (ceph_argparse_flag(args, i, "--read-only", (char *)NULL)) {
+      read_only = true;
     } else if (ceph_argparse_flag(args, i, "--no-progress", (char *)NULL)) {
       progress = false;
     } else if (ceph_argparse_flag(args, i , "--allow-shrink", (char *)NULL)) {
@@ -2247,7 +2256,7 @@ int main(int argc, const char **argv)
     opt_cmd = get_cmd(*i, false, false);
   }
   if (opt_cmd == OPT_NO_CMD) {
-    cerr << "rbd: error parsing command '" << *i << "'" << std::endl;
+    cerr << "rbd: error parsing command '" << *i << "'; -h or --help for usage" << std::endl;
     return EXIT_FAILURE;
   }
 
diff --git a/src/rgw/Makefile.am b/src/rgw/Makefile.am
index b812d908569..24060b52e25 100644
--- a/src/rgw/Makefile.am
+++ b/src/rgw/Makefile.am
@@ -93,11 +93,6 @@ DENCODER_SOURCES += \
 	rgw/rgw_common.cc \
 	rgw/rgw_env.cc \
 	rgw/rgw_json_enc.cc
-DENCODER_DEPS +=  \
-	libcls_lock_client.la \
-	libcls_rgw_client.la \
-	libcls_replica_log_client.a \
-	libcls_refcount_client.la
 
 
 endif # WITH_RADOSGW
diff --git a/src/rgw/rgw_metadata.cc b/src/rgw/rgw_metadata.cc
index 6da1ff5ab24..ca5ad3f2e7a 100644
--- a/src/rgw/rgw_metadata.cc
+++ b/src/rgw/rgw_metadata.cc
@@ -388,6 +388,8 @@ int RGWMetadataManager::remove(string& metadata_key)
 
   objv_tracker.read_version = obj->get_version();
 
+  delete obj;
+
   return handler->remove(store, entry, objv_tracker);
 }
 
diff --git a/src/rgw/rgw_rados.cc b/src/rgw/rgw_rados.cc
index bada7d22d1b..8b4d18f4e68 100644
--- a/src/rgw/rgw_rados.cc
+++ b/src/rgw/rgw_rados.cc
@@ -897,7 +897,9 @@ int RGWRados::init_complete()
 
   ret = region_map.read(cct, this);
   if (ret < 0) {
-    ldout(cct, 0) << "WARNING: cannot read region map" << dendl;
+    if (ret != -ENOENT) {
+      ldout(cct, 0) << "WARNING: cannot read region map" << dendl;
+    }
     ret = region_map.update(region);
     if (ret < 0) {
       ldout(cct, 0) << "ERROR: failed to update regionmap with local region info" << dendl;
@@ -2182,8 +2184,8 @@ int RGWRados::create_pools(vector<string>& names, vector<int>& retcodes)
       if (r < 0) {
         ldout(cct, 0) << "WARNING: async pool_create returned " << r << dendl;
       }
-      c->release();
     }
+    c->release();
     retcodes.push_back(r);
   }
   return 0;
@@ -2493,6 +2495,22 @@ static void set_copy_attrs(map<string, bufferlist>& src_attrs, map<string, buffe
   }
 }
 
+class GetObjHandleDestructor {
+  RGWRados *store;
+  void **handle;
+
+public:
+    GetObjHandleDestructor(RGWRados *_store) : store(_store), handle(NULL) {}
+    ~GetObjHandleDestructor() {
+      if (handle) {
+        store->finish_get_obj(handle);
+      }
+    }
+    void set_handle(void **_h) {
+      handle = _h;
+    }
+};
+
 /**
  * Copy an object.
  * dest_obj: the object to copy into
@@ -2547,6 +2565,7 @@ int RGWRados::copy_obj(void *ctx,
   ldout(cct, 5) << "Copy object " << src_obj.bucket << ":" << src_obj.object << " => " << dest_obj.bucket << ":" << dest_obj.object << dendl;
 
   void *handle = NULL;
+  GetObjHandleDestructor handle_destructor(this);
 
   map<string, bufferlist> src_attrs;
   off_t ofs = 0;
@@ -2556,6 +2575,8 @@ int RGWRados::copy_obj(void *ctx,
                   mod_ptr, unmod_ptr, &lastmod, if_match, if_nomatch, &total_len, &obj_size, NULL, &handle, err);
     if (ret < 0)
       return ret;
+
+    handle_destructor.set_handle(&handle);
   } else {
     /* source is in a different region, copy it there */
 
@@ -2616,7 +2637,6 @@ int RGWRados::copy_obj(void *ctx,
     { /* opening scope so that we can do goto, sorry */
       bufferlist& extra_data_bl = processor.get_extra_data();
       if (extra_data_bl.length()) {
-        extra_data_bl.push_back((char)0);
         JSONParser jp;
         if (!jp.parse(extra_data_bl.c_str(), extra_data_bl.length())) {
           ldout(cct, 0) << "failed to parse response extra data. len=" << extra_data_bl.length() << " data=" << extra_data_bl.c_str() << dendl;
@@ -2699,7 +2719,7 @@ set_err_state:
 
     return 0;
   } else if (copy_data) { /* refcounting tail wouldn't work here, just copy the data */
-    return copy_obj_data(ctx, handle, end, dest_obj, src_obj, mtime, src_attrs, category, ptag, err);
+    return copy_obj_data(ctx, &handle, end, dest_obj, src_obj, mtime, src_attrs, category, ptag, err);
   }
 
   map<uint64_t, RGWObjManifestPart>::iterator miter = astate->manifest.objs.begin();
@@ -2804,7 +2824,7 @@ done_ret:
 
 
 int RGWRados::copy_obj_data(void *ctx,
-	       void *handle, off_t end,
+	       void **handle, off_t end,
                rgw_obj& dest_obj,
                rgw_obj& src_obj,
 	       time_t *mtime,
@@ -2830,7 +2850,7 @@ int RGWRados::copy_obj_data(void *ctx,
 
   do {
     bufferlist bl;
-    ret = get_obj(ctx, NULL, &handle, src_obj, bl, ofs, end);
+    ret = get_obj(ctx, NULL, handle, src_obj, bl, ofs, end);
     if (ret < 0)
       return ret;
 
@@ -2877,12 +2897,9 @@ int RGWRados::copy_obj_data(void *ctx,
   if (mtime)
     obj_stat(ctx, dest_obj, NULL, mtime, NULL, NULL, NULL, NULL);
 
-  finish_get_obj(&handle);
-
   return ret;
 done_err:
   delete_obj(ctx, shadow_obj);
-  finish_get_obj(&handle);
   return r;
 }
 
diff --git a/src/rgw/rgw_rados.h b/src/rgw/rgw_rados.h
index a55f1c1f94c..65765c414aa 100644
--- a/src/rgw/rgw_rados.h
+++ b/src/rgw/rgw_rados.h
@@ -1130,7 +1130,7 @@ public:
                void *progress_data);
 
   int copy_obj_data(void *ctx,
-	       void *handle, off_t end,
+	       void **handle, off_t end,
                rgw_obj& dest_obj,
                rgw_obj& src_obj,
 	       time_t *mtime,
diff --git a/src/script/perf-watch.py b/src/script/perf-watch.py
index 8c18c3ec766..826d4a499d7 100755
--- a/src/script/perf-watch.py
+++ b/src/script/perf-watch.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/env python
 
 import json
 import argparse
diff --git a/src/test/Makefile.am b/src/test/Makefile.am
index f4f4defd6f7..59b4d89e930 100644
--- a/src/test/Makefile.am
+++ b/src/test/Makefile.am
@@ -65,6 +65,10 @@ endif
 
 bin_PROGRAMS += ceph-dencoder
 
+get_command_descriptions_SOURCES = test/common/get_command_descriptions.cc
+get_command_descriptions_LDADD = $(LIBMON) $(LIBCOMMON) $(CEPH_GLOBAL)
+noinst_PROGRAMS += get_command_descriptions
+
 
 ## Build tests
 # These should all use explicit _CXXFLAGS so avoid basename conflicts
@@ -228,6 +232,10 @@ bin_DEBUGPROGRAMS += ceph_bench_log
 
 ## Unit tests
 
+check_SCRIPTS += \
+	unittest_bufferlist.sh \
+	test/encoding/check-generated.sh
+
 # target to build but not run the unit tests
 unittests:: $(check_PROGRAMS)
 
@@ -250,11 +258,21 @@ unittest_addrs_CXXFLAGS = $(UNITTEST_CXXFLAGS)
 unittest_addrs_LDADD = $(UNITTEST_LDADD) $(CEPH_GLOBAL)
 check_PROGRAMS += unittest_addrs
 
+unittest_bloom_filter_SOURCES = test/common/test_bloom_filter.cc
+unittest_bloom_filter_CXXFLAGS = $(UNITTEST_CXXFLAGS)
+unittest_bloom_filter_LDADD = $(UNITTEST_LDADD) $(CEPH_GLOBAL)
+check_PROGRAMS += unittest_bloom_filter
+
 unittest_sharedptr_registry_SOURCES = test/common/test_sharedptr_registry.cc
 unittest_sharedptr_registry_CXXFLAGS = $(UNITTEST_CXXFLAGS)
 unittest_sharedptr_registry_LDADD = $(UNITTEST_LDADD) $(CEPH_GLOBAL)
 check_PROGRAMS += unittest_sharedptr_registry
 
+unittest_sloppy_crc_map_SOURCES = test/common/test_sloppy_crc_map.cc
+unittest_sloppy_crc_map_CXXFLAGS = $(UNITTEST_CXXFLAGS)
+unittest_sloppy_crc_map_LDADD = $(UNITTEST_LDADD) $(CEPH_GLOBAL)
+check_PROGRAMS += unittest_sloppy_crc_map
+
 unittest_util_SOURCES = test/common/test_util.cc
 unittest_util_CXXFLAGS = $(UNITTEST_CXXFLAGS)
 unittest_util_LDADD = $(LIBCOMMON) -lm $(UNITTEST_LDADD) $(CRYPTO_LIBS) $(EXTRALIBS)
@@ -300,6 +318,11 @@ unittest_ceph_argparse_LDADD = $(UNITTEST_LDADD) $(CEPH_GLOBAL)
 unittest_ceph_argparse_CXXFLAGS = $(UNITTEST_CXXFLAGS)
 check_PROGRAMS += unittest_ceph_argparse
 
+unittest_ceph_compatset_SOURCES = test/ceph_compatset.cc
+unittest_ceph_compatset_LDADD = $(UNITTEST_LDADD) $(CEPH_GLOBAL)
+unittest_ceph_compatset_CXXFLAGS = $(UNITTEST_CXXFLAGS)
+check_PROGRAMS += unittest_ceph_compatset
+
 libec_example_la_SOURCES = test/osd/ErasureCodePluginExample.cc
 libec_example_la_CFLAGS = ${AM_CFLAGS}
 libec_example_la_CXXFLAGS= ${AM_CXXFLAGS}
@@ -307,7 +330,35 @@ libec_example_la_LIBADD = $(PTHREAD_LIBS) $(EXTRALIBS)
 libec_example_la_LDFLAGS = ${AM_LDFLAGS} -export-symbols-regex '.*__erasure_code_.*'
 erasure_codelib_LTLIBRARIES += libec_example.la
 
-unittest_erasure_code_plugin_SOURCES = test/osd/TestErasureCodePluginExample.cc 
+libec_missing_entry_point_la_SOURCES = test/osd/ErasureCodePluginMissingEntryPoint.cc
+libec_missing_entry_point_la_CFLAGS = ${AM_CFLAGS}
+libec_missing_entry_point_la_CXXFLAGS= ${AM_CXXFLAGS}
+libec_missing_entry_point_la_LIBADD = $(PTHREAD_LIBS) $(EXTRALIBS)
+libec_missing_entry_point_la_LDFLAGS = ${AM_LDFLAGS} -export-symbols-regex '.*__erasure_code_.*'
+erasure_codelib_LTLIBRARIES += libec_missing_entry_point.la
+
+libec_hangs_la_SOURCES = test/osd/ErasureCodePluginHangs.cc
+libec_hangs_la_CFLAGS = ${AM_CFLAGS}
+libec_hangs_la_CXXFLAGS= ${AM_CXXFLAGS}
+libec_hangs_la_LIBADD = $(PTHREAD_LIBS) $(EXTRALIBS)
+libec_hangs_la_LDFLAGS = ${AM_LDFLAGS} -export-symbols-regex '.*__erasure_code_.*'
+erasure_codelib_LTLIBRARIES += libec_hangs.la
+
+libec_fail_to_initialize_la_SOURCES = test/osd/ErasureCodePluginFailToInitialize.cc
+libec_fail_to_initialize_la_CFLAGS = ${AM_CFLAGS}
+libec_fail_to_initialize_la_CXXFLAGS= ${AM_CXXFLAGS}
+libec_fail_to_initialize_la_LIBADD = $(PTHREAD_LIBS) $(EXTRALIBS)
+libec_fail_to_initialize_la_LDFLAGS = ${AM_LDFLAGS} -export-symbols-regex '.*__erasure_code_.*'
+erasure_codelib_LTLIBRARIES += libec_fail_to_initialize.la
+
+libec_fail_to_register_la_SOURCES = test/osd/ErasureCodePluginFailToRegister.cc
+libec_fail_to_register_la_CFLAGS = ${AM_CFLAGS}
+libec_fail_to_register_la_CXXFLAGS= ${AM_CXXFLAGS}
+libec_fail_to_register_la_LIBADD = $(PTHREAD_LIBS) $(EXTRALIBS)
+libec_fail_to_register_la_LDFLAGS = ${AM_LDFLAGS} -export-symbols-regex '.*__erasure_code_.*'
+erasure_codelib_LTLIBRARIES += libec_fail_to_register.la
+
+unittest_erasure_code_plugin_SOURCES = test/osd/TestErasureCodePlugin.cc 
 unittest_erasure_code_plugin_CXXFLAGS = $(UNITTEST_CXXFLAGS)
 unittest_erasure_code_plugin_LDADD = $(LIBOSD) $(LIBCOMMON) $(UNITTEST_LDADD) $(CEPH_GLOBAL)
 if LINUX
@@ -389,6 +440,11 @@ unittest_crc32c_LDADD = $(UNITTEST_LDADD) $(CEPH_GLOBAL)
 unittest_crc32c_CXXFLAGS = $(UNITTEST_CXXFLAGS)
 check_PROGRAMS += unittest_crc32c
 
+unittest_arch_SOURCES = test/test_arch.c
+unittest_arch_LDADD = $(UNITTEST_LDADD) $(CEPH_GLOBAL)
+unittest_arch_CXXFLAGS = $(UNITTEST_CXXFLAGS)
+check_PROGRAMS += unittest_arch
+
 unittest_crypto_SOURCES = test/crypto.cc
 unittest_crypto_LDADD = $(UNITTEST_LDADD) $(CEPH_GLOBAL)
 unittest_crypto_CXXFLAGS = $(UNITTEST_CXXFLAGS)
@@ -511,6 +567,8 @@ unittest_texttable_LDADD = $(LIBCOMMON) $(UNITTEST_LDADD)
 unittest_texttable_CXXFLAGS = $(UNITTEST_CXXFLAGS)
 check_PROGRAMS += unittest_texttable
 
+check_SCRIPTS += test/pybind/test_ceph_argparse.py
+
 if WITH_RADOSGW
 ceph_test_cors_SOURCES = test/test_cors.cc
 ceph_test_cors_LDADD = \
diff --git a/src/test/ObjectMap/test_object_map.cc b/src/test/ObjectMap/test_object_map.cc
index 1b39c8068fb..23f220daf45 100644
--- a/src/test/ObjectMap/test_object_map.cc
+++ b/src/test/ObjectMap/test_object_map.cc
@@ -55,16 +55,16 @@ public:
   }
 
   void set_key(const string &objname, const string &key, const string &value) {
-    set_key(hobject_t(sobject_t(objname, CEPH_NOSNAP)),
+    set_key(ghobject_t(hobject_t(sobject_t(objname, CEPH_NOSNAP))),
 	    key, value);
   }
 
   void set_xattr(const string &objname, const string &key, const string &value) {
-    set_xattr(hobject_t(sobject_t(objname, CEPH_NOSNAP)),
+    set_xattr(ghobject_t(hobject_t(sobject_t(objname, CEPH_NOSNAP))),
 	      key, value);
   }
 
-  void set_key(hobject_t hoid,
+  void set_key(ghobject_t hoid,
 	       string key, string value) {
     map<string, bufferlist> to_write;
     bufferptr bp(value.c_str(), value.size());
@@ -74,7 +74,7 @@ public:
     db->set_keys(hoid, to_write);
   }
 
-  void set_xattr(hobject_t hoid,
+  void set_xattr(ghobject_t hoid,
 		 string key, string value) {
     map<string, bufferlist> to_write;
     bufferptr bp(value.c_str(), value.size());
@@ -85,11 +85,11 @@ public:
   }
 
   void set_header(const string &objname, const string &value) {
-    set_header(hobject_t(sobject_t(objname, CEPH_NOSNAP)),
+    set_header(ghobject_t(hobject_t(sobject_t(objname, CEPH_NOSNAP))),
 	       value);
   }
 
-  void set_header(hobject_t hoid,
+  void set_header(ghobject_t hoid,
 		  const string &value) {
     bufferlist header;
     header.append(bufferptr(value.c_str(), value.size() + 1));
@@ -97,11 +97,11 @@ public:
   }
 
   int get_header(const string &objname, string *value) {
-    return get_header(hobject_t(sobject_t(objname, CEPH_NOSNAP)),
+    return get_header(ghobject_t(hobject_t(sobject_t(objname, CEPH_NOSNAP))),
 		      value);
   }
 
-  int get_header(hobject_t hoid,
+  int get_header(ghobject_t hoid,
 		 string *value) {
     bufferlist header;
     int r = db->get_header(hoid, &header);
@@ -115,11 +115,11 @@ public:
   }
 
   int get_xattr(const string &objname, const string &key, string *value) {
-    return get_xattr(hobject_t(sobject_t(objname, CEPH_NOSNAP)),
+    return get_xattr(ghobject_t(hobject_t(sobject_t(objname, CEPH_NOSNAP))),
 		     key, value);
   }
 
-  int get_xattr(hobject_t hoid,
+  int get_xattr(ghobject_t hoid,
 		string key, string *value) {
     set<string> to_get;
     to_get.insert(key);
@@ -135,11 +135,11 @@ public:
   }
 
   int get_key(const string &objname, const string &key, string *value) {
-    return get_key(hobject_t(sobject_t(objname, CEPH_NOSNAP)),
+    return get_key(ghobject_t(hobject_t(sobject_t(objname, CEPH_NOSNAP))),
 		   key, value);
   }
 
-  int get_key(hobject_t hoid,
+  int get_key(ghobject_t hoid,
 	      string key, string *value) {
     set<string> to_get;
     to_get.insert(key);
@@ -155,11 +155,11 @@ public:
   }
 
   void remove_key(const string &objname, const string &key) {
-    remove_key(hobject_t(sobject_t(objname, CEPH_NOSNAP)),
+    remove_key(ghobject_t(hobject_t(sobject_t(objname, CEPH_NOSNAP))),
 	       key);
   }
 
-  void remove_key(hobject_t hoid,
+  void remove_key(ghobject_t hoid,
 		  string key) {
     set<string> to_remove;
     to_remove.insert(key);
@@ -167,11 +167,11 @@ public:
   }
 
   void remove_xattr(const string &objname, const string &key) {
-    remove_xattr(hobject_t(sobject_t(objname, CEPH_NOSNAP)),
+    remove_xattr(ghobject_t(hobject_t(sobject_t(objname, CEPH_NOSNAP))),
 		 key);
   }
 
-  void remove_xattr(hobject_t hoid,
+  void remove_xattr(ghobject_t hoid,
 		    string key) {
     set<string> to_remove;
     to_remove.insert(key);
@@ -179,20 +179,20 @@ public:
   }
 
   void clone(const string &objname, const string &target) {
-    clone(hobject_t(sobject_t(objname, CEPH_NOSNAP)),
-	  hobject_t(sobject_t(target, CEPH_NOSNAP)));
+    clone(ghobject_t(hobject_t(sobject_t(objname, CEPH_NOSNAP))),
+	  ghobject_t(hobject_t(sobject_t(target, CEPH_NOSNAP))));
   }
 
-  void clone(hobject_t hoid,
-	     hobject_t hoid2) {
+  void clone(ghobject_t hoid,
+	     ghobject_t hoid2) {
     db->clone(hoid, hoid2);
   }
 
   void clear(const string &objname) {
-    clear(hobject_t(sobject_t(objname, CEPH_NOSNAP)));
+    clear(ghobject_t(hobject_t(sobject_t(objname, CEPH_NOSNAP))));
   }
 
-  void clear(hobject_t hoid) {
+  void clear(ghobject_t hoid) {
     db->clear(hoid);
   }
 
@@ -543,7 +543,7 @@ int main(int argc, char **argv) {
 }
 
 TEST_F(ObjectMapTest, CreateOneObject) {
-  hobject_t hoid(sobject_t("foo", CEPH_NOSNAP));
+  ghobject_t hoid(hobject_t(sobject_t("foo", CEPH_NOSNAP)), 100, 0);
   map<string, bufferlist> to_set;
   string key("test");
   string val("test_val");
@@ -579,8 +579,8 @@ TEST_F(ObjectMapTest, CreateOneObject) {
 }
 
 TEST_F(ObjectMapTest, CloneOneObject) {
-  hobject_t hoid(sobject_t("foo", CEPH_NOSNAP));
-  hobject_t hoid2(sobject_t("foo2", CEPH_NOSNAP));
+  ghobject_t hoid(hobject_t(sobject_t("foo", CEPH_NOSNAP)), 200, 0);
+  ghobject_t hoid2(hobject_t(sobject_t("foo2", CEPH_NOSNAP)), 201, 1);
 
   tester.set_key(hoid, "foo", "bar");
   tester.set_key(hoid, "foo2", "bar2");
@@ -640,8 +640,8 @@ TEST_F(ObjectMapTest, CloneOneObject) {
 }
 
 TEST_F(ObjectMapTest, OddEvenClone) {
-  hobject_t hoid(sobject_t("foo", CEPH_NOSNAP));
-  hobject_t hoid2(sobject_t("foo2", CEPH_NOSNAP));
+  ghobject_t hoid(hobject_t(sobject_t("foo", CEPH_NOSNAP)));
+  ghobject_t hoid2(hobject_t(sobject_t("foo2", CEPH_NOSNAP)));
 
   for (unsigned i = 0; i < 1000; ++i) {
     tester.set_key(hoid, "foo" + num_str(i), "bar" + num_str(i));
diff --git a/src/test/ceph_compatset.cc b/src/test/ceph_compatset.cc
new file mode 100644
index 00000000000..2b57db01ab9
--- /dev/null
+++ b/src/test/ceph_compatset.cc
@@ -0,0 +1,164 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2011 New Dream Network
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include <fstream>
+#include <iostream>
+#include <errno.h>
+#include <sys/stat.h>
+#include <signal.h>
+#include <ctype.h>
+#include <boost/scoped_ptr.hpp>
+#include <string>
+
+#include "include/types.h"
+#include "include/compat.h"
+
+//#undef assert
+//#define	assert(foo) if (!(foo)) abort();
+
+#include "include/CompatSet.h"
+
+#include "gtest/gtest.h"
+#include <vector>
+
+TEST(CephCompatSet, AllSet) {
+  CompatSet::FeatureSet compat;
+  CompatSet::FeatureSet ro;
+  CompatSet::FeatureSet incompat;
+
+  EXPECT_THROW(compat.insert(CompatSet::Feature(0, "test")), FailedAssertion);
+  EXPECT_THROW(compat.insert(CompatSet::Feature(64, "test")), FailedAssertion);
+
+  for (int i = 1; i < 64; i++) {
+    stringstream cname;
+    cname << string("c") << i;
+    compat.insert(CompatSet::Feature(i,cname.str().c_str()));
+    stringstream roname;
+    roname << string("r") << i;
+    ro.insert(CompatSet::Feature(i,roname.str().c_str()));
+    stringstream iname;
+    iname << string("i") << i;
+    incompat.insert(CompatSet::Feature(i,iname.str().c_str()));
+  }
+  CompatSet tcs(compat, ro, incompat);
+
+  //cout << tcs << std::endl;
+
+  //Due to a workaround for a bug bit 0 is always set even though it is
+  //not a legal feature.
+  EXPECT_EQ(tcs.compat.mask, (uint64_t)0xffffffffffffffff);
+  EXPECT_EQ(tcs.ro_compat.mask, (uint64_t)0xffffffffffffffff);
+  EXPECT_EQ(tcs.incompat.mask, (uint64_t)0xffffffffffffffff);
+
+  for (int i = 1; i < 64; i++) {
+    EXPECT_TRUE(tcs.compat.contains(i));
+    stringstream cname;
+    cname << string("c") << i;
+    EXPECT_TRUE(tcs.compat.contains(CompatSet::Feature(i,cname.str().c_str())));
+    tcs.compat.remove(i);
+
+    EXPECT_TRUE(tcs.ro_compat.contains(i));
+    stringstream roname;
+    roname << string("r") << i;
+    EXPECT_TRUE(tcs.ro_compat.contains(CompatSet::Feature(i,roname.str().c_str())));
+    tcs.ro_compat.remove(i);
+
+    EXPECT_TRUE(tcs.incompat.contains(i));
+    stringstream iname;
+    iname << string("i") << i;
+    EXPECT_TRUE(tcs.incompat.contains(CompatSet::Feature(i,iname.str().c_str())));
+    tcs.incompat.remove(i);
+  }
+  //Due to a workaround for a bug bit 0 is always set even though it is
+  //not a legal feature.
+  EXPECT_EQ(tcs.compat.mask, (uint64_t)1);
+  EXPECT_TRUE(tcs.compat.names.empty());
+  EXPECT_EQ(tcs.ro_compat.mask, (uint64_t)1);
+  EXPECT_TRUE(tcs.ro_compat.names.empty());
+  EXPECT_EQ(tcs.incompat.mask, (uint64_t)1);
+  EXPECT_TRUE(tcs.incompat.names.empty());
+}
+
+TEST(CephCompatSet, other) {
+  CompatSet s1, s2, s1dup;
+
+  s1.compat.insert(CompatSet::Feature(1, "c1"));
+  s1.compat.insert(CompatSet::Feature(2, "c2"));
+  s1.compat.insert(CompatSet::Feature(32, "c32"));
+  s1.ro_compat.insert(CompatSet::Feature(63, "r63"));
+  s1.incompat.insert(CompatSet::Feature(1, "i1"));
+
+  s2.compat.insert(CompatSet::Feature(1, "c1"));
+  s2.compat.insert(CompatSet::Feature(32, "c32"));
+  s2.ro_compat.insert(CompatSet::Feature(63, "r63"));
+  s2.incompat.insert(CompatSet::Feature(1, "i1"));
+
+  s1dup = s1;
+
+  //Check exact match
+  EXPECT_EQ(s1.compare(s1dup), 0);
+
+  //Check superset
+  EXPECT_EQ(s1.compare(s2), 1);
+
+  //Check missing features
+  EXPECT_EQ(s2.compare(s1), -1);
+
+  CompatSet diff = s2.unsupported(s1);
+  EXPECT_EQ(diff.compat.mask, (uint64_t)1<<2 | 1);
+  EXPECT_EQ(diff.ro_compat.mask, (uint64_t)1);
+  EXPECT_EQ(diff.incompat.mask, (uint64_t)1);
+
+  CompatSet s3 = s1;
+  s3.incompat.insert(CompatSet::Feature(4, "i4"));
+
+  diff = s1.unsupported(s3);
+  EXPECT_EQ(diff.compat.mask, (uint64_t)1);
+  EXPECT_EQ(diff.ro_compat.mask, (uint64_t)1);
+  EXPECT_EQ(diff.incompat.mask, (uint64_t)1<<4 | 1);
+}
+
+TEST(CephCompatSet, merge) {
+  CompatSet s1, s2, s1dup, s2dup;
+
+  s1.compat.insert(CompatSet::Feature(1, "c1"));
+  s1.compat.insert(CompatSet::Feature(2, "c2"));
+  s1.compat.insert(CompatSet::Feature(32, "c32"));
+  s1.ro_compat.insert(CompatSet::Feature(63, "r63"));
+  s1.incompat.insert(CompatSet::Feature(1, "i1"));
+
+  s1dup = s1;
+
+  s2.compat.insert(CompatSet::Feature(1, "c1"));
+  s2.compat.insert(CompatSet::Feature(32, "c32"));
+  s2.ro_compat.insert(CompatSet::Feature(1, "r1"));
+  s2.ro_compat.insert(CompatSet::Feature(63, "r63"));
+  s2.incompat.insert(CompatSet::Feature(1, "i1"));
+
+  s2dup = s2;
+
+  //Nothing to merge if they are the same
+  EXPECT_FALSE(s1.merge(s1dup));
+  EXPECT_FALSE(s2.merge(s2dup));
+
+  EXPECT_TRUE(s1.merge(s2));
+  EXPECT_EQ(s1.compat.mask, (uint64_t)1<<1 | (uint64_t)1<<2 | (uint64_t)1<<32 | 1);
+  EXPECT_EQ(s1.ro_compat.mask, (uint64_t)1<<1 | (uint64_t)1<<63 | 1);
+  EXPECT_EQ(s1.incompat.mask, (uint64_t)1<<1 | 1);
+
+  EXPECT_TRUE(s2.merge(s1dup));
+  EXPECT_EQ(s2.compat.mask, (uint64_t)1<<1 | (uint64_t)1<<2 | (uint64_t)1<<32 | 1);
+  EXPECT_EQ(s2.ro_compat.mask, (uint64_t)1<<1 | (uint64_t)1<<63 | 1);
+  EXPECT_EQ(s2.incompat.mask, (uint64_t)1<<1 | 1);
+}
diff --git a/src/test/cli/radosgw-admin/help.t b/src/test/cli/radosgw-admin/help.t
index 524d0edd8ba..2def60107dc 100644
--- a/src/test/cli/radosgw-admin/help.t
+++ b/src/test/cli/radosgw-admin/help.t
@@ -116,9 +116,10 @@
   
   <date> := "YYYY-MM-DD[ hh:mm:ss]"
   
-    --conf/-c        Read configuration from the given configuration file
-    --id/-i          set ID portion of my name
-    --name/-n        set name (TYPE.ID)
-    --version        show version and quit
+    --conf/-c FILE    read configuration from the given configuration file
+    --id/-i ID        set ID portion of my name
+    --name/-n TYPE.ID set name
+    --cluster NAME    set cluster name (default: ceph)
+    --version         show version and quit
   
   [1]
diff --git a/src/test/cli/rbd/help.t b/src/test/cli/rbd/help.t
index 1ad79385a7e..754e11f9357 100644
--- a/src/test/cli/rbd/help.t
+++ b/src/test/cli/rbd/help.t
@@ -76,4 +76,5 @@
     --pretty-format                    make json or xml output more readable
     --no-settle                        do not wait for udevadm to settle on map/unmap
     --no-progress                      do not show progress for long-running commands
+    --read-only                        set device readonly when mapping image
     --allow-shrink                     allow shrinking of an image when resizing
diff --git a/src/test/common/get_command_descriptions.cc b/src/test/common/get_command_descriptions.cc
new file mode 100644
index 00000000000..aff5575b8c4
--- /dev/null
+++ b/src/test/common/get_command_descriptions.cc
@@ -0,0 +1,116 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2013 Cloudwatt <libre.licensing@cloudwatt.com>
+ *
+ * Author: Loic Dachary <loic@dachary.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Library Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Library Public License for more details.
+ *
+ */
+
+#include <stdio.h>
+#include <signal.h>
+#include "mon/Monitor.h"
+#include "common/ceph_argparse.h"
+#include "global/global_init.h"
+#include "common/debug.h"
+
+#define dout_subsys ceph_subsys_mon
+
+static void usage(ostream &out)
+{
+  out << "usage: get_command_descriptions [options ...]" << std::endl;
+  out << "print on stdout the result of JSON formatted options\n";
+  out << "found in mon/MonCommands.h as produced by the\n";
+  out << "Monitor.cc::get_command_descriptions function.\n";
+  out << "Designed as a helper for ceph_argparse.py unit tests.\n";
+  out << "\n";
+  out << "  --all               all of mon/MonCommands.h \n";
+  out << "  --pull585           reproduce the bug fixed by #585\n";
+  out << "\n";
+  out << "Examples:\n";
+  out << "  get_command_descriptions --all\n";
+  out << "  get_command_descriptions --pull585\n";
+}
+
+static void json_print(const MonCommand *mon_commands, int size)
+{
+  bufferlist rdata;
+  Formatter *f = new_formatter("json");
+  get_command_descriptions(mon_commands, size, f, &rdata);
+  delete f;
+  string data(rdata.c_str(), rdata.length());
+  dout(0) << data << dendl;
+}
+
+static void all()
+{
+#undef COMMAND
+  MonCommand mon_commands[] = {
+#define COMMAND(parsesig, helptext, modulename, req_perms, avail)	\
+    {parsesig, helptext, modulename, req_perms, avail},
+#include <mon/MonCommands.h>
+  };
+
+  json_print(mon_commands, ARRAY_SIZE(mon_commands));
+}
+
+// syntax error https://github.com/ceph/ceph/pull/585
+static void pull585()
+{
+  MonCommand mon_commands[] = {
+    { "osd pool create "		       
+      "name=pool,type=CephPoolname " 
+      "name=pg_num,type=CephInt,range=0 " 
+      "name=pgp_num,type=CephInt,range=0,req=false" // !!! missing trailing space
+      "name=properties,type=CephString,n=N,req=false,goodchars=[A-Za-z0-9-_.=]", 
+      "create pool", "osd", "rw", "cli,rest" }
+  };
+
+  json_print(mon_commands, ARRAY_SIZE(mon_commands));
+}
+
+int main(int argc, char **argv) {
+  vector<const char*> args;
+  argv_to_vec(argc, (const char **)argv, args);
+
+  global_init(NULL, args, CEPH_ENTITY_TYPE_CLIENT, CODE_ENVIRONMENT_UTILITY, 0);
+  common_init_finish(g_ceph_context);
+
+  if (args.empty()) {
+    usage(cerr);
+    exit(1);
+  }
+  for (std::vector<const char*>::iterator i = args.begin(); i != args.end(); ++i) {
+    string err;
+
+    if (*i == string("help") || *i == string("-h") || *i == string("--help")) {
+      usage(cout);
+      exit(0);
+    } else if (*i == string("--all")) {
+      all();
+    } else if (*i == string("--pull585")) {
+      pull585();
+    }
+  }  
+}
+
+/*
+ * Local Variables:
+ * compile-command: "cd ../.. ; 
+ *   make get_command_descriptions && 
+ *   ./get_command_descriptions --all --pull585"
+ * End:
+ */
+
diff --git a/src/test/common/test_bloom_filter.cc b/src/test/common/test_bloom_filter.cc
new file mode 100644
index 00000000000..8e3661b2cc1
--- /dev/null
+++ b/src/test/common/test_bloom_filter.cc
@@ -0,0 +1,222 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2013 Inktank <info@inktank.com>
+ *
+ * LGPL2.1 (see COPYING-LGPL2.1) or later
+ */
+
+#include <iostream>
+#include <gtest/gtest.h>
+
+#include "include/stringify.h"
+#include "common/bloom_filter.hpp"
+
+TEST(BloomFilter, Basic) {
+  bloom_filter bf(10, .1, 1);
+  bf.insert("foo");
+  bf.insert("bar");
+
+  ASSERT_TRUE(bf.contains("foo"));
+  ASSERT_TRUE(bf.contains("bar"));
+}
+
+TEST(BloomFilter, Sweep) {
+  std::cout << "# max\tfpp\tactual\tsize\tB/insert" << std::endl;
+  for (int ex = 3; ex < 12; ex += 2) {
+    for (float fpp = .001; fpp < .5; fpp *= 4.0) {
+      int max = 2 << ex;
+      bloom_filter bf(max, fpp, 1);
+      bf.insert("foo");
+      bf.insert("bar");
+
+      ASSERT_TRUE(bf.contains("foo"));
+      ASSERT_TRUE(bf.contains("bar"));
+
+      for (int n = 0; n < max; n++)
+	bf.insert("ok" + stringify(n));
+
+      int test = max * 100;
+      int hit = 0;
+      for (int n = 0; n < test; n++)
+	if (bf.contains("asdf" + stringify(n)))
+	  hit++;
+
+      ASSERT_TRUE(bf.contains("foo"));
+      ASSERT_TRUE(bf.contains("bar"));
+
+      double actual = (double)hit / (double)test;
+
+      bufferlist bl;
+      ::encode(bf, bl);
+
+      double byte_per_insert = (double)bl.length() / (double)max;
+
+      std::cout << max << "\t" << fpp << "\t" << actual << "\t" << bl.length() << "\t" << byte_per_insert << std::endl;
+      ASSERT_TRUE(actual < fpp * 10);
+
+    }
+  }
+}
+
+TEST(BloomFilter, SweepInt) {
+  std::cout << "# max\tfpp\tactual\tsize\tB/insert" << std::endl;
+  for (int ex = 3; ex < 12; ex += 2) {
+    for (float fpp = .001; fpp < .5; fpp *= 4.0) {
+      int max = 2 << ex;
+      bloom_filter bf(max, fpp, 1);
+      bf.insert("foo");
+      bf.insert("bar");
+
+      ASSERT_TRUE(123);
+      ASSERT_TRUE(456);
+
+      for (int n = 0; n < max; n++)
+	bf.insert(n);
+
+      int test = max * 100;
+      int hit = 0;
+      for (int n = 0; n < test; n++)
+	if (bf.contains(100000 + n))
+	  hit++;
+
+      ASSERT_TRUE(123);
+      ASSERT_TRUE(456);
+
+      double actual = (double)hit / (double)test;
+
+      bufferlist bl;
+      ::encode(bf, bl);
+
+      double byte_per_insert = (double)bl.length() / (double)max;
+
+      std::cout << max << "\t" << fpp << "\t" << actual << "\t" << bl.length() << "\t" << byte_per_insert << std::endl;
+      ASSERT_TRUE(actual < fpp * 10);
+      ASSERT_TRUE(actual > fpp / 10);
+    }
+  }
+}
+
+
+TEST(BloomFilter, BinSweep) {
+  int total_max = 16384;
+  float total_fpp = .01;
+  std::cout << "total_inserts " << total_max << " target-fpp " << total_fpp << std::endl;
+  for (int bins = 1; bins < 16; ++bins) {
+    int max = total_max / bins;
+    float fpp = total_fpp / bins;//pow(total_fpp, bins);
+
+    std::vector<bloom_filter*> ls;
+    bufferlist bl;
+    for (int i=0; i<bins; i++) {
+      ls.push_back(new bloom_filter(max, fpp, i));
+      for (int j=0; j<max; j++) {
+	ls.back()->insert(10000 * (i+1) + j);
+      }
+      ::encode(*ls.front(), bl);
+    }
+
+    int hit = 0;
+    int test = max * 100;
+    for (int i=0; i<test; ++i) {
+      for (std::vector<bloom_filter*>::iterator j = ls.begin(); j != ls.end(); ++j) {
+	if ((*j)->contains(i * 732)) {  // note: sequential i does not work here; the intenral int hash is weak!!
+	  hit++;
+	  break;
+	}
+      }
+    }
+
+    double actual = (double)hit / (double)test;
+    std::cout << "bins " << bins << " bin-max " << max << " bin-fpp " << fpp
+	      << " actual-fpp " << actual
+	      << " total-size " << bl.length() << std::endl;
+  }
+}
+
+// disable these tests; doing dual insertions in consecutive filters
+// appears to be equivalent to doing a single insertion in a bloom
+// filter that is twice as big.
+#if 0
+
+// test the fpp over a sequence of bloom filters, each with unique
+// items inserted into it.
+//
+// we expect:  actual_fpp = num_filters * per_filter_fpp
+TEST(BloomFilter, Sequence) {
+
+  int max = 1024;
+  double fpp = .01;
+  for (int seq = 2; seq <= 128; seq *= 2) {
+    std::vector<bloom_filter*> ls;
+    for (int i=0; i<seq; i++) {
+      ls.push_back(new bloom_filter(max*2, fpp, i));
+      for (int j=0; j<max; j++) {
+	ls.back()->insert("ok" + stringify(j) + "_" + stringify(i));
+	if (ls.size() > 1)
+	  ls[ls.size() - 2]->insert("ok" + stringify(j) + "_" + stringify(i));
+      }
+    }
+
+    int hit = 0;
+    int test = max * 100;
+    for (int i=0; i<test; ++i) {
+      for (std::vector<bloom_filter*>::iterator j = ls.begin(); j != ls.end(); ++j) {
+	if ((*j)->contains("bad" + stringify(i))) {
+	  hit++;
+	  break;
+	}
+      }
+    }
+
+    double actual = (double)hit / (double)test;
+    std::cout << "seq " << seq << " max " << max << " fpp " << fpp << " actual " << actual << std::endl;
+  }
+}
+
+// test the ffp over a sequence of bloom filters, where actual values
+// are always inserted into a consecutive pair of filters.  in order
+// to have a false positive, we need to falsely match two consecutive
+// filters.
+//
+// we expect:  actual_fpp = num_filters * per_filter_fpp^2
+TEST(BloomFilter, SequenceDouble) {
+  int max = 1024;
+  double fpp = .01;
+  for (int seq = 2; seq <= 128; seq *= 2) {
+    std::vector<bloom_filter*> ls;
+    for (int i=0; i<seq; i++) {
+      ls.push_back(new bloom_filter(max*2, fpp, i));
+      for (int j=0; j<max; j++) {
+	ls.back()->insert("ok" + stringify(j) + "_" + stringify(i));
+	if (ls.size() > 1)
+	  ls[ls.size() - 2]->insert("ok" + stringify(j) + "_" + stringify(i));
+      }
+    }
+
+    int hit = 0;
+    int test = max * 100;
+    int run = 0;
+    for (int i=0; i<test; ++i) {
+      for (std::vector<bloom_filter*>::iterator j = ls.begin(); j != ls.end(); ++j) {
+	if ((*j)->contains("bad" + stringify(i))) {
+	  run++;
+	  if (run >= 2) {
+	    hit++;
+	    break;
+	  }
+	} else {
+	  run = 0;
+	}
+      }
+    }
+
+    double actual = (double)hit / (double)test;
+    std::cout << "seq " << seq << " max " << max << " fpp " << fpp << " actual " << actual
+	      << " expected " << (fpp*fpp*(double)seq) << std::endl;
+  }
+}
+
+#endif
diff --git a/src/test/common/test_sloppy_crc_map.cc b/src/test/common/test_sloppy_crc_map.cc
new file mode 100644
index 00000000000..2650f4f960d
--- /dev/null
+++ b/src/test/common/test_sloppy_crc_map.cc
@@ -0,0 +1,113 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "common/SloppyCRCMap.h"
+#include "common/Formatter.h"
+#include <gtest/gtest.h>
+
+void dump(const SloppyCRCMap& scm)
+{
+  Formatter *f = new_formatter("json-pretty");
+  f->open_object_section("map");
+  scm.dump(f);
+  f->close_section();
+  f->flush(cout);
+  delete f;
+}
+
+TEST(SloppyCRCMap, basic) {
+  SloppyCRCMap scm(4);
+
+  bufferlist a, b;
+  a.append("The quick brown fox jumped over a fence whose color I forget.");
+  b.append("asdf");
+
+  scm.write(0, a.length(), a);
+  if (0)
+    dump(scm);
+  ASSERT_EQ(0, scm.read(0, a.length(), a, &cout));
+
+  scm.write(12, b.length(), b);
+  if (0)
+    dump(scm);
+
+  ASSERT_EQ(0, scm.read(12, b.length(), b, &cout));
+  ASSERT_EQ(1, scm.read(0, a.length(), a, &cout));
+}
+
+TEST(SloppyCRCMap, truncate) {
+  SloppyCRCMap scm(4);
+
+  bufferlist a, b;
+  a.append("asdf");
+  b.append("qwer");
+
+  scm.write(0, a.length(), a);
+  scm.write(4, a.length(), a);
+  ASSERT_EQ(0, scm.read(4, 4, a, &cout));
+  ASSERT_EQ(1, scm.read(4, 4, b, &cout));
+  scm.truncate(4);
+  ASSERT_EQ(0, scm.read(4, 4, b, &cout));
+}
+
+TEST(SloppyCRCMap, zero) {
+  SloppyCRCMap scm(4);
+
+  bufferlist a, b;
+  a.append("asdf");
+  b.append("qwer");
+
+  scm.write(0, a.length(), a);
+  scm.write(4, a.length(), a);
+  ASSERT_EQ(0, scm.read(4, 4, a, &cout));
+  ASSERT_EQ(1, scm.read(4, 4, b, &cout));
+  scm.zero(4, 4);
+  ASSERT_EQ(1, scm.read(4, 4, a, &cout));
+  ASSERT_EQ(1, scm.read(4, 4, b, &cout));
+
+  bufferptr bp(4);
+  bp.zero();
+  bufferlist c;
+  c.append(bp);
+  ASSERT_EQ(0, scm.read(0, 4, a, &cout));
+  ASSERT_EQ(0, scm.read(4, 4, c, &cout));
+  scm.zero(0, 15);
+  ASSERT_EQ(1, scm.read(0, 4, a, &cout));
+  ASSERT_EQ(0, scm.read(0, 4, c, &cout));
+}
+
+TEST(SloppyCRCMap, clone_range) {
+  SloppyCRCMap src(4);
+  SloppyCRCMap dst(4);
+
+  bufferlist a, b;
+  a.append("asdfghjkl");
+  b.append("qwertyui");
+
+  src.write(0, a.length(), a);
+  src.write(8, a.length(), a);
+  src.write(16, a.length(), a);
+
+  dst.write(0, b.length(), b);
+  dst.clone_range(0, 8, 0, src);
+  ASSERT_EQ(2, dst.read(0, 8, b, &cout));
+  ASSERT_EQ(0, dst.read(8, 8, b, &cout));
+
+  dst.write(16, b.length(), b);
+  ASSERT_EQ(2, dst.read(16, 8, a, &cout));
+  dst.clone_range(16, 8, 16, src);
+  ASSERT_EQ(0, dst.read(16, 8, a, &cout));
+
+  dst.write(16, b.length(), b);
+  ASSERT_EQ(1, dst.read(16, 4, a, &cout));
+  dst.clone_range(16, 8, 2, src);
+  ASSERT_EQ(0, dst.read(16, 4, a, &cout));
+
+  dst.write(0, b.length(), b);
+  dst.write(8, b.length(), b);
+  ASSERT_EQ(2, dst.read(0, 8, a, &cout));
+  ASSERT_EQ(2, dst.read(8, 8, a, &cout));
+  dst.clone_range(2, 8, 0, src);
+  ASSERT_EQ(0, dst.read(0, 8, a, &cout));
+  ASSERT_EQ(0, dst.read(8, 4, a, &cout));
+}
diff --git a/src/test/common/test_util.cc b/src/test/common/test_util.cc
index 16713077cfc..cb22047c600 100644
--- a/src/test/common/test_util.cc
+++ b/src/test/common/test_util.cc
@@ -21,6 +21,7 @@ TEST(util, unit_to_bytesize)
 {
   ASSERT_EQ(1234ll, unit_to_bytesize("1234", &cerr));
   ASSERT_EQ(1024ll, unit_to_bytesize("1K", &cerr));
+  ASSERT_EQ(1024ll, unit_to_bytesize("1k", &cerr));
   ASSERT_EQ(1048576ll, unit_to_bytesize("1M", &cerr));
   ASSERT_EQ(1073741824ll, unit_to_bytesize("1G", &cerr));
   ASSERT_EQ(1099511627776ll, unit_to_bytesize("1T", &cerr));
diff --git a/src/test/encoding/types.h b/src/test/encoding/types.h
index d9a16240d5a..3ee9e03ff7d 100644
--- a/src/test/encoding/types.h
+++ b/src/test/encoding/types.h
@@ -4,6 +4,9 @@ TYPE(CompatSet)
 #include "include/filepath.h"
 TYPE(filepath)
 
+#include "common/bloom_filter.hpp"
+TYPE(bloom_filter)
+
 #include "common/snap_types.h"
 TYPE(SnapContext)
 TYPE(SnapRealmInfo)
@@ -16,6 +19,9 @@ TYPE(LogEntryKey)
 TYPE(LogEntry)
 TYPE(LogSummary)
 
+#include "common/SloppyCRCMap.h"
+TYPE(SloppyCRCMap)
+
 #include "msg/msg_types.h"
 TYPE(entity_name_t)
 TYPE(entity_addr_t)
@@ -80,6 +86,7 @@ TYPE(SequencerPosition)
 
 #include "common/hobject.h"
 TYPE(hobject_t)
+TYPE(ghobject_t)
 
 #include "mon/AuthMonitor.h"
 TYPE(AuthMonitor::Incremental)
diff --git a/src/test/filestore/FileStoreDiff.cc b/src/test/filestore/FileStoreDiff.cc
index b2419f5e298..40c0b32d30c 100644
--- a/src/test/filestore/FileStoreDiff.cc
+++ b/src/test/filestore/FileStoreDiff.cc
@@ -131,7 +131,7 @@ bool FileStoreDiff::diff_objects(FileStore *a_store, FileStore *b_store, coll_t
   bool ret = false;
 
   int err;
-  std::vector<hobject_t> b_objects, a_objects;
+  std::vector<ghobject_t> b_objects, a_objects;
   err = b_store->collection_list(coll, b_objects);
   if (err < 0) {
     dout(0) << "diff_objects list on verify coll " << coll.to_str()
@@ -151,11 +151,11 @@ bool FileStoreDiff::diff_objects(FileStore *a_store, FileStore *b_store, coll_t
     ret = true;
   }
 
-  std::vector<hobject_t>::iterator b_it = b_objects.begin();
-  std::vector<hobject_t>::iterator a_it = b_objects.begin();
+  std::vector<ghobject_t>::iterator b_it = b_objects.begin();
+  std::vector<ghobject_t>::iterator a_it = b_objects.begin();
   for (; b_it != b_objects.end(); ++b_it, ++a_it) {
-    hobject_t b_obj = *b_it, a_obj = *a_it;
-    if (b_obj.oid.name != a_obj.oid.name) {
+    ghobject_t b_obj = *b_it, a_obj = *a_it;
+    if (b_obj.hobj.oid.name != a_obj.hobj.oid.name) {
       dout(0) << "diff_objects name mismatch on A object "
           << coll << "/" << a_obj << " and B object "
           << coll << "/" << b_obj << dendl;
@@ -167,7 +167,7 @@ bool FileStoreDiff::diff_objects(FileStore *a_store, FileStore *b_store, coll_t
     err = b_store->stat(coll, b_obj, &b_stat);
     if (err < 0) {
       dout(0) << "diff_objects error stating B object "
-	      << coll.to_str() << "/" << b_obj.oid.name << dendl;
+	      << coll.to_str() << "/" << b_obj.hobj.oid.name << dendl;
       ret = true;
     }
     err = a_store->stat(coll, a_obj, &a_stat);
diff --git a/src/test/filestore/store_test.cc b/src/test/filestore/store_test.cc
index 92104960127..50450f467ff 100644
--- a/src/test/filestore/store_test.cc
+++ b/src/test/filestore/store_test.cc
@@ -51,9 +51,9 @@ public:
   }
 };
 
-bool sorted(const vector<hobject_t> &in) {
-  hobject_t start;
-  for (vector<hobject_t>::const_iterator i = in.begin();
+bool sorted(const vector<ghobject_t> &in) {
+  ghobject_t start;
+  for (vector<ghobject_t>::const_iterator i = in.begin();
        i != in.end();
        ++i) {
     if (start > *i) return false;
@@ -105,7 +105,7 @@ TEST_F(StoreTest, SimpleObjectTest) {
     r = store->apply_transaction(t);
     ASSERT_EQ(r, 0);
   }
-  hobject_t hoid(sobject_t("Object 1", CEPH_NOSNAP));
+  ghobject_t hoid(hobject_t(sobject_t("Object 1", CEPH_NOSNAP)));
   {
     ObjectStore::Transaction t;
     t.touch(cid, hoid);
@@ -133,7 +133,7 @@ TEST_F(StoreTest, SimpleObjectLongnameTest) {
     r = store->apply_transaction(t);
     ASSERT_EQ(r, 0);
   }
-  hobject_t hoid(sobject_t("aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaObjectaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa 1", CEPH_NOSNAP));
+  ghobject_t hoid(hobject_t(sobject_t("aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaObjectaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa 1", CEPH_NOSNAP)));
   {
     ObjectStore::Transaction t;
     t.touch(cid, hoid);
@@ -157,7 +157,7 @@ TEST_F(StoreTest, ManyObjectTest) {
   coll_t cid("blah");
   string base = "";
   for (int i = 0; i < 100; ++i) base.append("aaaaa");
-  set<hobject_t> created;
+  set<ghobject_t> created;
   {
     ObjectStore::Transaction t;
     t.create_collection(cid);
@@ -171,27 +171,27 @@ TEST_F(StoreTest, ManyObjectTest) {
     ObjectStore::Transaction t;
     char buf[100];
     snprintf(buf, sizeof(buf), "%d", i);
-    hobject_t hoid(sobject_t(string(buf) + base, CEPH_NOSNAP));
+    ghobject_t hoid(hobject_t(sobject_t(string(buf) + base, CEPH_NOSNAP)));
     t.touch(cid, hoid);
     created.insert(hoid);
     r = store->apply_transaction(t);
     ASSERT_EQ(r, 0);
   }
 
-  for (set<hobject_t>::iterator i = created.begin();
+  for (set<ghobject_t>::iterator i = created.begin();
        i != created.end();
        ++i) {
     struct stat buf;
     ASSERT_TRUE(!store->stat(cid, *i, &buf));
   }
 
-  set<hobject_t> listed;
-  vector<hobject_t> objects;
+  set<ghobject_t> listed;
+  vector<ghobject_t> objects;
   r = store->collection_list(cid, objects);
   ASSERT_EQ(r, 0);
 
   cerr << "objects.size() is " << objects.size() << std::endl;
-  for (vector<hobject_t> ::iterator i = objects.begin();
+  for (vector<ghobject_t> ::iterator i = objects.begin();
        i != objects.end();
        ++i) {
     listed.insert(*i);
@@ -199,11 +199,11 @@ TEST_F(StoreTest, ManyObjectTest) {
   }
   ASSERT_TRUE(listed.size() == created.size());
 
-  hobject_t start, next;
+  ghobject_t start, next;
   objects.clear();
   r = store->collection_list_partial(
     cid,
-    hobject_t::get_max(),
+    ghobject_t::get_max(),
     50,
     60,
     0,
@@ -234,13 +234,13 @@ TEST_F(StoreTest, ManyObjectTest) {
   }
   cerr << "listed.size() is " << listed.size() << std::endl;
   ASSERT_TRUE(listed.size() == created.size());
-  for (set<hobject_t>::iterator i = listed.begin();
+  for (set<ghobject_t>::iterator i = listed.begin();
        i != listed.end();
        ++i) {
     ASSERT_TRUE(created.count(*i));
   }
 
-  for (set<hobject_t>::iterator i = created.begin();
+  for (set<ghobject_t>::iterator i = created.begin();
        i != created.end();
        ++i) {
     ObjectStore::Transaction t;
@@ -259,7 +259,7 @@ TEST_F(StoreTest, ManyObjectTest) {
 
 class ObjectGenerator {
 public:
-  virtual hobject_t create_object(gen_type *gen) = 0;
+  virtual ghobject_t create_object(gen_type *gen) = 0;
   virtual ~ObjectGenerator() {}
 };
 
@@ -267,7 +267,7 @@ class MixedGenerator : public ObjectGenerator {
 public:
   unsigned seq;
   MixedGenerator() : seq(0) {}
-  hobject_t create_object(gen_type *gen) {
+  ghobject_t create_object(gen_type *gen) {
     char buf[100];
     snprintf(buf, sizeof(buf), "%u", seq);
 
@@ -283,7 +283,7 @@ public:
     // hash
     //boost::binomial_distribution<uint32_t> bin(0xFFFFFF, 0.5);
     ++seq;
-    return hobject_t(name, string(), rand() & 2 ? CEPH_NOSNAP : rand(), rand() & 0xFF, 0, "");
+    return ghobject_t(hobject_t(name, string(), rand() & 2 ? CEPH_NOSNAP : rand(), rand() & 0xFF, 0, ""));
   }
 };
 
@@ -293,8 +293,8 @@ public:
   static const unsigned max_objects = 3000;
   coll_t cid;
   unsigned in_flight;
-  set<hobject_t> available_objects;
-  set<hobject_t> in_use_objects;
+  set<ghobject_t> available_objects;
+  set<ghobject_t> in_use_objects;
   ObjectGenerator *object_gen;
   gen_type *rng;
   ObjectStore *store;
@@ -307,9 +307,9 @@ public:
   public:
     SyntheticWorkloadState *state;
     ObjectStore::Transaction *t;
-    hobject_t hoid;
+    ghobject_t hoid;
     C_SyntheticOnReadable(SyntheticWorkloadState *state,
-			  ObjectStore::Transaction *t, hobject_t hoid)
+			  ObjectStore::Transaction *t, ghobject_t hoid)
       : state(state), t(t), hoid(hoid) {}
 
     void finish(int r) {
@@ -339,14 +339,14 @@ public:
     return store->apply_transaction(t);
   }
 
-  hobject_t get_uniform_random_object() {
+  ghobject_t get_uniform_random_object() {
     while (in_flight >= max_in_flight || available_objects.empty())
       cond.Wait(lock);
     boost::uniform_int<> choose(0, available_objects.size() - 1);
     int index = choose(*rng);
-    set<hobject_t>::iterator i = available_objects.begin();
+    set<ghobject_t>::iterator i = available_objects.begin();
     for ( ; index > 0; --index, ++i) ;
-    hobject_t ret = *i;
+    ghobject_t ret = *i;
     available_objects.erase(i);
     return ret;
   }
@@ -375,7 +375,7 @@ public:
     if (!can_create())
       return -ENOSPC;
     wait_for_ready();
-    hobject_t new_obj = object_gen->create_object(rng);
+    ghobject_t new_obj = object_gen->create_object(rng);
     in_use_objects.insert(new_obj);
     available_objects.erase(new_obj);
     ObjectStore::Transaction *t = new ObjectStore::Transaction;
@@ -388,9 +388,9 @@ public:
     Mutex::Locker locker(lock);
     while (in_flight)
       cond.Wait(lock);
-    vector<hobject_t> objects;
-    set<hobject_t> objects_set, objects_set2;
-    hobject_t next, current;
+    vector<ghobject_t> objects;
+    set<ghobject_t> objects_set, objects_set2;
+    ghobject_t next, current;
     while (1) {
       cerr << "scanning..." << std::endl;
       int r = store->collection_list_partial(cid, current, 50, 100, 
@@ -403,7 +403,7 @@ public:
       current = next;
     }
     ASSERT_EQ(objects_set.size(), available_objects.size());
-    for (set<hobject_t>::iterator i = objects_set.begin();
+    for (set<ghobject_t>::iterator i = objects_set.begin();
 	 i != objects_set.end();
 	 ++i) {
       ASSERT_GT(available_objects.count(*i), (unsigned)0);
@@ -413,7 +413,7 @@ public:
     ASSERT_EQ(r, 0);
     objects_set2.insert(objects.begin(), objects.end());
     ASSERT_EQ(objects_set2.size(), available_objects.size());
-    for (set<hobject_t>::iterator i = objects_set2.begin();
+    for (set<ghobject_t>::iterator i = objects_set2.begin();
 	 i != objects_set2.end();
 	 ++i) {
       ASSERT_GT(available_objects.count(*i), (unsigned)0);
@@ -421,7 +421,7 @@ public:
   }
 
   int stat() {
-    hobject_t hoid;
+    ghobject_t hoid;
     {
       Mutex::Locker locker(lock);
       if (!can_unlink())
@@ -446,7 +446,7 @@ public:
     Mutex::Locker locker(lock);
     if (!can_unlink())
       return -ENOENT;
-    hobject_t to_remove = get_uniform_random_object();
+    ghobject_t to_remove = get_uniform_random_object();
     ObjectStore::Transaction *t = new ObjectStore::Transaction;
     t->remove(cid, to_remove);
     ++in_flight;
@@ -505,7 +505,7 @@ TEST_F(StoreTest, HashCollisionTest) {
   }
   string base = "";
   for (int i = 0; i < 100; ++i) base.append("aaaaa");
-  set<hobject_t> created;
+  set<ghobject_t> created;
   for (int n = 0; n < 10; ++n) {
     char nbuf[100];
     sprintf(nbuf, "n%d", n);
@@ -515,7 +515,7 @@ TEST_F(StoreTest, HashCollisionTest) {
     if (!(i % 5)) {
       cerr << "Object n" << n << " "<< i << std::endl;
     }
-    hobject_t hoid(string(buf) + base, string(), CEPH_NOSNAP, 0, 0, string(nbuf));
+    ghobject_t hoid(hobject_t(string(buf) + base, string(), CEPH_NOSNAP, 0, 0, string(nbuf)));
     {
       ObjectStore::Transaction t;
       t.touch(cid, hoid);
@@ -525,21 +525,21 @@ TEST_F(StoreTest, HashCollisionTest) {
     created.insert(hoid);
   }
   }
-  vector<hobject_t> objects;
+  vector<ghobject_t> objects;
   r = store->collection_list(cid, objects);
   ASSERT_EQ(r, 0);
-  set<hobject_t> listed(objects.begin(), objects.end());
+  set<ghobject_t> listed(objects.begin(), objects.end());
   cerr << "listed.size() is " << listed.size() << " and created.size() is " << created.size() << std::endl;
   ASSERT_TRUE(listed.size() == created.size());
   objects.clear();
   listed.clear();
-  hobject_t current, next;
+  ghobject_t current, next;
   while (1) {
     r = store->collection_list_partial(cid, current, 50, 60,
 				       0, &objects, &next);
     ASSERT_EQ(r, 0);
     ASSERT_TRUE(sorted(objects));
-    for (vector<hobject_t>::iterator i = objects.begin();
+    for (vector<ghobject_t>::iterator i = objects.begin();
 	 i != objects.end();
 	 ++i) {
       if (listed.count(*i))
@@ -555,13 +555,13 @@ TEST_F(StoreTest, HashCollisionTest) {
   }
   cerr << "listed.size() is " << listed.size() << std::endl;
   ASSERT_TRUE(listed.size() == created.size());
-  for (set<hobject_t>::iterator i = listed.begin();
+  for (set<ghobject_t>::iterator i = listed.begin();
        i != listed.end();
        ++i) {
     ASSERT_TRUE(created.count(*i));
   }
 
-  for (set<hobject_t>::iterator i = created.begin();
+  for (set<ghobject_t>::iterator i = created.begin();
        i != created.end();
        ++i) {
     ObjectStore::Transaction t;
@@ -576,7 +576,7 @@ TEST_F(StoreTest, HashCollisionTest) {
 
 TEST_F(StoreTest, OMapTest) {
   coll_t cid("blah");
-  hobject_t hoid("tesomap", "", CEPH_NOSNAP, 0, 0, "");
+  ghobject_t hoid(hobject_t("tesomap", "", CEPH_NOSNAP, 0, 0, ""));
   int r;
   {
     ObjectStore::Transaction t;
@@ -672,7 +672,7 @@ TEST_F(StoreTest, OMapTest) {
 
 TEST_F(StoreTest, XattrTest) {
   coll_t cid("blah");
-  hobject_t hoid("tesomap", "", CEPH_NOSNAP, 0, 0, "");
+  ghobject_t hoid(hobject_t("tesomap", "", CEPH_NOSNAP, 0, 0, ""));
   bufferlist big;
   for (unsigned i = 0; i < 10000; ++i) {
     big.append('\0');
@@ -769,12 +769,12 @@ void colsplittest(
     for (uint32_t i = 0; i < 2*num_objects; ++i) {
       stringstream objname;
       objname << "obj" << i;
-      t.touch(cid, hobject_t(
+      t.touch(cid, ghobject_t(hobject_t(
 	  objname.str(),
 	  "",
 	  CEPH_NOSNAP,
 	  i<<common_suffix_size,
-	  0, ""));
+	  0, "")));
     }
     r = store->apply_transaction(t);
     ASSERT_EQ(r, 0);
@@ -788,14 +788,14 @@ void colsplittest(
   }
 
   ObjectStore::Transaction t;
-  vector<hobject_t> objects;
+  vector<ghobject_t> objects;
   r = store->collection_list(cid, objects);
   ASSERT_EQ(r, 0);
   ASSERT_EQ(objects.size(), num_objects);
-  for (vector<hobject_t>::iterator i = objects.begin();
+  for (vector<ghobject_t>::iterator i = objects.begin();
        i != objects.end();
        ++i) {
-    ASSERT_EQ(!(i->hash & (1<<common_suffix_size)), 0u);
+    ASSERT_EQ(!(i->hobj.hash & (1<<common_suffix_size)), 0u);
     t.remove(cid, *i);
   }
 
@@ -803,10 +803,10 @@ void colsplittest(
   r = store->collection_list(tid, objects);
   ASSERT_EQ(r, 0);
   ASSERT_EQ(objects.size(), num_objects);
-  for (vector<hobject_t>::iterator i = objects.begin();
+  for (vector<ghobject_t>::iterator i = objects.begin();
        i != objects.end();
        ++i) {
-    ASSERT_EQ(i->hash & (1<<common_suffix_size), 0u);
+    ASSERT_EQ(i->hobj.hash & (1<<common_suffix_size), 0u);
     t.remove(tid, *i);
   }
 
@@ -848,12 +848,12 @@ TEST_F(StoreTest, TwoHash) {
   std::cout << "Making objects" << std::endl;
   for (int i = 0; i < 360; ++i) {
     ObjectStore::Transaction t;
-    hobject_t o;
+    ghobject_t o;
     if (i < 8) {
-      o.hash = (i << 16) | 0xA1;
+      o.hobj.hash = (i << 16) | 0xA1;
       t.touch(cid, o);
     }
-    o.hash = (i << 16) | 0xB1;
+    o.hobj.hash = (i << 16) | 0xB1;
     t.touch(cid, o);
     r = store->apply_transaction(t);
     ASSERT_EQ(r, 0);
@@ -861,8 +861,8 @@ TEST_F(StoreTest, TwoHash) {
   std::cout << "Removing half" << std::endl;
   for (int i = 1; i < 8; ++i) {
     ObjectStore::Transaction t;
-    hobject_t o;
-    o.hash = (i << 16) | 0xA1;
+    ghobject_t o;
+    o.hobj.hash = (i << 16) | 0xA1;
     t.remove(cid, o);
     r = store->apply_transaction(t);
     ASSERT_EQ(r, 0);
@@ -870,24 +870,24 @@ TEST_F(StoreTest, TwoHash) {
   std::cout << "Checking" << std::endl;
   for (int i = 1; i < 8; ++i) {
     ObjectStore::Transaction t;
-    hobject_t o;
-    o.hash = (i << 16) | 0xA1;
+    ghobject_t o;
+    o.hobj.hash = (i << 16) | 0xA1;
     bool exists = store->exists(cid, o);
     ASSERT_EQ(exists, false);
   }
   {
-    hobject_t o;
-    o.hash = 0xA1;
+    ghobject_t o;
+    o.hobj.hash = 0xA1;
     bool exists = store->exists(cid, o);
     ASSERT_EQ(exists, true);
   }
   std::cout << "Cleanup" << std::endl;
   for (int i = 0; i < 360; ++i) {
     ObjectStore::Transaction t;
-    hobject_t o;
-    o.hash = (i << 16) | 0xA1;
+    ghobject_t o;
+    o.hobj.hash = (i << 16) | 0xA1;
     t.remove(cid, o);
-    o.hash = (i << 16) | 0xB1;
+    o.hobj.hash = (i << 16) | 0xB1;
     t.remove(cid, o);
     r = store->apply_transaction(t);
     ASSERT_EQ(r, 0);
diff --git a/src/test/filestore/workload_generator.cc b/src/test/filestore/workload_generator.cc
index 496379d7ad1..704d93021e2 100644
--- a/src/test/filestore/workload_generator.cc
+++ b/src/test/filestore/workload_generator.cc
@@ -344,12 +344,12 @@ void WorkloadGenerator::do_destroy_collection(ObjectStore::Transaction *t,
 {  
   m_nr_runs.set(0);
   entry->m_osr.flush();
-  vector<hobject_t> ls;
+  vector<ghobject_t> ls;
   m_store->collection_list(entry->m_coll, ls);
   dout(2) << __func__ << " coll " << entry->m_coll
       << " (" << ls.size() << " objects)" << dendl;
 
-  for (vector<hobject_t>::iterator it = ls.begin(); it < ls.end(); ++it) {
+  for (vector<ghobject_t>::iterator it = ls.begin(); it < ls.end(); ++it) {
     t->remove(entry->m_coll, *it);
   }
 
diff --git a/src/test/librados/misc.cc b/src/test/librados/misc.cc
index 803c8b1cc77..9abac9c412a 100644
--- a/src/test/librados/misc.cc
+++ b/src/test/librados/misc.cc
@@ -538,21 +538,25 @@ TEST(LibRadosMisc, BigAttrPP) {
 
   bufferlist got;
 
-  bl.clear();
-  got.clear();
-  bl.append(buffer::create(g_conf->osd_max_attr_size));
-  ASSERT_EQ(0, ioctx.setxattr("foo", "one", bl));
-  ASSERT_EQ((int)bl.length(), ioctx.getxattr("foo", "one", got));
-  ASSERT_TRUE(bl.contents_equal(got));
+  if (g_conf->osd_max_attr_size) {
+    bl.clear();
+    got.clear();
+    bl.append(buffer::create(g_conf->osd_max_attr_size));
+    ASSERT_EQ(0, ioctx.setxattr("foo", "one", bl));
+    ASSERT_EQ((int)bl.length(), ioctx.getxattr("foo", "one", got));
+    ASSERT_TRUE(bl.contents_equal(got));
 
-  bl.clear();
-  bl.append(buffer::create(g_conf->osd_max_attr_size+1));
-  ASSERT_EQ(-EFBIG, ioctx.setxattr("foo", "one", bl));
+    bl.clear();
+    bl.append(buffer::create(g_conf->osd_max_attr_size+1));
+    ASSERT_EQ(-EFBIG, ioctx.setxattr("foo", "one", bl));
+  } else {
+    cout << "osd_max_attr_size == 0; skipping test" << std::endl;
+  }
 
   for (int i=0; i<1000; i++) {
     bl.clear();
     got.clear();
-    bl.append(buffer::create(g_conf->osd_max_attr_size));
+    bl.append(buffer::create(MIN(g_conf->osd_max_attr_size, 1024)));
     char n[10];
     snprintf(n, sizeof(n), "a%d", i);
     ASSERT_EQ(0, ioctx.setxattr("foo", n, bl));
@@ -643,6 +647,60 @@ TEST(LibRadosMisc, CopyPP) {
   ASSERT_EQ(0, destroy_one_pool_pp(pool_name, cluster));
 }
 
+TEST(LibRadosMisc, Dirty) {
+  Rados cluster;
+  std::string pool_name = get_temp_pool_name();
+  ASSERT_EQ("", create_one_pool_pp(pool_name, cluster));
+  IoCtx ioctx;
+  ASSERT_EQ(0, cluster.ioctx_create(pool_name.c_str(), ioctx));
+
+  {
+    ObjectWriteOperation op;
+    op.create(true);
+    ASSERT_EQ(0, ioctx.operate("foo", &op));
+  }
+  {
+    bool dirty = false;
+    int r = -1;
+    ObjectReadOperation op;
+    op.is_dirty(&dirty, &r);
+    ASSERT_EQ(0, ioctx.operate("foo", &op, NULL));
+    ASSERT_TRUE(dirty);
+    ASSERT_EQ(0, r);
+  }
+  {
+    ObjectWriteOperation op;
+    op.undirty();
+    ASSERT_EQ(0, ioctx.operate("foo", &op));
+  }
+  {
+    bool dirty = false;
+    int r = -1;
+    ObjectReadOperation op;
+    op.is_dirty(&dirty, &r);
+    ASSERT_EQ(0, ioctx.operate("foo", &op, NULL));
+    ASSERT_FALSE(dirty);
+    ASSERT_EQ(0, r);
+  }
+  {
+    ObjectWriteOperation op;
+    op.truncate(0);  // still a write even tho it is a no-op
+    ASSERT_EQ(0, ioctx.operate("foo", &op));
+  }
+  {
+    bool dirty = false;
+    int r = -1;
+    ObjectReadOperation op;
+    op.is_dirty(&dirty, &r);
+    ASSERT_EQ(0, ioctx.operate("foo", &op, NULL));
+    ASSERT_TRUE(dirty);
+    ASSERT_EQ(0, r);
+  }
+
+  ioctx.close();
+  ASSERT_EQ(0, destroy_one_pool_pp(pool_name, cluster));
+}
+
 int main(int argc, char **argv)
 {
   ::testing::InitGoogleTest(&argc, argv);
diff --git a/src/test/os/TestFlatIndex.cc b/src/test/os/TestFlatIndex.cc
index 6db4f6c4aa5..53d2bbe6376 100644
--- a/src/test/os/TestFlatIndex.cc
+++ b/src/test/os/TestFlatIndex.cc
@@ -49,8 +49,8 @@ TEST(FlatIndex, collection) {
   uint64_t hash = 111;
   uint64_t pool = 222;
   const std::string object_name(10, 'A');
-  hobject_t hoid(object_t(object_name), key, CEPH_NOSNAP, hash, pool, "");
-  vector<hobject_t> ls;
+  ghobject_t hoid(hobject_t(object_t(object_name), key, CEPH_NOSNAP, hash, pool, ""));
+  vector<ghobject_t> ls;
   ASSERT_DEATH(index.collection_list_partial(hoid, 0, 0, 0, &ls, &hoid), "0");
 }
 
@@ -70,7 +70,7 @@ TEST(FlatIndex, created_unlink) {
     CollectionIndex::IndexedPath indexed_path;
     index->set_ref(index);
     const std::string object_name(10, 'A');
-    hobject_t hoid(object_t(object_name), key, CEPH_NOSNAP, hash, pool, "");
+    ghobject_t hoid(hobject_t(object_t(object_name), key, CEPH_NOSNAP, hash, pool, ""));
     int exists;
     EXPECT_EQ(0, index->lookup(hoid, &indexed_path, &exists));
     EXPECT_EQ(0, exists);
@@ -88,7 +88,7 @@ TEST(FlatIndex, created_unlink) {
     CollectionIndex::IndexedPath indexed_path;
     index->set_ref(index);
     const std::string object_name(1024, 'A');
-    hobject_t hoid(object_t(object_name), key, CEPH_NOSNAP, hash, pool, "");
+    ghobject_t hoid(hobject_t(object_t(object_name), key, CEPH_NOSNAP, hash, pool, ""));
     int exists;
     EXPECT_EQ(0, index->lookup(hoid, &indexed_path, &exists));
     EXPECT_EQ(0, exists);
@@ -110,10 +110,10 @@ TEST(FlatIndex, collection_list) {
   const std::string filename("PATH/" + object_name + "_head");
   EXPECT_EQ(0, ::close(::creat(filename.c_str(), 0600)));
   std::tr1::shared_ptr<CollectionIndex> index(new FlatIndex(collection, base_path));
-  vector<hobject_t> ls;
+  vector<ghobject_t> ls;
   index->collection_list(&ls);
   EXPECT_EQ((unsigned)1, ls.size());
-  EXPECT_EQ(object_name, ls[0].oid.name);
+  EXPECT_EQ(object_name, ls[0].hobj.oid.name);
   EXPECT_EQ(0, ::system("rm -fr PATH"));
 }
 
diff --git a/src/test/os/TestLFNIndex.cc b/src/test/os/TestLFNIndex.cc
index 3947329d995..02578eb4a71 100644
--- a/src/test/os/TestLFNIndex.cc
+++ b/src/test/os/TestLFNIndex.cc
@@ -45,10 +45,10 @@ public:
 		     std::tr1::shared_ptr<CollectionIndex> dest
 		     ) { return 0; }
 
-  void test_generate_and_parse(const hobject_t &hoid, const std::string &mangled_expected) {
+  void test_generate_and_parse(const ghobject_t &hoid, const std::string &mangled_expected) {
     const std::string mangled_name = lfn_generate_object_name(hoid);
     EXPECT_EQ(mangled_expected, mangled_name);
-    hobject_t hoid_parsed;
+    ghobject_t hoid_parsed;
     EXPECT_TRUE(lfn_parse_object_name(mangled_name, &hoid_parsed));
     EXPECT_EQ(hoid, hoid_parsed);
   }
@@ -58,34 +58,34 @@ protected:
 
   virtual int _created(
 		       const vector<string> &path,
-		       const hobject_t &hoid,     
+		       const ghobject_t &hoid,
 		       const string &mangled_name 
 		       ) { return 0; }
 
   virtual int _remove(
 		      const vector<string> &path,
-		      const hobject_t &hoid,    
+		      const ghobject_t &hoid,
 		      const string &mangled_name
 		      ) { return 0; }
 
   virtual int _lookup(
-		      const hobject_t &hoid,
+		      const ghobject_t &hoid,
 		      vector<string> *path,
 		      string *mangled_name,
 		      int *exists		 
 		      ) { return 0; }
 
   virtual int _collection_list(
-			       vector<hobject_t> *ls
+			       vector<ghobject_t> *ls
 			       ) { return 0; }
 
   virtual int _collection_list_partial(
-				       const hobject_t &start,
+				       const ghobject_t &start,
 				       int min_count,
 				       int max_count,
 				       snapid_t seq,
-				       vector<hobject_t> *ls,
-				       hobject_t *next
+				       vector<ghobject_t> *ls,
+				       ghobject_t *next
 				       ) { return 0; }
 };
 
@@ -101,9 +101,9 @@ TEST_F(TestHASH_INDEX_TAG, generate_and_parse_name) {
   uint64_t hash = 0xABABABAB;
   uint64_t pool = -1;
 
-  test_generate_and_parse(hobject_t(object_t(".A/B_\\C.D"), key, CEPH_NOSNAP, hash, pool, ""),
+  test_generate_and_parse(ghobject_t(hobject_t(object_t(".A/B_\\C.D"), key, CEPH_NOSNAP, hash, pool, "")),
 			  "\\.A\\sB_\\\\C.D_head_ABABABAB");
-  test_generate_and_parse(hobject_t(object_t("DIR_A"), key, CEPH_NOSNAP, hash, pool, ""),
+  test_generate_and_parse(ghobject_t(hobject_t(object_t("DIR_A"), key, CEPH_NOSNAP, hash, pool, "")),
 			  "\\dA_head_ABABABAB");
 }
 
@@ -123,11 +123,11 @@ TEST_F(TestHASH_INDEX_TAG_2, generate_and_parse_name) {
   {
     std::string name(".XA/B_\\C.D");
     name[1] = '\0';
-    hobject_t hoid(object_t(name), key, CEPH_NOSNAP, hash, pool, "");
+    ghobject_t hoid(hobject_t(object_t(name), key, CEPH_NOSNAP, hash, pool, ""));
 
     test_generate_and_parse(hoid, "\\.\\nA\\sB\\u\\\\C.D_KEY_head_ABABABAB");
   }
-  test_generate_and_parse(hobject_t(object_t("DIR_A"), key, CEPH_NOSNAP, hash, pool, ""),
+  test_generate_and_parse(ghobject_t(hobject_t(object_t("DIR_A"), key, CEPH_NOSNAP, hash, pool, "")),
 			  "\\dA_KEY_head_ABABABAB");
 }
 
@@ -143,21 +143,37 @@ TEST_F(TestHOBJECT_WITH_POOL, generate_and_parse_name) {
   const std::string key("KEY");
   uint64_t hash = 0xABABABAB;
   uint64_t pool = 0xCDCDCDCD;
+  int64_t gen = 0xefefefefef;
+  int8_t shard_id = 0xb;
 
   {
     std::string name(".XA/B_\\C.D");
     name[1] = '\0';
-    hobject_t hoid(object_t(name), key, CEPH_NOSNAP, hash, pool, "");
-    hoid.nspace = "NSPACE";
+    ghobject_t hoid(hobject_t(object_t(name), key, CEPH_NOSNAP, hash, pool, ""));
+    hoid.hobj.nspace = "NSPACE";
 
     test_generate_and_parse(hoid, "\\.\\nA\\sB\\u\\\\C.D_KEY_head_ABABABAB_NSPACE_cdcdcdcd");
   }
   {
-    hobject_t hoid(object_t("DIR_A"), key, CEPH_NOSNAP, hash, pool, "");
-    hoid.nspace = "NSPACE";
+    ghobject_t hoid(hobject_t(object_t("DIR_A"), key, CEPH_NOSNAP, hash, pool, ""));
+    hoid.hobj.nspace = "NSPACE";
 
     test_generate_and_parse(hoid, "\\dA_KEY_head_ABABABAB_NSPACE_cdcdcdcd");
   }
+  {
+    std::string name(".XA/B_\\C.D");
+    name[1] = '\0';
+    ghobject_t hoid(hobject_t(object_t(name), key, CEPH_NOSNAP, hash, pool, ""), gen, shard_id);
+    hoid.hobj.nspace = "NSPACE";
+
+    test_generate_and_parse(hoid, "\\.\\nA\\sB\\u\\\\C.D_KEY_head_ABABABAB_NSPACE_cdcdcdcd_efefefefef_b");
+  }
+  {
+    ghobject_t hoid(hobject_t(object_t("DIR_A"), key, CEPH_NOSNAP, hash, pool, ""), gen, shard_id);
+    hoid.hobj.nspace = "NSPACE";
+
+    test_generate_and_parse(hoid, "\\dA_KEY_head_ABABABAB_NSPACE_cdcdcdcd_efefefefef_b");
+  }
 }
 
 class TestLFNIndex : public TestWrapLFNIndex, public ::testing::Test {
@@ -167,12 +183,12 @@ public:
 
   virtual void SetUp() {
     ::chmod("PATH", 0700);
-    ::system("rm -fr PATH");
-    ::mkdir("PATH", 0700);
+    ASSERT_EQ(0, ::system("rm -fr PATH"));
+    ASSERT_EQ(0, ::mkdir("PATH", 0700));
   }
 
   virtual void TearDown() {
-    ::system("rm -fr PATH");
+    ASSERT_EQ(0, ::system("rm -fr PATH"));
   }
 };
 
@@ -185,7 +201,7 @@ TEST_F(TestLFNIndex, remove_object) {
   {
     std::string mangled_name;
     int exists = 666;
-    hobject_t hoid(sobject_t("ABC", CEPH_NOSNAP));
+    ghobject_t hoid(hobject_t(sobject_t("ABC", CEPH_NOSNAP)));
 
     EXPECT_EQ(0, ::chmod("PATH", 0000));
     EXPECT_EQ(-EACCES, remove_object(path, hoid));
@@ -205,7 +221,7 @@ TEST_F(TestLFNIndex, remove_object) {
     std::string mangled_name;
     int exists;
     const std::string object_name(1024, 'A');
-    hobject_t hoid(sobject_t(object_name, CEPH_NOSNAP));
+    ghobject_t hoid(hobject_t(sobject_t(object_name, CEPH_NOSNAP)));
 
     EXPECT_EQ(0, get_mangled_name(path, hoid, &mangled_name, &exists));
     EXPECT_EQ(0, exists);
@@ -226,7 +242,7 @@ TEST_F(TestLFNIndex, remove_object) {
     std::string mangled_name;
     int exists;
     const std::string object_name(1024, 'A');
-    hobject_t hoid(sobject_t(object_name, CEPH_NOSNAP));
+    ghobject_t hoid(hobject_t(sobject_t(object_name, CEPH_NOSNAP)));
 
     //
     //   PATH/AAA..._0_long => does not match long object name
@@ -275,7 +291,7 @@ TEST_F(TestLFNIndex, remove_object) {
     std::string mangled_name;
     int exists;
     const std::string object_name(1024, 'A');
-    hobject_t hoid(sobject_t(object_name, CEPH_NOSNAP));
+    ghobject_t hoid(hobject_t(sobject_t(object_name, CEPH_NOSNAP)));
 
     //
     //   PATH/AAA..._0_long => matches long object name
@@ -323,7 +339,7 @@ TEST_F(TestLFNIndex, get_mangled_name) {
   {
     std::string mangled_name;
     int exists = 666;
-    hobject_t hoid(sobject_t("ABC", CEPH_NOSNAP));
+    ghobject_t hoid(hobject_t(sobject_t("ABC", CEPH_NOSNAP)));
 
     EXPECT_EQ(0, get_mangled_name(path, hoid, &mangled_name, &exists));
     EXPECT_NE(std::string::npos, mangled_name.find("ABC__head"));
@@ -343,7 +359,7 @@ TEST_F(TestLFNIndex, get_mangled_name) {
     std::string mangled_name;
     int exists;
     const std::string object_name(1024, 'A');
-    hobject_t hoid(sobject_t(object_name, CEPH_NOSNAP));
+    ghobject_t hoid(hobject_t(sobject_t(object_name, CEPH_NOSNAP)));
 
     //
     // long version of the mangled name and no matching
@@ -441,6 +457,11 @@ int main(int argc, char **argv) {
   }
 }
 
-// Local Variables:
-// compile-command: "cd ../.. ; make unittest_lfnindex ; valgrind --tool=memcheck ./unittest_lfnindex # --gtest_filter=TestLFNIndex.* --log-to-stderr=true --debug-filestore=20"
-// End:
+/* 
+ * Local Variables:
+ * compile-command: "cd ../.. ; 
+ *   make unittest_lfnindex && 
+ *   valgrind --tool=memcheck ./unittest_lfnindex \
+ *   # --gtest_filter=TestLFNIndex.* --log-to-stderr=true --debug-filestore=20"
+ * End:
+ */
diff --git a/src/test/osd/ErasureCodeExample.h b/src/test/osd/ErasureCodeExample.h
index 896e614c6b5..0fd55187559 100644
--- a/src/test/osd/ErasureCodeExample.h
+++ b/src/test/osd/ErasureCodeExample.h
@@ -19,45 +19,63 @@
 
 #include <unistd.h>
 #include <errno.h>
+#include <algorithm>
 #include <sstream>
 #include "osd/ErasureCodeInterface.h"
 
+#define FIRST_DATA_CHUNK 0
+#define SECOND_DATA_CHUNK 1
 #define DATA_CHUNKS 2u
+
+#define CODING_CHUNK 2
 #define CODING_CHUNKS 1u
 
+#define MINIMUM_TO_RECOVER 2u
+
 class ErasureCodeExample : public ErasureCodeInterface {
 public:
-  useconds_t delay;
-  ErasureCodeExample(const map<std::string,std::string> &parameters) :
-    delay(0)
-  {
-    if (parameters.find("usleep") != parameters.end()) {
-      std::istringstream ss(parameters.find("usleep")->second);
-      ss >> delay;
-      usleep(delay);
-    }
-  }
-
   virtual ~ErasureCodeExample() {}
   
   virtual int minimum_to_decode(const set<int> &want_to_read,
                                 const set<int> &available_chunks,
                                 set<int> *minimum) {
-    if (available_chunks.size() < DATA_CHUNKS)
+    if (includes(available_chunks.begin(), available_chunks.end(),
+		 want_to_read.begin(), want_to_read.end())) {
+      *minimum = want_to_read;
+      return 0;
+    } else if (available_chunks.size() >= MINIMUM_TO_RECOVER) {
+      *minimum = available_chunks;
+      return 0;
+    } else {
       return -EIO;
-    set<int>::iterator i;
-    unsigned j;
-    for (i = available_chunks.begin(), j = 0; j < DATA_CHUNKS; i++, j++)
-      minimum->insert(*i);
-    return 0;
+    }
   }
 
   virtual int minimum_to_decode_with_cost(const set<int> &want_to_read,
                                           const map<int, int> &available,
                                           set<int> *minimum) {
+    //
+    // If one chunk is more expensive to fetch than the others,
+    // recover it instead. For instance, if the cost reflects the
+    // time it takes for a chunk to be retrieved from a remote
+    // OSD and if CPU is cheap, it could make sense to recover
+    // instead of fetching the chunk.
+    //
+    map<int, int> c2c(available);
+    if (c2c.size() > DATA_CHUNKS) {
+      if (c2c[FIRST_DATA_CHUNK] > c2c[SECOND_DATA_CHUNK] &&
+	  c2c[FIRST_DATA_CHUNK] > c2c[CODING_CHUNK])
+	c2c.erase(FIRST_DATA_CHUNK);
+      else if(c2c[SECOND_DATA_CHUNK] > c2c[FIRST_DATA_CHUNK] &&
+	      c2c[SECOND_DATA_CHUNK] > c2c[CODING_CHUNK])
+	c2c.erase(SECOND_DATA_CHUNK);
+      else if(c2c[CODING_CHUNK] > c2c[FIRST_DATA_CHUNK] &&
+	      c2c[CODING_CHUNK] > c2c[SECOND_DATA_CHUNK])
+	c2c.erase(CODING_CHUNK);
+    }
     set <int> available_chunks;
-    for (map<int, int>::const_iterator i = available.begin();
-	 i != available.end();
+    for (map<int, int>::const_iterator i = c2c.begin();
+	 i != c2c.end();
 	 i++)
       available_chunks.insert(i->first);
     return minimum_to_decode(want_to_read, available_chunks, minimum);
@@ -66,16 +84,28 @@ public:
   virtual int encode(const set<int> &want_to_encode,
                      const bufferlist &in,
                      map<int, bufferlist> *encoded) {
+    //
+    // make sure all data chunks have the same length, allocating
+    // padding if necessary.
+    //
     unsigned chunk_length = ( in.length() / DATA_CHUNKS ) + 1;
     unsigned length = chunk_length * ( DATA_CHUNKS + CODING_CHUNKS );
     bufferlist out(in);
     bufferptr pad(length - in.length());
     pad.zero(0, DATA_CHUNKS);
     out.push_back(pad);
+    //
+    // compute the coding chunk with first chunk ^ second chunk
+    //
     char *p = out.c_str();
-    for (unsigned i = 0; i < chunk_length * DATA_CHUNKS; i++)
-      p[i + 2 * chunk_length] =
-        p[i + 0 * chunk_length] ^ p[i + 1 * chunk_length];
+    for (unsigned i = 0; i < chunk_length; i++)
+      p[i + CODING_CHUNK * chunk_length] =
+        p[i + FIRST_DATA_CHUNK * chunk_length] ^
+	p[i + SECOND_DATA_CHUNK * chunk_length];
+    //
+    // populate the bufferlist with bufferptr pointing
+    // to chunk boundaries
+    //
     const bufferptr ptr = out.buffers().front();
     for (set<int>::iterator j = want_to_encode.begin();
          j != want_to_encode.end();
@@ -89,14 +119,30 @@ public:
   virtual int decode(const set<int> &want_to_read,
                      const map<int, bufferlist> &chunks,
                      map<int, bufferlist> *decoded) {
-    
+    //
+    // All chunks have the same size
+    //
     unsigned chunk_length = (*chunks.begin()).second.length();
     for (set<int>::iterator i = want_to_read.begin();
          i != want_to_read.end();
          i++) {
-      if (chunks.find(*i) != chunks.end())
+      if (chunks.find(*i) != chunks.end()) {
+	//
+	// If the chunk is available, just copy the bufferptr pointer
+	// to the decoded argument.
+	//
         (*decoded)[*i] = chunks.find(*i)->second;
-      else {
+      } else if(chunks.size() != 2) {
+	//
+	// If a chunk is missing and there are not enough chunks
+	// to recover, abort.
+	//
+	return -ERANGE;
+      } else {
+	//
+	// No matter what the missing chunk is, XOR of the other
+	// two recovers it.
+	//
         bufferptr chunk(chunk_length);
         map<int, bufferlist>::const_iterator k = chunks.begin();
         const char *a = k->second.buffers().front().c_str();
diff --git a/src/test/osd/ErasureCodePluginExample.cc b/src/test/osd/ErasureCodePluginExample.cc
index 1543b1cdaed..6ae61c0a18d 100644
--- a/src/test/osd/ErasureCodePluginExample.cc
+++ b/src/test/osd/ErasureCodePluginExample.cc
@@ -14,6 +14,8 @@
  * 
  */
 
+#include <unistd.h>
+
 #include "osd/ErasureCodePlugin.h"
 #include "ErasureCodeExample.h"
 
@@ -22,7 +24,7 @@ public:
   virtual int factory(const map<std::string,std::string> &parameters,
                       ErasureCodeInterfaceRef *erasure_code)
   {
-    *erasure_code = ErasureCodeInterfaceRef(new ErasureCodeExample(parameters));
+    *erasure_code = ErasureCodeInterfaceRef(new ErasureCodeExample());
     return 0;
   }
 };
diff --git a/src/test/osd/ErasureCodePluginFailToInitialize.cc b/src/test/osd/ErasureCodePluginFailToInitialize.cc
new file mode 100644
index 00000000000..cded6eef556
--- /dev/null
+++ b/src/test/osd/ErasureCodePluginFailToInitialize.cc
@@ -0,0 +1,23 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2013 Cloudwatt <libre.licensing@cloudwatt.com>
+ *
+ * Author: Loic Dachary <loic@dachary.org>
+ *
+ *  This library is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU Lesser General Public
+ *  License as published by the Free Software Foundation; either
+ *  version 2.1 of the License, or (at your option) any later version.
+ * 
+ */
+
+#include <errno.h>
+#include "osd/ErasureCodePlugin.h"
+
+int __erasure_code_init(char *plugin_name)
+{
+  return -ESRCH;
+}
diff --git a/src/test/osd/ErasureCodePluginFailToRegister.cc b/src/test/osd/ErasureCodePluginFailToRegister.cc
new file mode 100644
index 00000000000..ea980b722ae
--- /dev/null
+++ b/src/test/osd/ErasureCodePluginFailToRegister.cc
@@ -0,0 +1,22 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2013 Cloudwatt <libre.licensing@cloudwatt.com>
+ *
+ * Author: Loic Dachary <loic@dachary.org>
+ *
+ *  This library is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU Lesser General Public
+ *  License as published by the Free Software Foundation; either
+ *  version 2.1 of the License, or (at your option) any later version.
+ * 
+ */
+
+#include "osd/ErasureCodePlugin.h"
+
+int __erasure_code_init(char *plugin_name)
+{
+  return 0;
+}
diff --git a/src/test/osd/ErasureCodePluginHangs.cc b/src/test/osd/ErasureCodePluginHangs.cc
new file mode 100644
index 00000000000..ea73786b526
--- /dev/null
+++ b/src/test/osd/ErasureCodePluginHangs.cc
@@ -0,0 +1,24 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2013 Cloudwatt <libre.licensing@cloudwatt.com>
+ *
+ * Author: Loic Dachary <loic@dachary.org>
+ *
+ *  This library is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU Lesser General Public
+ *  License as published by the Free Software Foundation; either
+ *  version 2.1 of the License, or (at your option) any later version.
+ * 
+ */
+
+#include <unistd.h>
+#include "osd/ErasureCodePlugin.h"
+
+int __erasure_code_init(char *plugin_name)
+{
+  sleep(1000);
+  return 0;
+}
diff --git a/src/test/osd/ErasureCodePluginMissingEntryPoint.cc b/src/test/osd/ErasureCodePluginMissingEntryPoint.cc
new file mode 100644
index 00000000000..fc60f866086
--- /dev/null
+++ b/src/test/osd/ErasureCodePluginMissingEntryPoint.cc
@@ -0,0 +1 @@
+// missing int __erasure_code_init(char *plugin_name) {}
diff --git a/src/test/osd/Object.cc b/src/test/osd/Object.cc
index 408cc63ac02..d4be4df1bdd 100644
--- a/src/test/osd/Object.cc
+++ b/src/test/osd/Object.cc
@@ -9,10 +9,11 @@
 
 ostream &operator<<(ostream &out, const ContDesc &rhs)
 {
-  return out << "ObjNum: " << rhs.objnum
-	     << " snap: " << rhs.cursnap
-	     << " seqnum: " << rhs.seqnum
-	     << " prefix: " << rhs.prefix;
+  return out << "(ObjNum " << rhs.objnum
+	     << " snap " << rhs.cursnap
+	     << " seq_num " << rhs.seqnum
+    //<< " prefix " << rhs.prefix
+	     << ")";
 }
 
 void VarLenGenerator::get_ranges(const ContDesc &cont, interval_set<uint64_t> &out) {
diff --git a/src/test/osd/RadosModel.h b/src/test/osd/RadosModel.h
index 3a73ac33faf..a87ecebb4c1 100644
--- a/src/test/osd/RadosModel.h
+++ b/src/test/osd/RadosModel.h
@@ -403,8 +403,8 @@ public:
 
   void update_object_full(const string &oid, const ObjectDesc &contents)
   {
-    pool_obj_cont.rbegin()->second.erase(oid);
-    pool_obj_cont.rbegin()->second.insert(pair<string,ObjectDesc>(oid, contents));
+    pool_obj_cont[current_snap].erase(oid);
+    pool_obj_cont[current_snap].insert(pair<string,ObjectDesc>(oid, contents));
   }
 
   void update_object_version(const string &oid, uint64_t version)
@@ -416,7 +416,7 @@ public:
       map<string,ObjectDesc>::iterator j = i->second.find(oid);
       if (j != i->second.end()) {
 	j->second.version = version;
-	cout << __func__ << " oid " << oid << " is version " << version << std::endl;
+	cout << __func__ << " oid " << oid << " v " << version << " " << j->second.most_recent() << std::endl;
 	break;
       }
     }
@@ -792,26 +792,12 @@ public:
     context->oid_in_use.insert(oid);
     context->oid_not_in_use.erase(oid);
 
-    context->seq_num++;
-
-    vector<uint64_t> snapset(context->snaps.size());
-    int j = 0;
-    for (map<int,uint64_t>::reverse_iterator i = context->snaps.rbegin();
-	 i != context->snaps.rend();
-	 ++i, ++j) {
-      snapset[j] = i->second;
-    }
     interval_set<uint64_t> ranges;
     context->cont_gen.get_ranges(cont, ranges);
     std::cout << num << ":  seq_num " << context->seq_num << " ranges " << ranges << std::endl;
+    context->seq_num++;
     context->state_lock.Unlock();
 
-    int r = context->io_ctx.selfmanaged_snap_set_write_ctx(context->seq, snapset);
-    if (r) {
-      cerr << " r is " << r << " snapset is " << snapset << " seq is " << context->seq << std::endl;
-      assert(0);
-    }
-
     waiting_on = ranges.num_intervals();
     //cout << " waiting_on = " << waiting_on << std::endl;
     ContentsGenerator::iterator gen_pos = context->cont_gen.get_iterator(cont);
@@ -922,23 +908,10 @@ public:
 
     context->remove_object(oid);
 
-    vector<uint64_t> snapset(context->snaps.size());
-    int j = 0;
-    for (map<int,uint64_t>::reverse_iterator i = context->snaps.rbegin();
-	 i != context->snaps.rend();
-	 ++i, ++j) {
-      snapset[j] = i->second;
-    }
     interval_set<uint64_t> ranges;
     context->state_lock.Unlock();
 
-    int r = context->io_ctx.selfmanaged_snap_set_write_ctx(context->seq, snapset);
-    if (r) {
-      cerr << "r is " << r << " snapset is " << snapset << " seq is " << context->seq << std::endl;
-      assert(0);
-    }
-
-    r = context->io_ctx.remove(context->prefix+oid);
+    int r = context->io_ctx.remove(context->prefix+oid);
     if (r && !(r == -ENOENT && !present)) {
       cerr << "r is " << r << " while deleting " << oid << " and present is " << present << std::endl;
       assert(0);
@@ -1072,6 +1045,7 @@ public:
 	     << err << std::endl;
       }
     } else {
+      cout << num << ":  expect " << old_value.most_recent() << std::endl;
       assert(!old_value.deleted());
       if (old_value.has_contents()) {
 	ContDesc to_check;
@@ -1081,8 +1055,8 @@ public:
 	  context->errors++;
 	}
 	if (to_check != old_value.most_recent()) {
-	  cerr << num << ": Found incorrect object contents " << to_check
-	       << ", expected " << old_value.most_recent() << " oid " << oid << std::endl;
+	  cerr << num << ": oid " << oid << " found incorrect object contents " << to_check
+	       << ", expected " << old_value.most_recent() << std::endl;
 	  context->errors++;
 	}
 	if (!old_value.check(result)) {
@@ -1272,17 +1246,8 @@ public:
     context->oid_in_use.insert(oid);
     context->oid_not_in_use.erase(oid);
 
-    vector<uint64_t> snapset(context->snaps.size());
-    int j = 0;
-    for (map<int,uint64_t>::reverse_iterator i = context->snaps.rbegin();
-	 i != context->snaps.rend();
-	 ++i, ++j) {
-      snapset[j] = i->second;
-    }
-
     TestWatchContext *ctx = context->get_watch_context(oid);
     context->state_lock.Unlock();
-    assert(!context->io_ctx.selfmanaged_snap_set_write_ctx(context->seq, snapset));
     int r;
     if (!ctx) {
       {
@@ -1352,15 +1317,7 @@ public:
     context->roll_back(oid, roll_back_to);
     uint64_t snap = context->snaps[roll_back_to];
 
-    vector<uint64_t> snapset(context->snaps.size());
-    int j = 0;
-    for (map<int,uint64_t>::reverse_iterator i = context->snaps.rbegin();
-	 i != context->snaps.rend();
-	 ++i, ++j) {
-      snapset[j] = i->second;
-    }
     context->state_lock.Unlock();
-    assert(!context->io_ctx.selfmanaged_snap_set_write_ctx(context->seq, snapset));
 
     op.selfmanaged_snap_rollback(snap);
 
@@ -1403,6 +1360,7 @@ public:
   string oid, oid_src;
   ObjectDesc src_value;
   librados::ObjectWriteOperation op;
+  librados::ObjectReadOperation rd_op;
   librados::AioCompletion *comp;
   librados::AioCompletion *comp_racing_read;
   int snap;
@@ -1440,6 +1398,8 @@ public:
       snap = -1;
     }
     context->find_object(oid_src, &src_value, snap);
+    if (!src_value.deleted())
+      context->update_object_full(oid, src_value);
 
     string src = context->prefix+oid_src;
     op.copy_from(src.c_str(), context->io_ctx, src_value.version);
@@ -1457,7 +1417,12 @@ public:
 					       new TestOp::CallbackInfo(1));
     comp_racing_read = context->rados.aio_create_completion((void*) read_cb_arg, &write_callback,
 							    NULL);
-    context->io_ctx.aio_stat(context->prefix+oid, comp_racing_read, NULL, NULL);
+    rd_op.stat(NULL, NULL, NULL);
+    context->io_ctx.aio_operate(context->prefix+oid, comp_racing_read, &rd_op,
+				librados::SNAP_HEAD,
+				librados::OPERATION_ORDER_READS_WRITES,  // order wrt previous write/update
+				NULL);
+
   }
 
   void _finish(CallbackInfo *info)
@@ -1473,19 +1438,18 @@ public:
       assert(comp->is_complete());
       cout << num << ":  finishing copy_from to " << context->prefix + oid << std::endl;
       if ((r = comp->get_return_value())) {
-	if (!(r == -ENOENT && src_value.deleted())) {
+	if (r == -ENOENT && src_value.deleted()) {
+	  cout << num << ":  got expected ENOENT (src dne)" << std::endl;
+	} else {
 	  cerr << "Error: oid " << oid << " copy_from " << oid_src << " returned error code "
 	       << r << std::endl;
+	  assert(0);
 	}
       } else {
 	assert(!version || comp->get_version64() == version);
 	version = comp->get_version64();
-	context->update_object_full(oid, src_value);
 	context->update_object_version(oid, comp->get_version64());
       }
-      context->oid_in_use.erase(oid_src);
-      context->oid_not_in_use.insert(oid_src);
-      context->kick();
     } else if (info->id == 1) {
       // racing read
       assert(comp_racing_read->is_complete());
@@ -1500,11 +1464,14 @@ public:
 	assert(!version || comp_racing_read->get_version64() == version);
 	version = comp_racing_read->get_version64();
       }
+    }
+    if (++done == 2) {
       context->oid_in_use.erase(oid);
       context->oid_not_in_use.insert(oid);
+      context->oid_in_use.erase(oid_src);
+      context->oid_not_in_use.insert(oid_src);
       context->kick();
     }
-    ++done;
   }
 
   bool finished()
diff --git a/src/test/osd/TestErasureCodeExample.cc b/src/test/osd/TestErasureCodeExample.cc
index 66f521d7863..f12e80c8cd0 100644
--- a/src/test/osd/TestErasureCodeExample.cc
+++ b/src/test/osd/TestErasureCodeExample.cc
@@ -20,24 +20,9 @@
 #include "global/global_context.h"
 #include "gtest/gtest.h"
 
-TEST(ErasureCodeExample, constructor)
-{
-  map<std::string,std::string> parameters;
-  {
-    ErasureCodeExample example(parameters);
-    EXPECT_EQ(0u, example.delay);
-  }
-  parameters["usleep"] = "10";
-  {
-    ErasureCodeExample example(parameters);
-    EXPECT_EQ(10u, example.delay);
-  }
-}
-
 TEST(ErasureCodeExample, minimum_to_decode)
 {
-  map<std::string,std::string> parameters;
-  ErasureCodeExample example(parameters);
+  ErasureCodeExample example;
   set<int> available_chunks;
   set<int> want_to_read;
   want_to_read.insert(1);
@@ -65,16 +50,58 @@ TEST(ErasureCodeExample, minimum_to_decode)
     EXPECT_EQ(0, example.minimum_to_decode(want_to_read,
                                            available_chunks,
                                            &minimum));
+    EXPECT_EQ(1u, minimum.size());
+    EXPECT_EQ(1u, minimum.count(1));
+  }
+}
+
+TEST(ErasureCodeExample, minimum_to_decode_with_cost)
+{
+  ErasureCodeExample example;
+  map<int,int> available;
+  set<int> want_to_read;
+  want_to_read.insert(1);
+  {
+    set<int> minimum;
+    EXPECT_EQ(-EIO, example.minimum_to_decode_with_cost(want_to_read,
+							available,
+							&minimum));
+  }
+  available[0] = 1;
+  available[2] = 1;
+  {
+    set<int> minimum;
+    EXPECT_EQ(0, example.minimum_to_decode_with_cost(want_to_read,
+						     available,
+						     &minimum));
     EXPECT_EQ(2u, minimum.size());
     EXPECT_EQ(1u, minimum.count(0));
+    EXPECT_EQ(1u, minimum.count(2));
+  }
+  {
+    set<int> minimum;
+    available[1] = 1;
+    EXPECT_EQ(0, example.minimum_to_decode_with_cost(want_to_read,
+						     available,
+						     &minimum));
+    EXPECT_EQ(1u, minimum.size());
     EXPECT_EQ(1u, minimum.count(1));
   }
+  {
+    set<int> minimum;
+    available[1] = 2;
+    EXPECT_EQ(0, example.minimum_to_decode_with_cost(want_to_read,
+						     available,
+						     &minimum));
+    EXPECT_EQ(2u, minimum.size());
+    EXPECT_EQ(1u, minimum.count(0));
+    EXPECT_EQ(1u, minimum.count(2));
+  }
 }
 
 TEST(ErasureCodeExample, encode_decode)
 {
-  map<std::string,std::string> parameters;
-  ErasureCodeExample example(parameters);
+  ErasureCodeExample example;
 
   bufferlist in;
   in.append("ABCDE");
@@ -142,5 +169,5 @@ int main(int argc, char **argv) {
 }
 
 // Local Variables:
-// compile-command: "cd ../.. ; make -j4 && make unittest_erasure_code_example && ./unittest_erasure_code_example --gtest_filter=*.* --log-to-stderr=true --debug-osd=20"
+// compile-command: "cd ../.. ; make -j4 && make unittest_erasure_code_example && valgrind  --leak-check=full --tool=memcheck ./unittest_erasure_code_example --gtest_filter=*.* --log-to-stderr=true --debug-osd=20"
 // End:
diff --git a/src/test/osd/TestErasureCodeJerasure.cc b/src/test/osd/TestErasureCodeJerasure.cc
index 266b1735659..a51cb853c86 100644
--- a/src/test/osd/TestErasureCodeJerasure.cc
+++ b/src/test/osd/TestErasureCodeJerasure.cc
@@ -14,6 +14,7 @@
  * 
  */
 
+#include <errno.h>
 #include "global/global_init.h"
 #include "osd/ErasureCodePluginJerasure/ErasureCodeJerasure.h"
 #include "common/ceph_argparse.h"
@@ -36,7 +37,8 @@ typedef ::testing::Types<
 > JerasureTypes;
 TYPED_TEST_CASE(ErasureCodeTest, JerasureTypes);
 
-TYPED_TEST(ErasureCodeTest, encode_decode) {
+TYPED_TEST(ErasureCodeTest, encode_decode)
+{
   TypeParam jerasure;
   map<std::string,std::string> parameters;
   parameters["erasure-code-k"] = "2";
@@ -45,9 +47,19 @@ TYPED_TEST(ErasureCodeTest, encode_decode) {
   parameters["erasure-code-packetsize"] = "8";
   jerasure.init(parameters);
 
+#define LARGE_ENOUGH 2048
+  bufferptr in_ptr(LARGE_ENOUGH);
+  in_ptr.zero();
+  in_ptr.set_length(0);
+  const char *payload =
+    "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"
+    "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"
+    "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"
+    "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"
+    "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789";
+  in_ptr.append(payload, strlen(payload));
   bufferlist in;
-  for (int i = 0; i < 5; i++)
-    in.append("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789");
+  in.push_front(in_ptr);
   int want_to_encode[] = { 0, 1, 2, 3 };
   map<int, bufferlist> encoded;
   EXPECT_EQ(0, jerasure.encode(set<int>(want_to_encode, want_to_encode+4),
@@ -56,7 +68,8 @@ TYPED_TEST(ErasureCodeTest, encode_decode) {
   EXPECT_EQ(4u, encoded.size());
   unsigned length =  encoded[0].length();
   EXPECT_EQ(0, strncmp(encoded[0].c_str(), in.c_str(), length));
-  EXPECT_EQ(0, strncmp(encoded[1].c_str(), in.c_str() + length, in.length() - length));
+  EXPECT_EQ(0, strncmp(encoded[1].c_str(), in.c_str() + length,
+		       in.length() - length));
 
 
   // all chunks are available
@@ -70,7 +83,8 @@ TYPED_TEST(ErasureCodeTest, encode_decode) {
     EXPECT_EQ(4u, decoded.size()); 
     EXPECT_EQ(length, decoded[0].length());
     EXPECT_EQ(0, strncmp(decoded[0].c_str(), in.c_str(), length));
-    EXPECT_EQ(0, strncmp(decoded[1].c_str(), in.c_str() + length, in.length() - length));
+    EXPECT_EQ(0, strncmp(decoded[1].c_str(), in.c_str() + length,
+			 in.length() - length));
   }
 
   // two chunks are missing 
@@ -88,11 +102,189 @@ TYPED_TEST(ErasureCodeTest, encode_decode) {
     EXPECT_EQ(4u, decoded.size()); 
     EXPECT_EQ(length, decoded[0].length());
     EXPECT_EQ(0, strncmp(decoded[0].c_str(), in.c_str(), length));
-    EXPECT_EQ(0, strncmp(decoded[1].c_str(), in.c_str() + length, in.length() - length));
+    EXPECT_EQ(0, strncmp(decoded[1].c_str(), in.c_str() + length,
+			 in.length() - length));
   }
 }
 
-int main(int argc, char **argv) {
+TYPED_TEST(ErasureCodeTest, minimum_to_decode)
+{
+  TypeParam jerasure;
+  map<std::string,std::string> parameters;
+  parameters["erasure-code-k"] = "2";
+  parameters["erasure-code-m"] = "2";
+  parameters["erasure-code-w"] = "7";
+  parameters["erasure-code-packetsize"] = "8";
+  jerasure.init(parameters);
+
+  //
+  // If trying to read nothing, the minimum is empty.
+  //
+  {
+    set<int> want_to_read;
+    set<int> available_chunks;
+    set<int> minimum;
+
+    EXPECT_EQ(0, jerasure.minimum_to_decode(want_to_read,
+					    available_chunks,
+					    &minimum));
+    EXPECT_TRUE(minimum.empty());
+  }
+  //
+  // There is no way to read a chunk if none are available.
+  //
+  {
+    set<int> want_to_read;
+    set<int> available_chunks;
+    set<int> minimum;
+
+    want_to_read.insert(0);
+
+    EXPECT_EQ(-EIO, jerasure.minimum_to_decode(want_to_read,
+					       available_chunks,
+					       &minimum));
+  }
+  //
+  // Reading a subset of the available chunks is always possible.
+  //
+  {
+    set<int> want_to_read;
+    set<int> available_chunks;
+    set<int> minimum;
+
+    want_to_read.insert(0);
+    available_chunks.insert(0);
+
+    EXPECT_EQ(0, jerasure.minimum_to_decode(want_to_read,
+					    available_chunks,
+					    &minimum));
+    EXPECT_EQ(want_to_read, minimum);
+  }
+  //
+  // There is no way to read a missing chunk if there is less than k
+  // chunks available.
+  //
+  {
+    set<int> want_to_read;
+    set<int> available_chunks;
+    set<int> minimum;
+
+    want_to_read.insert(0);
+    want_to_read.insert(1);
+    available_chunks.insert(0);
+
+    EXPECT_EQ(-EIO, jerasure.minimum_to_decode(want_to_read,
+					       available_chunks,
+					       &minimum));
+  }
+  //
+  // When chunks are not available, the minimum can be made of any
+  // chunks. For instance, to read 1 and 3 below the minimum could be
+  // 2 and 3 which may seem better because it contains one of the
+  // chunks to be read. But it won't be more efficient than retrieving
+  // 0 and 2 instead because, in both cases, the decode function will
+  // need to run the same recovery operation and use the same amount
+  // of CPU and memory.
+  //
+  {
+    set<int> want_to_read;
+    set<int> available_chunks;
+    set<int> minimum;
+
+    want_to_read.insert(1);
+    want_to_read.insert(3);
+    available_chunks.insert(0);
+    available_chunks.insert(2);
+    available_chunks.insert(3);
+
+    EXPECT_EQ(0, jerasure.minimum_to_decode(want_to_read,
+					    available_chunks,
+					    &minimum));
+    EXPECT_EQ(2u, minimum.size());
+    EXPECT_EQ(0u, minimum.count(3));
+  }
+}
+
+TEST(ErasureCodeTest, encode)
+{
+  ErasureCodeJerasureReedSolomonVandermonde jerasure;
+  map<std::string,std::string> parameters;
+  parameters["erasure-code-k"] = "2";
+  parameters["erasure-code-m"] = "2";
+  parameters["erasure-code-w"] = "8";
+  jerasure.init(parameters);
+
+  unsigned alignment = jerasure.get_alignment();
+  {
+    //
+    // When the input bufferlist is perfectly aligned, it is 
+    // pointed to unmodified by the returned encoded chunks.
+    //
+    bufferlist in;
+    map<int,bufferlist> encoded;
+    int want_to_encode[] = { 0, 1, 2, 3 };
+    in.append(string(alignment * 2, 'X'));
+    EXPECT_EQ(alignment * 2, in.length());
+    EXPECT_EQ(0, jerasure.encode(set<int>(want_to_encode, want_to_encode+4),
+				 in,
+				 &encoded));
+    EXPECT_EQ(4u, encoded.size());
+    for(int i = 0; i < 4; i++)
+      EXPECT_EQ(alignment, encoded[i].length());
+    EXPECT_EQ(in.c_str(), encoded[0].c_str());
+    EXPECT_EQ(in.c_str() + alignment, encoded[1].c_str());
+  }
+
+  {
+    //
+    // When the input bufferlist needs to be padded because
+    // it is not properly aligned, it is padded with zeros.
+    // The beginning of the input bufferlist is pointed to 
+    // unmodified by the returned encoded chunk, only the 
+    // trailing chunk is allocated and copied.
+    //
+    bufferlist in;
+    map<int,bufferlist> encoded;
+    int want_to_encode[] = { 0, 1, 2, 3 };
+    int trail_length = 10;
+    in.append(string(alignment + trail_length, 'X'));
+    EXPECT_EQ(0, jerasure.encode(set<int>(want_to_encode, want_to_encode+4),
+				 in,
+				 &encoded));
+    EXPECT_EQ(4u, encoded.size());
+    for(int i = 0; i < 4; i++)
+      EXPECT_EQ(alignment, encoded[i].length());
+    EXPECT_EQ(in.c_str(), encoded[0].c_str());
+    EXPECT_NE(in.c_str() + alignment, encoded[1].c_str());
+    char *last_chunk = encoded[1].c_str();
+    EXPECT_EQ('X', last_chunk[0]);
+    EXPECT_EQ('\0', last_chunk[trail_length]);
+  }
+
+  {
+    //
+    // When only the first chunk is required, the encoded map only
+    // contains the first chunk. Although the jerasure encode
+    // internally allocated a buffer because of padding requirements
+    // and also computes the coding chunks, they are released before
+    // the return of the method, as shown when running the tests thru
+    // valgrind that shows there is no leak.
+    //
+    bufferlist in;
+    map<int,bufferlist> encoded;
+    set<int> want_to_encode;
+    want_to_encode.insert(0);
+    int trail_length = 10;
+    in.append(string(alignment + trail_length, 'X'));
+    EXPECT_EQ(0, jerasure.encode(want_to_encode, in, &encoded));
+    EXPECT_EQ(1u, encoded.size());
+    EXPECT_EQ(alignment, encoded[0].length());
+    EXPECT_EQ(in.c_str(), encoded[0].c_str());
+  }
+}
+
+int main(int argc, char **argv)
+{
   vector<const char*> args;
   argv_to_vec(argc, (const char **)argv, args);
 
@@ -103,6 +295,12 @@ int main(int argc, char **argv) {
   return RUN_ALL_TESTS();
 }
 
-// Local Variables:
-// compile-command: "cd ../.. ; make -j4 && make unittest_erasure_code_jerasure && valgrind --tool=memcheck ./unittest_erasure_code_jerasure --gtest_filter=*.* --log-to-stderr=true --debug-osd=20"
-// End:
+/* 
+ * Local Variables:
+ * compile-command: "cd ../.. ; make -j4 && 
+ *   make unittest_erasure_code_jerasure && 
+ *   valgrind --tool=memcheck --leak-check=full \
+ *      ./unittest_erasure_code_jerasure \
+ *      --gtest_filter=*.* --log-to-stderr=true --debug-osd=20"
+ * End:
+ */
diff --git a/src/test/osd/TestErasureCodePlugin.cc b/src/test/osd/TestErasureCodePlugin.cc
new file mode 100644
index 00000000000..46ed4b1730d
--- /dev/null
+++ b/src/test/osd/TestErasureCodePlugin.cc
@@ -0,0 +1,108 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2013 Cloudwatt <libre.licensing@cloudwatt.com>
+ *
+ * Author: Loic Dachary <loic@dachary.org>
+ *
+ *  This library is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU Lesser General Public
+ *  License as published by the Free Software Foundation; either
+ *  version 2.1 of the License, or (at your option) any later version.
+ * 
+ */
+
+#include <errno.h>
+#include <signal.h>
+#include "common/Thread.h"
+#include "global/global_init.h"
+#include "osd/ErasureCodePlugin.h"
+#include "common/ceph_argparse.h"
+#include "global/global_context.h"
+#include "gtest/gtest.h"
+
+class ErasureCodePluginRegistryTest : public ::testing::Test {
+protected:
+
+  class Thread_factory : public Thread {
+  public:
+    virtual void *entry() {
+      map<std::string,std::string> parameters;
+      parameters["erasure-code-directory"] = ".libs";
+      ErasureCodePluginRegistry &instance = ErasureCodePluginRegistry::instance();
+      ErasureCodeInterfaceRef erasure_code;
+      instance.factory("hangs", parameters, &erasure_code);
+      return NULL;
+    }
+  };
+
+};
+
+TEST_F(ErasureCodePluginRegistryTest, factory_mutex) {
+  ErasureCodePluginRegistry &instance = ErasureCodePluginRegistry::instance();
+
+  EXPECT_TRUE(instance.lock.TryLock());
+  instance.lock.Unlock();
+
+  // 
+  // Test that the loading of a plugin is protected by a mutex.
+  //
+  useconds_t delay = 0;
+  const useconds_t DELAY_MAX = 20 * 1000 * 1000;
+  Thread_factory sleep_forever;
+  sleep_forever.create();
+  do {
+    cout << "Trying (1) with delay " << delay << "us\n";
+    if (delay > 0)
+      usleep(delay);
+    if (!instance.loading)
+      delay = ( delay + 1 ) * 2;
+  } while(!instance.loading && delay < DELAY_MAX);
+  ASSERT_TRUE(delay < DELAY_MAX);
+
+  EXPECT_FALSE(instance.lock.TryLock());
+
+  EXPECT_EQ(0, pthread_cancel(sleep_forever.get_thread_id()));
+  EXPECT_EQ(0, sleep_forever.join());
+}
+
+TEST_F(ErasureCodePluginRegistryTest, all)
+{
+  map<std::string,std::string> parameters;
+  parameters["erasure-code-directory"] = ".libs";
+  ErasureCodeInterfaceRef erasure_code;
+  ErasureCodePluginRegistry &instance = ErasureCodePluginRegistry::instance();
+  EXPECT_FALSE(erasure_code);
+  EXPECT_EQ(-EIO, instance.factory("invalid", parameters, &erasure_code));
+  EXPECT_FALSE(erasure_code);
+  EXPECT_EQ(-ENOENT, instance.factory("missing_entry_point", parameters,
+				      &erasure_code));
+  EXPECT_FALSE(erasure_code);
+  EXPECT_EQ(-ESRCH, instance.factory("fail_to_initialize", parameters,
+				     &erasure_code));
+  EXPECT_FALSE(erasure_code);
+  EXPECT_EQ(-EBADF, instance.factory("fail_to_register", parameters,
+				     &erasure_code));
+  EXPECT_FALSE(erasure_code);
+  EXPECT_EQ(0, instance.factory("example", parameters, &erasure_code));
+  EXPECT_TRUE(erasure_code);
+  ErasureCodePlugin *plugin = 0;
+  EXPECT_EQ(-EEXIST, instance.load("example", parameters, &plugin));
+}
+
+int main(int argc, char **argv) {
+  vector<const char*> args;
+  argv_to_vec(argc, (const char **)argv, args);
+
+  global_init(NULL, args, CEPH_ENTITY_TYPE_CLIENT, CODE_ENVIRONMENT_UTILITY, 0);
+  common_init_finish(g_ceph_context);
+
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
+
+// Local Variables:
+// compile-command: "cd ../.. ; make -j4 && make unittest_erasure_code_plugin && valgrind  --leak-check=full --tool=memcheck ./unittest_erasure_code_plugin --gtest_filter=*.* --log-to-stderr=true --debug-osd=20"
+// End:
diff --git a/src/test/osd/TestErasureCodePluginExample.cc b/src/test/osd/TestErasureCodePluginExample.cc
deleted file mode 100644
index 67b41f2011a..00000000000
--- a/src/test/osd/TestErasureCodePluginExample.cc
+++ /dev/null
@@ -1,51 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
-// vim: ts=8 sw=2 smarttab
-/*
- * Ceph - scalable distributed file system
- *
- * Copyright (C) 2013 Cloudwatt <libre.licensing@cloudwatt.com>
- *
- * Author: Loic Dachary <loic@dachary.org>
- *
- *  This library is free software; you can redistribute it and/or
- *  modify it under the terms of the GNU Lesser General Public
- *  License as published by the Free Software Foundation; either
- *  version 2.1 of the License, or (at your option) any later version.
- * 
- */
-
-#include <errno.h>
-#include "common/Thread.h"
-#include "global/global_init.h"
-#include "osd/ErasureCodePlugin.h"
-#include "common/ceph_argparse.h"
-#include "global/global_context.h"
-#include "gtest/gtest.h"
-
-TEST(ErasureCodePluginRegistry, factory)
-{
-  map<std::string,std::string> parameters;
-  parameters["erasure-code-directory"] = ".libs";
-  ErasureCodeInterfaceRef erasure_code;
-  ErasureCodePluginRegistry &instance = ErasureCodePluginRegistry::instance();
-  EXPECT_FALSE(erasure_code);
-  EXPECT_EQ(0, instance.factory("example", parameters, &erasure_code));
-  EXPECT_TRUE(erasure_code);
-  ErasureCodePlugin *plugin = 0;
-  EXPECT_EQ(-EEXIST, instance.load("example", parameters, &plugin));
-}
-
-int main(int argc, char **argv) {
-  vector<const char*> args;
-  argv_to_vec(argc, (const char **)argv, args);
-
-  global_init(NULL, args, CEPH_ENTITY_TYPE_CLIENT, CODE_ENVIRONMENT_UTILITY, 0);
-  common_init_finish(g_ceph_context);
-
-  ::testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
-
-// Local Variables:
-// compile-command: "cd ../.. ; make -j4 && make unittest_erasure_code_plugin && ./unittest_erasure_code_plugin --gtest_filter=*.* --log-to-stderr=true --debug-osd=20"
-// End:
diff --git a/src/test/osd/TestErasureCodePluginJerasure.cc b/src/test/osd/TestErasureCodePluginJerasure.cc
index fe819c71a39..2f558937595 100644
--- a/src/test/osd/TestErasureCodePluginJerasure.cc
+++ b/src/test/osd/TestErasureCodePluginJerasure.cc
@@ -51,7 +51,8 @@ TEST(ErasureCodePlugin, factory)
   }
 }
 
-int main(int argc, char **argv) {
+int main(int argc, char **argv)
+{
   vector<const char*> args;
   argv_to_vec(argc, (const char **)argv, args);
 
@@ -62,6 +63,12 @@ int main(int argc, char **argv) {
   return RUN_ALL_TESTS();
 }
 
-// Local Variables:
-// compile-command: "cd ../.. ; make -j4 && make unittest_erasure_code_jerasure_plugin && valgrind --tool=memcheck ./unittest_erasure_code_jerasure_plugin --gtest_filter=*.* --log-to-stderr=true --debug-osd=20"
-// End:
+/* 
+ * Local Variables:
+ * compile-command: "cd ../.. ; make -j4 && 
+ *   make unittest_erasure_code_plugin_jerasure && 
+ *   valgrind --tool=memcheck ./unittest_erasure_code_plugin_jerasure \
+ *      --gtest_filter=*.* --log-to-stderr=true --debug-osd=20"
+ * End:
+ */
+
diff --git a/src/test/osd/TestRados.cc b/src/test/osd/TestRados.cc
index 1b6bd073a12..be919161579 100644
--- a/src/test/osd/TestRados.cc
+++ b/src/test/osd/TestRados.cc
@@ -155,7 +155,9 @@ private:
 
     case TEST_OP_COPY_FROM:
       oid = *(rand_choose(context.oid_not_in_use));
-      oid2 = *(rand_choose(context.oid_not_in_use));
+      do {
+	oid2 = *(rand_choose(context.oid_not_in_use));
+      } while (oid == oid2);
       cout << "copy_from oid " << oid << " from oid " << oid2
 	   << " current snap is " << context.current_snap << std::endl;
       return new CopyFromOp(m_op, &context, oid, oid2, m_stats);
@@ -282,8 +284,8 @@ int main(int argc, char **argv)
     return 1;
   }
 
-  if (max_in_flight > objects) {
-    cerr << "Error: max_in_flight must be less than the number of objects"
+  if (max_in_flight * 2 > objects) {
+    cerr << "Error: max_in_flight must be <= than the number of objects / 2"
 	 << std::endl;
     return 1;
   }
diff --git a/src/test/pybind/test_ceph_argparse.py b/src/test/pybind/test_ceph_argparse.py
new file mode 100755
index 00000000000..34bcf698e5a
--- /dev/null
+++ b/src/test/pybind/test_ceph_argparse.py
@@ -0,0 +1,1061 @@
+#!/usr/bin/nosetests --nocapture
+# -*- mode:python; tab-width:4; indent-tabs-mode:t -*-
+# vim: ts=4 sw=4 smarttab expandtab
+#
+# Ceph - scalable distributed file system
+#
+# Copyright (C) 2013 Cloudwatt <libre.licensing@cloudwatt.com>
+#
+# Author: Loic Dachary <loic@dachary.org>
+#
+#  This library is free software; you can redistribute it and/or
+#  modify it under the terms of the GNU Lesser General Public
+#  License as published by the Free Software Foundation; either
+#  version 2.1 of the License, or (at your option) any later version.
+#
+
+from nose.tools import eq_ as eq
+from nose.tools import *
+
+from ceph_argparse import validate_command, parse_json_funcsigs
+
+import os
+import re
+import json
+
+def get_command_descriptions(what):
+    buffer = os.popen("./get_command_descriptions " + "--" + what
+					  + " 2>&1 | grep cmd000").read()
+    return re.sub(r'^.*?(\{.*\})', '\g<1>', buffer)
+
+def test_parse_json_funcsigs():
+    commands = get_command_descriptions("all")
+    cmd_json = parse_json_funcsigs(commands, 'cli')
+
+    # syntax error https://github.com/ceph/ceph/pull/585
+    commands = get_command_descriptions("pull585")
+    assert_raises(TypeError, parse_json_funcsigs, commands, 'cli')
+
+sigdict = parse_json_funcsigs(get_command_descriptions("all"), 'cli')
+
+
+class TestArgparse:
+
+    def assert_valid_command(self, args):
+        result = validate_command(sigdict, args)
+        assert_not_in(result, [None, {}])
+
+    def check_1_natural_arg(self, prefix, command):
+        self.assert_valid_command([prefix, command, '1'])
+        assert_equal({}, validate_command(sigdict, [prefix, command]))
+        assert_equal({}, validate_command(sigdict, [prefix, command, '-1']))
+        assert_equal({}, validate_command(sigdict, [prefix, command, '1',
+                                                    '1']))
+
+    def check_0_or_1_natural_arg(self, prefix, command):
+        self.assert_valid_command([prefix, command, '1'])
+        self.assert_valid_command([prefix, command])
+        assert_equal({}, validate_command(sigdict, [prefix, command, '-1']))
+        assert_equal({}, validate_command(sigdict, [prefix, command, '1',
+                                                    '1']))
+
+    def check_1_string_arg(self, prefix, command):
+        assert_equal({}, validate_command(sigdict, [prefix, command]))
+        self.assert_valid_command([prefix, command, 'string'])
+        assert_equal({}, validate_command(sigdict, [prefix,
+                                                    command,
+                                                    'string',
+                                                    'toomany']))
+
+    def check_1_or_more_string_args(self, prefix, command):
+        assert_equal({}, validate_command(sigdict, [prefix,
+                                                    command]))
+        self.assert_valid_command([prefix,
+                                   command,
+                                   'string'])
+        self.assert_valid_command([prefix,
+                                   command,
+                                   'string',
+                                   'more string'])
+
+    def check_no_arg(self, prefix, command):
+        self.assert_valid_command([prefix,
+                                   command])
+        assert_equal({}, validate_command(sigdict, [prefix,
+                                                    command,
+                                                    'toomany']))
+
+
+class TestPG(TestArgparse):
+
+    def test_stat(self):
+        self.assert_valid_command(['pg', 'stat'])
+
+    def test_getmap(self):
+        self.assert_valid_command(['pg', 'getmap'])
+
+    def test_send_pg_creates(self):
+        self.assert_valid_command(['pg', 'send_pg_creates'])
+
+    def test_dump(self):
+        self.assert_valid_command(['pg', 'dump'])
+        self.assert_valid_command(['pg', 'dump',
+                                   'all',
+                                   'summary',
+                                   'sum',
+                                   'delta',
+                                   'pools',
+                                   'osds',
+                                   'pgs',
+                                   'pgs_brief'])
+        assert_equal({}, validate_command(sigdict, ['pg', 'dump', 'invalid']))
+
+    def test_dump_json(self):
+        self.assert_valid_command(['pg', 'dump_json'])
+        self.assert_valid_command(['pg', 'dump_json',
+                                   'all',
+                                   'summary',
+                                   'sum',
+                                   'pools',
+                                   'osds',
+                                   'pgs'])
+        assert_equal({}, validate_command(sigdict, ['pg', 'dump_json',
+                                                    'invalid']))
+
+    def test_dump_pools_json(self):
+        self.assert_valid_command(['pg', 'dump_pools_json'])
+
+    def test_dump_pools_stuck(self):
+        self.assert_valid_command(['pg', 'dump_stuck'])
+        self.assert_valid_command(['pg', 'dump_stuck',
+                                   'inactive',
+                                   'unclean',
+                                   'stale'])
+        assert_equal({}, validate_command(sigdict, ['pg', 'dump_stuck',
+                                                    'invalid']))
+        self.assert_valid_command(['pg', 'dump_stuck',
+                                   'inactive',
+                                   '1234'])
+
+    def one_pgid(self, command):
+        self.assert_valid_command(['pg', command, '1.1'])
+        assert_equal({}, validate_command(sigdict, ['pg', command]))
+        assert_equal({}, validate_command(sigdict, ['pg', command, '1']))
+
+    def test_map(self):
+        self.one_pgid('map')
+
+    def test_scrub(self):
+        self.one_pgid('scrub')
+
+    def test_deep_scrub(self):
+        self.one_pgid('deep-scrub')
+
+    def test_repair(self):
+        self.one_pgid('repair')
+
+    def test_debug(self):
+        self.assert_valid_command(['pg',
+                                   'debug',
+                                   'unfound_objects_exist'])
+        self.assert_valid_command(['pg',
+                                   'debug',
+                                   'degraded_pgs_exist'])
+        assert_equal({}, validate_command(sigdict, ['pg', 'debug']))
+        assert_equal({}, validate_command(sigdict, ['pg', 'debug',
+                                                    'invalid']))
+
+    def test_force_create_pg(self):
+        self.one_pgid('force_create_pg')
+
+    def set_ratio(self, command):
+        self.assert_valid_command(['pg',
+                                   command,
+                                   '0.0'])
+        assert_equal({}, validate_command(sigdict, ['pg', command]))
+        assert_equal({}, validate_command(sigdict, ['pg',
+                                                    command,
+                                                    '2.0']))
+
+    def test_set_full_ratio(self):
+        self.set_ratio('set_full_ratio')
+
+    def test_set_nearfull_ratio(self):
+        self.set_ratio('set_nearfull_ratio')
+
+
+class TestAuth(TestArgparse):
+
+    def test_export(self):
+        self.assert_valid_command(['auth', 'export'])
+        self.assert_valid_command(['auth',
+                                   'export',
+                                   'string'])
+        assert_equal({}, validate_command(sigdict, ['auth',
+                                                    'export',
+                                                    'string',
+                                                    'toomany']))
+
+    def test_get(self):
+        self.check_1_string_arg('auth', 'get')
+
+    def test_get_key(self):
+        self.check_1_string_arg('auth', 'get-key')
+
+    def test_print_key(self):
+        self.check_1_string_arg('auth', 'print-key')
+        self.check_1_string_arg('auth', 'print_key')
+
+    def test_list(self):
+        self.check_no_arg('auth', 'list')
+
+    def test_import(self):
+        self.check_no_arg('auth', 'import')
+
+    def test_add(self):
+        self.check_1_or_more_string_args('auth', 'add')
+
+    def test_get_or_create_key(self):
+        self.check_1_or_more_string_args('auth', 'get-or-create-key')
+
+    def test_get_or_create(self):
+        self.check_1_or_more_string_args('auth', 'get-or-create')
+
+    def test_caps(self):
+        assert_equal({}, validate_command(sigdict, ['auth',
+                                                    'caps']))
+        assert_equal({}, validate_command(sigdict, ['auth',
+                                                    'caps',
+                                                    'string']))
+        self.assert_valid_command(['auth',
+                                   'caps',
+                                   'string',
+                                   'more string'])
+
+    def test_del(self):
+        self.check_1_string_arg('auth', 'del')
+
+
+class TestMonitor(TestArgparse):
+
+    def test_compact(self):
+        self.assert_valid_command(['compact'])
+
+    def test_scrub(self):
+        self.assert_valid_command(['scrub'])
+
+    def test_fsid(self):
+        self.assert_valid_command(['fsid'])
+
+    def test_log(self):
+        assert_equal({}, validate_command(sigdict, ['log']))
+        self.assert_valid_command(['log', 'a logtext'])
+        self.assert_valid_command(['log', 'a logtext', 'and another'])
+
+    def test_injectargs(self):
+        assert_equal({}, validate_command(sigdict, ['injectargs']))
+        self.assert_valid_command(['injectargs', 'one'])
+        self.assert_valid_command(['injectargs', 'one', 'two'])
+
+    def test_status(self):
+        self.assert_valid_command(['status'])
+
+    def test_health(self):
+        self.assert_valid_command(['health'])
+        self.assert_valid_command(['health', 'detail'])
+        assert_equal({}, validate_command(sigdict, ['health', 'invalid']))
+        assert_equal({}, validate_command(sigdict, ['health', 'detail',
+                                                    'toomany']))
+
+    def test_df(self):
+        self.assert_valid_command(['df'])
+        self.assert_valid_command(['df', 'detail'])
+        assert_equal({}, validate_command(sigdict, ['df', 'invalid']))
+        assert_equal({}, validate_command(sigdict, ['df', 'detail',
+                                                    'toomany']))
+
+    def test_report(self):
+        self.assert_valid_command(['report'])
+        self.assert_valid_command(['report', 'tag1'])
+        self.assert_valid_command(['report', 'tag1', 'tag2'])
+
+    def test_quorum_status(self):
+        self.assert_valid_command(['quorum_status'])
+
+    def test_mon_status(self):
+        self.assert_valid_command(['mon_status'])
+
+    def test_sync_force(self):
+        self.assert_valid_command(['sync',
+                                   'force',
+                                   '--yes-i-really-mean-it',
+                                   '--i-know-what-i-am-doing'])
+        self.assert_valid_command(['sync',
+                                   'force',
+                                   '--yes-i-really-mean-it'])
+        self.assert_valid_command(['sync',
+                                   'force'])
+        assert_equal({}, validate_command(sigdict, ['sync']))
+        assert_equal({}, validate_command(sigdict, ['sync',
+                                                    'force',
+                                                    '--yes-i-really-mean-it',
+                                                    '--i-know-what-i-am-doing',
+                                                    'toomany']))
+
+    def test_heap(self):
+        assert_equal({}, validate_command(sigdict, ['heap']))
+        assert_equal({}, validate_command(sigdict, ['heap', 'invalid']))
+        self.assert_valid_command(['heap', 'dump'])
+        self.assert_valid_command(['heap', 'start_profiler'])
+        self.assert_valid_command(['heap', 'stop_profiler'])
+        self.assert_valid_command(['heap', 'release'])
+        self.assert_valid_command(['heap', 'stats'])
+
+    def test_quorum(self):
+        assert_equal({}, validate_command(sigdict, ['quorum']))
+        assert_equal({}, validate_command(sigdict, ['quorum', 'invalid']))
+        self.assert_valid_command(['quorum', 'enter'])
+        self.assert_valid_command(['quorum', 'exit'])
+        assert_equal({}, validate_command(sigdict, ['quorum',
+                                                    'enter',
+                                                    'toomany']))
+
+    def test_tell(self):
+        assert_equal({}, validate_command(sigdict, ['tell']))
+        assert_equal({}, validate_command(sigdict, ['tell', 'invalid']))
+        for name in ('osd', 'mon', 'client', 'mds'):
+            assert_equal({}, validate_command(sigdict, ['tell', name]))
+            assert_equal({}, validate_command(sigdict, ['tell',
+                                                        name + ".42"]))
+            self.assert_valid_command(['tell', name + ".42", 'something'])
+            self.assert_valid_command(['tell', name + ".42",
+                                       'something',
+                                       'something else'])
+
+
+class TestMDS(TestArgparse):
+
+    def test_stat(self):
+        self.check_no_arg('mds', 'stat')
+
+    def test_dump(self):
+        self.check_0_or_1_natural_arg('mds', 'dump')
+
+    def test_tell(self):
+        self.assert_valid_command(['mds', 'tell',
+                                   'someone',
+                                   'something'])
+        self.assert_valid_command(['mds', 'tell',
+                                   'someone',
+                                   'something',
+                                   'something else'])
+        assert_equal({}, validate_command(sigdict, ['mds', 'tell']))
+        assert_equal({}, validate_command(sigdict, ['mds', 'tell',
+                                                    'someone']))
+
+    def test_compat_show(self):
+        self.assert_valid_command(['mds', 'compat', 'show'])
+        assert_equal({}, validate_command(sigdict, ['mds', 'compat']))
+        assert_equal({}, validate_command(sigdict, ['mds', 'compat',
+                                                    'show', 'toomany']))
+
+    def test_stop(self):
+        self.assert_valid_command(['mds', 'stop', 'someone'])
+        assert_equal({}, validate_command(sigdict, ['mds', 'stop']))
+        assert_equal({}, validate_command(sigdict, ['mds', 'stop',
+                                                    'someone', 'toomany']))
+
+    def test_deactivate(self):
+        self.assert_valid_command(['mds', 'deactivate', 'someone'])
+        assert_equal({}, validate_command(sigdict, ['mds', 'deactivate']))
+        assert_equal({}, validate_command(sigdict, ['mds', 'deactivate',
+                                                    'someone', 'toomany']))
+
+    def test_set_max_mds(self):
+        self.check_1_natural_arg('mds', 'set_max_mds')
+
+    def test_setmap(self):
+        self.check_1_natural_arg('mds', 'setmap')
+
+    def test_set_state(self):
+        self.assert_valid_command(['mds', 'set_state', '1', '2'])
+        assert_equal({}, validate_command(sigdict, ['mds', 'set_state']))
+        assert_equal({}, validate_command(sigdict, ['mds', 'set_state', '-1']))
+        assert_equal({}, validate_command(sigdict, ['mds', 'set_state',
+                                                    '1', '-1']))
+        assert_equal({}, validate_command(sigdict, ['mds', 'set_state',
+                                                    '1', '21']))
+
+    def test_fail(self):
+        self.check_1_string_arg('mds', 'fail')
+
+    def test_rm(self):
+        assert_equal({}, validate_command(sigdict, ['mds', 'rm']))
+        assert_equal({}, validate_command(sigdict, ['mds', 'rm', '1']))
+        for name in ('osd', 'mon', 'client', 'mds'):
+            self.assert_valid_command(['mds', 'rm', '1', name + '.42'])
+            assert_equal({}, validate_command(sigdict, ['mds', 'rm',
+                                                        '-1', name + '.42']))
+            assert_equal({}, validate_command(sigdict, ['mds', 'rm',
+                                                        '-1', name]))
+            assert_equal({}, validate_command(sigdict, ['mds', 'rm',
+                                                        '1', name + '.42',
+                                                        'toomany']))
+
+    def test_rmfailed(self):
+        self.check_1_natural_arg('mds', 'rmfailed')
+
+    def test_cluster_down(self):
+        self.check_no_arg('mds', 'cluster_down')
+
+    def test_cluster_up(self):
+        self.check_no_arg('mds', 'cluster_up')
+
+    def test_compat_rm_compat(self):
+        self.assert_valid_command(['mds', 'compat', 'rm_compat', '1'])
+        assert_equal({}, validate_command(sigdict, ['mds',
+                                                    'compat',
+                                                    'rm_compat']))
+        assert_equal({}, validate_command(sigdict, ['mds',
+                                                    'compat',
+                                                    'rm_compat', '-1']))
+        assert_equal({}, validate_command(sigdict, ['mds',
+                                                    'compat',
+                                                    'rm_compat', '1', '1']))
+
+    def test_incompat_rm_incompat(self):
+        self.assert_valid_command(['mds', 'compat', 'rm_incompat', '1'])
+        assert_equal({}, validate_command(sigdict, ['mds',
+                                                    'compat',
+                                                    'rm_incompat']))
+        assert_equal({}, validate_command(sigdict, ['mds',
+                                                    'compat',
+                                                    'rm_incompat', '-1']))
+        assert_equal({}, validate_command(sigdict, ['mds',
+                                                    'compat',
+                                                    'rm_incompat', '1', '1']))
+
+    def test_mds_set(self):
+        self.assert_valid_command(['mds', 'set', 'allow_new_snaps'])
+        self.assert_valid_command(['mds', 'set', 'allow_new_snaps', 'sure'])
+        assert_equal({}, validate_command(sigdict, ['mds',
+                                                    'set',
+                                                    'invalid']))
+        assert_equal({}, validate_command(sigdict, ['mds',
+                                                    'set',
+                                                    'allow_new_snaps',
+													'sure',
+													'toomany']))
+
+    def test_mds_unset(self):
+        self.assert_valid_command(['mds', 'unset', 'allow_new_snaps'])
+        self.assert_valid_command(['mds', 'unset', 'allow_new_snaps', 'sure'])
+        assert_equal({}, validate_command(sigdict, ['mds',
+                                                    'unset',
+                                                    'invalid']))
+        assert_equal({}, validate_command(sigdict, ['mds',
+                                                    'unset',
+                                                    'allow_new_snaps',
+													'sure',
+													'toomany']))
+
+    def test_add_data_pool(self):
+        self.check_1_natural_arg('mds', 'add_data_pool')
+
+    def test_remove_data_pool(self):
+        self.check_1_natural_arg('mds', 'remove_data_pool')
+
+    def test_newfs(self):
+        self.assert_valid_command(['mds', 'newfs', '1', '2',
+                                   '--yes-i-really-mean-it'])
+        self.assert_valid_command(['mds', 'newfs', '1', '2'])
+        assert_equal({}, validate_command(sigdict, ['mds', 'newfs']))
+        assert_equal({}, validate_command(sigdict, ['mds', 'newfs', '1']))
+        assert_equal({}, validate_command(sigdict, ['mds',
+                                                    'newfs',
+                                                    '1',
+                                                    '2',
+                                                    '--yes-i-really-mean-it',
+                                                    'toomany']))
+        assert_equal({}, validate_command(sigdict, ['mds',
+                                                    'newfs',
+                                                    '-1',
+                                                    '2',
+                                                    '--yes-i-really-mean-it']))
+        assert_equal({}, validate_command(sigdict, ['mds',
+                                                    'newfs',
+                                                    '1',
+                                                    '-1',
+                                                    '--yes-i-really-mean-it']))
+
+
+class TestMon(TestArgparse):
+
+    def test_dump(self):
+        self.check_0_or_1_natural_arg('mon', 'dump')
+
+    def test_stat(self):
+        self.check_no_arg('mon', 'stat')
+
+    def test_getmap(self):
+        self.check_0_or_1_natural_arg('mon', 'getmap')
+
+    def test_add(self):
+        self.assert_valid_command(['mon', 'add', 'name', '1.2.3.4:1234'])
+        assert_equal({}, validate_command(sigdict, ['mon', 'add']))
+        assert_equal({}, validate_command(sigdict, ['mon', 'add', 'name']))
+        assert_equal({}, validate_command(sigdict, ['mon', 'add',
+                                                    'name',
+                                                    '400.500.600.700']))
+        assert_equal({}, validate_command(sigdict, ['mon', 'add', 'name',
+                                                    '1.2.3.4:1234',
+                                                    'toomany']))
+
+    def test_remove(self):
+        self.assert_valid_command(['mon', 'remove', 'name'])
+        assert_equal({}, validate_command(sigdict, ['mon', 'remove']))
+        assert_equal({}, validate_command(sigdict, ['mon', 'remove',
+                                                    'name', 'toomany']))
+
+
+class TestOSD(TestArgparse):
+
+    def test_stat(self):
+        self.check_no_arg('osd', 'stat')
+
+    def test_dump(self):
+        self.check_0_or_1_natural_arg('osd', 'dump')
+
+    def test_osd_tree(self):
+        self.check_0_or_1_natural_arg('osd', 'tree')
+
+    def test_osd_ls(self):
+        self.check_0_or_1_natural_arg('osd', 'ls')
+
+    def test_osd_getmap(self):
+        self.check_0_or_1_natural_arg('osd', 'getmap')
+
+    def test_osd_getcrushmap(self):
+        self.check_0_or_1_natural_arg('osd', 'getcrushmap')
+
+    def test_perf(self):
+        self.check_no_arg('osd', 'perf')
+
+    def test_getmaxosd(self):
+        self.check_no_arg('osd', 'getmaxosd')
+
+    def test_find(self):
+        self.check_1_natural_arg('osd', 'find')
+
+    def test_map(self):
+        self.assert_valid_command(['osd', 'map', 'poolname', 'objectname'])
+        assert_equal({}, validate_command(sigdict, ['osd', 'map']))
+        assert_equal({}, validate_command(sigdict, ['osd', 'map', 'poolname']))
+        assert_equal({}, validate_command(sigdict, ['osd', 'map',
+                                                    'poolname', 'objectname',
+                                                    'toomany']))
+
+    def test_scrub(self):
+        self.check_1_string_arg('osd', 'scrub')
+
+    def test_deep_scrub(self):
+        self.check_1_string_arg('osd', 'deep-scrub')
+
+    def test_repair(self):
+        self.check_1_string_arg('osd', 'repair')
+
+    def test_lspools(self):
+        self.assert_valid_command(['osd', 'lspools'])
+        self.assert_valid_command(['osd', 'lspools', '1'])
+        self.assert_valid_command(['osd', 'lspools', '-1'])
+        assert_equal({}, validate_command(sigdict, ['osd', 'lspools',
+                                                    '1', 'toomany']))
+
+    def test_blacklist_ls(self):
+        self.assert_valid_command(['osd', 'blacklist', 'ls'])
+        assert_equal({}, validate_command(sigdict, ['osd', 'blacklist']))
+        assert_equal({}, validate_command(sigdict, ['osd', 'blacklist',
+                                                    'ls', 'toomany']))
+
+    def test_crush_rule(self):
+        assert_equal({}, validate_command(sigdict, ['osd', 'crush']))
+        assert_equal({}, validate_command(sigdict, ['osd', 'crush', 'rule']))
+        for subcommand in ('list', 'ls', 'dump'):
+            self.assert_valid_command(['osd', 'crush', 'rule', subcommand])
+            assert_equal({}, validate_command(sigdict, ['osd', 'crush',
+                                                        'rule', subcommand,
+                                                        'toomany']))
+
+    def test_crush_dump(self):
+        self.assert_valid_command(['osd', 'crush', 'dump'])
+        assert_equal({}, validate_command(sigdict, ['osd', 'crush']))
+        assert_equal({}, validate_command(sigdict, ['osd', 'crush',
+                                                    'dump', 'toomany']))
+
+    def test_setcrushmap(self):
+        self.check_no_arg('osd', 'setcrushmap')
+
+    def test_crush_add_bucket(self):
+        self.assert_valid_command(['osd', 'crush', 'add-bucket',
+                                   'name', 'type'])
+        assert_equal({}, validate_command(sigdict, ['osd', 'crush']))
+        assert_equal({}, validate_command(sigdict, ['osd', 'crush',
+                                                    'add-bucket']))
+        assert_equal({}, validate_command(sigdict, ['osd', 'crush',
+                                                    'add-bucket', 'name',
+                                                    'type',
+                                                    'toomany']))
+        assert_equal({}, validate_command(sigdict, ['osd', 'crush',
+                                                    'add-bucket', '!!!',
+                                                    'type']))
+
+    def check_crush_setter(self, setter):
+        self.assert_valid_command(['osd', 'crush', setter,
+                                   '*', '2.3', 'AZaz09-_.='])
+        self.assert_valid_command(['osd', 'crush', setter,
+                                   'osd.0', '2.3', 'AZaz09-_.='])
+        self.assert_valid_command(['osd', 'crush', setter,
+                                   '0', '2.3', 'AZaz09-_.='])
+        self.assert_valid_command(['osd', 'crush', setter,
+                                   '0', '2.3', 'AZaz09-_.=', 'AZaz09-_.='])
+        assert_equal({}, validate_command(sigdict, ['osd', 'crush',
+                                                    setter,
+                                                    'osd.0']))
+        assert_in(validate_command(sigdict, ['osd', 'crush',
+                                             setter,
+                                             'osd.0',
+                                             '-1.0']),
+                  [None, {}])
+        assert_equal({}, validate_command(sigdict, ['osd', 'crush',
+                                                    setter,
+                                                    'osd.0',
+                                                    '1.0',
+                                                    '!!!']))
+
+    def test_crush_set(self):
+        assert_equal({}, validate_command(sigdict, ['osd', 'crush']))
+        self.check_crush_setter('set')
+
+    def test_crush_add(self):
+        assert_equal({}, validate_command(sigdict, ['osd', 'crush']))
+        self.check_crush_setter('add')
+
+    def test_crush_create_or_move(self):
+        assert_equal({}, validate_command(sigdict, ['osd', 'crush']))
+        self.check_crush_setter('create-or-move')
+
+    def test_crush_move(self):
+        self.assert_valid_command(['osd', 'crush', 'move',
+                                   'AZaz09-_.', 'AZaz09-_.='])
+        self.assert_valid_command(['osd', 'crush', 'move',
+                                   '0', 'AZaz09-_.=', 'AZaz09-_.='])
+        assert_equal({}, validate_command(sigdict, ['osd', 'crush',
+                                                    'move']))
+        assert_equal({}, validate_command(sigdict, ['osd', 'crush',
+                                                    'move', 'AZaz09-_.']))
+        assert_equal({}, validate_command(sigdict, ['osd', 'crush',
+                                                    'move', '!!!',
+                                                    'AZaz09-_.=']))
+        assert_equal({}, validate_command(sigdict, ['osd', 'crush',
+                                                    'move', 'AZaz09-_.',
+                                                    '!!!']))
+
+    def test_crush_link(self):
+        self.assert_valid_command(['osd', 'crush', 'link',
+                                   'name', 'AZaz09-_.='])
+        self.assert_valid_command(['osd', 'crush', 'link',
+                                   'name', 'AZaz09-_.=', 'AZaz09-_.='])
+        assert_equal({}, validate_command(sigdict, ['osd', 'crush',
+                                                    'link']))
+        assert_equal({}, validate_command(sigdict, ['osd', 'crush',
+                                                    'link',
+                                                    'name']))
+
+    def test_crush_rm(self):
+        for alias in ('rm', 'remove', 'unlink'):
+            self.assert_valid_command(['osd', 'crush', alias, 'AZaz09-_.'])
+            self.assert_valid_command(['osd', 'crush', alias,
+                                       'AZaz09-_.', 'AZaz09-_.'])
+            assert_equal({}, validate_command(sigdict, ['osd', 'crush',
+                                                        alias]))
+            assert_equal({}, validate_command(sigdict, ['osd', 'crush',
+                                                        alias,
+                                                        'AZaz09-_.',
+                                                        'AZaz09-_.',
+                                                        'toomany']))
+
+    def test_crush_reweight(self):
+        self.assert_valid_command(['osd', 'crush', 'reweight',
+                                   'AZaz09-_.', '2.3'])
+        assert_equal({}, validate_command(sigdict, ['osd', 'crush',
+                                                    'reweight']))
+        assert_equal({}, validate_command(sigdict, ['osd', 'crush',
+                                                    'reweight',
+                                                    'AZaz09-_.']))
+        assert_equal({}, validate_command(sigdict, ['osd', 'crush',
+                                                    'reweight',
+                                                    'AZaz09-_.',
+                                                    '-1.0']))
+        assert_equal({}, validate_command(sigdict, ['osd', 'crush',
+                                                    'reweight',
+                                                    '!!!',
+                                                    '2.3']))
+
+    def test_crush_tunables(self):
+        for tunable in ('legacy', 'argonaut', 'bobtail', 'optimal', 'default'):
+            self.assert_valid_command(['osd', 'crush', 'tunables',
+                                       tunable])
+        assert_equal({}, validate_command(sigdict, ['osd', 'crush',
+                                                    'tunables']))
+        assert_equal(None, validate_command(sigdict, ['osd', 'crush',
+                                                      'default', 'toomany']))
+
+    def test_crush_rule_create_simple(self):
+        self.assert_valid_command(['osd', 'crush', 'rule', 'create-simple',
+                                   'AZaz09-_.', 'AZaz09-_.', 'AZaz09-_.'])
+        assert_equal(None, validate_command(sigdict, ['osd', 'crush',
+                                                      'create-simple']))
+        assert_equal(None, validate_command(sigdict, ['osd', 'crush',
+                                                      'create-simple',
+                                                      'AZaz09-_.']))
+        assert_equal(None, validate_command(sigdict, ['osd', 'crush',
+                                                      'create-simple',
+                                                      'AZaz09-_.',
+                                                      'AZaz09-_.']))
+        assert_equal(None, validate_command(sigdict, ['osd', 'crush',
+                                                      'create-simple',
+                                                      '!!!',
+                                                      'AZaz09-_.',
+                                                      'AZaz09-_.']))
+        assert_equal(None, validate_command(sigdict, ['osd', 'crush',
+                                                      'create-simple',
+                                                      'AZaz09-_.',
+                                                      '|||',
+                                                      'AZaz09-_.']))
+        assert_equal(None, validate_command(sigdict, ['osd', 'crush',
+                                                      'create-simple',
+                                                      'AZaz09-_.',
+                                                      'AZaz09-_.',
+                                                      '+++']))
+        assert_equal(None, validate_command(sigdict, ['osd', 'crush',
+                                                      'create-simple',
+                                                      'AZaz09-_.',
+                                                      'AZaz09-_.',
+                                                      'AZaz09-_.',
+                                                      'toomany']))
+
+    def test_crush_rule_rm(self):
+        self.assert_valid_command(['osd', 'crush', 'rule', 'rm', 'AZaz09-_.'])
+        assert_equal({}, validate_command(sigdict, ['osd', 'crush',
+                                                    'rule', 'rm']))
+        assert_equal({}, validate_command(sigdict, ['osd', 'crush',
+                                                    'rule', 'rm',
+                                                    '!!!!']))
+        assert_equal({}, validate_command(sigdict, ['osd', 'crush',
+                                                    'rule', 'rm',
+                                                    'AZaz09-_.',
+                                                    'toomany']))
+
+    def test_setmaxosd(self):
+        self.check_1_natural_arg('osd', 'setmaxosd')
+
+    def test_pause(self):
+        self.check_no_arg('osd', 'pause')
+
+    def test_unpause(self):
+        self.check_no_arg('osd', 'unpause')
+
+    def test_set_unset(self):
+        for action in ('set', 'unset'):
+            for flag in ('pause', 'noup', 'nodown', 'noout', 'noin',
+                         'nobackfill', 'norecover', 'noscrub', 'nodeep-scrub'):
+                self.assert_valid_command(['osd', action, flag])
+            assert_equal({}, validate_command(sigdict, ['osd', action]))
+            assert_equal({}, validate_command(sigdict, ['osd', action,
+                                                        'invalid']))
+            assert_equal({}, validate_command(sigdict, ['osd', action,
+                                                        'pause', 'toomany']))
+
+    def test_cluster_snap(self):
+        assert_equal(None, validate_command(sigdict, ['osd', 'cluster_snap']))
+
+    def test_down(self):
+        self.check_1_or_more_string_args('osd', 'down')
+
+    def test_out(self):
+        self.check_1_or_more_string_args('osd', 'out')
+
+    def test_in(self):
+        self.check_1_or_more_string_args('osd', 'in')
+
+    def test_rm(self):
+        self.check_1_or_more_string_args('osd', 'rm')
+
+    def test_reweight(self):
+        self.assert_valid_command(['osd', 'reweight', '1', '0.1'])
+        assert_equal({}, validate_command(sigdict, ['osd', 'reweight']))
+        assert_equal({}, validate_command(sigdict, ['osd', 'reweight',
+                                                    '1']))
+        assert_equal({}, validate_command(sigdict, ['osd', 'reweight',
+                                                    '1', '2.0']))
+        assert_equal({}, validate_command(sigdict, ['osd', 'reweight',
+                                                    '-1', '0.1']))
+        assert_equal({}, validate_command(sigdict, ['osd', 'reweight',
+                                                    '1', '0.1',
+                                                    'toomany']))
+
+    def test_lost(self):
+        self.assert_valid_command(['osd', 'lost', '1',
+                                   '--yes-i-really-mean-it'])
+        self.assert_valid_command(['osd', 'lost', '1'])
+        assert_equal({}, validate_command(sigdict, ['osd', 'lost']))
+        assert_equal({}, validate_command(sigdict, ['osd', 'lost',
+                                                    '1',
+                                                    'what?']))
+        assert_equal({}, validate_command(sigdict, ['osd', 'lost',
+                                                    '-1',
+                                                    '--yes-i-really-mean-it']))
+        assert_equal({}, validate_command(sigdict, ['osd', 'lost',
+                                                    '1',
+                                                    '--yes-i-really-mean-it',
+                                                    'toomany']))
+
+    def test_create(self):
+        uuid = '12345678123456781234567812345678'
+        self.assert_valid_command(['osd', 'create'])
+        self.assert_valid_command(['osd', 'create',
+                                   uuid])
+        assert_equal({}, validate_command(sigdict, ['osd', 'create',
+                                                    'invalid']))
+        assert_equal({}, validate_command(sigdict, ['osd', 'create',
+                                                    uuid,
+                                                    'toomany']))
+
+    def test_blackist(self):
+        for action in ('add', 'rm'):
+            self.assert_valid_command(['osd', 'blacklist', action,
+                                       '1.2.3.4/567'])
+            self.assert_valid_command(['osd', 'blacklist', action,
+                                       '1.2.3.4'])
+            self.assert_valid_command(['osd', 'blacklist', action,
+                                       '1.2.3.4/567', '600.40'])
+            self.assert_valid_command(['osd', 'blacklist', action,
+                                       '1.2.3.4', '600.40'])
+            assert_equal({}, validate_command(sigdict, ['osd', 'blacklist',
+                                                        action,
+                                                        'invalid',
+                                                        '600.40']))
+            assert_equal({}, validate_command(sigdict, ['osd', 'blacklist',
+                                                        action,
+                                                        '1.2.3.4/567',
+                                                        '-1.0']))
+            assert_equal({}, validate_command(sigdict, ['osd', 'blacklist',
+                                                        action,
+                                                        '1.2.3.4/567',
+                                                        '600.40',
+                                                        'toomany']))
+
+    def test_pool_mksnap(self):
+        self.assert_valid_command(['osd', 'pool', 'mksnap',
+                                   'poolname', 'snapname'])
+        assert_equal({}, validate_command(sigdict, ['osd', 'pool', 'mksnap']))
+        assert_equal({}, validate_command(sigdict, ['osd', 'pool', 'mksnap',
+                                                    'poolname']))
+        assert_equal({}, validate_command(sigdict, ['osd', 'pool', 'mksnap',
+                                                    'poolname', 'snapname',
+                                                    'toomany']))
+
+    def test_pool_rmsnap(self):
+        self.assert_valid_command(['osd', 'pool', 'rmsnap',
+                                   'poolname', 'snapname'])
+        assert_equal({}, validate_command(sigdict, ['osd', 'pool', 'rmsnap']))
+        assert_equal({}, validate_command(sigdict, ['osd', 'pool', 'rmsnap',
+                                                    'poolname']))
+        assert_equal({}, validate_command(sigdict, ['osd', 'pool', 'rmsnap',
+                                                    'poolname', 'snapname',
+                                                    'toomany']))
+
+    def test_pool_create(self):
+        self.assert_valid_command(['osd', 'pool', 'create',
+                                   'poolname', '128'])
+        self.assert_valid_command(['osd', 'pool', 'create',
+                                   'poolname', '128', '128'])
+        self.assert_valid_command(['osd', 'pool', 'create',
+                                   'poolname', '128', '128',
+                                   'foo=bar'])
+        self.assert_valid_command(['osd', 'pool', 'create',
+                                   'poolname', '128', '128',
+                                   'foo=bar', 'baz=frob'])
+        self.assert_valid_command(['osd', 'pool', 'create',
+                                   'poolname', '128',
+                                   'foo=bar', 'baz=frob'])
+        assert_equal({}, validate_command(sigdict, ['osd', 'pool', 'create']))
+        assert_equal({}, validate_command(sigdict, ['osd', 'pool', 'create',
+                                                    'poolname']))
+        assert_equal({}, validate_command(sigdict, ['osd', 'pool', 'create',
+                                                    'poolname', '-1']))
+
+    def test_pool_delete(self):
+        self.assert_valid_command(['osd', 'pool', 'delete',
+                                   'poolname', 'poolname',
+                                   '--yes-i-really-really-mean-it'])
+        self.assert_valid_command(['osd', 'pool', 'delete',
+                                   'poolname', 'poolname'])
+        self.assert_valid_command(['osd', 'pool', 'delete',
+                                   'poolname'])
+        assert_equal({}, validate_command(sigdict, ['osd', 'pool', 'delete']))
+        assert_equal({}, validate_command(sigdict, ['osd', 'pool', 'delete',
+                                                    'poolname', 'poolname',
+                                                    'not really']))
+        assert_equal({}, validate_command(sigdict,
+                                          ['osd', 'pool', 'delete',
+                                           'poolname', 'poolname',
+                                           '--yes-i-really-really-mean-it',
+                                           'toomany']))
+
+    def test_pool_rename(self):
+        self.assert_valid_command(['osd', 'pool', 'rename',
+                                   'poolname', 'othername'])
+        assert_equal({}, validate_command(sigdict, ['osd', 'pool', 'rename']))
+        assert_equal({}, validate_command(sigdict, ['osd', 'pool', 'rename',
+                                                    'poolname']))
+        assert_equal({}, validate_command(sigdict, ['osd', 'pool', 'rename',
+                                                    'poolname', 'othername',
+                                                    'toomany']))
+
+    def test_pool_get(self):
+        for var in ('size', 'min_size', 'crash_replay_interval',
+                    'pg_num', 'pgp_num', 'crush_ruleset'):
+            self.assert_valid_command(['osd', 'pool', 'get', 'poolname', var])
+        assert_equal({}, validate_command(sigdict, ['osd', 'pool']))
+        assert_equal({}, validate_command(sigdict, ['osd', 'pool',
+                                                    'get']))
+        assert_equal({}, validate_command(sigdict, ['osd', 'pool',
+                                                    'get', 'poolname']))
+        assert_equal({}, validate_command(sigdict, ['osd', 'pool',
+                                                    'get', 'poolname',
+                                                    'size', 'toomany']))
+        assert_equal({}, validate_command(sigdict, ['osd', 'pool',
+                                                    'get', 'poolname',
+                                                    'invalid']))
+
+    def test_pool_set(self):
+        for var in ('size', 'min_size', 'crash_replay_interval',
+                    'pg_num', 'pgp_num', 'crush_ruleset'):
+            self.assert_valid_command(['osd', 'pool',
+                                       'set', 'poolname', var, '-1'])
+        assert_equal({}, validate_command(sigdict, ['osd', 'pool',
+                                                    'set']))
+        assert_equal({}, validate_command(sigdict, ['osd', 'pool',
+                                                    'set', 'poolname']))
+        assert_equal({}, validate_command(sigdict, ['osd', 'pool',
+                                                    'set', 'poolname',
+                                                    'size', 'invalid']))
+        assert_equal({}, validate_command(sigdict, ['osd', 'pool',
+                                                    'set', 'poolname',
+                                                    'invalid', '-1']))
+        assert_equal({}, validate_command(sigdict, ['osd', 'pool',
+                                                    'set', 'poolname',
+                                                    'size', '-1',
+                                                    'toomany']))
+
+    def test_pool_set_quota(self):
+        for field in ('max_objects', 'max_bytes'):
+            self.assert_valid_command(['osd', 'pool', 'set-quota',
+                                       'poolname', field, '10K'])
+        assert_equal({}, validate_command(sigdict, ['osd', 'pool',
+                                                    'set-quota']))
+        assert_equal({}, validate_command(sigdict, ['osd', 'pool',
+                                                    'set-quota',
+                                                    'poolname']))
+        assert_equal({}, validate_command(sigdict, ['osd', 'pool',
+                                                    'set-quota',
+                                                    'poolname',
+                                                    'max_objects']))
+        assert_equal({}, validate_command(sigdict, ['osd', 'pool',
+                                                    'set-quota',
+                                                    'poolname',
+                                                    'invalid',
+                                                    '10K']))
+        assert_equal({}, validate_command(sigdict, ['osd', 'pool',
+                                                    'set-quota',
+                                                    'poolname',
+                                                    'max_objects',
+                                                    '10K',
+                                                    'toomany']))
+
+    def test_reweight_by_utilization(self):
+        self.assert_valid_command(['osd', 'reweight-by-utilization'])
+        self.assert_valid_command(['osd', 'reweight-by-utilization', '100'])
+        assert_equal({}, validate_command(sigdict, ['osd',
+                                                    'reweight-by-utilization',
+                                                    '50']))
+        assert_equal({}, validate_command(sigdict, ['osd',
+                                                    'reweight-by-utilization',
+                                                    '100',
+                                                    'toomany']))
+
+    def test_thrash(self):
+        self.check_1_natural_arg('osd', 'thrash')
+
+    def test_tier_op(self):
+        for op in ('add', 'remove', 'set-overlay'):
+            self.assert_valid_command(['osd', 'tier', op,
+                                       'poolname', 'othername'])
+            assert_equal({}, validate_command(sigdict, ['osd', 'tier', op]))
+            assert_equal({}, validate_command(sigdict, ['osd', 'tier', op,
+                                                        'poolname']))
+            assert_equal({}, validate_command(sigdict, ['osd', 'tier', op,
+                                                        'poolname',
+                                                        'othername',
+                                                        'toomany']))
+
+    def test_tier_cache_mode(self):
+        for mode in ('none', 'writeback', 'invalidate+forward', 'readonly'):
+            self.assert_valid_command(['osd', 'tier', 'cache-mode',
+                                       'poolname', mode])
+        assert_equal({}, validate_command(sigdict, ['osd', 'tier',
+                                                    'cache-mode']))
+        assert_equal({}, validate_command(sigdict, ['osd', 'tier',
+                                                    'cache-mode',
+                                                    'invalid']))
+
+    def test_tier_remove_overlay(self):
+        self.assert_valid_command(['osd', 'tier', 'remove-overlay',
+                                   'poolname'])
+        assert_equal({}, validate_command(sigdict, ['osd', 'tier',
+                                                    'remove-overlay']))
+        assert_equal({}, validate_command(sigdict, ['osd', 'tier',
+                                                    'remove-overlay',
+                                                    'poolname',
+                                                    'toomany']))
+
+
+class TestConfigKey(TestArgparse):
+
+    def test_get(self):
+        self.check_1_string_arg('config-key', 'get')
+
+    def test_put(self):
+        self.assert_valid_command(['config-key', 'put',
+                                   'key'])
+        self.assert_valid_command(['config-key', 'put',
+                                   'key', 'value'])
+        assert_equal({}, validate_command(sigdict, ['config-key', 'put']))
+        assert_equal({}, validate_command(sigdict, ['config-key', 'put',
+                                                    'key', 'value',
+                                                    'toomany']))
+
+    def test_del(self):
+        self.check_1_string_arg('config-key', 'del')
+
+    def test_exists(self):
+        self.check_1_string_arg('config-key', 'exists')
+
+    def test_list(self):
+        self.check_no_arg('config-key', 'list')
+# Local Variables:
+# compile-command: "cd ../.. ; make -j4 && 
+#  PYTHONPATH=pybind nosetests --stop \
+#  test/pybind/test_ceph_argparse.py # test_ceph_argparse.py:TestOSD.test_rm"
+# End:
diff --git a/src/tools/ceph-filestore-dump.cc b/src/tools/ceph-filestore-dump.cc
index 49b8d10bdba..b4220bae307 100644
--- a/src/tools/ceph-filestore-dump.cc
+++ b/src/tools/ceph-filestore-dump.cc
@@ -52,6 +52,32 @@ enum {
     END_OF_TYPES,	//Keep at the end
 };
 
+//#define INTERNAL_TEST
+//#define INTERNAL_TEST2
+
+#ifdef INTERNAL_TEST
+CompatSet get_test_compat_set() {
+  CompatSet::FeatureSet ceph_osd_feature_compat;
+  CompatSet::FeatureSet ceph_osd_feature_ro_compat;
+  CompatSet::FeatureSet ceph_osd_feature_incompat;
+  ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_BASE);
+  ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_PGINFO);
+  ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_OLOC);
+  ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_LEC);
+  ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_CATEGORIES);
+  ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_HOBJECTPOOL);
+  ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_BIGINFO);
+  ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_LEVELDBINFO);
+  ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_LEVELDBLOG);
+#ifdef INTERNAL_TEST2
+  ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SNAPMAPPER);
+  ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SHARDS);
+#endif
+  return CompatSet(ceph_osd_feature_compat, ceph_osd_feature_ro_compat,
+		   ceph_osd_feature_incompat);
+}
+#endif
+
 typedef uint8_t sectiontype_t;
 typedef uint32_t mymagic_t;
 typedef int64_t mysize_t;
@@ -69,7 +95,7 @@ const int fd_none = INT_MIN;
 //can be added to the export format.
 struct super_header {
   static const uint32_t super_magic = (shortmagic << 16) | shortmagic;
-  static const uint32_t super_ver = 1;
+  static const uint32_t super_ver = 2;
   static const uint32_t FIXED_LENGTH = 16;
   uint32_t magic;
   uint32_t version;
@@ -139,18 +165,25 @@ struct footer {
 
 struct pg_begin {
   pg_t pgid;
+  OSDSuperblock superblock;
 
-  pg_begin(pg_t pg): pgid(pg) { }
+  pg_begin(pg_t pg, OSDSuperblock sb):
+    pgid(pg), superblock(sb) { }
   pg_begin() { }
 
   void encode(bufferlist& bl) const {
-    ENCODE_START(1, 1, bl);
+    // New super_ver prevents decode from ver 1
+    ENCODE_START(2, 2, bl);
     ::encode(pgid, bl);
+    ::encode(superblock, bl);
     ENCODE_FINISH(bl);
   }
   void decode(bufferlist::iterator& bl) {
-    DECODE_START(1, bl);
+    DECODE_START(2, bl);
     ::decode(pgid, bl);
+    if (struct_v > 1) {
+      ::decode(superblock, bl);
+    }
     DECODE_FINISH(bl);
   }
 };
@@ -347,8 +380,8 @@ void remove_coll(ObjectStore *store, const coll_t &coll)
     OSD::make_snapmapper_oid());
   SnapMapper mapper(&driver, 0, 0, 0);
 
-  vector<hobject_t> objects;
-  hobject_t next;
+  vector<ghobject_t> objects;
+  ghobject_t next;
   int r = 0;
   int64_t num = 0;
   ObjectStore::Transaction *t = new ObjectStore::Transaction;
@@ -358,13 +391,14 @@ void remove_coll(ObjectStore *store, const coll_t &coll)
       &objects, &next);
     if (r < 0)
       goto out;
-    for (vector<hobject_t>::iterator i = objects.begin();
+    for (vector<ghobject_t>::iterator i = objects.begin();
 	 i != objects.end();
 	 ++i, ++num) {
 
+      assert(i->generation == ghobject_t::NO_GEN);
       OSDriver::OSTransaction _t(driver.get_transaction(t));
       cout << "remove " << *i << std::endl;
-      int r = mapper.remove_oid(*i, &_t);
+      int r = mapper.remove_oid(i->hobj, &_t);
       if (r != 0 && r != -ENOENT) {
         assert(0);
       }
@@ -621,18 +655,19 @@ int export_file(ObjectStore *store, coll_t cid, hobject_t &obj)
 
 int export_files(ObjectStore *store, coll_t coll)
 {
-  vector<hobject_t> objects;
-  hobject_t next;
+  vector<ghobject_t> objects;
+  ghobject_t next;
 
   while (!next.is_max()) {
     int r = store->collection_list_partial(coll, next, 200, 300, 0,
       &objects, &next);
     if (r < 0)
       return r;
-    for (vector<hobject_t>::iterator i = objects.begin();
+    for (vector<ghobject_t>::iterator i = objects.begin();
 	 i != objects.end();
 	 ++i) {
-      r = export_file(store, coll, *i);
+      assert(i->generation == ghobject_t::NO_GEN);
+      r = export_file(store, coll, i->hobj);
       if (r < 0)
         return r;
     }
@@ -664,7 +699,7 @@ void write_super()
 }
 
 int do_export(ObjectStore *fs, coll_t coll, pg_t pgid, pg_info_t &info,
-    epoch_t map_epoch, __u8 struct_ver)
+    epoch_t map_epoch, __u8 struct_ver, OSDSuperblock superblock)
 {
   PGLog::IndexedLog log;
   pg_missing_t missing;
@@ -675,7 +710,7 @@ int do_export(ObjectStore *fs, coll_t coll, pg_t pgid, pg_info_t &info,
 
   write_super();
 
-  pg_begin pgb(pgid);
+  pg_begin pgb(pgid, superblock);
   ret = write_section(TYPE_PG_BEGIN, pgb, file_fd);
   if (ret)
     return ret;
@@ -909,7 +944,7 @@ int get_pg_metadata(ObjectStore *store, coll_t coll, bufferlist &bl)
   return 0;
 }
 
-int do_import(ObjectStore *store)
+int do_import(ObjectStore *store, OSDSuperblock sb)
 {
   bufferlist ebl;
   pg_info_t info;
@@ -943,7 +978,16 @@ int do_import(ObjectStore *store)
   pg_begin pgb;
   pgb.decode(ebliter);
   pg_t pgid = pgb.pgid;
-  
+
+  if (debug) {
+    cout << "Exported features: " << pgb.superblock.compat_features << std::endl;
+  }
+  if (sb.compat_features.compare(pgb.superblock.compat_features) == -1) {
+    cout << "Export has incompatible features set "
+      << pgb.superblock.compat_features << std::endl;
+    return 1;
+  }
+
   log_oid = OSD::make_pg_log_oid(pgid);
   biginfo_oid = OSD::make_pg_biginfo_oid(pgid);
 
@@ -1017,7 +1061,7 @@ int main(int argc, char **argv)
     ("pgid", po::value<string>(&pgidstr),
      "PG id, mandatory")
     ("type", po::value<string>(&type),
-     "Type one of info, log, export, or import, mandatory")
+     "Type one of info, log, remove, export, or import, mandatory")
     ("file", po::value<string>(&file),
      "path of file to export or import")
     ("debug", "Enable diagnostic output to stderr")
@@ -1170,14 +1214,67 @@ int main(int argc, char **argv)
     return 1;
   }
 
+  bool fs_sharded_objects = fs->get_allow_sharded_objects();
+
   int ret = 0;
   vector<coll_t> ls;
   vector<coll_t>::iterator it;
+  CompatSet supported;
+
+#ifdef INTERNAL_TEST
+  supported = get_test_compat_set();
+#else
+  supported = OSD::get_osd_compat_set();
+#endif
+
+  bufferlist bl;
+  OSDSuperblock superblock;
+  bufferlist::iterator p;
+  ret = fs->read(coll_t::META_COLL, OSD_SUPERBLOCK_POBJECT, 0, 0, bl);
+  if (ret < 0) {
+    cout << "Failure to read OSD superblock error= " << r << std::endl;
+    goto out;
+  }
+
+  p = bl.begin();
+  ::decode(superblock, p);
+
+#ifdef INTERNAL_TEST2
+  fs->set_allow_sharded_objects();
+  assert(fs->get_allow_sharded_objects());
+  fs_sharded_objects = true;
+  superblock.compat_features.incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SHARDS);
+#endif
+
+  if (debug && file_fd != STDOUT_FILENO) {
+    cout << "Supported features: " << supported << std::endl;
+    cout << "On-disk features: " << superblock.compat_features << std::endl;
+  }
+  if (supported.compare(superblock.compat_features) == -1) {
+    cout << "On-disk OSD incompatible features set "
+      << superblock.compat_features << std::endl;
+    ret = EINVAL;
+    goto out;
+  }
+
+  // If there was a crash as an OSD was transitioning to sharded objects
+  // and hadn't completed a set_allow_sharded_objects().
+  // This utility does not want to attempt to finish that transition.
+  if (superblock.compat_features.incompat.contains(CEPH_OSD_FEATURE_INCOMPAT_SHARDS) != fs_sharded_objects) {
+    // An OSD should never have call set_allow_sharded_objects() before
+    // updating its own OSD features.
+    if (fs_sharded_objects)
+      cout << "FileStore sharded but OSD not set, Corruption?" << std::endl;
+    else
+      cout << "Found incomplete transition to sharded objects" << std::endl;
+    ret = EINVAL;
+    goto out;
+  }
 
   if (type == "import") {
 
     try {
-      ret = do_import(fs);
+      ret = do_import(fs, superblock);
     }
     catch (const buffer::error &e) {
       cout << "do_import threw exception error " << e.what() << std::endl;
@@ -1260,7 +1357,7 @@ int main(int argc, char **argv)
       cerr << "struct_v " << (int)struct_ver << std::endl;
 
     if (type == "export") {
-      ret = do_export(fs, coll, pgid, info, map_epoch, struct_ver);
+      ret = do_export(fs, coll, pgid, info, map_epoch, struct_ver, superblock);
     } else if (type == "info") {
       formatter->open_object_section("info");
       info.dump(formatter);
diff --git a/src/tools/ceph-osdomap-tool.cc b/src/tools/ceph-osdomap-tool.cc
index aedc4c824e7..bde4b28b45f 100644
--- a/src/tools/ceph-osdomap-tool.cc
+++ b/src/tools/ceph-osdomap-tool.cc
@@ -115,30 +115,30 @@ int main(int argc, char **argv) {
       i->value().hexdump(std::cout);
     }
   } else if (cmd == "dump-objects") {
-    vector<hobject_t> objects;
+    vector<ghobject_t> objects;
     r = omap.list_objects(&objects);
     if (r < 0) {
       std::cerr << "list_objects got: " << cpp_strerror(r) << std::endl;
       goto done;
     }
-    for (vector<hobject_t>::iterator i = objects.begin();
+    for (vector<ghobject_t>::iterator i = objects.begin();
 	 i != objects.end();
 	 ++i) {
       std::cout << *i << std::endl;
     }
     r = 0;
   } else if (cmd == "dump-objects-with-keys") {
-    vector<hobject_t> objects;
+    vector<ghobject_t> objects;
     r = omap.list_objects(&objects);
     if (r < 0) {
       std::cerr << "list_objects got: " << cpp_strerror(r) << std::endl;
       goto done;
     }
-    for (vector<hobject_t>::iterator i = objects.begin();
+    for (vector<ghobject_t>::iterator i = objects.begin();
 	 i != objects.end();
 	 ++i) {
       std::cout << "Object: " << *i << std::endl;
-      ObjectMap::ObjectMapIterator j = omap.get_iterator(*i);
+      ObjectMap::ObjectMapIterator j = omap.get_iterator(i->hobj);
       for (j->seek_to_first(); j->valid(); j->next()) {
 	std::cout << j->key() << std::endl;
 	j->value().hexdump(std::cout);
diff --git a/src/tools/dupstore.cc b/src/tools/dupstore.cc
index e17eb2201a7..c8b8ece31c8 100644
--- a/src/tools/dupstore.cc
+++ b/src/tools/dupstore.cc
@@ -27,7 +27,7 @@ int dupstore(ObjectStore* src, ObjectStore* dst)
   if (dst->mount() < 0) return 1;
 
   // objects
-  hash_map<hobject_t, coll_t> did_object;
+  hash_map<ghobject_t, coll_t> did_object;
 
   // collections
   vector<coll_t> collections;
@@ -54,11 +54,11 @@ int dupstore(ObjectStore* src, ObjectStore* dst)
       dst->apply_transaction(t);
     }
 
-    vector<hobject_t> o;
+    vector<ghobject_t> o;
     src->collection_list(*p, o);
     int numo = o.size();
     int j = 1;
-    for (vector<hobject_t>::iterator q = o.begin(); q != o.end(); ++q) {
+    for (vector<ghobject_t>::iterator q = o.begin(); q != o.end(); ++q) {
       ObjectStore::Transaction t;
       if (did_object.count(*q))
 	t.collection_add(*p, did_object[*q], *q);
diff --git a/src/tools/rados/rados.cc b/src/tools/rados/rados.cc
index e8386959349..ad8eaa3e1a4 100644
--- a/src/tools/rados/rados.cc
+++ b/src/tools/rados/rados.cc
@@ -98,6 +98,7 @@ void usage(ostream& out)
 "   rmomapkey <obj-name> <key>\n"
 "   getomapheader <obj-name>\n"
 "   setomapheader <obj-name> <val>\n"
+"   tmap-to-omap <obj-name>          convert tmap keys/values to omap\n"
 "   listwatchers <obj-name>          list the watchers of this object\n"
 "\n"
 "IMPORT AND EXPORT\n"
@@ -1813,8 +1814,15 @@ static int rados_tool_common(const std::map < std::string, std::string > &opts,
       bufferlist::iterator p = outdata.begin();
       bufferlist header;
       map<string, bufferlist> kv;
-      ::decode(header, p);
-      ::decode(kv, p);
+      try {
+	::decode(header, p);
+	::decode(kv, p);
+      }
+      catch (buffer::error& e) {
+	cerr << "error decoding tmap " << pool_name << "/" << oid << std::endl;
+	ret = -EINVAL;
+	goto out;
+      }
       cout << "header (" << header.length() << " bytes):\n";
       header.hexdump(cout);
       cout << "\n";
@@ -1841,6 +1849,50 @@ static int rados_tool_common(const std::map < std::string, std::string > &opts,
     }
   }
 
+  else if (strcmp(nargs[0], "tmap-to-omap") == 0) {
+    if (!pool_name || nargs.size() < 2)
+      usage_exit();
+    string oid(nargs[1]);
+
+    bufferlist bl;
+    int r = io_ctx.tmap_get(oid, bl);
+    if (r < 0) {
+      ret = r;
+      cerr << "error reading tmap " << pool_name << "/" << oid
+	   << ": " << cpp_strerror(ret) << std::endl;
+      goto out;
+    }
+    bufferlist hdr;
+    map<string, bufferlist> kv;
+    bufferlist::iterator p = bl.begin();
+    try {
+      ::decode(hdr, p);
+      ::decode(kv, p);
+    }
+    catch (buffer::error& e) {
+      cerr << "error decoding tmap " << pool_name << "/" << oid << std::endl;
+      ret = -EINVAL;
+      goto out;
+    }
+    if (!p.end()) {
+      cerr << "error decoding tmap (stray trailing data) in " << pool_name << "/" << oid << std::endl;
+      ret = -EINVAL;
+      goto out;
+    }
+    librados::ObjectWriteOperation wr;
+    wr.omap_set_header(hdr);
+    wr.omap_set(kv);
+    wr.truncate(0);  // delete the old tmap data
+    r = io_ctx.operate(oid, &wr);
+    if (r < 0) {
+      ret = r;
+      cerr << "error writing tmap data as omap on " << pool_name << "/" << oid
+	   << ": " << cpp_strerror(ret) << std::endl;
+      goto out;
+    }
+    ret = 0;
+  }
+
   else if (strcmp(nargs[0], "mkpool") == 0) {
     int auid = 0;
     __u8 crush_rule = 0;
@@ -2235,8 +2287,9 @@ static int rados_tool_common(const std::map < std::string, std::string > &opts,
     }
 
   } else {
-    cerr << "unrecognized command " << nargs[0] << std::endl;
-    usage_exit();
+    cerr << "unrecognized command " << nargs[0] << "; -h or --help for usage" << std::endl;
+    ret = -EINVAL;
+    goto out;
   }
 
   if (ret < 0)
diff --git a/src/vstart.sh b/src/vstart.sh
index c112bfc9138..def480779de 100755
--- a/src/vstart.sh
+++ b/src/vstart.sh
@@ -249,11 +249,11 @@ if [ -n "$ip" ]; then
     IP="$ip"
 else
     echo hostname $HOSTNAME
-    RAW_IP=`hostname --ip-address`
+    RAW_IP=`hostname -I`
     # filter out IPv6 and localhost addresses
     IP="$(echo "$RAW_IP"|tr ' ' '\012'|grep -v :|grep -v '^127\.'|head -n1)"
     # if that left nothing, then try to use the raw thing, it might work
-    if [ -z "IP" ]; then IP="$RAW_IP"; fi
+    if [ -z "$IP" ]; then IP="$RAW_IP"; fi
     echo ip $IP
 fi
 echo "ip $IP"
@@ -339,6 +339,7 @@ $DAEMONOPTS
 $COSDDEBUG
 $extra_conf
 [mon]
+        mon pg warn min per osd = 10
 $DAEMONOPTS
 $CMONDEBUG
 $extra_conf
author	Greg Farnum <greg@inktank.com>	2013-10-03 15:50:40 -0700
committer	Greg Farnum <greg@inktank.com>	2013-10-03 15:50:40 -0700
commit	b9d4e97378aa90687e23de73493ec92b280cae6a (patch)
tree	477f6734908239453b6b0e26f146d60f9a0b0a8e
parent	d6a1799f410cad09dc27b9102911b5682ef4f346 (diff)
parent	86e96578be669578fdf8213512506e60dc09856d (diff)
download	ceph-b9d4e97378aa90687e23de73493ec92b280cae6a.tar.gz