86 files changed, 2543 insertions, 1269 deletions
diff --git a/.gitignore b/.gitignore
index 99560459d1e..73a8d86bd14 100644
--- a/.gitignore
+++ b/.gitignore
@@ -53,3 +53,4 @@ core
 .project
 .cproject
 /build-doc
+/doc/object_store.png
diff --git a/admin/build-doc b/admin/build-doc
index aee1a48345c..622d7c84031 100755
--- a/admin/build-doc
+++ b/admin/build-doc
@@ -10,6 +10,7 @@ if [ ! -e build-doc/doxygen/xml ]; then
 fi
 
 dia --filter=png-libart --export=doc/overview.png.tmp doc/overview.dia
+
 mv -- doc/overview.png.tmp doc/overview.png
 
 cd build-doc
diff --git a/doc/appendix/differences-from-posix.rst b/doc/appendix/differences-from-posix.rst
new file mode 100644
index 00000000000..f327e786dc9
--- /dev/null
+++ b/doc/appendix/differences-from-posix.rst
@@ -0,0 +1,17 @@
+========================
+ Differences from POSIX
+========================
+
+.. todo:: delete http://ceph.newdream.net/wiki/Differences_from_POSIX
+
+Ceph does have a few places where it diverges from strict POSIX semantics for various reasons:
+
+- Sparse files propagate incorrectly to tools like df. They will only
+  use up the required space, but in df will increase the "used" space
+  by the full file size. We do this because actually keeping track of
+  the space a large, sparse file uses is very expensive.
+- In shared simultaneous writer situations, a write that crosses
+  object boundaries is not necessarily atomic. This means that you
+  could have writer A write "aa|aa" and writer B write "bb|bb"
+  simultaneously (where | is the object boundary), and end up with
+  "aa|bb" rather than the proper "aa|aa" or "bb|bb".
diff --git a/doc/appendix/index.rst b/doc/appendix/index.rst
new file mode 100644
index 00000000000..a98bf899a2a
--- /dev/null
+++ b/doc/appendix/index.rst
@@ -0,0 +1,10 @@
+============
+ Appendices
+============
+
+.. toctree::
+   :glob:
+   :numbered:
+   :titlesonly:
+
+   *
diff --git a/doc/architecture.rst b/doc/architecture.rst
index f66cf3cd9b7..05a3fdfa493 100644
--- a/doc/architecture.rst
+++ b/doc/architecture.rst
@@ -2,167 +2,180 @@
  Architecture of Ceph
 ======================
 
-- Introduction to Ceph Project
-
-  - High-level overview of project benefits for users (few paragraphs, mention each subproject)
-  - Introduction to sub-projects (few paragraphs to a page each)
-
-    - RADOS
-    - RGW
-    - RBD
-    - Ceph
-
-  - Example scenarios Ceph projects are/not suitable for
-  - (Very) High-Level overview of Ceph
-
-    This would include an introduction to basic project terminology,
-    the concept of OSDs, MDSes, and Monitors, and things like
-    that. What they do, some of why they're awesome, but not how they
-    work.
-
-- Discussion of MDS terminology, daemon types (active, standby,
-  standby-replay)
-
-.. todo:: write me
-
-=================================
- Library architecture
-=================================
-Ceph is structured into libraries which are built and then combined together to
-make executables and other libraries.
-
-- libcommon: a collection of utilities which are available to nearly every ceph
-  library and executable. In general, libcommon should not contain global
-  variables, because it is intended to be linked into libraries such as
-  libceph.so.
-
-- libglobal: a collection of utilities focused on the needs of Ceph daemon
-  programs. In here you will find pidfile management functions, signal
-  handlers, and so forth.
-
-.. todo:: document other libraries
-
-=================================
- Configuration Management System
-=================================
-The configuration management system exists to provide every daemon with the
-proper configuration information. The configuration can be viewed as a set of
-key-value pairs.
-
-How can the configuration be set? Well, there are several sources:
- - the ceph configuration file, usually named ceph.conf
- - command line arguments::
-    --debug-ms=1
-    --debug-pg=10
-    etc.
- - arguments injected at runtime by using injectargs
-
-======================================================
- The Configuration File
-======================================================
-Most configuration settings originate in the Ceph configuration file.
-
-How do we find the configuration file? Well, in order, we check:
- - the default locations
- - the environment variable CEPH_CONF
- - the command line argument -c
-
-Each stanza of the configuration file describes the key-value pairs that will be in
-effect for a particular subset of the daemons. The "global" stanza applies to
-everything. The "mon", "osd", and "mds" stanzas specify settings to take effect
-for all monitors, all osds, and all mds servers, respectively.  A stanza of the
-form mon.$name, osd.$name, or mds.$name gives settings for the monitor, OSD, or
-MDS of that name, respectively. Configuration values that appear later in the
-file win over earlier ones.
-
-A sample configuration file can be found in src/sample.ceph.conf.
-
-======================================================
- Metavariables
-======================================================
-The configuration system supports certain "metavariables." If these occur
-inside a configuration value, they are expanded into something else-- similar to
-how bash shell expansion works.
-
-There are a few different metavariables:
- - $host: expands to the current hostname
- - $type: expands to one of "mds", "osd", or "mon"
- - $id: expands to the daemon identifier. For osd.0, this would be "0"; for mds.a, it would be "a"
- - $num: same as $id
- - $name: expands to $type.$id
-
-======================================================
- Interfacing with the Configuration Management System
-======================================================
-There are two ways for Ceph code to get configuration values. One way is to
-read it directly from a variable named "g_conf," or equivalently,
-"g_ceph_ctx->_conf." The other is to register an observer that will called
-every time the relevant configuration values changes.  This observer will be
-called soon after the initial configuration is read, and every time after that
-when one of the relevant values changes. Each observer tracks a set of keys
-and is invoked only when one of the relevant keys changes.
-
-The interface to implement is found in common/config_obs.h.
-
-The observer method should be preferred in new code because
- - It is more flexible, allowing the code to do whatever reinitialization needs
-   to be done to implement the new configuration value.
- - It is the only way to create a std::string configuration variable that can
-   be changed by injectargs.
- - Even for int-valued configuration options, changing the values in one thread
-   while another thread is reading them can lead to subtle and
-   impossible-to-diagnose bugs.
-
-For these reasons, reading directly from g_conf should be considered deprecated
-and not done in new code.  Do not ever alter g_conf.
-
-=================================
- Debug Logs
-=================================
-The main debugging tool for Ceph is the dout and derr logging functions.
-Collectively, these are referred to as "dout logging."
-
-Dout has several log faculties, which can be set at various log
-levels using the configuration management system. So it is possible to enable
-debugging just for the messenger, by setting debug_ms to 10, for example.
-
-Dout is implemented mainly in common/DoutStreambuf.cc
-
-The dout macro avoids even generating log messages which are not going to be
-used, by enclosing them in an "if" statement. What this means is that if you
-have the debug level set at 0, and you run this code
-
-``dout(20) << "myfoo() = " << myfoo() << dendl;``
-
-
-myfoo() will not be called here.
-
-Unfortunately, the performance of debug logging is relatively low. This is
-because there is a single, process-wide mutex which every debug output
-statement takes, and every debug output statement leads to a write() system
-call or a call to syslog(). There is also a computational overhead to using C++
-streams to consider. So you will need to be parsimonius in your logging to get
-the best performance.
-
-Sometimes, enabling logging can hide race conditions and other bugs by changing
-the timing of events. Keep this in mind when debugging.
-
-=================================
- CephContext
-=================================
-A CephContext represents a single view of the Ceph cluster. It comes complete
-with a configuration, a set of performance counters (PerfCounters), and a
-heartbeat map. You can find more information about CephContext in
-src/common/ceph_context.h.
-
-Generally, you will have only one CephContext in your application, called
-g_ceph_context. However, in library code, it is possible that the library user
-will initialize multiple CephContexts. For example, this would happen if he
-called rados_create more than once.
-
-A ceph context is required to issue log messages. Why is this? Well, without
-the CephContext, we would not know which log messages were disabled and which
-were enabled.  The dout() macro implicitly references g_ceph_context, so it
-can't be used in library code.  It is fine to use dout and derr in daemons, but
-in library code, you must use ldout and lderr, and pass in your own CephContext
-object. The compiler will enforce this restriction.
+Ceph is a distributed network storage and file system with distributed
+metadata management and POSIX semantics.
+
+RADOS is a reliable object store, used by Ceph, but also directly
+accessible.
+
+``radosgw`` is an S3-compatible RESTful HTTP service for object
+storage, using RADOS storage.
+
+RBD is a Linux kernel feature that exposes RADOS storage as a block
+device. Qemu/KVM also has a direct RBD client, that avoids the kernel
+overhead.
+
+
+.. _monitor:
+
+Monitor cluster
+===============
+
+``cmon`` is a lightweight daemon that provides a consensus for
+distributed decisionmaking in a Ceph/RADOS cluster.
+
+It also is the initial point of contact for new clients, and will hand
+out information about the topology of the cluster, such as the
+``osdmap``.
+
+You normally run 3 ``cmon`` daemons, on 3 separate physical machines,
+isolated from each other; for example, in different racks or rows.
+
+You could run just 1 instance, but that means giving up on high
+availability.
+
+You may use the same hosts for ``cmon`` and other purposes.
+
+``cmon`` processes talk to each other using a Paxos_\-style
+protocol. They discover each other via the ``[mon.X] mon addr`` fields
+in ``ceph.conf``.
+
+.. todo:: What about ``monmap``? Fact check.
+
+Any decision requires the majority of the ``cmon`` processes to be
+healthy and communicating with each other. For this reason, you never
+want an even number of ``cmon``\s; there is no unambiguous majority
+subgroup for an even number.
+
+.. _Paxos: http://en.wikipedia.org/wiki/Paxos_algorithm
+
+.. todo:: explain monmap
+
+
+.. _rados:
+
+
+RADOS
+=====
+
+``cosd`` is the storage daemon that provides the RADOS service. It
+uses ``cmon`` for cluster membership, services object read/write/etc
+request from clients, and peers with other ``cosd``\s for data
+replication.
+
+The data model is fairly simple on this level. There are multiple
+named pools, and within each pool there are named objects, in a flat
+namespace (no directories). Each object has both data and metadata.
+
+The data for an object is a single, potentially big, series of
+bytes. Additionally, the series may be sparse, it may have holes that
+contain binary zeros, and take up no actual storage.
+
+The metadata is an unordered set of key-value pairs. It's semantics
+are completely up to the client; for example, the Ceph filesystem uses
+metadata to store file owner etc.
+
+.. todo:: Verify that metadata is unordered.
+
+Underneath, ``cosd`` stores the data on a local filesystem. We
+recommend using Btrfs_, but any POSIX filesystem that has extended
+attributes should work (see :ref:`xattr`).
+
+.. _Btrfs: http://en.wikipedia.org/wiki/Btrfs
+
+.. todo:: write about access control
+
+.. todo:: explain osdmap
+
+.. todo:: explain plugins ("classes")
+
+
+.. _cephfs:
+
+Ceph filesystem
+===============
+
+The Ceph filesystem service is provided by a daemon called
+``cmds``. It uses RADOS to store all the filesystem metadata
+(directories, file ownership, access modes, etc), and directs clients
+to access RADOS directly for the file contents.
+
+The Ceph filesystem aims for POSIX compatibility, except for a few
+chosen differences. See :doc:`/appendix/differences-from-posix`.
+
+``cmds`` can run as a single process, or it can be distributed out to
+multiple physical machines, either for high availability or for
+scalability.
+
+For high availability, the extra ``cmds`` instances can be `standby`,
+ready to take over the duties of any failed ``cmds`` that was
+`active`. This is easy because all the data, including the journal, is
+stored on RADOS. The transition is triggered automatically by
+``cmon``.
+
+For scalability, multiple ``cmds`` instances can be `active`, and they
+will split the directory tree into subtrees (and shards of a single
+busy directory), effectively balancing the load amongst all `active`
+servers.
+
+Combinations of `standby` and `active` etc are possible, for example
+running 3 `active` ``cmds`` instances for scaling, and one `standby`.
+
+To control the number of `active` ``cmds``\es, see :doc:`/ops/grow/mds`.
+
+.. topic:: Status as of 2011-09:
+
+   Multiple `active` ``cmds`` operation is stable under normal
+   circumstances, but some failure scenarios may still cause
+   operational issues.
+
+.. todo:: document `standby-replay`
+
+.. todo:: mds.0 vs mds.alpha etc details
+
+
+
+``radosgw``
+===========
+
+``radosgw`` is a FastCGI service that provides a RESTful_ HTTP API to
+store objects and metadata. It layers on top of RADOS with its own
+data formats, and maintains it's own user database, authentication,
+access control, and so on.
+
+.. _RESTful: http://en.wikipedia.org/wiki/RESTful
+
+
+Rados Block Device (RBD)
+========================
+
+In virtual machine scenarios, RBD is typically used via the ``rbd``
+network storage driver in Qemu/KVM, where the host machine uses
+``librbd`` to provide a block device service to the guest.
+
+Alternatively, as no direct ``librbd`` support is available in Xen,
+the Linux kernel can act as the RBD client and provide a real block
+device on the host machine, that can then be accessed by the
+virtualization. This is done with the command-line tool ``rbd`` (see
+:doc:`/ops/rbd`).
+
+The latter is also useful in non-virtualized scenarios.
+
+Internally, RBD stripes the device image over multiple RADOS objects,
+each typically located on a separate ``cosd``, allowing it to perform
+better than a single server could.
+
+
+Client
+======
+
+.. todo:: cephfs, cfuse, librados, libceph, librbd
+
+
+.. todo:: Summarize how much Ceph trusts the client, for what parts (security vs reliability).
+
+
+TODO
+====
+
+.. todo:: Example scenarios Ceph projects are/not suitable for
diff --git a/doc/conf.py b/doc/conf.py
index b74229608a1..afac33f816a 100644
--- a/doc/conf.py
+++ b/doc/conf.py
@@ -22,6 +22,7 @@ html_sidebars = {
 # ugly kludge until breathe is distutils-friendly
 import sys; sys.path.append('../build-doc/breathe')
 extensions = [
+    'sphinx.ext.graphviz',
     'sphinx.ext.todo',
     'breathe',
     ]
diff --git a/doc/dev/config.rst b/doc/dev/config.rst
new file mode 100644
index 00000000000..59f1b92989c
--- /dev/null
+++ b/doc/dev/config.rst
@@ -0,0 +1,77 @@
+=================================
+ Configuration Management System
+=================================
+
+The configuration management system exists to provide every daemon with the
+proper configuration information. The configuration can be viewed as a set of
+key-value pairs.
+
+How can the configuration be set? Well, there are several sources:
+ - the ceph configuration file, usually named ceph.conf
+ - command line arguments::
+    --debug-ms=1
+    --debug-pg=10
+    etc.
+ - arguments injected at runtime by using injectargs
+
+
+The Configuration File
+======================
+
+Most configuration settings originate in the Ceph configuration file.
+
+How do we find the configuration file? Well, in order, we check:
+ - the default locations
+ - the environment variable CEPH_CONF
+ - the command line argument -c
+
+Each stanza of the configuration file describes the key-value pairs that will be in
+effect for a particular subset of the daemons. The "global" stanza applies to
+everything. The "mon", "osd", and "mds" stanzas specify settings to take effect
+for all monitors, all osds, and all mds servers, respectively.  A stanza of the
+form mon.$name, osd.$name, or mds.$name gives settings for the monitor, OSD, or
+MDS of that name, respectively. Configuration values that appear later in the
+file win over earlier ones.
+
+A sample configuration file can be found in src/sample.ceph.conf.
+
+
+Metavariables
+=============
+
+The configuration system supports certain "metavariables." If these occur
+inside a configuration value, they are expanded into something else-- similar to
+how bash shell expansion works.
+
+There are a few different metavariables:
+ - $host: expands to the current hostname
+ - $type: expands to one of "mds", "osd", or "mon"
+ - $id: expands to the daemon identifier. For osd.0, this would be "0"; for mds.a, it would be "a"
+ - $num: same as $id
+ - $name: expands to $type.$id
+
+
+Interfacing with the Configuration Management System
+====================================================
+
+There are two ways for Ceph code to get configuration values. One way is to
+read it directly from a variable named "g_conf," or equivalently,
+"g_ceph_ctx->_conf." The other is to register an observer that will called
+every time the relevant configuration values changes.  This observer will be
+called soon after the initial configuration is read, and every time after that
+when one of the relevant values changes. Each observer tracks a set of keys
+and is invoked only when one of the relevant keys changes.
+
+The interface to implement is found in common/config_obs.h.
+
+The observer method should be preferred in new code because
+ - It is more flexible, allowing the code to do whatever reinitialization needs
+   to be done to implement the new configuration value.
+ - It is the only way to create a std::string configuration variable that can
+   be changed by injectargs.
+ - Even for int-valued configuration options, changing the values in one thread
+   while another thread is reading them can lead to subtle and
+   impossible-to-diagnose bugs.
+
+For these reasons, reading directly from g_conf should be considered deprecated
+and not done in new code.  Do not ever alter g_conf.
diff --git a/doc/dev/context.rst b/doc/dev/context.rst
new file mode 100644
index 00000000000..1a2b2cbfb00
--- /dev/null
+++ b/doc/dev/context.rst
@@ -0,0 +1,20 @@
+=============
+ CephContext
+=============
+
+A CephContext represents a single view of the Ceph cluster. It comes complete
+with a configuration, a set of performance counters (PerfCounters), and a
+heartbeat map. You can find more information about CephContext in
+src/common/ceph_context.h.
+
+Generally, you will have only one CephContext in your application, called
+g_ceph_context. However, in library code, it is possible that the library user
+will initialize multiple CephContexts. For example, this would happen if he
+called rados_create more than once.
+
+A ceph context is required to issue log messages. Why is this? Well, without
+the CephContext, we would not know which log messages were disabled and which
+were enabled.  The dout() macro implicitly references g_ceph_context, so it
+can't be used in library code.  It is fine to use dout and derr in daemons, but
+in library code, you must use ldout and lderr, and pass in your own CephContext
+object. The compiler will enforce this restriction.
diff --git a/doc/dev/index.rst b/doc/dev/index.rst
new file mode 100644
index 00000000000..69fda91850c
--- /dev/null
+++ b/doc/dev/index.rst
@@ -0,0 +1,22 @@
+==================================
+ Internal developer documentation
+==================================
+
+.. note:: If you're looking for how to use Ceph as a library from your
+   own software, please see :doc:`/api/index`.
+
+You can start a development mode Ceph cluster, after compiling the source, with::
+
+	cd src
+	install -d -m0755 out dev/osd0
+	./vstart.sh -n -x -l
+	# check that it's there
+	./ceph health
+
+.. todo:: vstart is woefully undocumented and full of sharp sticks to poke yourself with.
+
+
+.. toctree::
+   :glob:
+
+   *
diff --git a/doc/dev/libs.rst b/doc/dev/libs.rst
new file mode 100644
index 00000000000..bdebdb3b000
--- /dev/null
+++ b/doc/dev/libs.rst
@@ -0,0 +1,18 @@
+======================
+ Library architecture
+======================
+
+Ceph is structured into libraries which are built and then combined together to
+make executables and other libraries.
+
+- libcommon: a collection of utilities which are available to nearly every ceph
+  library and executable. In general, libcommon should not contain global
+  variables, because it is intended to be linked into libraries such as
+  libceph.so.
+
+- libglobal: a collection of utilities focused on the needs of Ceph daemon
+  programs. In here you will find pidfile management functions, signal
+  handlers, and so forth.
+
+.. todo:: document other libraries
+
diff --git a/doc/dev/logs.rst b/doc/dev/logs.rst
new file mode 100644
index 00000000000..f4bef84c2ca
--- /dev/null
+++ b/doc/dev/logs.rst
@@ -0,0 +1,31 @@
+============
+ Debug Logs
+============
+
+The main debugging tool for Ceph is the dout and derr logging functions.
+Collectively, these are referred to as "dout logging."
+
+Dout has several log faculties, which can be set at various log
+levels using the configuration management system. So it is possible to enable
+debugging just for the messenger, by setting debug_ms to 10, for example.
+
+Dout is implemented mainly in common/DoutStreambuf.cc
+
+The dout macro avoids even generating log messages which are not going to be
+used, by enclosing them in an "if" statement. What this means is that if you
+have the debug level set at 0, and you run this code
+
+``dout(20) << "myfoo() = " << myfoo() << dendl;``
+
+
+myfoo() will not be called here.
+
+Unfortunately, the performance of debug logging is relatively low. This is
+because there is a single, process-wide mutex which every debug output
+statement takes, and every debug output statement leads to a write() system
+call or a call to syslog(). There is also a computational overhead to using C++
+streams to consider. So you will need to be parsimonius in your logging to get
+the best performance.
+
+Sometimes, enabling logging can hide race conditions and other bugs by changing
+the timing of events. Keep this in mind when debugging.
diff --git a/doc/dev/object-store.rst b/doc/dev/object-store.rst
new file mode 100644
index 00000000000..698070c5e7f
--- /dev/null
+++ b/doc/dev/object-store.rst
@@ -0,0 +1,67 @@
+====================================
+ Object Store Architecture Overview
+====================================
+
+.. graphviz::
+
+  /*
+   * Rough outline of object store module dependencies
+   */
+
+  digraph object_store {
+    size="7,7";
+    node [color=lightblue2, style=filled, fontname="Serif"];
+
+    "testrados" -> "librados"
+    "testradospp" -> "librados"
+
+    "rbd" -> "librados"
+
+    "radostool" -> "librados"
+
+    "radosgw_admin" -> "rgw"
+
+    "rgw" -> "librados"
+
+    "radosacl" -> "librados"
+
+    "librados" -> "objecter"
+
+    "ObjectCacher" -> "Filer"
+
+    "dumpjournal" -> "Journaler"
+
+    "Journaler" -> "Filer"
+
+    "SyntheticClient" -> "Filer"
+    "SyntheticClient" -> "objecter"
+
+    "Filer" -> "objecter"
+
+    "objecter" -> "OSDMap"
+
+    "cosd" -> "PG"
+    "cosd" -> "ObjectStore"
+
+    "crushtool" -> "CrushWrapper"
+
+    "OSDMap" -> "CrushWrapper"
+
+    "OSDMapTool" -> "OSDMap"
+
+    "PG" -> "ReplicatedPG"
+    "PG" -> "ObjectStore"
+    "PG" -> "OSDMap"
+
+    "ReplicatedPG" -> "ObjectStore"
+    "ReplicatedPG" -> "OSDMap"
+
+    "ObjectStore" -> "FileStore"
+
+    "FileStore" -> "ext3"
+    "FileStore" -> "ext4"
+    "FileStore" -> "btrfs"
+  }
+
+
+.. todo:: write more here
diff --git a/doc/index.rst b/doc/index.rst
index c108dffeeb4..d3ad0c669ec 100644
--- a/doc/index.rst
+++ b/doc/index.rst
@@ -90,9 +90,11 @@ Table of Contents
    architecture
    ops/index
    api/index
+   Internals <dev/index>
    man/index
    papers
    glossary
+   appendix/index
 
 
 Indices and tables
diff --git a/doc/ops/autobuilt.rst b/doc/ops/autobuilt.rst
new file mode 100644
index 00000000000..add9b292a92
--- /dev/null
+++ b/doc/ops/autobuilt.rst
@@ -0,0 +1,64 @@
+=============================
+ Autobuilt unstable packages
+=============================
+
+We automatically build Debian and Ubuntu packages for any branches or
+tags that appear in the |ceph.git|_. We build packages for the `amd64`
+and `i386` architectures (`arch list`_), for the following
+distributions (`distro list`_):
+
+- ``natty`` (Ubuntu 11.04)
+- ``squeeze`` (Debian 6.0)
+
+.. |ceph.git| replace::
+   ``ceph.git`` repository
+.. _`ceph.git`: https://github.com/NewDreamNetwork/ceph
+
+.. _`arch list`: http://ceph.newdream.net/debian-snapshot-amd64/master/dists/natty/main/
+.. _`distro list`: http://ceph.newdream.net/debian-snapshot-amd64/master/dists/
+
+The current status of autobuilt packages can be found at
+http://ceph.newdream.net/gitbuilder-deb-amd64/ .
+
+If you wish to use these packages, you need to modify the
+:ref:`earlier instructions <install-debs>` as follows:
+
+.. warning:: The following commands make your computer trust any code
+   that makes it into ``ceph.git``, including work in progress
+   branches and versions of code with possible security issues (that
+   were fixed afterwards). Use at your own risk!
+
+Whenever we say *DISTRO* below, replace it with the codename of your
+operating system.
+
+Whenever we say *BRANCH* below, replace it with the version of the
+code you want to run, e.g. ``master``, ``stable`` or ``v0.34`` (`branch list`_ [#broken-links]_).
+
+.. _`branch list`: http://ceph.newdream.net/debian-snapshot-amd64/
+
+Run these commands on all nodes::
+
+	wget -q -O- https://raw.github.com/NewDreamNetwork/ceph/master/keys/autobuild.asc \
+	| sudo apt-key add -
+
+	sudo tee /etc/apt/sources.list.d/ceph.list <<EOF
+	deb http://ceph.newdream.net/debian-snapshot-amd64/BRANCH/ DISTRO main
+	deb-src http://ceph.newdream.net/debian-snapshot-amd64/BRANCH/ DISTRO main
+	EOF
+
+	sudo apt-get update
+	sudo apt-get install ceph
+
+From here on, you can follow the usual set up instructions in
+:doc:`/ops/install`.
+
+
+
+.. rubric:: Footnotes
+
+.. [#broken-links] Technical issues with how that part of the URL
+   space is HTTP reverse proxied means that the links in the generated
+   directory listings are broken. Please don't click on the links,
+   instead edit the URL bar manually, for now.
+
+   .. todo:: Fix the gitbuilder reverse proxy to not break relative URLs.
diff --git a/doc/ops/filesystem.rst b/doc/ops/filesystem.rst
index 6a9ddeca7a6..ad302dff69c 100644
--- a/doc/ops/filesystem.rst
+++ b/doc/ops/filesystem.rst
@@ -3,3 +3,22 @@
 =======================================
 
 .. todo:: Benefits of each, limits on non-btrfs ones, performance data when we have them, etc
+
+
+.. _btrfs:
+
+Btrfs
+-----
+
+.. todo:: what does btrfs give you (the journaling thing)
+
+
+ext4/ext3
+---------
+
+.. _xattr:
+
+Enabling extended attributes
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. todo:: how to enable xattr on ext4/3
diff --git a/doc/ops/index.rst b/doc/ops/index.rst
index 3221394d374..915a02dcfc5 100644
--- a/doc/ops/index.rst
+++ b/doc/ops/index.rst
@@ -14,3 +14,5 @@
    monitor
    grow/index
    data-placement
+   autobuilt
+   misc
diff --git a/doc/ops/install.rst b/doc/ops/install.rst
index 692ac926bec..d0b589d56fc 100644
--- a/doc/ops/install.rst
+++ b/doc/ops/install.rst
@@ -2,13 +2,195 @@
  Installing a Ceph cluster
 ===========================
 
+For development and really early stage testing, see :doc:`/dev/index`.
+
+For installing the latest development builds, see
+:doc:`/ops/autobuilt`.
+
+Installing any complex distributed software can be a lot of work. We
+support two automated ways of installing Ceph: using Chef_, or with
+the ``mkcephfs`` shell script.
+
+.. _Chef: http://wiki.opscode.com/display/chef
+
+.. topic:: Status as of 2011-09
+
+  This section hides a lot of the tedious underlying details. If you
+  need to, or wish to, roll your own deployment automation, or are
+  doing it manually, you'll have to dig into a lot more intricate
+  details.  We are working on simplifying the installation, as that
+  also simplifies our Chef cookbooks.
+
+
+Installing Ceph using Chef
+==========================
+
+(Try saying that fast 10 times.)
+
+.. topic:: Status as of 2011-09
+
+  While we have Chef cookbooks in use internally, they are not yet
+  ready to handle unsupervised installation of a full cluster. Stay
+  tuned for updates.
+
 .. todo:: write me
 
-Authentication is optional but very much recommended.
 
-Basically, everything somebody needs to go through to build a new
-cluster when not cheating via vstart or teuthology, but without
-mentioning all the design tradeoffs and options like journaling
-locations or filesystems
+Installing Ceph using ``mkcephfs``
+==================================
+
+Pick a host that has the Ceph software installed -- it does not have
+to be a part of your cluster, but it does need to have *matching
+versions* of the ``mkcephfs`` command and other Ceph tools
+installed. This will be your `admin host`.
+
+
+Installing the packages
+-----------------------
+
+
+.. _install-debs:
+
+Debian/Ubuntu
+~~~~~~~~~~~~~
+
+We regularly build Debian and Ubuntu packages for the `amd64` and
+`i386` architectures, for the following distributions:
+
+- ``sid`` (Debian unstable)
+- ``squeeze`` (Debian 6.0)
+- ``lenny`` (Debian 5.0)
+- ``oneiric`` (Ubuntu 11.11)
+- ``natty`` (Ubuntu 11.04)
+- ``maverick`` (Ubuntu 10.10)
+
+.. todo:: http://ceph.newdream.net/debian/dists/ also has ``lucid``
+   (Ubuntu 10.04), should that be removed?
+
+Whenever we say *DISTRO* below, replace that with the codename of your
+operating system.
+
+Run these commands on all nodes::
+
+	wget -q -O- https://raw.github.com/NewDreamNetwork/ceph/master/keys/release.asc \
+	| sudo apt-key add -
+
+	sudo tee /etc/apt/sources.list.d/ceph.list <<EOF
+	deb http://ceph.newdream.net/debian/ DISTRO main
+	deb-src http://ceph.newdream.net/debian/ DISTRO main
+	EOF
+
+	sudo apt-get update
+	sudo apt-get install ceph
+
+
+.. todo:: For older distributions, you may need to make sure your apt-get may read .bz2 compressed files. This works for Debian Lenny 5.0.3: ``apt-get install bzip2``
+
+.. todo:: Ponder packages; ceph.deb currently pulls in gceph (ceph.deb
+   Recommends: ceph-client-tools ceph-fuse libceph1 librados2 librbd1
+   btrfs-tools gceph) (other interesting: ceph-client-tools ceph-fuse
+   libceph-dev librados-dev librbd-dev obsync python-ceph radosgw)
+
+
+.. todo:: Other operating system support.
+
+
+Creating a ``ceph.conf`` file
+-----------------------------
+
+On the `admin host`, create a file with a name like
+``mycluster.conf``.
+
+Here's a template for a 3-node cluster, where all three machines run a
+:ref:`monitor <monitor>` and an :ref:`object store <rados>`, and the
+first one runs the :ref:`Ceph filesystem daemon <cephfs>`. Replace the
+hostnames and IP addresses with your own, and add/remove hosts as
+appropriate. All hostnames *must* be short form (no domain).
+
+.. literalinclude:: mycluster.conf
+   :language: ini
+
+Note how the ``host`` variables dictate what node runs what
+services. See :doc:`/ops/config` for more information.
+
+.. todo:: More specific link for host= convention.
+
+.. todo:: Point to cluster design docs, once they are ready.
+
+.. todo:: At this point, either use 1 or 3 mons, point to :doc:`grow/mon`
+
+
+Running ``mkcephfs``
+--------------------
+
+Verify that you can manage the nodes from the host you intend to run
+``mkcephfs`` on:
+
+- Make sure you can SSH_ from the `admin host` into all the nodes
+  using the short hostnames (``myserver`` not
+  ``myserver.mydept.example.com``), with no user specified
+  [#ssh_config]_.
+- Make sure you can run ``sudo`` without passphrase prompts on all
+  nodes [#sudo]_.
+
+.. _SSH: http://openssh.org/
+
+If you are not using :ref:`Btrfs <btrfs>`, enable :ref:`extended
+attributes <xattr>`.
+
+On each node, make sure the directory ``/srv/osd.N`` (with the
+appropriate ``N``) exists, and the right filesystem is mounted. If you
+are not using a separate filesystem for the file store, just run
+``sudo mkdir /srv/osd.N`` (with the right ``N``).
+
+Then, using the right path to the ``mycluster.conf`` file you prepared
+earlier, run::
+
+	mkcephfs -a -c mycluster.conf -k mycluster.keyring
+
+This will place an `admin key` into ``mycluster.keyring``. This will
+be used to manage the cluster. Treat it like a ``root`` password to
+your filesystem.
+
+.. todo:: Link to explanation of `admin key`.
+
+That should SSH into all the nodes, and set up Ceph for you.
+
+It does **not** copy the configuration, or start the services. Let's
+do that::
+
+	ssh myserver01 sudo tee /etc/ceph/ceph.conf <mycluster.conf
+	ssh myserver02 sudo tee /etc/ceph/ceph.conf <mycluster.conf
+	ssh myserver03 sudo tee /etc/ceph/ceph.conf <mycluster.conf
+	...
+
+	ssh myserver01 sudo /etc/init.d/ceph start
+	ssh myserver02 sudo /etc/init.d/ceph start
+	ssh myserver03 sudo /etc/init.d/ceph start
+	...
+
+After a little while, the cluster should come up and reach a healthy
+state. We can check that::
+
+	ceph -k mycluster.keyring -c mycluster.conf health
+	2011-09-06 12:33:51.561012 mon <- [health]
+	2011-09-06 12:33:51.562164 mon2 -> 'HEALTH_OK' (0)
+
+.. todo:: Document "healthy"
+
+.. todo:: Improve output.
+
+
+
+.. rubric:: Footnotes
+
+.. [#ssh_config] Something like this in your ``~/.ssh_config`` may
+   help -- unfortunately you need an entry per node::
+
+	Host myserverNN
+	     Hostname myserverNN.dept.example.com
+	     User ubuntu
+
+.. [#sudo] The relevant ``sudoers`` syntax looks like this::
 
-At this point, either use 1 or 3 mons, point to :doc:`grow/mon`
+	%admin ALL=(ALL) NOPASSWD:ALL
diff --git a/doc/ops/misc.rst b/doc/ops/misc.rst
new file mode 100644
index 00000000000..084debc80e7
--- /dev/null
+++ b/doc/ops/misc.rst
@@ -0,0 +1,14 @@
+===============
+ Miscellaneous
+===============
+
+.. todo:: This section should not exist. Try to reorganize, when
+   document is otherwise more ready.
+
+
+Disabling encryption
+====================
+
+Authentication is optional but very much recommended.
+
+.. todo:: write me
diff --git a/doc/ops/mycluster.conf b/doc/ops/mycluster.conf
new file mode 100644
index 00000000000..454eca63bfb
--- /dev/null
+++ b/doc/ops/mycluster.conf
@@ -0,0 +1,37 @@
+[global]
+	auth supported = cephx
+	keyring = /etc/ceph/$name.keyring
+
+[mon]
+	mon data = /srv/mon.$id
+
+[mds]
+
+[osd]
+	osd data = /srv/osd.$id
+	osd journal = /srv/osd.$id.journal
+	osd journal size = 1000
+
+[mon.a]
+	host = myserver01
+	mon addr = 10.0.0.101:6789
+
+[mon.b]
+	host = myserver02
+	mon addr = 10.0.0.102:6789
+
+[mon.c]
+	host = myserver03
+	mon addr = 10.0.0.103:6789
+
+[osd.0]
+	host = myserver01
+
+[osd.1]
+	host = myserver02
+
+[osd.2]
+	host = myserver03
+
+[mds.a]
+	host = myserver01
diff --git a/keys/autobuild.asc b/keys/autobuild.asc
new file mode 100644
index 00000000000..2a1d17dc9ef
--- /dev/null
+++ b/keys/autobuild.asc
@@ -0,0 +1,41 @@
+-----BEGIN PGP PUBLIC KEY BLOCK-----
+Version: GnuPG v1.4.9 (GNU/Linux)
+
+mQGiBE1Rr28RBADCxdpLV3ea9ocpS/1+UCvHqD5xjmlw/9dmji4qrUX0+IhPMNuA
+GBBt2CRaR7ygMF5S0NFXooegph0/+NT0KisLIuhUI3gde4SWb5jsb8hpGUse9MC5
+DN39P46zZSpepIMlQuQUkge8W/H2qBu10RcwQhs7o2fZ1zK9F3MmRCkBqwCggpap
+GsOgE2IlWjcztmE6xcPO0wED/R4BxTaQM+jxIjylnHgn9PYy6795yIc/ZoYjNnIh
+QyjqbLWnyzeTmjPBwcXNljKqzEoA/Cjb2gClxHXrYAw7bGu7wKbnqhzdghSx7ab+
+HwIoy/v6IQqv+EXZgYHonqQwqtgfAHp5ON2gWu03cHoGkXfmA4qZIoowqMolZhGo
+cF30A/9GotDdnMlqh8bFBOCMuxfRow7H8RpfL0fX7VHA0knAZEDk2rNFeebL5QKH
+GNJm9Wa6JSVj1NUIaz4LHyravqXi4MXzlUqauhLHw1iG+qwZlPM04z+1Dj6A+2Hr
+b5UxI/I+EzmO5OYa38YWOqybNVBH0wO+sMCpdBq0LABa8X29LbRPQ2VwaCBhdXRv
+bWF0ZWQgcGFja2FnZSBidWlsZCAoQ2VwaCBhdXRvbWF0ZWQgcGFja2FnZSBidWls
+ZCkgPHNhZ2VAbmV3ZHJlYW0ubmV0PohmBBMRAgAmBQJNUa9vAhsDBQkDwmcABgsJ
+CAcDAgQVAggDBBYCAwECHgECF4AACgkQbq6uIgPDlRpR0QCfZnYE8vEDX4JL3sZj
+5LvMsXruULIAnjHBAYvdlu5iMowoEMQDJlNNdscxuQQNBE1Rr28QEACKG04kxGY1
+cwGoInHVP6z1+8oqGiaiYWFflYRtSiwoUVtl30T1sMOSzoEvmauc+rmBBfsyaBb8
+DLDUIgGKv1FCOY/tfqnOyQXotPjgaLeCtK5A5Z5D212wbskf5fRHAxiychwKURiE
+eesRa7EWrF6ohFxOTy9NOlFi7ctusShw6Q2kUtN7bQCX9hJdYs7PYQXvCXvW8DNt
+7IitF7MpgMHNcj0wik6p38I4s7pqK6mqP4AXVVSWbJKr/LSz8bI8KhWRAT7erVAZ
+f6FElR2xZVr3c4zsE2HFpnZTsM5y/nj8fUkgKGl8OfBuUoh+MCVfnPmE6sgWfDTK
+kwWtUcmL6V9UQ1INUJ3sk+XBY9SMNbOn04su9FjQyNEMI/3VK7yuyKBRAN7IIVgP
+2ch499m6+YFV9ZkG3JSTovNiqSpQouW7YPkS+8mxlPo03LQcU5bHeacBl0T8Xjlv
+qu6q279EliHul4huKL0+myPN4DtmOTh/kwgSy3BGCBdS+wfAJSZcuKI7pk7pHGCd
+UjNMHQZmPFbwzp33bVLd16gnAx0OW5DOn6l0VfgIQNSJ2rn7WZ5jdyg/Flp2VlWV
+tAHFLzkCa+LvQ5twSuzrV/VipSr3xz3pTDLY+ZxDztvrgA6AST8+sdq6uQTYjwUQ
+V0wzanvp9hkC5eqRY6YlzcgMkWFv8DCIEwADBQ//ZQaeVmG6T5vyfXf2JrCipmI4
+MAdO+ezEtWE82wgixlCvvm26UmUejCYgtD6DmwY/7/bIjvJDhUwP0+hAHHOpR62g
+ncoMtbMryHpm3FvYH58JNk5gx8ZA322WEc2GCRCQzrMQoMKBcpZY/703GpQ4l3RZ
+7/25gq7ANohV5zeddFQftc05PMBBJLU3U+lrnahJS1WaOXNQzS6oVj9jNda1jkgc
+Qni6QssSIMT6rAPsVbGJhe9mxr2VWdQ90QlubpszIeSJuqqJxLwqH8XHXZmQOYxm
+yVP9a3pFqWDmsNxDA8ttYnMIc+nUAgCDJ84ScwQ1GvoCUD1b1cFNzvvhEHsNb4D/
+XbdrFcFGwEkeyivUsojdq2YnGjYSgauqyNWbeEgBrWzUe5USYysmziL/KAubcUjI
+beRGxyPS6iQ2kbvfEJJPgocWTfLs5j61FObO+MVlj+PEmxWbcsIRv/pnG2V2FPJ8
+evhzgvp7cG9imZPM6dWHzc/ZFdi3Bcs51RtStsvPqXv4icKIi+01h1MLHNBqwuUk
+IiiK7ooMlvnp+DiEsVSuYYKBdGTi+4+nduuYL2g8CTNJKZuC46dY7EcE3lRYZlxl
+7dwN3jfLPRlnNscs34dwhZa+b70Flia0U1DNF4jrIFFBSHD3TqMg0Z6kxp1Tfxpe
+GOLOqnBWrr0GKehu9CGITwQYEQIADwUCTVGvbwIbDAUJA8JnAAAKCRBurq4iA8OV
+GqKjAJ9QA7mNQs0Rko5VGYA+xjPokf0yVACfQMEFVHxT/k9+awAbBFLR3D0jjJ4=
+=PYuQ
+-----END PGP PUBLIC KEY BLOCK-----
diff --git a/keys/release.asc b/keys/release.asc
new file mode 100644
index 00000000000..a4e8441c01d
--- /dev/null
+++ b/keys/release.asc
@@ -0,0 +1,140 @@
+-----BEGIN PGP PUBLIC KEY BLOCK-----
+Version: GnuPG v1.4.9 (GNU/Linux)
+
+mQGiBEtYu+8RBAC3y30BJKKV67fz6UhB0cQm1mdfYl1gBMPQv5FuX9sai7ZuzAWh
+83uGXByCp6foBp33xbusVqtM9YBrswBUfR0K5SJiWrp07e/vUz6MIORakPhQ4ggK
+M+pAvg+myQa9FdiAdFN1Pm7QXzziLSJ6hkNtmytkSxNQ1p/srx4PT2WHGwCg6bTk
+i7mUeTw+l3MQtudEAXRwb2sD/isdW5KdWtok9UwncQlfsdv8C/coHeGLHBxvIEdh
+BGHUD6DnWEIb4XbCaIBXpHu0aztm9PLE160YN8dYRMqnuwNmU7RWp0b97g641xi1
+YW/+ShBVrfUjHlYjO7vZpIJlO6rnTQB6bLuKrZ0ZG5KdMnQjbURnCzMtFPMdp0IV
+P9EgA/0YAxL1AM4aTeSYLfyDSRHe5Nvv8Y0wPV8cHKQXacgP5042riyjCxY9vSkQ
+yiLqKvC+ZBr93vRnLdS4UnDiOlEW1vqhbT8pRhAM7n8ngGRLWbrG7nhRo27eJVPr
+0r+srwCVR+udfBS1RTnzHtXMDrpLY9GS59r9x/UquwHHsfyU1bQdU2FnZSBXZWls
+IDxzYWdlQG5ld2RyZWFtLm5ldD6IYQQTEQgAIQIbAwIeAQIXgAUCS1i+bgULCQgH
+AwUVCgkICwUWAgMBAAAKCRDaRCDtKImVyHpGAJ9ThO25++a5RK/HloILHOZfC2nc
+BgCfVpv7PqHlvJcNLGETMjK9OjdzK12IRgQQEQgABgUCS1i/yQAKCRDOlFtnvv7s
+ZKPLAKDL0hF9FEGmRCC4E7K2Og1E7ju76ACfSOFFdsso0wsvmLIzPeOuwixZBEaI
+YQQTEQgAIQUCS1i77wIbAwULCQgHAwUVCgkICwUWAgMBAAIeAQIXgAAKCRDaRCDt
+KImVyAJcAKCPXc3aC7PAHuG+tJGc+ozvobkpJgCg22rH3JRr9mtYdX6RzIJCde24
+jliIRgQQEQIABgUCS4NJBAAKCRD0mo+yK5g/WDl+AKCopuwEut14Uz7FksJ/LUVT
+rveKWACfbUCPrKIo7hpHHXm0w2eBsTRs/TOJAhwEEAECAAYFAkuDSRAACgkQk0OA
+6Dag0KrGAQ/+IHS8fUApyCuomo318MPpHXQ1wJlnysMgmtvTJDTpsWM51iCYB3fG
+Op2WHlddkRJtZkrK0Yxf6koK+HaBHj4vlQgDkGwdd9N1xqQz8N0ztrwtDOwlffdx
+TVzoqaAymSvb37Eptcet0Ru+aQB6hmyFkj02SNfJ4ncPTFgabDKpXMtsZAcbBw5I
+8UHXYw25XruE4M9yF4UGilySZ9wPHm5Na/F3XM/gAJS9O6+3uiB5TbXlPv1jnFbJ
++3bSb+2zz97TrqE2zvfvFd5kiUXbyYkHNvt/HJi8MQFPEeBmq6Le+dQ41jtpa0Zs
+dqm6EoJATeVRuzrTOk+7+FVlPZB5ZKQWErOavWln6VDaBogE0IH2KIdxuaC7LWLG
+mBcNQZGvq8Qosx60jLGYCtzdftr7pZzYl5QRsqPL7pAiiKds15HJzu4beSewPe9u
+QZ1MFQ249rHn170TNDE2LsUQp4JeO5E00MExyxP6EYB3Vo6mFWoSf377GrIHbVkd
+5ePPmUN3KAsK7omUwTKQpsbtcoz8oyMOgSFIkYA/SyI18riRKviL2eJQ2jI3a0M7
+G3SB2lVueyF/gf7E/+jNjeWMImd7glQ8s7v+Mbiuq7lszLljwgPsahCiVgvDS1pN
+PDx8E2/VDE437RZpCmLae/JTwL61BsY9F4bB0Q4z9sEQ7IFcx1xHdfiIRgQQEQIA
+BgUCS4NNKQAKCRANjRNR/daw26C8AKC6+goG/2EJ31oIg7nL/Qf1BwbHiwCfZbXY
+AqCRBDWQmky8Upqc48WCgv2JAhwEEwECAAYFAkuDgOUACgkQqQNBJhIwH6LhuhAA
+lFgek4UpgvuM5+3wysojdaD207WSadLAUU2uv+lSSGFaOFtEk0u41LbTSUjiFwkJ
+22s7qCb4eCWPO8VKpmjlp9SWyxDn3OMRR34idrle8bEPeIJBiwkUBWO3QpRfkrbL
+IuK5g9hHMKtUlKLXiFxc63RXNdekaygDlJuuvjLiN6IYvvDJK9S1Iy+PlodPjckR
+J3GtxOmD2k0TFqXrfEO6qdzNVM6XQJ01irBrl1r8PYdMdk+sRjJy6GXO0KBjFNl7
+JZw71hEyvVEzquQqy5F6xvfr1Um5Y7UHpjra9rA33qn7T2IHSAAV9RF3j1y4Bmsc
+o5BnMi1TmQa9zOC5oAszDPFxVfbxpHeX7v7QAiUdaCf9uw49fyOd8k0PO7Notasn
+DnZPMwqINBHaTXlIOtVYiXV3FUlARxpLrqZ7JwdcIXiqRCqRqv7NlDoob1vBE5Ew
+mLnAAdH6Lj/QIMkeEJzhR6k2KRPXq4SB/ggUmuzw/jAWdAyUr3kOXkPAwOKLMLh0
+FB3DK5QkbWkYGGm4zYBmVBSpcUt5dLUqX9O0qhKA1F287y8eF/E7ArCaoToVNAZx
+6swnNdQNgOqKASRikLANJB0Qof1iSVh6dMbJUR7D8iHuowcSPDP/1JGNn7VFDeo/
+QsQOaAtL+nhJlFpqIcwaTTXO7KTN0sHGAxlsZQtkppKIRgQTEQIABgUCS4S7NwAK
+CRAjcDdYmlP8Ku40AJwL1Hh/05el/VYQe/JgDB17UJVawACfX9vGPaJIvj5lpYic
+OGn4VnNlQJ2IRgQTEQIABgUCS4OP4AAKCRAR6PE9C66WSJotAJ9cKKaTbaHjcjvM
+YOnAhhuXcqD9KwCeMwxZTlfAVMEGyWcq5NOvnqAfzLqIRgQTEQIABgUCS4SxkwAK
+CRAhGTfVLP6ynenjAJ0XOHfVA1uAulVfHNtR4F8NGb6+pwCdEuC9RiBjNSV+vdCQ
+lFsFganlnNmIRgQTEQIABgUCS4TAdQAKCRD/XLkM/+lAh7PHAJ0caUFxh9JVKxNU
+pSXT3usvz/49wgCfYHTNkZYaTyML+zSZ3kpf2/UiPPGJARwEEwECAAYFAkuEy90A
+CgkQtlATpJN1Xgh0NAf/drKpNHibzd43ipM1KGA1CJI9QCU+pH8P37hf7KbpDBMz
+XKzsYzijuqUtlt+bokKpHj17OUpdGhvk3/gdHTBqjoj+2S4SZOrfhbbczCP9YgxX
+fHtPWVpxQDliND3B+N2LnwrOGBVtfL2QlHY5pfTevzOZnvo2ivNQ/5NqVjWc1zxe
+2yIhDFNaTfgr+GP8wO3G8vzY+kdPVCZA1SdBRCvMzJakFICpwXex/yi1CR89DtMV
+5eqD5pgtbW3z8J75hF+x/FOs87EIwtUUBqq5XKE81VDPF5d1gO/AQ5Aw1JYvFgL/
+dFl7BAc4yMB6n61Hnz7ykTtUuzcYDPNF8tNxjJ5BBIkBHAQQAQIABgUCS4YYvAAK
+CRCGiaJaiNv9F+PwCAC/1mCxwgnQ/rqEBCCIZ7YnIGDdoyLB+samhYNGbYRMX9HS
+8i6/bX/G6U9pbsRhHXdLPUrU5qpFfjec7RJg/+QJReHmCKbG+Ed4SvZDTlR/R/VB
+vOEyitV1+jnPyPJYNjvRC7E6IPL8n8iRPsGf5Pj/mAwPQIugu23TM/zcUGaWpkJe
+V7hElpXQsJx9pzvi/MC51L/wE0N+TClZGMoIIuCJrfkWrgZ/P4rtdlnH8fdTkEgj
+P0KGMZ1mevk4LLrzUqR9+TAmEzOnSNK2BmV49Dz7cGNyUBJrT7uTdor3KVUNHSjH
+C3kLm1gHxiHXrHeSUU/lwG1OQvijAGTqzM7qI3KfiEYEExECAAYFAkuGaakACgkQ
+O/Dos2Fb9DSB8wCfWwOU744GNjWzho7/dJbetsZFq74AnjFJcAOm6AhnS56UdJMx
+QCJBPK5WiEYEExECAAYFAkuHPcwACgkQ9e/J5tUGicVt2ACfUkYQe1ge6/Auq8nE
+KCkig2SCZJoAoJtdSPKPDLb5oyhnyl9VzRnAnSmmiEYEExECAAYFAkuHskcACgkQ
+LBV88STLCDnYvQCfRB7Bfd54qXnhoGKE/DCZ5pqNEj0An34Xc/H1gri/J74ajomK
+b1ZgI/dgtBtTYWdlIFdlaWwgPHNhZ2VAa2VybmVsLm9yZz6IYQQTEQgAIQUCS1jH
+/gIbAwULCQgHAwUVCgkICwUWAgMBAAIeAQIXgAAKCRDaRCDtKImVyIX/AJwNXg/7
+uBgx6wnd0cfZtYuxFKNwmwCfQBEAL6Ml9lams5KOTNlk6Rp1kKeIRgQQEQIABgUC
+S4NJBAAKCRD0mo+yK5g/WCZgAKDCGALmU3MINVnZ7xWhgeX2u447rACeKZ2gkcYf
+Ctv/6BhsIs+mQMDVQl6JAhwEEAECAAYFAkuDSRAACgkQk0OA6Dag0KpWGg/6A7f1
+yveGxH1vOdJHWFEW6OoMSpAEEhmV61Xl/ExPyqn+0Z8nHa9mKrkv8HP53Grfk7Vz
+QjoZPYWprIJsljuFSig+Dr5UaW8gyiFtzbICZXRmGI4yR1ZI0gGbWfx59Ypavd4U
+gWym7SCFOzSvwh9VyymB0DANmEp6MHH7KMnDOnbxu9TbDg2ShWVyR+DKNg9KZUM/
+AsT+vNJubKwo7wd9v6cJxVnZ1RO3Iyy3119Q0N0FP28YoijaWrIpguoBPgAeqkLn
+KgIVme+QP7cMQFxBqBwtSu7Jo5Jjo33CvmfXftUCV/JQCkbcbPR5thhh6wt/4VfM
+/VvQO3N03joR+brNEshS9YqdrVSl1IYdUJ9HR098Nkgjx7ZZrSYuCF3CeaxXsFgv
+xKPmZ34ZalnvvJsp2SE2vYS0NqH56fQ+pxCacO5u/OUXBvzFn/Ih/C7hGdPwUWnz
+t4lRo44Hb2uXGInSVIDOgxszvnToOy55yZe/sAUYE/HIjmjvGga3Qx4UH6kT0k/Q
+3YncOC5Y2eckPefwNvvtcv8oet2/d1zX81/R1CMCJJnuVDvI570gFGZd2/yHP8uH
+vP4aBFuQfoCeKsWx13+WQ4FCinwK1clnVeMxAY9CufO6OdSPy2rg8PECBaM8S3zV
+0MLO2kozQU7GCmTTZ3r7GcXCR2xeayFahHC5ZwGIRgQQEQIABgUCS4NNIAAKCRAN
+jRNR/daw232FAKCcVRTO27X2OIm+UFiIDfuo32ACRwCgvS0gMza5jH4H0vikeyWp
+c7Nn566JAhwEEwECAAYFAkuDgOAACgkQqQNBJhIwH6L9Mg/8DJ2F/CjNWqiI5DAC
+aoKEF4xszO/gqd15hnGe/OEgszMU5etm9EAKAW4a+2i2XgzdmxvRKudJS69Nze1D
+y1yzOB6688raRTNc3bd33Sd/vtAht1hwjI1UitxXr/71jxU+lEiXizFyU4gF1Omc
+LuyOj8Ap2m4eR6GlLu/MCTTLoBYGPGpiMQQ4kA6A1wRYQkemBnYozn2wUw12EuUs
+znaLClexhG2h/V6JPXdn/lTfeMQdLG/+qzVdN64ZvuLAGZHcmMjrdsrcSSvVLfYP
+5Hn43NO0W+mntQGyCgvllYUeVGhXHGnwRZnPiIm+vtkRtHmyv/T6ru4XUknGr7n8
+6LF4CEN2hCwMrFeb5cdYCPEnEgpON9f0KU+E96wxHVH844iXg5fDa98P2MpDEEQB
+Psvh11VzPpUQwdrPxfus+uf26PhOAxsyNAOt4VMtYtT6LxADf4tQmkh3OkmSe/1Q
+0oAm2M96I1Pf8iDdSG1xRISxd7zPbQZefNXwqMTJEt7YdqVJPTtAejmIcRFV5ypn
+XLvteeBYFZO5YWGOOeLedYOVRB7KVitYmWQRTQk8zZJyvKuF+/nmuseuczT31hlu
+JxJAjjbZ0PxSSkrNuwqiLqj8bMMKkXC2423g4SEJF6VzNKwUtKuGlKNLxCyOfNFQ
+IX4AlD/HgaWHd1giZFJMhwm94aGIRgQTEQIABgUCS4OP3QAKCRAR6PE9C66WSGDp
+AJ0dHSfxYPlUBB3f2b/iIAK4kbxmnwCbB7wXSFR1Rse5AnXOHa3adu3lnZaJARwE
+EAECAAYFAkuGGLwACgkQhomiWojb/ReoCAgA4SHL0WIarFR0ssaXh0lNO5DEY9xr
+qio/H0RN32BtkLSoYTxfQqwRIWTacfjOEX6NUq5oypE/vEnbFhDOPHqf+PBKp97D
+wNjS/07KyhrCA8LDbCQRHpmRNDimPwqFdOahXEctoYKFu+GCqfvism1AIHv26ZKA
+hSHjkMTfmDJeQbnvWgSRHU5bM2q+eo3Mt42fRbs+Z+QPYsSs0vDxEmcEkwP26oLT
+4UAueDVHlbyKEv3CwqUJu2uXYAmYOXD3AlaCaPfD0833Wk/wcTdwK9sj9P0O8/lv
+NQ8TSFkmbk9jkkKerxm2IbO5iz9STWDjauO2UvNWXeS5OV+fvAfargsyPrkEDQRL
+WLvvEBAAp215RD7U20GFb0VqZ1EmBJJlCFJALyxmPpxAjtabxZ5jjIwSSbcVSPt5
+mi3IvLpT56X5jMkUQXj+g5NcPFcquNJn4x8p3rTreiS6x3M/vuWYbZPMeCtACfQI
+bRDiix7eFX5aY1C2AwQAludAYPMmuiWS6HC8OttSJSrusJeHbSlVPsK46xX3c706
+HHJLXQuDP/in496KOZMVmWH8xhqAohdHrJyMNO4rj71R7pigHZT5YryuJu4fW95K
+9LrMkWG1J49CcLKiz4jiDfVSEZDBvDZZ9XkHqFbIKBVSWJnJr/k76myalTMpZGTq
+3RfdZ/soIeNU+yGscCGKKZvh2845oy6LJNotRMI0oy40bcfjzA7BTTipeZ9ChPRs
+H8RoQH1eoPREwLOqtk77gXKDgQgpWqQeRDt5+vF8lapwcnIpt95LKAM2epp7FZZ0
+Tv7lywU2SeFTSi5Gi7jYtWTtXzwCepSl+VXZ5ai3ZuQiCV0b38Ijds56SKkfw/ZU
+y8cZzs55SOAdamBTB0hB+6IVAKW0tVSTVCEBYLr+xu+Cfpj8tWEsFVs5e7t3EwBS
+sUF3WTLD6rjPNtSwRUhwxRkh41kKzQhUBteyK+ruk0Z5mNBxOphKqIGeMB5ry32S
+i/nnQiKNHaqX7t0RrTOlbxf8n0XvfUoEr4quRGrU1TzAGdoetF8ABR0P/j5FU5Iu
+rlzhJS9SiIu6pYqkBfk+fKvtEFGXYQfIsJQSiCmj6YHQs1WVvnqc8TeiwwvH0J8D
+fwDplkhUJ0VNz5KtehN8/uRhmJSpMc0Up0x/VajN2q7k5BNal0GC1/rOrYFTsdyi
+YvGl50D4dG5M7VmrYmaXTKZ1HeC0vTqcH12a0lGEB9IYjma1Y6MWxUr79wWPS361
+lsxuwdXpKoHKQfT8LUYrXMLmIuuCtZvqPHJ0HblIp9UBNwTfqL/8g3A3KyEdT6iM
+lF3cUPsigqISPVLFocQ3Q2LnypZkE3XoqM/JsK4sFfFwg4XDaIsORt+YG66vsZEt
+STwM44wxyZm5pgjloKsdY+UwP1w/OBPhhNZq/QPCsL+pKbdbt3Nx9CO84GiyoxhQ
+zVLKGmTFCvilyW3nauojV4WM+Evu3dVS56tq+oNk9ttOH+5UsLcvMiaA/hS3q/O2
+jf7wS8dHOdTn1elHqAJPUbe/HlReTrAU1ll4rujKWi8ZQabd4RoOTjfIPxAOHszO
+pmFwHrpuL/7jH8TGcNK/6GbVeOVFlFU40klJIJ5O5He91IqSFyZoTMqFqloeGxuC
+JJV+uVNEPEyM5LCLdag8BvCnfhZZbEiUd2A5th2Pc+yKwxZ3s/Np7HfYkkug5JZ5
+75g6DNinZfPFGga5g6DWgkdPyoxrS4zhU4qoiEkEGBEIAAkFAktYu+8CGwwACgkQ
+2kQg7SiJlchkaACglP00AlVap07y7/0Ul7XNS+C/seUAnRp6f33KAyusDGes/tMH
+xG6RQNuruQINBEtYvpkBEACvQ/WYV1nJRqyq/ffX7xIAxTGS4t80ra2bOaAURPNX
+aRDOPK15Toz4/Ct89eiEohA8xLsrpssC5cEx7oS5g/XBfYDhOXnwtNf/LAgp44GY
+jhTjP/Oc+1z3Q3SJvgmCJbEgKtAlzE7nXYcagjvNUIiQa9FgJ5M58vN284Evfm5C
+omIbduiJEAWMjJk8is8gNoX8zCdX0TIoaznOiJ1g3dsuUygZ/oZ4UFx6Ph5Lsaaq
+XGg6iBHA730Wk3xxOG4ndlkWuLbvKd0vb2I/umQOEcuPoxR6B1NLB2f6GulEY9/P
+JCfdPX3NbZgJoI8nS7GqDaJB45DXiuAU02W1kHU9vTQOKp6UJKUiff4NF7zpefef
+UNpMCWeyYKGyS3U4Bl17pEI1n9iH1oNyrLQljDlAGjhIAlT/oF/Qbm7nNl/iTO30
+HMXIuXy8yFfMtjolEhOiwRCCK+ooQ000BGwM17yppVNa1i6lDiiHvG3FdVoV1HF1
+hOgHXS4j6fbWmSOwKewKHK2+GBJBxppuedLzuxCtxmQOwFBhAoiJokCXDo1nGE85
++/dJdyN/dZyBCQ3NyC8A7BB6nK3uUWO/b4wwWZjpxpyrHs6+SgrKIkj12rnaQuAI
+SNbVD++y/QJcUVXlqsJkVj4v04aWAwdNWlOrYT770xUzc8xUmPbsd89QgLfqyMFo
+hQARAQABiEkEGBEIAAkFAktYvpkCGwwACgkQ2kQg7SiJlcgRgwCghJWbAL8wv7Q0
+tS2vGOq9DWh1N28An2yMBPSf/WaHuo+mtsKqMk3enf6R
+=6gus
+-----END PGP PUBLIC KEY BLOCK-----
diff --git a/man/mkcephfs.8 b/man/mkcephfs.8
index f066d789462..51fed771f90 100644
--- a/man/mkcephfs.8
+++ b/man/mkcephfs.8
@@ -61,7 +61,7 @@ executing commands via SSH.
 Use the given conf file instead of the default \fI/etc/ceph/ceph.conf\fP.
 .TP
 \fB\-k\fI /path/to/keyring\fR
-When \fB\-a\fR is used, we can special a location to copy the
+When \fB\-a\fR is used, we can specify a location to copy the
 client.admin keyring, which is used to administer the cluster.  The
 default is \fI/etc/ceph/keyring\fP (or whatever is specified in the
 config file).
diff --git a/qa/workunits/false.sh b/qa/workunits/false.sh
new file mode 100644
index 00000000000..8a961b329d5
--- /dev/null
+++ b/qa/workunits/false.sh
@@ -0,0 +1,3 @@
+#!/bin/sh -ex
+
+false
+\ No newline at end of file
diff --git a/src/Makefile.am b/src/Makefile.am
index e53b68def91..5139235121e 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -317,12 +317,12 @@ my_radosgw_src = \
 	rgw/rgw_access.cc \
 	rgw/rgw_op.cc \
 	rgw/rgw_rest.cc \
-	rgw/rgw_rest_os.cc \
+	rgw/rgw_rest_swift.cc \
 	rgw/rgw_rest_s3.cc \
 	rgw/rgw_common.cc \
 	rgw/rgw_cache.cc \
-	rgw/rgw_os.cc \
-	rgw/rgw_os_auth.cc \
+	rgw/rgw_swift.cc \
+	rgw/rgw_swift_auth.cc \
 	rgw/rgw_formats.cc \
 	rgw/rgw_log.cc \
 	rgw/rgw_multi.cc \
@@ -885,6 +885,7 @@ libmds_a_SOURCES = \
 	mds/Dumper.cc \
 	mds/Resetter.cc \
 	mds/MDS.cc \
+	mds/flock.cc \
 	mds/locks.c \
 	mds/journal.cc \
 	mds/Server.cc \
@@ -1118,8 +1119,8 @@ noinst_HEADERS = \
 	include/rbd/librbd.h\
 	include/rbd/librbd.hpp\
 	logrotate.conf\
-	mds/flock.h\
 	mds/inode_backtrace.h\
+	mds/flock.h\
 	mds/locks.c\
 	mds/locks.h\
         mds/Anchor.h\
@@ -1333,11 +1334,11 @@ noinst_HEADERS = \
 	rgw/rgw_log.h\
 	rgw/rgw_multi.h\
 	rgw/rgw_op.h\
-	rgw/rgw_os.h\
-	rgw/rgw_os_auth.h\
+	rgw/rgw_swift.h\
+	rgw/rgw_swift_auth.h\
 	rgw/rgw_rados.h\
 	rgw/rgw_rest.h\
-	rgw/rgw_rest_os.h\
+	rgw/rgw_rest_swift.h\
 	rgw/rgw_rest_s3.h\
 	rgw/rgw_tools.h\
 	rgw/rgw_bucket.h\
diff --git a/src/client/Client.cc b/src/client/Client.cc
index bb2cfb90c43..c5fa15abf79 100644
--- a/src/client/Client.cc
+++ b/src/client/Client.cc
@@ -2948,11 +2948,13 @@ void Client::handle_cap_import(Inode *in, MClientCaps *m)
 		 m->get_caps(), m->get_seq(), m->get_mseq(), m->get_realm(),
 		 CEPH_CAP_FLAG_AUTH);
   
-  // reflush any/all caps
-  if (in->cap_snaps.size())
-    flush_snaps(in, true);
-  if (in->flushing_caps)
-    flush_caps(in, mds);
+  if (in->auth_cap && in->auth_cap->session->mds_num == mds) {
+    // reflush any/all caps (if we are now the auth_cap)
+    if (in->cap_snaps.size())
+      flush_snaps(in, true);
+    if (in->flushing_caps)
+      flush_caps(in, mds);
+  }
 
   if (m->get_mseq() > in->exporting_mseq) {
     ldout(cct, 5) << "handle_cap_import ino " << m->get_ino() << " mseq " << m->get_mseq()
@@ -3279,7 +3281,7 @@ void Client::unmount()
 
   // NOTE: i'm assuming all caches are already flushing (because all files are closed).
 
-  //clean up any unclosed files
+  // clean up any unclosed files
   if (!fd_map.empty())
     std::cerr << "Warning: Some files were not closed prior to unmounting;\n"
 	      << "Ceph is closing them now.\n";
@@ -3287,7 +3289,8 @@ void Client::unmount()
     int fd = fd_map.begin()->first;
     assert(fd_map.count(fd));
     Fh *fh = fd_map[fd];
-    _release(fh);
+    lderr(cct) << " destroying lost open file " << fh << " on " << *fh->inode << dendl;
+    _release_fh(fh);
     fd_map.erase(fd);
   }
 
@@ -4823,6 +4826,29 @@ Fh *Client::_create_fh(Inode *in, int flags, int cmode)
   return f;
 }
 
+int Client::_release_fh(Fh *f)
+{
+  //ldout(cct, 3) << "op: client->close(open_files[ " << fh << " ]);" << dendl;
+  //ldout(cct, 3) << "op: open_files.erase( " << fh << " );" << dendl;
+  Inode *in = f->inode;
+  ldout(cct, 5) << "_release_fh " << f << " mode " << f->mode << " on " << *in << dendl;
+
+  if (in->snapid == CEPH_NOSNAP) {
+    if (in->put_open_ref(f->mode)) {
+      _flush(in);
+      check_caps(in, false);
+    }
+  } else {
+    assert(in->snap_cap_refs > 0);
+    in->snap_cap_refs--;
+  }
+
+  put_inode( in );
+  delete f;
+
+  return 0;
+}
+
 int Client::_open(Inode *in, int flags, mode_t mode, Fh **fhp, int uid, int gid) 
 {
   int cmode = ceph_flags_to_mode(flags);
@@ -4866,9 +4892,6 @@ int Client::_open(Inode *in, int flags, mode_t mode, Fh **fhp, int uid, int gid)
   return result;
 }
 
-
-
-
 int Client::close(int fd)
 {
   ldout(cct, 3) << "close enter(" << fd << ")" << dendl;
@@ -4878,36 +4901,12 @@ int Client::close(int fd)
 
   assert(fd_map.count(fd));
   Fh *fh = fd_map[fd];
-  _release(fh);
+  _release_fh(fh);
   fd_map.erase(fd);
   ldout(cct, 3) << "close exit(" << fd << ")" << dendl;
   return 0;
 }
 
-int Client::_release(Fh *f)
-{
-  //ldout(cct, 3) << "op: client->close(open_files[ " << fh << " ]);" << dendl;
-  //ldout(cct, 3) << "op: open_files.erase( " << fh << " );" << dendl;
-  Inode *in = f->inode;
-  ldout(cct, 5) << "_release " << f << " mode " << f->mode << " on " << *in << dendl;
-
-  if (in->snapid == CEPH_NOSNAP) {
-    if (in->put_open_ref(f->mode)) {
-      _flush(in);
-      check_caps(in, false);
-    }
-  } else {
-    assert(in->snap_cap_refs > 0);
-    in->snap_cap_refs--;
-  }
-
-  put_inode( in );
-  delete f;
-
-  return 0;
-}
-
-
 
 // ------------
 // read, write
@@ -6735,7 +6734,7 @@ int Client::ll_release(Fh *fh)
   tout(cct) << "ll_release" << std::endl;
   tout(cct) << (unsigned long)fh << std::endl;
 
-  _release(fh);
+  _release_fh(fh);
   return 0;
 }
 
diff --git a/src/client/Client.h b/src/client/Client.h
index c094082c92f..f4a5f1404f7 100644
--- a/src/client/Client.h
+++ b/src/client/Client.h
@@ -467,6 +467,7 @@ private:
   void _ll_drop_pins();
 
   Fh *_create_fh(Inode *in, int flags, int cmode);
+  int _release_fh(Fh *fh);
 
   int _read_sync(Fh *f, uint64_t off, uint64_t len, bufferlist *bl);
   int _read_async(Fh *f, uint64_t off, uint64_t len, bufferlist *bl);
@@ -491,7 +492,6 @@ private:
   int _removexattr(Inode *in, const char *nm, int uid=-1, int gid=-1);
   int _open(Inode *in, int flags, mode_t mode, Fh **fhp, int uid=-1, int gid=-1);
   int _create(Inode *in, const char *name, int flags, mode_t mode, Inode **inp, Fh **fhp, int uid=-1, int gid=-1);
-  int _release(Fh *fh);
   loff_t _lseek(Fh *fh, loff_t offset, int whence);
   int _read(Fh *fh, int64_t offset, uint64_t size, bufferlist *bl);
   int _write(Fh *fh, int64_t offset, uint64_t size, const char *buf);
diff --git a/src/client/fuse_ll.cc b/src/client/fuse_ll.cc
index 9ea7e16d59c..aeaceec3f10 100644
--- a/src/client/fuse_ll.cc
+++ b/src/client/fuse_ll.cc
@@ -317,7 +317,7 @@ static void ceph_ll_link(fuse_req_t req, fuse_ino_t ino, fuse_ino_t newparent,
 static void ceph_ll_open(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi)
 {
   const struct fuse_ctx *ctx = fuse_req_ctx(req);
-  Fh *fh;
+  Fh *fh = NULL;
   int r = client->ll_open(fino_vino(ino), fi->flags, &fh, ctx->uid, ctx->gid);
   if (r == 0) {
     fi->fh = (long)fh;
@@ -443,7 +443,7 @@ static void ceph_ll_create(fuse_req_t req, fuse_ino_t parent, const char *name,
   const struct fuse_ctx *ctx = fuse_req_ctx(req);
   struct fuse_entry_param fe;
   memset(&fe, 0, sizeof(fe));
-  Fh *fh;
+  Fh *fh = NULL;
   int r = client->ll_create(fino_vino(parent), name, mode, fi->flags, &fe.attr, &fh, ctx->uid, ctx->gid);
   if (r == 0) {
     fi->fh = (long)fh;
diff --git a/src/common/config.cc b/src/common/config.cc
index c75e066b51f..f618d9d2e40 100644
--- a/src/common/config.cc
+++ b/src/common/config.cc
@@ -433,6 +433,11 @@ struct config_option config_optionsp[] = {
   OPTION(rgw_socket_path, OPT_STR, NULL),   // path to unix domain socket, if not specified, rgw will not run as external fcgi
   OPTION(rgw_op_thread_timeout, OPT_INT, 10*60),
   OPTION(rgw_op_thread_suicide_timeout, OPT_INT, 60*60),
+  OPTION(rgw_thread_pool_size, OPT_INT, 100),
+  OPTION(rgw_maintenance_tick_interval, OPT_DOUBLE, 10.0),
+  OPTION(rgw_pools_preallocate_max, OPT_INT, 100),
+  OPTION(rgw_pools_preallocate_threshold, OPT_INT, 70),
+  OPTION(rgw_log_nonexistent_bucket, OPT_BOOL, false),
 
   // see config.h
   OPTION(internal_safe_to_start_threads, OPT_BOOL, false),
diff --git a/src/common/config.h b/src/common/config.h
index 1d4bfbfc9be..b072edbff32 100644
--- a/src/common/config.h
+++ b/src/common/config.h
@@ -567,6 +567,11 @@ public:
   string rgw_socket_path;
   int rgw_op_thread_timeout;
   int rgw_op_thread_suicide_timeout;
+  int rgw_thread_pool_size;
+  double rgw_maintenance_tick_interval;
+  int rgw_pools_preallocate_max;
+  int rgw_pools_preallocate_threshold;
+  bool rgw_log_nonexistent_bucket;
 
   // This will be set to true when it is safe to start threads.
   // Once it is true, it will never change.
diff --git a/src/cosd.cc b/src/cosd.cc
index 82fa291901c..e2ce3cc347d 100644
--- a/src/cosd.cc
+++ b/src/cosd.cc
@@ -202,15 +202,13 @@ int main(int argc, const char **argv)
     exit(0);
   }
 
-  int err = OSD::convertfs(g_conf->osd_data, g_conf->osd_journal);
-  if (err < 0) {
-    derr << TEXT_RED << " ** ERROR: error converting store " << g_conf->osd_data
-	 << ": " << cpp_strerror(-err) << TEXT_NORMAL << dendl;
-    exit(1);
-  }
   if (convertfilestore) {
-    derr << "Converted Filestore " << g_conf->osd_data << dendl;
-    exit(0);
+    int err = OSD::convertfs(g_conf->osd_data, g_conf->osd_journal);
+    if (err < 0) {
+      derr << TEXT_RED << " ** ERROR: error converting store " << g_conf->osd_data
+	   << ": " << cpp_strerror(-err) << TEXT_NORMAL << dendl;
+      exit(1);
+    }
   }
   
   string magic;
@@ -302,6 +300,14 @@ int main(int argc, const char **argv)
   // Leave stderr open in case we need to report errors.
   global_init_daemonize(g_ceph_context, CINIT_FLAG_NO_CLOSE_STDERR);
   common_init_finish(g_ceph_context);
+
+  int err = OSD::convertfs(g_conf->osd_data, g_conf->osd_journal);
+  if (err < 0) {
+    derr << TEXT_RED << " ** ERROR: error converting store " << g_conf->osd_data
+	 << ": " << cpp_strerror(-err) << TEXT_NORMAL << dendl;
+    exit(1);
+  }
+
   MonClient mc(g_ceph_context);
   if (mc.build_initial_monmap() < 0)
     return -1;
diff --git a/src/doc/object_store.dot b/src/doc/object_store.dot
deleted file mode 100644
index 08328f07a24..00000000000
--- a/src/doc/object_store.dot
+++ /dev/null
@@ -1,61 +0,0 @@
-/*
- * Rough outline of object store module dependencies
- *
- * build with
- * dot -Tpng < ./object_store.dot > object_store.png
- */
-
-digraph object_store {
-	size="16,16";
-	node [color=lightblue2, style=filled, fontname="Serif"];
-
-	"testrados" -> "librados"
-	"testradospp" -> "librados"
-
-	"rbd" -> "librados"
-
-	"radostool" -> "librados"
-
-	"radosgw_admin" -> "rgw"
-
-	"rgw" -> "librados"
-
-	"radosacl" -> "librados"
-
-	"librados" -> "objecter"
-
-	"ObjectCacher" -> "Filer"
-
-	"dumpjournal" -> "Journaler"
-
-	"Journaler" -> "Filer"
-
-	"SyntheticClient" -> "Filer"
-	"SyntheticClient" -> "objecter"
-
-	"Filer" -> "objecter"
-
-        "objecter" -> "OSDMap"
-
-        "cosd" -> "PG"
-        "cosd" -> "ObjectStore"
-
-        "crushtool" -> "CrushWrapper"
-
-        "OSDMap" -> "CrushWrapper"
-
-        "OSDMapTool" -> "OSDMap"
-
-        "PG" -> "ReplicatedPG"
-        "PG" -> "ObjectStore"
-        "PG" -> "OSDMap"
-
-        "ReplicatedPG" -> "ObjectStore"
-        "ReplicatedPG" -> "OSDMap"
-
-        "ObjectStore" -> "FileStore"
-
-        "FileStore" -> "ext3"
-        "FileStore" -> "ext4"
-        "FileStore" -> "btrfs"
-}
diff --git a/src/include/object.h b/src/include/object.h
index bc56445fd1e..706caa6bffc 100644
--- a/src/include/object.h
+++ b/src/include/object.h
@@ -261,14 +261,15 @@ namespace __gnu_cxx {
 
 struct hobject_t {
   object_t oid;
+  string key;
   snapid_t snap;
   uint32_t hash;
 
   hobject_t() : snap(0), hash(0) {}
-  hobject_t(object_t oid, snapid_t snap, uint32_t hash) : 
-    oid(oid), snap(snap), hash(hash) {}
-  hobject_t(const sobject_t &soid, uint32_t hash) : 
-    oid(soid.oid), snap(soid.snap), hash(hash) {}
+  hobject_t(object_t oid, const string &key, snapid_t snap, uint32_t hash) : 
+    oid(oid), key(key), snap(snap), hash(hash) {}
+  hobject_t(const sobject_t &soid, const string &key, uint32_t hash) : 
+    oid(soid.oid), key(key), snap(soid.snap), hash(hash) {}
 
   /* Do not use when a particular hash function is needed */
   explicit hobject_t(const sobject_t &o) :
@@ -279,9 +280,11 @@ struct hobject_t {
   void swap(hobject_t &o) {
     hobject_t temp(o);
     o.oid = oid;
+    o.key = key;
     o.snap = snap;
     o.hash = hash;
     oid = temp.oid;
+    key = temp.key;
     snap = temp.snap;
     hash = temp.hash;
   }
@@ -291,8 +294,9 @@ struct hobject_t {
   }
 
   void encode(bufferlist& bl) const {
-    __u8 version = 0;
+    __u8 version = 1;
     ::encode(version, bl);
+    ::encode(key, bl);
     ::encode(oid, bl);
     ::encode(snap, bl);
     ::encode(hash, bl);
@@ -300,6 +304,8 @@ struct hobject_t {
   void decode(bufferlist::iterator& bl) {
     __u8 version;
     ::decode(version, bl);
+    if (version >= 1)
+      ::decode(key, bl);
     ::decode(oid, bl);
     ::decode(snap, bl);
     ::decode(hash, bl);
diff --git a/src/include/rbd/librbd.h b/src/include/rbd/librbd.h
index 8389f98972a..886e04e8636 100644
--- a/src/include/rbd/librbd.h
+++ b/src/include/rbd/librbd.h
@@ -37,8 +37,7 @@ extern "C" {
 typedef void *rbd_snap_t;
 typedef void *rbd_image_t;
 
-typedef int (*librbd_copy_progress_fn_t)(uint64_t offset, uint64_t src_size,
-					 void *data);
+typedef int (*librbd_progress_fn_t)(uint64_t offset, uint64_t total, void *ptr);
 
 typedef struct {
   uint64_t id;
@@ -73,7 +72,7 @@ int rbd_resize(rbd_image_t image, uint64_t size);
 int rbd_stat(rbd_image_t image, rbd_image_info_t *info, size_t infosize);
 int rbd_copy(rbd_image_t image, rados_ioctx_t dest_io_ctx, const char *destname);
 int rbd_copy_with_progress(rbd_image_t image, rados_ioctx_t dest_p,
-	    const char *destname, librbd_copy_progress_fn_t fn, void *data);
+			   const char *destname, librbd_progress_fn_t fn, void *data);
 
 /* snapshots */
 int rbd_snap_list(rbd_image_t image, rbd_snap_info_t *snaps, int *max_snaps);
diff --git a/src/include/rbd/librbd.hpp b/src/include/rbd/librbd.hpp
index 956f072576e..6a685cccf8b 100644
--- a/src/include/rbd/librbd.hpp
+++ b/src/include/rbd/librbd.hpp
@@ -41,12 +41,12 @@ namespace librbd {
 
   typedef rbd_image_info_t image_info_t;
 
-class ProgressContext
-{
-public:
-  virtual ~ProgressContext();
-  virtual int update_progress(uint64_t offset, uint64_t src_size) = 0;
-};
+  class ProgressContext
+  {
+  public:
+    virtual ~ProgressContext();
+    virtual int update_progress(uint64_t offset, uint64_t total) = 0;
+  };
 
 class RBD
 {
diff --git a/src/librbd.cc b/src/librbd.cc
index 3846cb7aeae..0a31e278c1f 100644
--- a/src/librbd.cc
+++ b/src/librbd.cc
@@ -1053,7 +1053,7 @@ public:
 class CProgressContext : public librbd::ProgressContext
 {
 public:
-  CProgressContext(librbd_copy_progress_fn_t fn, void *data)
+  CProgressContext(librbd_progress_fn_t fn, void *data)
     : m_fn(fn), m_data(data)
   {
   }
@@ -1062,7 +1062,7 @@ public:
     return m_fn(offset, src_size, m_data);
   }
 private:
-  librbd_copy_progress_fn_t m_fn;
+  librbd_progress_fn_t m_fn;
   void *m_data;
 };
 
@@ -1761,7 +1761,7 @@ extern "C" int rbd_copy(rbd_image_t image, rados_ioctx_t dest_p, const char *des
 }
 
 extern "C" int rbd_copy_with_progress(rbd_image_t image, rados_ioctx_t dest_p,
-	      const char *destname, librbd_copy_progress_fn_t fn, void *data)
+	      const char *destname, librbd_progress_fn_t fn, void *data)
 {
   librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
   librados::IoCtx dest_io_ctx;
diff --git a/src/mds/CInode.cc b/src/mds/CInode.cc
index aa306adfcb8..528d1f8b970 100644
--- a/src/mds/CInode.cc
+++ b/src/mds/CInode.cc
@@ -129,6 +129,10 @@ ostream& operator<<(ostream& out, CInode& in)
   if (in.is_freezing_inode()) out << " FREEZING=" << in.auth_pin_freeze_allowance;
   if (in.is_frozen_inode()) out << " FROZEN";
 
+  inode_t *pi = in.get_projected_inode();
+  if (pi->is_truncating())
+    out << " truncating(" << pi->truncate_from << " to " << pi->truncate_size << ")";
+
   // anchors
   if (in.is_anchored())
     out << " anc";
diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc
index e07479e2fe4..5422f5b4992 100644
--- a/src/mds/MDCache.cc
+++ b/src/mds/MDCache.cc
@@ -5232,6 +5232,8 @@ void MDCache::_truncate_inode(CInode *in, LogSegment *ls)
 	   << pi->truncate_from << " -> " << pi->truncate_size
 	   << " on " << *in << dendl;
 
+  assert(pi->is_truncating());
+
   in->auth_pin(this);
 
   SnapRealm *realm = in->find_snaprealm();
diff --git a/src/mds/flock.cc b/src/mds/flock.cc
new file mode 100644
index 00000000000..b97aa013118
--- /dev/null
+++ b/src/mds/flock.cc
@@ -0,0 +1,488 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+#include <errno.h>
+
+#include "common/debug.h"
+#include "mdstypes.h"
+#include "mds/flock.h"
+
+bool ceph_lock_state_t::is_waiting(ceph_filelock &fl)
+{
+  multimap<uint64_t, ceph_filelock>::iterator p = waiting_locks.find(fl.start);
+  while (p != waiting_locks.end()) {
+    if (p->second.start > fl.start)
+      return false;
+    if (p->second.length == fl.length &&
+        p->second.client == fl.client &&
+        p->second.pid == fl.pid &&
+        p->second.pid_namespace == fl.pid_namespace)
+      return true;
+    ++p;
+  }
+  return false;
+}
+
+void ceph_lock_state_t::remove_waiting(ceph_filelock& fl)
+{
+  multimap<uint64_t, ceph_filelock>::iterator p = waiting_locks.find(fl.start);
+  while (p != waiting_locks.end()) {
+    if (p->second.start > fl.start)
+      return;
+    if (p->second.length == fl.length &&
+        p->second.client == fl.client &&
+        p->second.pid == fl.pid &&
+        p->second.pid_namespace == fl.pid_namespace) {
+      waiting_locks.erase(p);
+      return;
+    }
+    ++p;
+  }
+}
+
+bool ceph_lock_state_t::add_lock(ceph_filelock& new_lock, bool wait_on_fail)
+{
+  dout(15) << "add_lock " << new_lock << dendl;
+  bool ret = false;
+  list<multimap<uint64_t, ceph_filelock>::iterator>
+    overlapping_locks, self_overlapping_locks, neighbor_locks;
+
+  // first, get any overlapping locks and split them into owned-by-us and not
+  if (get_overlapping_locks(new_lock, overlapping_locks, &neighbor_locks)) {
+    dout(15) << "got overlapping lock, splitting by owner" << dendl;
+    split_by_owner(new_lock, overlapping_locks, self_overlapping_locks);
+  }
+  if (!overlapping_locks.empty()) { //overlapping locks owned by others :(
+    if (CEPH_LOCK_EXCL == new_lock.type) {
+      //can't set, we want an exclusive
+      dout(15) << "overlapping lock, and this lock is exclusive, can't set"
+              << dendl;
+      if (wait_on_fail) {
+        waiting_locks.insert(pair<uint64_t, ceph_filelock>(new_lock.start, new_lock));
+      }
+    } else { //shared lock, check for any exclusive locks blocking us
+      if (contains_exclusive_lock(overlapping_locks)) { //blocked :(
+        dout(15) << " blocked by exclusive lock in overlapping_locks" << dendl;
+        if (wait_on_fail) {
+          waiting_locks.insert(pair<uint64_t, ceph_filelock>(new_lock.start, new_lock));
+        }
+      } else {
+        //yay, we can insert a shared lock
+        dout(15) << "inserting shared lock" << dendl;
+        adjust_locks(self_overlapping_locks, new_lock, neighbor_locks);
+        held_locks.insert(pair<uint64_t, ceph_filelock>(new_lock.start, new_lock));
+        ret = true;
+      }
+    }
+  } else { //no overlapping locks except our own
+    adjust_locks(self_overlapping_locks, new_lock, neighbor_locks);
+    dout(15) << "no conflicts, inserting " << new_lock << dendl;
+    held_locks.insert(pair<uint64_t, ceph_filelock>
+                      (new_lock.start, new_lock));
+    ret = true;
+  }
+  if (ret)
+    ++client_held_lock_counts[(client_t)new_lock.client];
+  else if (wait_on_fail)
+    ++client_waiting_lock_counts[(client_t)new_lock.client];
+  return ret;
+}
+
+void ceph_lock_state_t::look_for_lock(ceph_filelock& testing_lock)
+{
+  list<multimap<uint64_t, ceph_filelock>::iterator> overlapping_locks,
+    self_overlapping_locks;
+  if (get_overlapping_locks(testing_lock, overlapping_locks)) {
+    split_by_owner(testing_lock, overlapping_locks, self_overlapping_locks);
+  }
+  if (!overlapping_locks.empty()) { //somebody else owns overlapping lock
+    if (CEPH_LOCK_EXCL == testing_lock.type) { //any lock blocks it
+      testing_lock = (*overlapping_locks.begin())->second;
+    } else {
+      ceph_filelock *blocking_lock;
+      if ((blocking_lock = contains_exclusive_lock(overlapping_locks))) {
+        testing_lock = *blocking_lock;
+      } else { //nothing blocking!
+        testing_lock.type = CEPH_LOCK_UNLOCK;
+      }
+    }
+    return;
+  }
+  //if we get here, only our own locks block
+  testing_lock.type = CEPH_LOCK_UNLOCK;
+}
+
+void ceph_lock_state_t::remove_lock(ceph_filelock removal_lock,
+                 list<ceph_filelock>& activated_locks)
+{
+  list<multimap<uint64_t, ceph_filelock>::iterator> overlapping_locks,
+    self_overlapping_locks, crossed_waiting_locks;
+  if (get_overlapping_locks(removal_lock, overlapping_locks)) {
+    dout(15) << "splitting by owner" << dendl;
+    split_by_owner(removal_lock, overlapping_locks, self_overlapping_locks);
+  } else dout(15) << "attempt to remove lock at " << removal_lock.start
+                 << " but no locks there!" << dendl;
+  bool remove_to_end = (0 == removal_lock.length);
+  bool old_lock_to_end;
+  uint64_t removal_start = removal_lock.start;
+  uint64_t removal_end = removal_start + removal_lock.length - 1;
+  uint64_t old_lock_end;
+  __s64 old_lock_client = 0;
+  ceph_filelock *old_lock;
+
+  dout(15) << "examining " << self_overlapping_locks.size()
+          << " self-overlapping locks for removal" << dendl;
+  for (list<multimap<uint64_t, ceph_filelock>::iterator>::iterator
+         iter = self_overlapping_locks.begin();
+       iter != self_overlapping_locks.end();
+       ++iter) {
+    dout(15) << "self overlapping lock " << (*iter)->second << dendl;
+    old_lock = &(*iter)->second;
+    old_lock_to_end = (0 == old_lock->length);
+    old_lock_end = old_lock->start + old_lock->length - 1;
+    old_lock_client = old_lock->client;
+    if (remove_to_end) {
+      if (old_lock->start < removal_start) {
+        old_lock->length = removal_start - old_lock->start;
+      } else {
+        dout(15) << "erasing " << (*iter)->second << dendl;
+        held_locks.erase(*iter);
+        --client_held_lock_counts[old_lock_client];
+      }
+    } else if (old_lock_to_end) {
+      ceph_filelock append_lock = *old_lock;
+      append_lock.start = removal_end+1;
+      held_locks.insert(pair<uint64_t, ceph_filelock>
+                        (append_lock.start, append_lock));
+      ++client_held_lock_counts[(client_t)old_lock->client];
+      if (old_lock->start >= removal_start) {
+        dout(15) << "erasing " << (*iter)->second << dendl;
+        held_locks.erase(*iter);
+        --client_held_lock_counts[old_lock_client];
+      } else old_lock->length = removal_start - old_lock->start;
+    } else {
+      if (old_lock_end  > removal_end) {
+        ceph_filelock append_lock = *old_lock;
+        append_lock.start = removal_end + 1;
+        append_lock.length = old_lock_end - append_lock.start + 1;
+        held_locks.insert(pair<uint64_t, ceph_filelock>
+                          (append_lock.start, append_lock));
+        ++client_held_lock_counts[(client_t)old_lock->client];
+      }
+      if (old_lock->start < removal_start) {
+        old_lock->length = removal_start - old_lock->start;
+      } else {
+        dout(15) << "erasing " << (*iter)->second << dendl;
+        held_locks.erase(*iter);
+        --client_held_lock_counts[old_lock_client];
+      }
+    }
+    if (!client_held_lock_counts[old_lock_client]) {
+      client_held_lock_counts.erase(old_lock_client);
+    }
+  }
+}
+
+bool ceph_lock_state_t::remove_all_from (client_t client)
+{
+  bool cleared_any = false;
+  if (client_held_lock_counts.count(client)) {
+    remove_all_from(client, held_locks);
+    client_held_lock_counts.erase(client);
+    cleared_any = true;
+  }
+  if (client_waiting_lock_counts.count(client)) {
+    remove_all_from(client, waiting_locks);
+    client_waiting_lock_counts.erase(client);
+  }
+  return cleared_any;
+}
+
+void ceph_lock_state_t::adjust_locks(list<multimap<uint64_t, ceph_filelock>::iterator> old_locks,
+                  ceph_filelock& new_lock,
+                  list<multimap<uint64_t, ceph_filelock>::iterator>
+                  neighbor_locks)
+{
+  dout(15) << "adjust_locks" << dendl;
+  bool new_lock_to_end = (0 == new_lock.length);
+  bool old_lock_to_end;
+  uint64_t new_lock_start = new_lock.start;
+  uint64_t new_lock_end = new_lock.start + new_lock.length - 1;
+  uint64_t old_lock_start, old_lock_end;
+  __s64 old_lock_client = 0;
+  ceph_filelock *old_lock;
+  for (list<multimap<uint64_t, ceph_filelock>::iterator>::iterator
+         iter = old_locks.begin();
+       iter != old_locks.end();
+       ++iter) {
+    old_lock = &(*iter)->second;
+    dout(15) << "adjusting lock: " << *old_lock << dendl;
+    old_lock_to_end = (0 == old_lock->length);
+    old_lock_start = old_lock->start;
+    old_lock_end = old_lock->start + old_lock->length - 1;
+    new_lock_start = new_lock.start;
+    new_lock_end = new_lock.start + new_lock.length - 1;
+    old_lock_client = old_lock->client;
+    if (new_lock_to_end || old_lock_to_end) {
+      //special code path to deal with a length set at 0
+      dout(15) << "one lock extends forever" << dendl;
+      if (old_lock->type == new_lock.type) {
+        //just unify them in new lock, remove old lock
+        dout(15) << "same lock type, unifying" << dendl;
+        new_lock.start = (new_lock_start < old_lock_start) ? new_lock_start :
+          old_lock_start;
+        new_lock.length = 0;
+        held_locks.erase(*iter);
+        --client_held_lock_counts[old_lock_client];
+      } else { //not same type, have to keep any remains of old lock around
+        dout(15) << "shrinking old lock" << dendl;
+        if (new_lock_to_end) {
+          if (old_lock_start < new_lock_start) {
+            old_lock->length = new_lock_start - old_lock_start;
+          } else {
+            held_locks.erase(*iter);
+            --client_held_lock_counts[old_lock_client];
+          }
+        } else { //old lock extends past end of new lock
+          ceph_filelock appended_lock = *old_lock;
+          appended_lock.start = new_lock_end + 1;
+          held_locks.insert(pair<uint64_t, ceph_filelock>
+                            (appended_lock.start, appended_lock));
+          ++client_held_lock_counts[(client_t)old_lock->client];
+          if (old_lock_start < new_lock_start) {
+            old_lock->length = new_lock_start - old_lock_start;
+          } else {
+            held_locks.erase(*iter);
+            --client_held_lock_counts[old_lock_client];
+          }
+        }
+      }
+    } else {
+      if (old_lock->type == new_lock.type) { //just merge them!
+        dout(15) << "merging locks, they're the same type" << dendl;
+        new_lock.start = (old_lock_start < new_lock_start ) ? old_lock_start :
+          new_lock_start;
+        int new_end = (new_lock_end > old_lock_end) ? new_lock_end :
+          old_lock_end;
+        new_lock.length = new_end - new_lock.start + 1;
+        dout(15) << "erasing lock " << (*iter)->second << dendl;
+        held_locks.erase(*iter);
+        --client_held_lock_counts[old_lock_client];
+      } else { //we'll have to update sizes and maybe make new locks
+        dout(15) << "locks aren't same type, changing sizes" << dendl;
+        if (old_lock_end > new_lock_end) { //add extra lock after new_lock
+          ceph_filelock appended_lock = *old_lock;
+          appended_lock.start = new_lock_end + 1;
+          appended_lock.length = old_lock_end - appended_lock.start + 1;
+          held_locks.insert(pair<uint64_t, ceph_filelock>
+                            (appended_lock.start, appended_lock));
+          ++client_held_lock_counts[(client_t)old_lock->client];
+        }
+        if (old_lock_start < new_lock_start) {
+          old_lock->length = new_lock_start - old_lock_start;
+        } else { //old_lock starts inside new_lock, so remove it
+          //if it extended past new_lock_end it's been replaced
+          held_locks.erase(*iter);
+          --client_held_lock_counts[old_lock_client];
+        }
+      }
+    }
+    if (!client_held_lock_counts[old_lock_client]) {
+      client_held_lock_counts.erase(old_lock_client);
+    }
+  }
+
+  //make sure to coalesce neighboring locks
+  for (list<multimap<uint64_t, ceph_filelock>::iterator>::iterator
+         iter = neighbor_locks.begin();
+       iter != neighbor_locks.end();
+       ++iter) {
+    old_lock = &(*iter)->second;
+    old_lock_client = old_lock->client;
+    dout(15) << "lock to coalesce: " << *old_lock << dendl;
+    /* because if it's a neibhoring lock there can't be any self-overlapping
+       locks that covered it */
+    if (old_lock->type == new_lock.type) { //merge them
+      if (0 == new_lock.length) {
+        if (old_lock->start + old_lock->length == new_lock.start) {
+          new_lock.start = old_lock->start;
+        } else assert(0); /* if there's no end to new_lock, the neighbor
+                             HAS TO be to left side */
+      } else if (0 == old_lock->length) {
+        if (new_lock.start + new_lock.length == old_lock->start) {
+          new_lock.length = 0;
+        } else assert(0); //same as before, but reversed
+      } else {
+        if (old_lock->start + old_lock->length == new_lock.start) {
+          new_lock.start = old_lock->start;
+          new_lock.length = old_lock->length + new_lock.length;
+        } else if (new_lock.start + new_lock.length == old_lock->start) {
+          new_lock.length = old_lock->length + new_lock.length;
+        }
+      }
+      held_locks.erase(*iter);
+      --client_held_lock_counts[old_lock_client];
+    }
+    if (!client_held_lock_counts[old_lock_client]) {
+      client_held_lock_counts.erase(old_lock_client);
+    }
+  }
+}
+
+void ceph_lock_state_t::remove_all_from(client_t client,
+                                        multimap<uint64_t,
+                                          ceph_filelock>& locks)
+{
+  multimap<uint64_t, ceph_filelock>::iterator iter = locks.begin();
+  while (iter != locks.end()) {
+    if ((client_t)iter->second.client == client) {
+      locks.erase(iter++);
+    } else ++iter;
+  }
+}
+
+multimap<uint64_t, ceph_filelock>::iterator
+ceph_lock_state_t::get_lower_bound(uint64_t start,
+                                   multimap<uint64_t, ceph_filelock>& lock_map)
+{
+   multimap<uint64_t, ceph_filelock>::iterator lower_bound =
+     lock_map.lower_bound(start);
+   if ((lower_bound->first != start)
+       && (start != 0)
+       && (lower_bound != lock_map.begin())) --lower_bound;
+   if (lock_map.end() == lower_bound)
+     dout(15) << "get_lower_dout(15)eturning end()" << dendl;
+   else dout(15) << "get_lower_bound returning iterator pointing to "
+                << lower_bound->second << dendl;
+   return lower_bound;
+ }
+
+multimap<uint64_t, ceph_filelock>::iterator
+ceph_lock_state_t::get_last_before(uint64_t end,
+                                   multimap<uint64_t, ceph_filelock>& lock_map)
+{
+  multimap<uint64_t, ceph_filelock>::iterator last =
+    lock_map.upper_bound(end);
+  if (last != lock_map.begin()) --last;
+  if (lock_map.end() == last)
+    dout(15) << "get_last_before returning end()" << dendl;
+  else dout(15) << "get_last_before returning iterator pointing to "
+               << last->second << dendl;
+  return last;
+}
+
+bool ceph_lock_state_t::share_space(
+    multimap<uint64_t, ceph_filelock>::iterator& iter,
+    uint64_t start, uint64_t end)
+{
+  bool ret = ((iter->first >= start && iter->first <= end) ||
+              ((iter->first < start) &&
+               (((iter->first + iter->second.length - 1) >= start) ||
+                (0 == iter->second.length))));
+  dout(15) << "share_space got start: " << start << ", end: " << end
+          << ", lock: " << iter->second << ", returning " << ret << dendl;
+  return ret;
+}
+
+bool ceph_lock_state_t::get_overlapping_locks(ceph_filelock& lock,
+                           list<multimap<uint64_t,
+                               ceph_filelock>::iterator> & overlaps,
+                           list<multimap<uint64_t,
+                               ceph_filelock>::iterator> *self_neighbors)
+{
+  dout(15) << "get_overlapping_locks" << dendl;
+  // create a lock starting one earlier and ending one later
+  // to check for neighbors
+  ceph_filelock neighbor_check_lock = lock;
+  if (neighbor_check_lock.start != 0) {
+    neighbor_check_lock.start = neighbor_check_lock.start - 1;
+    if (neighbor_check_lock.length)
+      neighbor_check_lock.length = neighbor_check_lock.length + 2;
+  } else {
+    if (neighbor_check_lock.length)
+      neighbor_check_lock.length = neighbor_check_lock.length + 1;
+  }
+  //find the last held lock starting at the point after lock
+  uint64_t endpoint = lock.start;
+  if (lock.length) {
+    endpoint += lock.length;
+  } else {
+    endpoint = uint64_t(-1); // max offset
+  }
+  multimap<uint64_t, ceph_filelock>::iterator iter =
+    get_last_before(endpoint, held_locks);
+  bool cont = iter != held_locks.end();
+  while(cont) {
+    if (share_space(iter, lock)) {
+      overlaps.push_front(iter);
+    } else if (self_neighbors &&
+               (neighbor_check_lock.client == iter->second.client) &&
+               (neighbor_check_lock.pid == iter->second.pid) &&
+               share_space(iter, neighbor_check_lock)) {
+      self_neighbors->push_front(iter);
+    }
+    if ((iter->first < lock.start) && (CEPH_LOCK_EXCL == iter->second.type)) {
+      //can't be any more overlapping locks or they'd interfere with this one
+      cont = false;
+    } else if (held_locks.begin() == iter) cont = false;
+    else --iter;
+  }
+  return !overlaps.empty();
+}
+
+bool ceph_lock_state_t::get_waiting_overlaps(ceph_filelock& lock,
+                                             list<multimap<uint64_t,
+                                               ceph_filelock>::iterator>&
+                                               overlaps)
+{
+  dout(15) << "get_waiting_overlaps" << dendl;
+  multimap<uint64_t, ceph_filelock>::iterator iter =
+    get_last_before(lock.start + lock.length - 1, waiting_locks);
+  bool cont = iter != waiting_locks.end();
+  while(cont) {
+    if (share_space(iter, lock)) overlaps.push_front(iter);
+    if (waiting_locks.begin() == iter) cont = false;
+    --iter;
+  }
+  return !overlaps.empty();
+}
+
+void ceph_lock_state_t::split_by_owner(ceph_filelock& owner,
+                                       list<multimap<uint64_t,
+                                           ceph_filelock>::iterator>& locks,
+                                       list<multimap<uint64_t,
+                                           ceph_filelock>::iterator>&
+                                           owned_locks)
+{
+  list<multimap<uint64_t, ceph_filelock>::iterator>::iterator
+    iter = locks.begin();
+  dout(15) << "owner lock: " << owner << dendl;
+  while (iter != locks.end()) {
+    dout(15) << "comparing to " << (*iter)->second << dendl;
+    if ((*iter)->second.client == owner.client &&
+        (*iter)->second.pid_namespace == owner.pid_namespace &&
+        (*iter)->second.pid == owner.pid) {
+      dout(15) << "success, pushing to owned_locks" << dendl;
+      owned_locks.push_back(*iter);
+      iter = locks.erase(iter);
+    } else {
+      dout(15) << "failure, something not equal in this group "
+              << (*iter)->second.client << ":" << owner.client << ","
+              << (*iter)->second.pid_namespace << ":" << owner.pid_namespace
+              << "," << (*iter)->second.pid << ":" << owner.pid << dendl;
+      ++iter;
+    }
+  }
+}
+
+ceph_filelock *
+ceph_lock_state_t::contains_exclusive_lock(list<multimap<uint64_t,
+                                               ceph_filelock>::iterator>& locks)
+{
+  for (list<multimap<uint64_t, ceph_filelock>::iterator>::iterator
+         iter = locks.begin();
+       iter != locks.end();
+       ++iter) {
+    if (CEPH_LOCK_EXCL == (*iter)->second.type) return &(*iter)->second;
+  }
+  return NULL;
+}
diff --git a/src/mds/flock.h b/src/mds/flock.h
index 69228b46c90..133feabee66 100644
--- a/src/mds/flock.h
+++ b/src/mds/flock.h
@@ -26,239 +26,63 @@ inline bool operator==(ceph_filelock& l, ceph_filelock& r) {
     l.type == r.type;
 }
 
-struct ceph_lock_state_t {
+class ceph_lock_state_t {
+public:
   multimap<uint64_t, ceph_filelock> held_locks;    // current locks
   multimap<uint64_t, ceph_filelock> waiting_locks; // locks waiting for other locks
   // both of the above are keyed by starting offset
   map<client_t, int> client_held_lock_counts;
   map<client_t, int> client_waiting_lock_counts;
 
-  bool is_waiting(ceph_filelock &fl) {
-    multimap<uint64_t, ceph_filelock>::iterator p = waiting_locks.find(fl.start);
-    while (p != waiting_locks.end()) {
-      if (p->second.start > fl.start)
-	return false;
-      if (p->second.length == fl.length &&
-	  p->second.client == fl.client &&
-	  p->second.pid == fl.pid &&
-	  p->second.pid_namespace == fl.pid_namespace)
-	return true;
-      ++p;
-    }
-    return false;
-  }
-  void remove_waiting(ceph_filelock& fl) {
-    multimap<uint64_t, ceph_filelock>::iterator p = waiting_locks.find(fl.start);
-    while (p != waiting_locks.end()) {
-      if (p->second.start > fl.start)
-	return;
-      if (p->second.length == fl.length &&
-	  p->second.client == fl.client &&
-	  p->second.pid == fl.pid &&
-	  p->second.pid_namespace == fl.pid_namespace) {
-	waiting_locks.erase(p);
-	return;
-      }
-      ++p;
-    }
-  }
+  /**
+   * Check if a lock is on the waiting_locks list.
+   *
+   * @param fl The filelock to check for
+   * @returns True if the lock is waiting, false otherwise
+   */
+  bool is_waiting(ceph_filelock &fl);
+  /**
+   * Remove a lock from the waiting_locks list
+   *
+   * @param fl The filelock to remove
+   */
+  void remove_waiting(ceph_filelock& fl);
 
   /*
    * Try to set a new lock. If it's blocked and wait_on_fail is true,
    * add the lock to waiting_locks.
    * The lock needs to be of type CEPH_LOCK_EXCL or CEPH_LOCK_SHARED.
+   * This may merge previous locks, or convert the type of already-owned
+   * locks.
    *
-   * If we already added ourselves to waiting_locks, did_wait will be
-   * true.  If did_wait==true and we're not on the list, that means we
-   * were canceled and we should return an error.
+   * @param new_lock The lock to set
+   * @param wait_on_fail whether to wait until the lock can be set.
+   * Otherwise it fails immediately when blocked.
    *
-   * Returns true if set, false if not set.
+   * @returns true if set, false if not set.
    */
-  bool add_lock(ceph_filelock& new_lock, bool wait_on_fail) {
-    dout(15) << "add_lock " << new_lock << dendl;
-    bool ret = false;
-    list<multimap<uint64_t, ceph_filelock>::iterator>
-      overlapping_locks, self_overlapping_locks, neighbor_locks;
-
-    // first, get any overlapping locks and split them into owned-by-us and not
-    if (get_overlapping_locks(new_lock, overlapping_locks, &neighbor_locks)) {
-      dout(15) << "got overlapping lock, splitting by owner" << dendl;
-      split_by_owner(new_lock, overlapping_locks, self_overlapping_locks);
-    }
-    if (!overlapping_locks.empty()) { //overlapping locks owned by others :(
-      if (CEPH_LOCK_EXCL == new_lock.type) {
-	//can't set, we want an exclusive
-	dout(15) << "overlapping lock, and this lock is exclusive, can't set"
-		<< dendl;
-	if (wait_on_fail) {
-	  waiting_locks.insert(pair<uint64_t, ceph_filelock>(new_lock.start, new_lock));
-	}
-      } else { //shared lock, check for any exclusive locks blocking us
-	if (contains_exclusive_lock(overlapping_locks)) { //blocked :(
-	  dout(15) << " blocked by exclusive lock in overlapping_locks" << dendl;
-	  if (wait_on_fail) {
-	    waiting_locks.insert(pair<uint64_t, ceph_filelock>(new_lock.start, new_lock));
-	  }
-	} else {
-	  //yay, we can insert a shared lock
-	  dout(15) << "inserting shared lock" << dendl;
-	  adjust_locks(self_overlapping_locks, new_lock, neighbor_locks);
-	  held_locks.insert(pair<uint64_t, ceph_filelock>(new_lock.start, new_lock));
-	  ret = true;
-	}
-      }
-    } else { //no overlapping locks except our own
-      adjust_locks(self_overlapping_locks, new_lock, neighbor_locks);
-      dout(15) << "no conflicts, inserting " << new_lock << dendl;
-      held_locks.insert(pair<uint64_t, ceph_filelock>
-			(new_lock.start, new_lock));
-      ret = true;
-    }
-    if (ret)
-      ++client_held_lock_counts[(client_t)new_lock.client];
-    else if (wait_on_fail)
-      ++client_waiting_lock_counts[(client_t)new_lock.client];
-    return ret;
-  }
-
-  void look_for_lock(ceph_filelock& testing_lock) {
-    list<multimap<uint64_t, ceph_filelock>::iterator> overlapping_locks,
-      self_overlapping_locks;
-    if (get_overlapping_locks(testing_lock, overlapping_locks)) {
-      split_by_owner(testing_lock, overlapping_locks, self_overlapping_locks);
-    }
-    if (!overlapping_locks.empty()) { //somebody else owns overlapping lock
-      if (CEPH_LOCK_EXCL == testing_lock.type) { //any lock blocks it
-	testing_lock = (*overlapping_locks.begin())->second;
-      } else {
-	ceph_filelock *blocking_lock;
-	if ((blocking_lock = contains_exclusive_lock(overlapping_locks))) {
-	  testing_lock = *blocking_lock;
-	} else { //nothing blocking!
-	  testing_lock.type = CEPH_LOCK_UNLOCK;
-	}
-      }
-      return;
-    }
-    //if we get here, only our own locks block
-    testing_lock.type = CEPH_LOCK_UNLOCK;
-  }
+  bool add_lock(ceph_filelock& new_lock, bool wait_on_fail);
+  /**
+   * See if a lock is blocked by existing locks. If the lock is blocked,
+   * it will be set to the value of the first blocking lock. Otherwise,
+   * it will be returned unchanged, except for setting the type field
+   * to CEPH_LOCK_UNLOCK.
+   *
+   * @param testing_lock The lock to check for conflicts on.
+   */
+  void look_for_lock(ceph_filelock& testing_lock);
 
   /*
    * Remove lock(s) described in old_lock. This may involve splitting a
    * previous lock or making a previous lock smaller.
+   *
+   * @param removal_lock The lock to remove
+   * @param activated_locks A return parameter, holding activated wait locks.
    */
   void remove_lock(ceph_filelock removal_lock,
-		   list<ceph_filelock>& activated_locks) {
-    list<multimap<uint64_t, ceph_filelock>::iterator> overlapping_locks,
-      self_overlapping_locks, crossed_waiting_locks;
-    if (get_overlapping_locks(removal_lock, overlapping_locks)) {
-      dout(15) << "splitting by owner" << dendl;
-      split_by_owner(removal_lock, overlapping_locks, self_overlapping_locks);
-    } else dout(15) << "attempt to remove lock at " << removal_lock.start
-		   << " but no locks there!" << dendl;
-    bool remove_to_end = (0 == removal_lock.length);
-    bool old_lock_to_end;
-    uint64_t removal_start = removal_lock.start;
-    uint64_t removal_end = removal_start + removal_lock.length - 1;
-    uint64_t old_lock_end;
-    __s64 old_lock_client = 0;
-    ceph_filelock *old_lock;
-
-    dout(15) << "examining " << self_overlapping_locks.size()
-	    << " self-overlapping locks for removal" << dendl;
-    for (list<multimap<uint64_t, ceph_filelock>::iterator>::iterator
-	   iter = self_overlapping_locks.begin();
-	 iter != self_overlapping_locks.end();
-	 ++iter) {
-      dout(15) << "self overlapping lock " << (*iter)->second << dendl;
-      old_lock = &(*iter)->second;
-      old_lock_to_end = (0 == old_lock->length);
-      old_lock_end = old_lock->start + old_lock->length - 1;
-      old_lock_client = old_lock->client;
-      if (remove_to_end) {
-	if (old_lock->start < removal_start) {
-	  old_lock->length = removal_start - old_lock->start;
-	} else {
-	  dout(15) << "erasing " << (*iter)->second << dendl;
-	  held_locks.erase(*iter);
-	  --client_held_lock_counts[old_lock_client];
-	}
-      } else if (old_lock_to_end) {
-	ceph_filelock append_lock = *old_lock;
-	append_lock.start = removal_end+1;
-	held_locks.insert(pair<uint64_t, ceph_filelock>
-			  (append_lock.start, append_lock));
-	++client_held_lock_counts[(client_t)old_lock->client];
-	if (old_lock->start >= removal_start) {
-	  dout(15) << "erasing " << (*iter)->second << dendl;
-	  held_locks.erase(*iter);
-	  --client_held_lock_counts[old_lock_client];
-	} else old_lock->length = removal_start - old_lock->start;
-      } else {
-	if (old_lock_end  > removal_end) {
-	  ceph_filelock append_lock = *old_lock;
-	  append_lock.start = removal_end + 1;
-	  append_lock.length = old_lock_end - append_lock.start + 1;
-	  held_locks.insert(pair<uint64_t, ceph_filelock>
-			    (append_lock.start, append_lock));
-	  ++client_held_lock_counts[(client_t)old_lock->client];
-	}
-	if (old_lock->start < removal_start) {
-	  old_lock->length = removal_start - old_lock->start;
-	} else {
-	  dout(15) << "erasing " << (*iter)->second << dendl;
-	  held_locks.erase(*iter);
-	  --client_held_lock_counts[old_lock_client];
-	}
-      }
-      if (!client_held_lock_counts[old_lock_client]) {
-	client_held_lock_counts.erase(old_lock_client);
-      }
-    }
-
-    /* okay, we've removed the locks, but removing them might allow some
-     * other waiting locks to come through */
-    if (get_waiting_overlaps(removal_lock, crossed_waiting_locks)) {
-      /*let's do this the SUPER lazy way for now. Should work out something
-	that's slightly less slow and wasteful, though.
-	1) Remove lock from waiting_locks.
-	2) attempt to insert lock via add_lock
-	3) Add to success list if we get back "true"
-
-	In the future, should probably set this up to detect some
-	guaranteed blocks and do fewer map lookups.
-       */
-      for (list<multimap<uint64_t, ceph_filelock>::iterator>::iterator
-	     iter = crossed_waiting_locks.begin();
-	   iter != crossed_waiting_locks.end();
-	   ++iter) {
-	ceph_filelock cur_lock = (*iter)->second;
-	waiting_locks.erase(*iter);
-	--client_waiting_lock_counts[(client_t)cur_lock.client];
-	if (!client_waiting_lock_counts[(client_t)cur_lock.client]) {
-	  client_waiting_lock_counts.erase((client_t)cur_lock.client);
-	}
-	if (add_lock(cur_lock, true))
-	  activated_locks.push_back(cur_lock);
-      }
-    }
-  }
-
-  bool remove_all_from (client_t client) {
-    bool cleared_any = false;
-    if (client_held_lock_counts.count(client)) {
-      remove_all_from(client, held_locks);
-      client_held_lock_counts.erase(client);
-      cleared_any = true;
-    }
-    if (client_waiting_lock_counts.count(client)) {
-      remove_all_from(client, waiting_locks);
-      client_waiting_lock_counts.erase(client);
-    }
-    return cleared_any;
-  }
+                   list<ceph_filelock>& activated_locks);
 
+  bool remove_all_from(client_t client);
 private:
   /**
    * Adjust old locks owned by a single process so that process can set
@@ -270,182 +94,32 @@ private:
    * This function should only be called once you know the lock will be
    * inserted, as it DOES adjust new_lock. You can call this function
    * on an empty list, in which case it does nothing.
-   * This function does not remove elements from the list, so regard the list
+   * This function does not remove elements from old_locks, so regard the list
    * as bad information following function invocation.
    *
-   * new_lock: The new lock the process has requested.
-   * old_locks: list of all locks currently held by same
+   * @param new_lock The new lock the process has requested.
+   * @param old_locks list of all locks currently held by same
    *    client/process that overlap new_lock.
-   * neighbor_locks: locks owned by same process that neighbor new_lock on
+   * @param neighbor_locks locks owned by same process that neighbor new_lock on
    *    left or right side.
    */
   void adjust_locks(list<multimap<uint64_t, ceph_filelock>::iterator> old_locks,
-		    ceph_filelock& new_lock,
-		    list<multimap<uint64_t, ceph_filelock>::iterator>
-		    neighbor_locks) {
-    dout(15) << "adjust_locks" << dendl;
-    bool new_lock_to_end = (0 == new_lock.length);
-    bool old_lock_to_end;
-    uint64_t new_lock_start = new_lock.start;
-    uint64_t new_lock_end = new_lock.start + new_lock.length - 1;
-    uint64_t old_lock_start, old_lock_end;
-    __s64 old_lock_client = 0;
-    ceph_filelock *old_lock;
-    for (list<multimap<uint64_t, ceph_filelock>::iterator>::iterator
-	   iter = old_locks.begin();
-	 iter != old_locks.end();
-	 ++iter) {
-      old_lock = &(*iter)->second;
-      dout(15) << "adjusting lock: " << *old_lock << dendl;
-      old_lock_to_end = (0 == old_lock->length);
-      old_lock_start = old_lock->start;
-      old_lock_end = old_lock->start + old_lock->length - 1;
-      new_lock_start = new_lock.start;
-      new_lock_end = new_lock.start + new_lock.length - 1;
-      old_lock_client = old_lock->client;
-      if (new_lock_to_end || old_lock_to_end) {
-	//special code path to deal with a length set at 0
-	dout(15) << "one lock extends forever" << dendl;
-	if (old_lock->type == new_lock.type) {
-	  //just unify them in new lock, remove old lock
-	  dout(15) << "same lock type, unifying" << dendl;
-	  new_lock.start = (new_lock_start < old_lock_start) ? new_lock_start :
-	    old_lock_start;
-	  new_lock.length = 0;
-	  held_locks.erase(*iter);
-	  --client_held_lock_counts[old_lock_client];
-	} else { //not same type, have to keep any remains of old lock around
-	  dout(15) << "shrinking old lock" << dendl;
-	  if (new_lock_to_end) {
-	    if (old_lock_start < new_lock_start) {
-	      old_lock->length = new_lock_start - old_lock_start;
-	    } else {
-	      held_locks.erase(*iter);
-	      --client_held_lock_counts[old_lock_client];
-	    }
-	  } else { //old lock extends past end of new lock
-	    ceph_filelock appended_lock = *old_lock;
-	    appended_lock.start = new_lock_end + 1;
-	    held_locks.insert(pair<uint64_t, ceph_filelock>
-			      (appended_lock.start, appended_lock));
-	    ++client_held_lock_counts[(client_t)old_lock->client];
-	    if (old_lock_start < new_lock_start) {
-	      old_lock->length = new_lock_start - old_lock_start;
-	    } else {
-	      held_locks.erase(*iter);
-	      --client_held_lock_counts[old_lock_client];
-	    }
-	  }
-	}
-      } else {
-	if (old_lock->type == new_lock.type) { //just merge them!
-	  dout(15) << "merging locks, they're the same type" << dendl;
-	  new_lock.start = (old_lock_start < new_lock_start ) ? old_lock_start :
-	    new_lock_start;
-	  int new_end = (new_lock_end > old_lock_end) ? new_lock_end :
-	    old_lock_end;
-	  new_lock.length = new_end - new_lock.start + 1;
-	  dout(15) << "erasing lock " << (*iter)->second << dendl;
-	  held_locks.erase(*iter);
-	  --client_held_lock_counts[old_lock_client];
-	} else { //we'll have to update sizes and maybe make new locks
-	  dout(15) << "locks aren't same type, changing sizes" << dendl;
-	  if (old_lock_end > new_lock_end) { //add extra lock after new_lock
-	    ceph_filelock appended_lock = *old_lock;
-	    appended_lock.start = new_lock_end + 1;
-	    appended_lock.length = old_lock_end - appended_lock.start + 1;
-	    held_locks.insert(pair<uint64_t, ceph_filelock>
-			      (appended_lock.start, appended_lock));
-	    ++client_held_lock_counts[(client_t)old_lock->client];
-	  }
-	  if (old_lock_start < new_lock_start) {
-	    old_lock->length = new_lock_start - old_lock_start;
-	  } else { //old_lock starts inside new_lock, so remove it
-	    //if it extended past new_lock_end it's been replaced
-	    held_locks.erase(*iter);
-	    --client_held_lock_counts[old_lock_client];
-	  }
-	}
-      }
-      if (!client_held_lock_counts[old_lock_client]) {
-	client_held_lock_counts.erase(old_lock_client);
-      }
-    }
-
-    //make sure to coalesce neighboring locks
-    for (list<multimap<uint64_t, ceph_filelock>::iterator>::iterator
-	   iter = neighbor_locks.begin();
-	 iter != neighbor_locks.end();
-	 ++iter) {
-      old_lock = &(*iter)->second;
-      old_lock_client = old_lock->client;
-      dout(15) << "lock to coalesce: " << *old_lock << dendl;
-      /* because if it's a neibhoring lock there can't be any self-overlapping
-	 locks that covered it */
-      if (old_lock->type == new_lock.type) { //merge them
-	if (0 == new_lock.length) {
-	  if (old_lock->start + old_lock->length == new_lock.start) {
-	    new_lock.start = old_lock->start;
-	  } else assert(0); /* if there's no end to new_lock, the neighbor
-			       HAS TO be to left side */
-	} else if (0 == old_lock->length) {
-	  if (new_lock.start + new_lock.length == old_lock->start) {
-	    new_lock.length = 0;
-	  } else assert(0); //same as before, but reversed
-	} else {
-	  if (old_lock->start + old_lock->length == new_lock.start) {
-	    new_lock.start = old_lock->start;
-	    new_lock.length = old_lock->length + new_lock.length;
-	  } else if (new_lock.start + new_lock.length == old_lock->start) {
-	    new_lock.length = old_lock->length + new_lock.length;
-	  }
-	}
-	held_locks.erase(*iter);
-	--client_held_lock_counts[old_lock_client];
-      }
-      if (!client_held_lock_counts[old_lock_client]) {
-	client_held_lock_counts.erase(old_lock_client);
-      }
-    }
-  }
+                    ceph_filelock& new_lock,
+                    list<multimap<uint64_t, ceph_filelock>::iterator>
+                      neighbor_locks);
 
   //this won't reset the counter map value, do that yourself
-  void remove_all_from(client_t client, multimap<uint64_t, ceph_filelock>& locks) {
-    multimap<uint64_t, ceph_filelock>::iterator iter = locks.begin();
-    while (iter != locks.end()) {
-      if ((client_t)iter->second.client == client) {
-	locks.erase(iter++);
-      } else ++iter;
-    }
-  }
+  void remove_all_from(client_t client,
+                       multimap<uint64_t, ceph_filelock>& locks);
 
   //get last lock prior to start position
   multimap<uint64_t, ceph_filelock>::iterator
-  get_lower_bound(uint64_t start, multimap<uint64_t, ceph_filelock>& lock_map) {
-    multimap<uint64_t, ceph_filelock>::iterator lower_bound =
-      lock_map.lower_bound(start);
-    if ((lower_bound->first != start)
-	&& (start != 0)
-	&& (lower_bound != lock_map.begin())) --lower_bound;
-    if (lock_map.end() == lower_bound)
-      dout(15) << "get_lower_dout(15)eturning end()" << dendl;
-    else dout(15) << "get_lower_bound returning iterator pointing to "
-		 << lower_bound->second << dendl;
-    return lower_bound;
-  }
-
+  get_lower_bound(uint64_t start,
+                  multimap<uint64_t, ceph_filelock>& lock_map);
   //get latest-starting lock that goes over the byte "end"
   multimap<uint64_t, ceph_filelock>::iterator
-  get_last_before(uint64_t end, multimap<uint64_t, ceph_filelock>& lock_map) {
-    multimap<uint64_t, ceph_filelock>::iterator last =
-      lock_map.upper_bound(end);
-    if (last != lock_map.begin()) --last;
-    if (lock_map.end() == last)
-      dout(15) << "get_last_before returning end()" << dendl;
-    else dout(15) << "get_last_before returning iterator pointing to "
-		 << last->second << dendl;
-    return last;
-  }
+  get_last_before(uint64_t end,
+                  multimap<uint64_t, ceph_filelock>& lock_map);
 
   /*
    * See if an iterator's lock covers any of the same bounds as a given range
@@ -454,25 +128,18 @@ private:
    * If the length is 0, the lock covers from "start" to the end of the file.
    */
   bool share_space(multimap<uint64_t, ceph_filelock>::iterator& iter,
-		   uint64_t start, uint64_t end) {
-    bool ret = ((iter->first >= start && iter->first <= end) ||
-		((iter->first < start) &&
-		 (((iter->first + iter->second.length - 1) >= start) ||
-		  (0 == iter->second.length))));
-    dout(15) << "share_space got start: " << start << ", end: " << end
-	    << ", lock: " << iter->second << ", returning " << ret << dendl;
-    return ret;
-  }
+		   uint64_t start, uint64_t end);
+  
   bool share_space(multimap<uint64_t, ceph_filelock>::iterator& iter,
-		   ceph_filelock& lock) {
+                   ceph_filelock &lock) {
     uint64_t end = lock.start;
-    if (lock.length)
+    if (lock.length) {
       end += lock.length - 1;
-    else // zero length means to end-of-file
+    } else { // zero length means end of file
       end = uint64_t(-1);
+    }
     return share_space(iter, lock.start, end);
   }
-  
   /*
    *get a list of all locks overlapping with the given lock's range
    * lock: the lock to compare with.
@@ -480,47 +147,11 @@ private:
    * Returns: true if at least one lock overlaps.
    */
   bool get_overlapping_locks(ceph_filelock& lock,
-			     list<multimap<uint64_t, ceph_filelock>::iterator> & overlaps,
-			     list<multimap<uint64_t, ceph_filelock>::iterator> *self_neighbors) {
-    dout(15) << "get_overlapping_locks" << dendl;
-    // create a lock starting one earlier and ending one later
-    // to check for neighbors
-    ceph_filelock neighbor_check_lock = lock;
-    if (neighbor_check_lock.start != 0) {
-      neighbor_check_lock.start = neighbor_check_lock.start - 1;
-      if (neighbor_check_lock.length)
-        neighbor_check_lock.length = neighbor_check_lock.length + 2;
-    } else {
-      if (neighbor_check_lock.length)
-        neighbor_check_lock.length = neighbor_check_lock.length + 1;
-    }
-    //find the last held lock starting at the point after lock
-    uint64_t endpoint = lock.start;
-    if (lock.length) {
-      endpoint += lock.length;
-    } else {
-      endpoint = uint64_t(-1); // max offset
-    }
-    multimap<uint64_t, ceph_filelock>::iterator iter =
-      get_last_before(endpoint, held_locks);
-    bool cont = iter != held_locks.end();
-    while(cont) {
-      if (share_space(iter, lock)) {
-	overlaps.push_front(iter);
-      } else if (self_neighbors &&
-		 (neighbor_check_lock.client == iter->second.client) &&
-		 (neighbor_check_lock.pid == iter->second.pid) &&
-		 share_space(iter, neighbor_check_lock)) {
-	self_neighbors->push_front(iter);
-      }
-      if ((iter->first < lock.start) && (CEPH_LOCK_EXCL == iter->second.type)) {
-	//can't be any more overlapping locks or they'd interfere with this one
-	cont = false;
-      } else if (held_locks.begin() == iter) cont = false;
-      else --iter;
-    }
-    return !overlaps.empty();
-  }
+                             list<multimap<uint64_t,
+                                 ceph_filelock>::iterator> & overlaps,
+                             list<multimap<uint64_t,
+                                 ceph_filelock>::iterator> *self_neighbors);
+
   
   bool get_overlapping_locks(ceph_filelock& lock,
 			     list<multimap<uint64_t, ceph_filelock>::iterator>& overlaps) {
@@ -534,20 +165,8 @@ private:
    * Returns: true if at least one waiting_lock overlaps
    */
   bool get_waiting_overlaps(ceph_filelock& lock,
-			    list<multimap<uint64_t, ceph_filelock>::iterator>&
-			    overlaps) {
-    dout(15) << "get_waiting_overlaps" << dendl;
-    multimap<uint64_t, ceph_filelock>::iterator iter =
-      get_last_before(lock.start + lock.length - 1, waiting_locks);
-    bool cont = iter != waiting_locks.end();
-    while(cont) {
-      if (share_space(iter, lock)) overlaps.push_front(iter);
-      if (waiting_locks.begin() == iter) cont = false;
-      --iter;
-    }
-    return !overlaps.empty();
-  }
-
+                            list<multimap<uint64_t,
+                                ceph_filelock>::iterator>& overlaps);
   /*
    * split a list of locks up by whether they're owned by same
    * process as given lock
@@ -557,38 +176,13 @@ private:
    * owned_locks: an empty list, to be filled with the locks owned by owner
    */
   void split_by_owner(ceph_filelock& owner,
-		      list<multimap<uint64_t, ceph_filelock>::iterator> & locks,
-		      list<multimap<uint64_t, ceph_filelock>::iterator> & owned_locks) {
-    list<multimap<uint64_t, ceph_filelock>::iterator>::iterator
-      iter = locks.begin();
-    dout(15) << "owner lock: " << owner << dendl;
-    while (iter != locks.end()) {
-      dout(15) << "comparing to " << (*iter)->second << dendl;
-      if ((*iter)->second.client == owner.client &&
-	  (*iter)->second.pid_namespace == owner.pid_namespace &&
-	  (*iter)->second.pid == owner.pid) {
-	dout(15) << "success, pushing to owned_locks" << dendl;
-	owned_locks.push_back(*iter);
-	iter = locks.erase(iter);
-      } else {
-	dout(15) << "failure, something not equal in this group "
-		<< (*iter)->second.client << ":" << owner.client << ","
-		<< (*iter)->second.pid_namespace << ":" << owner.pid_namespace
-		<< "," << (*iter)->second.pid << ":" << owner.pid << dendl;
-	++iter;
-      }
-    }
-  }
+		      list<multimap<uint64_t,
+		          ceph_filelock>::iterator> & locks,
+		      list<multimap<uint64_t,
+		          ceph_filelock>::iterator> & owned_locks);
 
-  ceph_filelock *contains_exclusive_lock(list<multimap<uint64_t, ceph_filelock>::iterator>& locks) {
-    for (list<multimap<uint64_t, ceph_filelock>::iterator>::iterator
-	   iter = locks.begin();
-	 iter != locks.end();
-	 ++iter) {
-      if (CEPH_LOCK_EXCL == (*iter)->second.type) return &(*iter)->second;
-    }
-    return NULL;
-  }
+  ceph_filelock *contains_exclusive_lock(list<multimap<uint64_t,
+                                         ceph_filelock>::iterator>& locks);
 
 public:
   void encode(bufferlist& bl) const {
diff --git a/src/mds/mdstypes.h b/src/mds/mdstypes.h
index 70f029740d0..32f597277a1 100644
--- a/src/mds/mdstypes.h
+++ b/src/mds/mdstypes.h
@@ -430,6 +430,7 @@ struct inode_t {
 
   bool is_truncating() const { return (truncate_pending > 0); }
   void truncate(uint64_t old_size, uint64_t new_size) {
+    assert(new_size < old_size);
     truncate_from = old_size;
     size = new_size;
     rstat.rbytes = new_size;
diff --git a/src/mkcephfs.in b/src/mkcephfs.in
index d4d5092bf93..fa43155c173 100644
--- a/src/mkcephfs.in
+++ b/src/mkcephfs.in
@@ -437,7 +437,7 @@ if [ $allhosts -eq 1 ]; then
 	    do_root_cmd "$0 -d $rdir --prepare-osdfs $name"
 	fi
 
-	do_cmd "$0 -d $rdir --init-daemon $name"
+	do_root_cmd "$0 -d $rdir --init-daemon $name"
 
 	# collect the key
 	if [ -n "$ssh" ]; then
@@ -468,7 +468,7 @@ if [ $allhosts -eq 1 ]; then
 	    rdir=$dir
 	fi
 	
-	do_cmd "$0 -d $rdir --init-daemon $name"
+	do_root_cmd "$0 -d $rdir --init-daemon $name"
     done
 
     # admin keyring
diff --git a/src/mon/OSDMonitor.cc b/src/mon/OSDMonitor.cc
index 6db2324eebd..abb878dea9d 100644
--- a/src/mon/OSDMonitor.cc
+++ b/src/mon/OSDMonitor.cc
@@ -1664,16 +1664,21 @@ bool OSDMonitor::prepare_command(MMonCommand *m)
 	i = atoi(m->cmd[2].c_str());
 	if (i < 0 || i >= osdmap.get_max_osd()) {
 	  ss << i << " is not a valid osd id";
-	  getline(ss, rs);
-	  return -ERANGE;
+	  err = -ERANGE;
+	  goto out;
+	}
+	if (osdmap.exists(i)) {
+	  ss << i << " already exists";
+	  err = -EEXIST;
+	  goto out;
 	}
-	if (osdmap.exists(i) ||
-	    pending_inc.new_up_client.count(i) ||
+	if (pending_inc.new_up_client.count(i) ||
 	    (pending_inc.new_state.count(i) &&
 	     (pending_inc.new_state[i] & CEPH_OSD_EXISTS))) {
 	  ss << i << " already exists";
 	  getline(ss, rs);
-	  return -EEXIST;
+	  paxos->wait_for_commit(new Monitor::C_Command(mon, m, -EEXIST, rs, paxos->get_version()));
+	  return true;
 	}
       } else {
 	// allocate a new id
diff --git a/src/os/CollectionIndex.h b/src/os/CollectionIndex.h
index f8846a78cd6..2fca26fb0a6 100644
--- a/src/os/CollectionIndex.h
+++ b/src/os/CollectionIndex.h
@@ -58,6 +58,9 @@ protected:
   /// Type of returned paths
   typedef std::tr1::shared_ptr<Path> IndexedPath;
 
+  static const uint32_t FLAT_INDEX_TAG = 0;
+  static const uint32_t HASH_INDEX_TAG = 1;
+  static const uint32_t HASH_INDEX_TAG_2 = 2;
   /**
    * For tracking Filestore collection versions.
    *
diff --git a/src/os/FileStore.cc b/src/os/FileStore.cc
index 9d3ebef3c75..86281ab9982 100644
--- a/src/os/FileStore.cc
+++ b/src/os/FileStore.cc
@@ -1268,7 +1268,7 @@ int FileStore::version_stamp_is_valid(uint32_t *version)
   }
   bufferptr bp(PATH_MAX);
   int ret = safe_read(fd, bp.c_str(), bp.length());
-  TEMP_FAILURE_RETRY(::close(op_fd));
+  TEMP_FAILURE_RETRY(::close(fd));
   if (ret < 0)
     return -errno;
   bufferlist bl;
@@ -1292,7 +1292,7 @@ int FileStore::write_version_stamp()
   ::encode(on_disk_version, bl);
   
   int ret = safe_write(fd, bl.c_str(), bl.length());
-  TEMP_FAILURE_RETRY(::close(op_fd));
+  TEMP_FAILURE_RETRY(::close(fd));
   if (ret < 0)
     return -errno;
   return 0;
@@ -2985,7 +2985,11 @@ void FileStore::sync_entry()
       sync_epoch++;
 
       dout(15) << "sync_entry committing " << cp << " sync_epoch " << sync_epoch << dendl;
-      write_op_seq(op_fd, cp);
+      if (write_op_seq(op_fd, cp) < 0) {
+	derr << "Error: " << cpp_strerror(errno) 
+	     << " during write_op_seq" << dendl;
+	assert(0);
+      }
 
       bool do_snap = btrfs && g_conf->filestore_btrfs_snap;
 
diff --git a/src/os/FileStore.h b/src/os/FileStore.h
index 7f01c897407..d17d5e6b82f 100644
--- a/src/os/FileStore.h
+++ b/src/os/FileStore.h
@@ -39,7 +39,7 @@ using namespace __gnu_cxx;
 // fake attributes in memory, if we need to.
 
 class FileStore : public JournalingObjectStore {
-  static const uint32_t on_disk_version = 1;
+  static const uint32_t on_disk_version = 2;
   string basedir, journalpath;
   std::string current_fn;
   std::string current_op_seq_fn;
diff --git a/src/os/FlatIndex.h b/src/os/FlatIndex.h
index d94edf3f69e..53e27f5ec08 100644
--- a/src/os/FlatIndex.h
+++ b/src/os/FlatIndex.h
@@ -35,7 +35,7 @@ public:
   FlatIndex(string base_path) : base_path(base_path) {}
 
   /// @see CollectionIndex
-  uint32_t collection_version() { return 0; }
+  uint32_t collection_version() { return FLAT_INDEX_TAG; }
 
   /// @see CollectionIndex
   void set_ref(std::tr1::shared_ptr<CollectionIndex> ref);
diff --git a/src/os/HashIndex.h b/src/os/HashIndex.h
index fb8fde599ab..bbcbc9904ad 100644
--- a/src/os/HashIndex.h
+++ b/src/os/HashIndex.h
@@ -128,12 +128,13 @@ public:
   HashIndex(
     const char *base_path, ///< [in] Path to the index root.
     int merge_at,          ///< [in] Merge threshhold.
-    int split_at)	   ///< [in] Split threshhold.
-    : LFNIndex(base_path), merge_threshold(merge_at),
+    int split_at,	   ///< [in] Split threshhold.
+    uint32_t index_version)///< [in] Index version
+    : LFNIndex(base_path, index_version), merge_threshold(merge_at),
       split_threshold(split_at) {}
 
   /// @see CollectionIndex
-  uint32_t collection_version() { return 1; }
+  uint32_t collection_version() { return index_version; }
 
   /// @see CollectionIndex
   int cleanup();
diff --git a/src/os/IndexManager.cc b/src/os/IndexManager.cc
index 251599b1175..504cd8f5217 100644
--- a/src/os/IndexManager.cc
+++ b/src/os/IndexManager.cc
@@ -70,7 +70,8 @@ int IndexManager::init_index(coll_t c, const char *path, uint32_t version) {
   if (r < 0)
     return r;
   HashIndex index(path, g_conf->filestore_merge_threshold, 
-		  g_conf->filestore_split_multiple);
+		  g_conf->filestore_split_multiple,
+		  CollectionIndex::HASH_INDEX_TAG_2);
   return index.init();
 }
 
@@ -84,15 +85,16 @@ int IndexManager::build_index(coll_t c, const char *path, Index *index) {
       return r;
 
     switch (version) {
-    case 0: {
+    case CollectionIndex::FLAT_INDEX_TAG: {
       *index = Index(new FlatIndex(path), 
 		     RemoveOnDelete(c, this));
       return 0;
     }
-    case 1: {
+    case CollectionIndex::HASH_INDEX_TAG: // fall through
+    case CollectionIndex::HASH_INDEX_TAG_2: {
       // Must be a HashIndex
       *index = Index(new HashIndex(path, g_conf->filestore_merge_threshold,
-				   g_conf->filestore_split_multiple), 
+				   g_conf->filestore_split_multiple, version), 
 		     RemoveOnDelete(c, this));
       return 0;
     }
@@ -102,7 +104,8 @@ int IndexManager::build_index(coll_t c, const char *path, Index *index) {
   } else {
     // No need to check
     *index = Index(new HashIndex(path, g_conf->filestore_merge_threshold,
-				 g_conf->filestore_split_multiple), 
+				 g_conf->filestore_split_multiple,
+				 CollectionIndex::HASH_INDEX_TAG_2), 
 		   RemoveOnDelete(c, this));
     return 0;
   }
diff --git a/src/os/LFNIndex.cc b/src/os/LFNIndex.cc
index d6903032e3c..557b69a564d 100644
--- a/src/os/LFNIndex.cc
+++ b/src/os/LFNIndex.cc
@@ -256,6 +256,23 @@ int LFNIndex::get_mangled_name(const vector<string> &from,
   return lfn_get_name(from, hoid, mangled_name, 0, exists);
 }
 
+static int get_hobject_from_oinfo(const char *dir, const char *file, 
+				  hobject_t *o) {
+  char path[PATH_MAX];
+  bufferptr bp(PATH_MAX);
+  snprintf(path, sizeof(path), "%s/%s", dir, file);
+  // Hack, user.ceph._ is the attribute used to store the object info
+  int r = do_getxattr(path, "user.ceph._", bp.c_str(), bp.length());
+  if (r < 0)
+    return r;
+  bufferlist bl;
+  bl.push_back(bp);
+  object_info_t oi(bl);
+  *o = oi.soid;
+  return 0;
+}
+
+
 int LFNIndex::list_objects(const vector<string> &to_list, int max_objs,
 			   long *handle, map<string, hobject_t> *out) {
   string to_list_path = get_full_path_subdir(to_list);
@@ -295,6 +312,9 @@ int LFNIndex::list_objects(const vector<string> &to_list, int max_objs,
 	if (!lfn_must_hash(long_name)) {
 	  assert(long_name == short_name);
 	}
+	if (index_version == HASH_INDEX_TAG)
+	  get_hobject_from_oinfo(to_list_path.c_str(), short_name.c_str(), &obj);
+	  
 	out->insert(pair<string, hobject_t>(short_name, obj));
 	++listed;
       } else {
@@ -413,7 +433,7 @@ int LFNIndex::remove_attr_path(const vector<string> &path,
   return do_removexattr(full_path.c_str(), mangled_attr_name.c_str());
 }
   
-string LFNIndex::lfn_generate_object_name(const hobject_t &hoid) {
+string LFNIndex::lfn_generate_object_name_keyless(const hobject_t &hoid) {
   char s[FILENAME_MAX_LEN];
   char *end = s + sizeof(s);
   char *t = s;
@@ -449,7 +469,55 @@ string LFNIndex::lfn_generate_object_name(const hobject_t &hoid) {
   snprintf(t, end - t, "_%.*X", (int)(sizeof(hoid.hash)*2), hoid.hash);
 
   return string(s);
- }
+}
+
+static void append_escaped(string::const_iterator begin,
+			   string::const_iterator end, 
+			   string *out) {
+  for (string::const_iterator i = begin; i != end; ++i) {
+    if (*i == '\\') {
+      out->append("\\\\");
+    } else if (*i == '/') {
+      out->append("\\s");
+    } else if (*i == '_') {
+      out->append("\\u");
+    } else {
+      out->append(i, i+1);
+    }
+  }
+}
+
+string LFNIndex::lfn_generate_object_name(const hobject_t &hoid) {
+  if (index_version == HASH_INDEX_TAG)
+    return lfn_generate_object_name_keyless(hoid);
+
+  string full_name;
+  string::const_iterator i = hoid.oid.name.begin();
+  if (hoid.oid.name.substr(0, 4) == "DIR_") {
+    full_name.append("\\d");
+    i += 4;
+  } else if (hoid.oid.name[0] == '.') {
+    full_name.append("\\.");
+    ++i;
+  }
+  append_escaped(i, hoid.oid.name.end(), &full_name);
+  full_name.append("_");
+  append_escaped(hoid.key.begin(), hoid.key.end(), &full_name);
+  full_name.append("_");
+
+  char snap_with_hash[PATH_MAX];
+  char *t = snap_with_hash;
+  char *end = t + sizeof(snap_with_hash);
+  if (hoid.snap == CEPH_NOSNAP)
+    t += snprintf(t, end - t, "head");
+  else if (hoid.snap == CEPH_SNAPDIR)
+    t += snprintf(t, end - t, "snapdir");
+  else
+    t += snprintf(t, end - t, "%llx", (long long unsigned)hoid.snap);
+  snprintf(t, end - t, "_%.*X", (int)(sizeof(hoid.hash)*2), hoid.hash);
+  full_name += string(snap_with_hash);
+  return full_name;
+}
 
 int LFNIndex::lfn_get_name(const vector<string> &path, 
 			   const hobject_t &hoid,
@@ -487,7 +555,7 @@ int LFNIndex::lfn_get_name(const vector<string> &path,
   for ( ; ; ++i) {
     candidate = lfn_get_short_name(hoid, i);
     candidate_path = get_full_path(path, candidate);
-    r = do_getxattr(candidate_path.c_str(), LFN_ATTR.c_str(), buf, sizeof(buf));
+    r = do_getxattr(candidate_path.c_str(), get_lfn_attr().c_str(), buf, sizeof(buf));
     if (r < 0) {
       if (errno != ENODATA && errno != ENOENT)
 	return -errno;
@@ -528,7 +596,7 @@ int LFNIndex::lfn_created(const vector<string> &path,
     return 0;
   string full_path = get_full_path(path, mangled_name);
   string full_name = lfn_generate_object_name(hoid);
-  return do_setxattr(full_path.c_str(), LFN_ATTR.c_str(), 
+  return do_setxattr(full_path.c_str(), get_lfn_attr().c_str(), 
 		     full_name.c_str(), full_name.size());
 }
 
@@ -585,15 +653,15 @@ int LFNIndex::lfn_unlink(const vector<string> &path,
 }
 
 int LFNIndex::lfn_translate(const vector<string> &path,
-				   const string &short_name,
-				   hobject_t *out) {
+			    const string &short_name,
+			    hobject_t *out) {
   if (!lfn_is_hashed_filename(short_name)) {
     return lfn_parse_object_name(short_name, out);
   }
   // Get lfn_attr
   string full_path = get_full_path(path, short_name);
   char attr[PATH_MAX];
-  int r = do_getxattr(full_path.c_str(), LFN_ATTR.c_str(), attr, sizeof(attr) - 1);
+  int r = do_getxattr(full_path.c_str(), get_lfn_attr().c_str(), attr, sizeof(attr) - 1);
   if (r < 0)
     return -errno;
   if (r < (int)sizeof(attr))
@@ -666,13 +734,90 @@ static int parse_object(const char *s, hobject_t& o)
   return 0;
 }
 
-bool LFNIndex::lfn_parse_object_name(const string &long_name, hobject_t *out) {
+bool LFNIndex::lfn_parse_object_name_keyless(const string &long_name, hobject_t *out) {
   bool r = parse_object(long_name.c_str(), *out);
   if (!r) return r;
   string temp = lfn_generate_object_name(*out);
   return r;
 }
 
+static bool append_unescaped(string::const_iterator begin,
+			     string::const_iterator end, 
+			     string *out) {
+  for (string::const_iterator i = begin; i != end; ++i) {
+    if (*i == '\\') {
+      ++i;
+      if (*i == '\\')
+	out->append("\\");
+      else if (*i == 's')
+	out->append("/");
+      else if (*i == 'u')
+	out->append("_");
+      else
+	return false;
+    } else {
+      out->append(i, i+1);
+    }
+  }
+  return true;
+}
+
+bool LFNIndex::lfn_parse_object_name(const string &long_name, hobject_t *out) {
+  if (index_version == HASH_INDEX_TAG)
+    return lfn_parse_object_name_keyless(long_name, out);
+
+  string::const_iterator current = long_name.begin();
+  if (*current == '\\') {
+    ++current;
+    if (current == long_name.end()) {
+      return false;
+    } else if (*current == 'd') {
+      out->oid.name.append("DIR_");
+      ++current;
+    } else if (*current == '.') {
+      out->oid.name.append(".");
+      ++current;
+    } else {
+      --current;
+    }
+  }
+
+  string::const_iterator end = current;
+  for ( ; end != long_name.end() && *end != '_'; ++end) ;
+  if (end == long_name.end())
+    return false;
+  if (!append_unescaped(current, end, &(out->oid.name)))
+    return false;
+
+  current = ++end;
+  for ( ; end != long_name.end() && *end != '_'; ++end) ;
+  if (end == long_name.end())
+    return false;
+  if (!append_unescaped(current, end, &(out->key)))
+    return false;
+
+  current = ++end;
+  for ( ; end != long_name.end() && *end != '_'; ++end) ;
+  if (end == long_name.end())
+    return false;
+  string snap(current, end);
+
+  current = ++end;
+  for ( ; end != long_name.end() && *end != '_'; ++end) ;
+  if (end != long_name.end())
+    return false;
+  string hash(current, end);
+
+  if (snap == "head")
+    out->snap = CEPH_NOSNAP;
+  else if (snap == "snapdir")
+    out->snap = CEPH_SNAPDIR;
+  else
+    out->snap = strtoull(snap.c_str(), NULL, 16);
+  sscanf(hash.c_str(), "%X", &out->hash);
+  return true;
+}
+
 bool LFNIndex::lfn_is_hashed_filename(const string &name) {
   if (name.size() < (unsigned)FILENAME_SHORT_LEN) {
     return 0;
@@ -776,13 +921,13 @@ string LFNIndex::demangle_path_component(const string &component) {
 }
 
 int LFNIndex::decompose_full_path(const char *in, vector<string> *out,
-			hobject_t *hoid, string *shortname) {
+				  hobject_t *hoid, string *shortname) {
   const char *beginning = in + get_base_path().size();
   const char *end = beginning;
   while (1) {
     end++;
     beginning = end++;
-    for (; *end != '\0' && *end != '/'; ++end);
+    for ( ; *end != '\0' && *end != '/'; ++end) ;
     if (*end != '\0') {
       out->push_back(demangle_path_component(string(beginning, end - beginning)));
       continue;
diff --git a/src/os/LFNIndex.h b/src/os/LFNIndex.h
index dedd8465a28..e0321b45dbf 100644
--- a/src/os/LFNIndex.h
+++ b/src/os/LFNIndex.h
@@ -76,10 +76,26 @@ class LFNIndex : public CollectionIndex {
   /// For reference counting the collection @see Path
   std::tr1::weak_ptr<CollectionIndex> self_ref;
 
+protected:
+  const uint32_t index_version;
+
+private:
+  string lfn_attribute;
+
 public:
   /// Constructor
-  LFNIndex(const char *base_path) ///< [in] path to Index root
-    : base_path(base_path) {}
+  LFNIndex(
+    const char *base_path, ///< [in] path to Index root
+    uint32_t index_version)
+    : base_path(base_path), index_version(index_version) {
+    if (index_version == HASH_INDEX_TAG) {
+      lfn_attribute = LFN_ATTR;
+    } else {
+      char buf[100];
+      snprintf(buf, sizeof(buf), "%d", index_version);
+      lfn_attribute = LFN_ATTR + string(buf);
+    }
+  }
 
   /// Virtual destructor
   virtual ~LFNIndex() {}
@@ -306,6 +322,14 @@ protected:
 
 private:
   /* lfn translation functions */
+
+  /**
+   * Gets the version specific lfn attribute tag
+   */
+  const string &get_lfn_attr() const {
+    return lfn_attribute;
+  }
+
   /**
    * Gets the filename corresponsing to hoid in path.
    *
@@ -359,11 +383,22 @@ private:
     ); ///< @return True if short_name is a subdir, false otherwise
 
   /// Generate object name
+  string lfn_generate_object_name_keyless(
+    const hobject_t &hoid ///< [in] Object for which to generate.
+    ); ///< @return Generated object name.
+
+  /// Generate object name
   string lfn_generate_object_name(
     const hobject_t &hoid ///< [in] Object for which to generate.
     ); ///< @return Generated object name.
 
   /// Parse object name
+  bool lfn_parse_object_name_keyless(
+    const string &long_name, ///< [in] Name to parse
+    hobject_t *out	     ///< [out] Resulting Object
+    ); ///< @return True if successfull, False otherwise.
+
+  /// Parse object name
   bool lfn_parse_object_name(
     const string &long_name, ///< [in] Name to parse
     hobject_t *out	     ///< [out] Resulting Object
diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc
index 95adc3ff8ac..85d922cab11 100644
--- a/src/osd/OSD.cc
+++ b/src/osd/OSD.cc
@@ -193,16 +193,15 @@ static int do_convertfs(ObjectStore *store)
   g_ceph_context->_conf->filestore_update_collections = true;
   int r = store->mount();
   if (r < 0)
-    return -r;
+    return r;
 
   uint32_t version;
   r = store->version_stamp_is_valid(&version);
   if (r < 0)
-    return -r;
+    return r;
   if (r == 1) {
     derr << "FileStore is up to date." << dendl;
-    store->umount();
-    return 0;
+    return store->umount();
   } else {
     derr << "FileStore is old at version " << version << ".  Updating..." 
 	 << dendl;
@@ -212,7 +211,7 @@ static int do_convertfs(ObjectStore *store)
   vector<coll_t> collections;
   r = store->list_collections(collections);
   if (r < 0)
-    return -r;
+    return r;
 
   derr << collections.size() << " to process." << dendl;
   int processed = 0;
@@ -242,8 +241,7 @@ static int do_convertfs(ObjectStore *store)
   store->sync_and_flush();
   store->sync();
   cerr << "Version stamp updated, done!" << std::endl;
-  store->umount();
-  return 0;
+  return store->umount();
 }
 
 int OSD::convertfs(const std::string &dev, const std::string &jdev)
@@ -5069,7 +5067,8 @@ void OSD::handle_op(MOSDOp *op)
 
   if ((op->get_flags() & CEPH_OSD_FLAG_PGOP) == 0) {
     // missing object?
-    hobject_t head(op->get_oid(), CEPH_NOSNAP, op->get_pg().ps());
+    hobject_t head(op->get_oid(), op->get_object_locator().key,
+		   CEPH_NOSNAP, op->get_pg().ps());
     if (pg->is_missing_object(head)) {
       pg->wait_for_missing_object(head, op);
       pg->unlock();
diff --git a/src/osd/PG.cc b/src/osd/PG.cc
index c0f383b6e0f..d77639c2a3b 100644
--- a/src/osd/PG.cc
+++ b/src/osd/PG.cc
@@ -2250,6 +2250,7 @@ void PG::read_log(ObjectStore *store)
 	     ++i) {
 	  if (i->oid == e.soid.oid && i->snap == e.soid.snap) {
 	    e.soid.hash = i->hash;
+	    e.soid.key = i->key;
 	    found = true;
 	    break;
 	  }
diff --git a/src/osd/PG.h b/src/osd/PG.h
index 61e8778fd14..d5f810d5130 100644
--- a/src/osd/PG.h
+++ b/src/osd/PG.h
@@ -393,7 +393,7 @@ public:
       }
 
       void encode(bufferlist &bl) const {
-	__u8 struct_v = 2;
+	__u8 struct_v = 3;
 	::encode(struct_v, bl);
 	::encode(op, bl);
 	::encode(soid, bl);
@@ -417,6 +417,8 @@ public:
 	} else {
 	  ::decode(soid, bl);
 	}
+	if (struct_v < 3)
+	  invalid_hash = true;
 	::decode(version, bl);
 	::decode(prior_version, bl);
 	::decode(reqid, bl);
diff --git a/src/osd/ReplicatedPG.cc b/src/osd/ReplicatedPG.cc
index b9a50f95936..700b02c01c3 100644
--- a/src/osd/ReplicatedPG.cc
+++ b/src/osd/ReplicatedPG.cc
@@ -404,7 +404,9 @@ void ReplicatedPG::do_op(MOSDOp *op)
   ObjectContext *obc;
   bool can_create = op->may_write();
   snapid_t snapid;
-  int r = find_object_context(hobject_t(op->get_oid(), op->get_snapid(), op->get_pg().ps()),
+  int r = find_object_context(hobject_t(op->get_oid(), 
+					op->get_object_locator().key,
+					op->get_snapid(), op->get_pg().ps()),
 			      op->get_object_locator(),
 			      &obc, can_create, &snapid);
 
@@ -416,7 +418,8 @@ void ReplicatedPG::do_op(MOSDOp *op)
       if (is_primary() || (!(op->get_rmw_flags() & CEPH_OSD_FLAG_LOCALIZE_READS))) {
 	// missing the specific snap we need; requeue and wait.
 	assert(!can_create); // only happens on a read
-	hobject_t soid(op->get_oid(), snapid, op->get_pg().ps());
+	hobject_t soid(op->get_oid(), op->get_object_locator().key,
+		       snapid, op->get_pg().ps());
 	wait_for_missing_object(soid, op);
 	return;
       }
@@ -476,17 +479,19 @@ void ReplicatedPG::do_op(MOSDOp *op)
   map<hobject_t,ObjectContext*> src_obc;
   for (vector<OSDOp>::iterator p = op->ops.begin(); p != op->ops.end(); p++) {
     OSDOp& osd_op = *p;
-    hobject_t toid(osd_op.soid, op->get_pg().ps());
+    hobject_t toid(osd_op.soid, op->get_object_locator().key, op->get_pg().ps());
     if (osd_op.soid.oid.name.length()) {
       if (!src_obc.count(toid)) {
 	ObjectContext *sobc;
 	snapid_t ssnapid;
-	int r = find_object_context(hobject_t(toid.oid, toid.snap, op->get_pg().ps()),
+	int r = find_object_context(hobject_t(toid.oid, op->get_object_locator().key,
+					      toid.snap, op->get_pg().ps()),
 				    op->get_object_locator(),
 				    &sobc, false, &ssnapid);
 	if (r == -EAGAIN) {
 	  // missing the specific snap we need; requeue and wait.
-	  hobject_t wait_oid(osd_op.soid.oid, ssnapid, op->get_pg().ps());
+	  hobject_t wait_oid(osd_op.soid.oid, op->get_object_locator().key, 
+			     ssnapid, op->get_pg().ps());
 	  wait_for_missing_object(wait_oid, op);
 	} else if (r) {
 	  osd->reply_op_error(op, r);
@@ -816,7 +821,7 @@ ReplicatedPG::RepGather *ReplicatedPG::trim_object(const hobject_t &coid,
   // load clone info
   bufferlist bl;
   ObjectContext *obc = 0;
-  int r = find_object_context(hobject_t(coid.oid, sn, coid.hash),
+  int r = find_object_context(hobject_t(coid.oid, coid.key, sn, coid.hash),
 			      OLOC_BLANK, &obc, false, NULL);
   if (r == -ENOENT || coid.snap != obc->obs.oi.soid.snap) {
     if (obc) put_object_context(obc);
@@ -829,7 +834,7 @@ ReplicatedPG::RepGather *ReplicatedPG::trim_object(const hobject_t &coid,
 
   // get snap set context
   if (!obc->ssc)
-    obc->ssc = get_snapset_context(coid.oid, coid.hash, false);
+    obc->ssc = get_snapset_context(coid.oid, coid.key, coid.hash, false);
   SnapSetContext *ssc = obc->ssc;
   assert(ssc);
   SnapSet& snapset = ssc->snapset;
@@ -926,7 +931,7 @@ ReplicatedPG::RepGather *ReplicatedPG::trim_object(const hobject_t &coid,
   // save head snapset
   dout(10) << coid << " new snapset " << snapset << dendl;
 
-  hobject_t snapoid(coid.oid, snapset.head_exists ? CEPH_NOSNAP:CEPH_SNAPDIR, coid.hash);
+  hobject_t snapoid(coid.oid, coid.key, snapset.head_exists ? CEPH_NOSNAP:CEPH_SNAPDIR, coid.hash);
   ctx->snapset_obc = get_object_context(snapoid, coi.oloc, false);
   assert(ctx->snapset_obc->registered);
   if (snapset.clones.empty() && !snapset.head_exists) {
@@ -1157,7 +1162,7 @@ int ReplicatedPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops,
 
     ObjectContext *src_obc = 0;
     if (ceph_osd_op_type_multi(op.op)) {
-      src_obc = ctx->src_obc[hobject_t(osd_op.soid, soid.hash)];
+      src_obc = ctx->src_obc[hobject_t(osd_op.soid, soid.key, soid.hash)];
       assert(src_obc);
     }
 
@@ -1678,8 +1683,11 @@ int ReplicatedPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops,
 	  result = -EINVAL;
 	  break;
 	}
-	t.clone_range(coll, hobject_t(osd_op.soid, obs.oi.soid.hash), obs.oi.soid,
-		      op.clonerange.src_offset, op.clonerange.length, op.clonerange.offset);
+	t.clone_range(coll, hobject_t(osd_op.soid, obs.oi.soid.key, 
+				      obs.oi.soid.hash),
+		      obs.oi.soid, op.clonerange.src_offset,
+		      op.clonerange.length, op.clonerange.offset);
+		      
 
 	write_update_size_and_usage(ctx->delta_stats, oi, ssc->snapset, ctx->modified_ranges,
 				    op.clonerange.offset, op.clonerange.length, false);
@@ -2096,7 +2104,7 @@ int ReplicatedPG::_rollback_to(OpContext *ctx, ceph_osd_op& op)
   dout(10) << "_rollback_to " << soid << " snapid " << snapid << dendl;
 
   ObjectContext *rollback_to;
-  int ret = find_object_context(hobject_t(soid.oid, snapid, soid.hash), 
+  int ret = find_object_context(hobject_t(soid.oid, oi.oloc.key, snapid, soid.hash), 
 				oi.oloc, &rollback_to, false, &cloneid);
   if (ret) {
     if (-ENOENT == ret) {
@@ -2109,7 +2117,7 @@ int ReplicatedPG::_rollback_to(OpContext *ctx, ceph_osd_op& op)
       /* a different problem, like degraded pool
        * with not-yet-restored object. We shouldn't have been able
        * to get here; recovery should have completed first! */
-      hobject_t rollback_target(soid.oid, cloneid, soid.hash);
+      hobject_t rollback_target(soid.oid, soid.key, cloneid, soid.hash);
       assert(is_missing_object(rollback_target));
       dout(20) << "_rollback_to attempted to roll back to a missing object " 
 	       << rollback_target << " (requested snapid: ) " << snapid << dendl;
@@ -2546,7 +2554,7 @@ int ReplicatedPG::prepare_transaction(OpContext *ctx)
     ctx->op_t.setattr(coll, soid, SS_ATTR, bss);   
     if (!head_existed) {
       // if we logically recreated the head, remove old _snapdir object
-      hobject_t snapoid(soid.oid, CEPH_SNAPDIR, soid.hash);
+      hobject_t snapoid(soid.oid, soid.key, CEPH_SNAPDIR, soid.hash);
 
       ctx->snapset_obc = get_object_context(snapoid, ctx->new_obs.oi.oloc, false);
       if (ctx->snapset_obc && ctx->snapset_obc->obs.exists) {
@@ -2563,7 +2571,7 @@ int ReplicatedPG::prepare_transaction(OpContext *ctx)
     }
   } else if (ctx->new_snapset.clones.size()) {
     // save snapset on _snap
-    hobject_t snapoid(soid.oid, CEPH_SNAPDIR, soid.hash);
+    hobject_t snapoid(soid.oid, soid.key, CEPH_SNAPDIR, soid.hash);
     dout(10) << " final snapset " << ctx->new_snapset
 	     << " in " << snapoid << dendl;
     ctx->at_version.version++;
@@ -2809,7 +2817,8 @@ void ReplicatedPG::eval_repop(RepGather *repop)
 	  reply = new MOSDOpReply(op, 0, osd->osdmap->get_epoch(), 0);
 	reply->add_flags(CEPH_OSD_FLAG_ACK);
 	dout(10) << " sending ack on " << *repop << " " << reply << dendl;
-	osd->cluster_messenger->send_message(reply, op->get_connection());
+        assert(entity_name_t::TYPE_OSD != op->get_connection()->peer_type);
+	osd->client_messenger->send_message(reply, op->get_connection());
 	repop->sent_ack = true;
       }
 
@@ -3017,13 +3026,13 @@ ReplicatedPG::ObjectContext *ReplicatedPG::get_object_context(const hobject_t& s
 
       SnapSetContext *ssc = NULL;
       if (can_create)
-	ssc = get_snapset_context(soid.oid, soid.hash, true);
+	ssc = get_snapset_context(soid.oid, soid.key, soid.hash, true);
       obc = new ObjectContext(oi, true, ssc);
     }
     register_object_context(obc);
 
     if (can_create && !obc->ssc)
-      obc->ssc = get_snapset_context(soid.oid, soid.hash, true);
+      obc->ssc = get_snapset_context(soid.oid, soid.key, soid.hash, true);
 
     if (r >= 0) {
       obc->obs.oi.decode(bv);
@@ -3058,7 +3067,7 @@ int ReplicatedPG::find_object_context(const hobject_t& oid,
 				      snapid_t *psnapid)
 {
   // want the head?
-  hobject_t head(oid.oid, CEPH_NOSNAP, oid.hash);
+  hobject_t head(oid.oid, oid.key, CEPH_NOSNAP, oid.hash);
   if (oid.snap == CEPH_NOSNAP) {
     ObjectContext *obc = get_object_context(head, oloc, can_create);
     if (!obc)
@@ -3067,13 +3076,13 @@ int ReplicatedPG::find_object_context(const hobject_t& oid,
     *pobc = obc;
 
     if (can_create && !obc->ssc)
-      obc->ssc = get_snapset_context(oid.oid, oid.hash, true);
+      obc->ssc = get_snapset_context(oid.oid, oid.key, oid.hash, true);
 
     return 0;
   }
 
   // we want a snap
-  SnapSetContext *ssc = get_snapset_context(oid.oid, oid.hash, can_create);
+  SnapSetContext *ssc = get_snapset_context(oid.oid, oid.key, oid.hash, can_create);
   if (!ssc)
     return -ENOENT;
 
@@ -3115,7 +3124,7 @@ int ReplicatedPG::find_object_context(const hobject_t& oid,
     put_snapset_context(ssc);
     return -ENOENT;
   }
-  hobject_t soid(oid.oid, ssc->snapset.clones[k], oid.hash);
+  hobject_t soid(oid.oid, oid.key, ssc->snapset.clones[k], oid.hash);
 
   put_snapset_context(ssc); // we're done with ssc
   ssc = 0;
@@ -3179,7 +3188,9 @@ void ReplicatedPG::put_object_contexts(map<hobject_t,ObjectContext*>& obcv)
   obcv.clear();
 }
 
-ReplicatedPG::SnapSetContext *ReplicatedPG::get_snapset_context(const object_t& oid, ps_t seed,
+ReplicatedPG::SnapSetContext *ReplicatedPG::get_snapset_context(const object_t& oid,
+								const string& key,
+								ps_t seed,
 								bool can_create)
 {
   SnapSetContext *ssc;
@@ -3188,11 +3199,11 @@ ReplicatedPG::SnapSetContext *ReplicatedPG::get_snapset_context(const object_t&
     ssc = p->second;
   } else {
     bufferlist bv;
-    hobject_t head(oid, CEPH_NOSNAP, seed);
+    hobject_t head(oid, key, CEPH_NOSNAP, seed);
     int r = osd->store->getattr(coll, head, SS_ATTR, bv);
     if (r < 0) {
       // try _snapset
-      hobject_t snapdir(oid, CEPH_SNAPDIR, seed);
+      hobject_t snapdir(oid, key, CEPH_SNAPDIR, seed);
       r = osd->store->getattr(coll, snapdir, SS_ATTR, bv);
       if (r < 0 && !can_create)
 	return NULL;
@@ -3602,7 +3613,7 @@ int ReplicatedPG::pull(const hobject_t& soid)
     }
 
     // check snapset
-    SnapSetContext *ssc = get_snapset_context(soid.oid, soid.hash, false);
+    SnapSetContext *ssc = get_snapset_context(soid.oid, soid.key, soid.hash, false);
     dout(10) << " snapset " << ssc->snapset << dendl;
     calc_clone_subsets(ssc->snapset, soid, missing,
 		       data_subset, clone_subsets);
@@ -3710,7 +3721,7 @@ void ReplicatedPG::push_to_replica(ObjectContext *obc, const hobject_t& soid, in
       return push_start(snapdir, peer);
     }
     
-    SnapSetContext *ssc = get_snapset_context(soid.oid, soid.hash, false);
+    SnapSetContext *ssc = get_snapset_context(soid.oid, soid.key, soid.hash, false);
     dout(15) << "push_to_replica snapset is " << ssc->snapset << dendl;
     calc_clone_subsets(ssc->snapset, soid, peer_missing[peer],
 		       data_subset, clone_subsets);
@@ -3718,7 +3729,7 @@ void ReplicatedPG::push_to_replica(ObjectContext *obc, const hobject_t& soid, in
   } else if (soid.snap == CEPH_NOSNAP) {
     // pushing head or unversioned object.
     // base this on partially on replica's clones?
-    SnapSetContext *ssc = get_snapset_context(soid.oid, soid.hash, false);
+    SnapSetContext *ssc = get_snapset_context(soid.oid, soid.key, soid.hash, false);
     dout(15) << "push_to_replica snapset is " << ssc->snapset << dendl;
     calc_head_subsets(ssc->snapset, soid, peer_missing[peer], data_subset, clone_subsets);
     put_snapset_context(ssc);
@@ -4062,7 +4073,7 @@ void ReplicatedPG::sub_op_push(MOSDSubOp *op)
 
     if (soid.snap && soid.snap < CEPH_NOSNAP) {
       // clone.  make sure we have enough data.
-      SnapSetContext *ssc = get_snapset_context(soid.oid, soid.hash, false);
+      SnapSetContext *ssc = get_snapset_context(soid.oid, soid.key, soid.hash, false);
       assert(ssc);
 
       clone_subsets.clear();   // forget what pusher said; recalculate cloning.
diff --git a/src/osd/ReplicatedPG.h b/src/osd/ReplicatedPG.h
index faadd3f03bf..f9295e4e13c 100644
--- a/src/osd/ReplicatedPG.h
+++ b/src/osd/ReplicatedPG.h
@@ -510,7 +510,8 @@ protected:
 			  ObjectContext **pobc,
 			  bool can_create, snapid_t *psnapid=NULL);
 
-  SnapSetContext *get_snapset_context(const object_t& oid, ps_t seed, bool can_create);
+  SnapSetContext *get_snapset_context(const object_t& oid, const string &key,
+				      ps_t seed, bool can_create);
   void register_snapset_context(SnapSetContext *ssc) {
     if (!ssc->registered) {
       ssc->registered = true;
diff --git a/src/osd/osd_types.cc b/src/osd/osd_types.cc
index 75a93ca73fd..2bdf099d0cf 100644
--- a/src/osd/osd_types.cc
+++ b/src/osd/osd_types.cc
@@ -574,7 +574,7 @@ ps_t object_info_t::legacy_object_locator_to_ps(const object_t &oid,
 
 void object_info_t::encode(bufferlist& bl) const
 {
-  const __u8 v = 6;
+  const __u8 v = 7;
   ::encode(v, bl);
   ::encode(soid, bl);
   ::encode(oloc, bl);
@@ -603,12 +603,15 @@ void object_info_t::decode(bufferlist::iterator& bl)
     sobject_t obj;
     ::decode(obj, bl);
     ::decode(oloc, bl);
-    soid = hobject_t(obj.oid, obj.snap, 0);
+    soid = hobject_t(obj.oid, oloc.key, obj.snap, 0);
     soid.hash = legacy_object_locator_to_ps(soid.oid, oloc);
   } else if (v >= 6) {
     ::decode(soid, bl);
     ::decode(oloc, bl);
+    if (v == 6)
+      soid.key = oloc.key;
   }
+    
   if (v >= 5)
     ::decode(category, bl);
   ::decode(version, bl);
diff --git a/src/osdc/ObjectCacher.cc b/src/osdc/ObjectCacher.cc
index 19ef1224b57..a4dad0ddf1a 100644
--- a/src/osdc/ObjectCacher.cc
+++ b/src/osdc/ObjectCacher.cc
@@ -632,7 +632,6 @@ void ObjectCacher::bh_write_commit(int64_t poolid, sobject_t oid, loff_t start,
           << dendl;
   if (objects[poolid].count(oid) == 0) {
     ldout(cct, 7) << "bh_write_commit no object cache" << dendl;
-    assert(0);
   } else {
     Object *ob = objects[poolid][oid];
     
@@ -642,7 +641,8 @@ void ObjectCacher::bh_write_commit(int64_t poolid, sobject_t oid, loff_t start,
          p++) {
       BufferHead *bh = p->second;
       
-      if (bh->start() > start+(loff_t)length) break;
+      if (bh->start() > start+(loff_t)length)
+	break;
 
       if (bh->start() < start &&
           bh->end() > start+(loff_t)length) {
diff --git a/src/rados.cc b/src/rados.cc
index 39cc1e2f1a2..8b43bb08375 100644
--- a/src/rados.cc
+++ b/src/rados.cc
@@ -753,6 +753,7 @@ static int rados_tool_common(const std::map < std::string, std::string > &opts,
 	     "KB", "objects", "clones", "degraded",
 	     "unfound", "rd", "rd KB", "wr", "wr KB");
     } else {
+      formatter->open_object_section("stats");
       formatter->open_array_section("pools");
     }
     for (map<string, librados::stats_map>::iterator c = stats.begin(); c != stats.end(); ++c) {
@@ -825,6 +826,7 @@ static int rados_tool_common(const std::map < std::string, std::string > &opts,
       printf("  total avail   %12lld\n", (long long unsigned)tstats.kb_avail);
       printf("  total space   %12lld\n", (long long unsigned)tstats.kb);
     } else {
+      formatter->close_section();
       formatter->dump_format("total_objects", "%lld", (long long unsigned)tstats.num_objects);
       formatter->dump_format("total_used", "%lld", (long long unsigned)tstats.kb_used);
       formatter->dump_format("total_avail", "%lld", (long long unsigned)tstats.kb_avail);
diff --git a/src/rgw/rgw_access.h b/src/rgw/rgw_access.h
index 63e199df168..9a7a19623a7 100644
--- a/src/rgw/rgw_access.h
+++ b/src/rgw/rgw_access.h
@@ -232,7 +232,7 @@ public:
 
   virtual bool supports_tmap() { return false; }
 
-  virtual int tmap_get(rgw_obj& obj, bufferlist& bl) { return -ENOTSUP; }
+  virtual int tmap_get(rgw_obj& obj, bufferlist& header, std::map<string, bufferlist>& m) { return -ENOTSUP; }
   virtual int tmap_set(rgw_obj& obj, std::string& key, bufferlist& bl) { return -ENOTSUP; }
   virtual int tmap_set(rgw_obj& obj, map<std::string, bufferlist>& m) { return -ENOTSUP; }
   virtual int tmap_create(rgw_obj& obj, std::string& key, bufferlist& bl) { return -ENOTSUP; }
diff --git a/src/rgw/rgw_admin.cc b/src/rgw/rgw_admin.cc
index 7ed07881956..dd6d31f9f85 100644
--- a/src/rgw/rgw_admin.cc
+++ b/src/rgw/rgw_admin.cc
@@ -57,11 +57,11 @@ void _usage()
   cerr << "   --uid=<id>                user id\n";
   cerr << "   --subuser=<name>          subuser name\n";
   cerr << "   --access-key=<key>        S3 access key\n";
-  cerr << "   --os-user=<group:name>    OpenStack user\n";
+  cerr << "   --swift-user=<group:name> Swift user\n";
   cerr << "   --email=<email>\n";
   cerr << "   --auth_uid=<auid>         librados uid\n";
   cerr << "   --secret=<key>            S3 key\n";
-  cerr << "   --os-secret=<key>         OpenStack key\n";
+  cerr << "   --swift-secret=<key>      Swift key\n";
   cerr << "   --gen-access-key          generate random access key\n";
   cerr << "   --gen-secret              generate random secret key\n";
   cerr << "   --access=<access>         Set access permissions for sub-user, should be one\n";
@@ -266,32 +266,76 @@ string escape_str(string& src, char c)
   return dest;
 }
 
-static void show_user_info( RGWUserInfo& info)
+static void show_user_info(RGWUserInfo& info, const char *format, Formatter *formatter)
 {
   map<string, RGWAccessKey>::iterator kiter;
   map<string, RGWSubUser>::iterator uiter;
 
-  cout << "User ID: " << info.user_id << std::endl;
-  cout << "RADOS UID: " << info.auid << std::endl;
-  cout << "Keys:" << std::endl;
-  for (kiter = info.access_keys.begin(); kiter != info.access_keys.end(); ++kiter) {
-    RGWAccessKey& k = kiter->second;
-    cout << " User: " << info.user_id << (k.subuser.empty() ? "" : ":") << k.subuser << std::endl;
-    cout << "  Access Key: " << k.id << std::endl;
-    cout << "  Secret Key: " << k.key << std::endl;
-  }
-  cout << "Users: " << std::endl;
-  for (uiter = info.subusers.begin(); uiter != info.subusers.end(); ++uiter) {
-    RGWSubUser& u = uiter->second;
-    cout << " Name: " << info.user_id << ":" << u.name << std::endl;
-    char buf[256];
-    perm_to_str(u.perm_mask, buf, sizeof(buf));
-    cout << " Permissions: " << buf << std::endl;
+
+  if (!format) {
+    cout << "User ID: " << info.user_id << std::endl;
+    cout << "RADOS UID: " << info.auid << std::endl;
+    cout << "Keys:" << std::endl;
+    for (kiter = info.access_keys.begin(); kiter != info.access_keys.end(); ++kiter) {
+      RGWAccessKey& k = kiter->second;
+      cout << " User: " << info.user_id << (k.subuser.empty() ? "" : ":") << k.subuser << std::endl;
+      cout << "  Access Key: " << k.id << std::endl;
+      cout << "  Secret Key: " << k.key << std::endl;
+    }
+    cout << "Users: " << std::endl;
+    for (uiter = info.subusers.begin(); uiter != info.subusers.end(); ++uiter) {
+      RGWSubUser& u = uiter->second;
+      cout << " Name: " << info.user_id << ":" << u.name << std::endl;
+      char buf[256];
+      perm_to_str(u.perm_mask, buf, sizeof(buf));
+      cout << " Permissions: " << buf << std::endl;
+    }
+    cout << "Display Name: " << info.display_name << std::endl;
+    cout << "Email: " << info.user_email << std::endl;
+    cout << "Swift User: " << (info.swift_name.size() ? info.swift_name : "<undefined>")<< std::endl;
+    cout << "Swift Key: " << (info.swift_key.size() ? info.swift_key : "<undefined>")<< std::endl;
+  } else {
+    formatter->open_object_section("user_info");
+
+    formatter->dump_string("user_id", info.user_id.c_str());
+    formatter->dump_format("rados_uid", "%lld", info.auid);
+    formatter->dump_string("display_name", info.display_name.c_str());
+    formatter->dump_string("email", info.user_email.c_str());
+    formatter->dump_string("swift_user", info.swift_name.c_str());
+    formatter->dump_string("swift_key", info.swift_key.c_str());
+
+    // keys
+    formatter->open_array_section("keys");
+    for (kiter = info.access_keys.begin(); kiter != info.access_keys.end(); ++kiter) {
+      RGWAccessKey& k = kiter->second;
+      const char *sep = (k.subuser.empty() ? "" : ":");
+      const char *subuser = (k.subuser.empty() ? "" : k.subuser.c_str());
+      formatter->open_object_section("key");
+      formatter->dump_format("user", "%s%s%s", info.user_id.c_str(), sep, subuser);
+      formatter->dump_string("access_key", k.id);
+      formatter->dump_string("secret_key", k.key);
+      formatter->close_section();
+      formatter->flush(cout);
+    }
+    formatter->close_section();
+
+    // subusers
+    formatter->open_array_section("subusers");
+    for (uiter = info.subusers.begin(); uiter != info.subusers.end(); ++uiter) {
+      RGWSubUser& u = uiter->second;
+      formatter->open_object_section("user");
+      formatter->dump_format("id", "%s:%s", info.user_id.c_str(), u.name.c_str());
+      char buf[256];
+      perm_to_str(u.perm_mask, buf, sizeof(buf));
+      formatter->dump_string("permissions", buf);
+      formatter->close_section();
+      formatter->flush(cout);
+    }
+    formatter->close_section();
+
+    formatter->close_section();
+    formatter->flush(cout);
   }
-  cout << "Display Name: " << info.display_name << std::endl;
-  cout << "Email: " << info.user_email << std::endl;
-  cout << "OpenStack User: " << (info.openstack_name.size() ? info.openstack_name : "<undefined>")<< std::endl;
-  cout << "OpenStack Key: " << (info.openstack_key.size() ? info.openstack_key : "<undefined>")<< std::endl;
 }
 
 static int create_bucket(string bucket_str, string& user_id, string& display_name, uint64_t auid)
@@ -358,11 +402,11 @@ static void remove_old_indexes(RGWUserInfo& old_info, RGWUserInfo new_info)
     }
   }
 
-  if (!old_info.openstack_name.empty() &&
-      old_info.openstack_name.compare(new_info.openstack_name) != 0) {
-    ret = rgw_remove_openstack_name_index(new_info.user_id, old_info.openstack_name);
+  if (!old_info.swift_name.empty() &&
+      old_info.swift_name.compare(new_info.swift_name) != 0) {
+    ret = rgw_remove_swift_name_index(new_info.user_id, old_info.swift_name);
     if (ret < 0 && ret != -ENOENT) {
-      cerr << "ERROR: could not remove index for openstack_name " << old_info.openstack_name << " return code: " << ret << std::endl;
+      cerr << "ERROR: could not remove index for swift_name " << old_info.swift_name << " return code: " << ret << std::endl;
       success = false;
     }
   }
@@ -421,7 +465,12 @@ int process_intent_log(rgw_bucket& bucket, string& oid, time_t epoch, int flags,
   try {
     while (!iter.end()) {
       struct rgw_intent_log_entry entry;
-      ::decode(entry, iter);
+      try {
+        ::decode(entry, iter);
+      } catch (buffer::error& err) {
+        RGW_LOG(0) << "ERROR: " << __func__ << "(): caught buffer::error" << dendl;
+        return -EIO;
+      }
       if (entry.op_time.sec() > epoch) {
         cerr << "skipping entry for obj=" << obj << " entry.op_time=" << entry.op_time.sec() << " requested epoch=" << epoch << std::endl;
         cerr << "skipping intent log" << std::endl; // no use to continue
@@ -484,7 +533,7 @@ int main(int argc, char **argv)
   common_init_finish(g_ceph_context);
 
   std::string user_id, access_key, secret_key, user_email, display_name;
-  std::string bucket_name, object, openstack_user, openstack_key;
+  std::string bucket_name, object, swift_user, swift_key;
   std::string date, time, subuser, access, format;
   rgw_bucket bucket;
   uint32_t perm_mask = 0;
@@ -540,9 +589,9 @@ int main(int argc, char **argv)
       }
       auid = tmp;
     } else if (ceph_argparse_witharg(args, i, &val, "--os-user", (char*)NULL)) {
-      openstack_user = val;
+      swift_user = val;
     } else if (ceph_argparse_witharg(args, i, &val, "--os-secret", (char*)NULL)) {
-      openstack_key = val;
+      swift_key = val;
     } else if (ceph_argparse_witharg(args, i, &val, "--date", (char*)NULL)) {
       date = val;
     } else if (ceph_argparse_witharg(args, i, &val, "--time", (char*)NULL)) {
@@ -654,12 +703,12 @@ int main(int argc, char **argv)
 	cerr << "could not find user by specified access key" << std::endl;
       }
     }
-    if (!found && (!openstack_user.empty())) {
-      s = openstack_user;
-      if (rgw_get_user_info_by_openstack(s, info) >= 0) {
+    if (!found && (!swift_user.empty())) {
+      s = swift_user;
+      if (rgw_get_user_info_by_swift(s, info) >= 0) {
 	found = true;
       } else
-        cerr << "could not find user by specified openstack username" << std::endl;
+        cerr << "could not find user by specified swift username" << std::endl;
     }
     if (found)
       user_id = info.user_id.c_str();
@@ -799,10 +848,10 @@ int main(int argc, char **argv)
       info.user_email = user_email;
     if (auid != (uint64_t)-1)
       info.auid = auid;
-    if (!openstack_user.empty())
-      info.openstack_name = openstack_user;
-    if (!openstack_key.empty())
-      info.openstack_key = openstack_key;
+    if (!swift_user.empty())
+      info.swift_name = swift_user;
+    if (!swift_key.empty())
+      info.swift_key = swift_key;
     if (!subuser.empty()) {
       RGWSubUser u;
       u.name = subuser;
@@ -817,7 +866,7 @@ int main(int argc, char **argv)
 
     remove_old_indexes(old_info, info);
 
-    show_user_info(info);
+    show_user_info(info, format.c_str(), formatter);
     break;
 
   case OPT_SUBUSER_RM:
@@ -828,7 +877,7 @@ int main(int argc, char **argv)
       cerr << "error storing user info: " << cpp_strerror(-err) << std::endl;
       break;
     }
-    show_user_info(info);
+    show_user_info(info, format.c_str(), formatter);
     break;
 
   case OPT_KEY_RM:
@@ -843,11 +892,11 @@ int main(int argc, char **argv)
         break;
       }
     }
-    show_user_info(info);
+    show_user_info(info, format.c_str(), formatter);
     break;
 
   case OPT_USER_INFO:
-    show_user_info(info);
+    show_user_info(info, format.c_str(), formatter);
     break;
   }
 
@@ -859,7 +908,12 @@ int main(int argc, char **argv)
     RGWAccessControlPolicy policy;
     if (ret >= 0) {
       bufferlist::iterator iter = bl.begin();
-      policy.decode(iter);
+      try {
+        policy.decode(iter);
+      } catch (buffer::error& err) {
+        RGW_LOG(0) << "ERROR: caught buffer::error, could not decode policy" << dendl;
+        return -EIO;
+      }
       policy.to_xml(cout);
       cout << std::endl;
     }
diff --git a/src/rgw/rgw_bucket.cc b/src/rgw/rgw_bucket.cc
index a1c440fee3b..94eebd8f045 100644
--- a/src/rgw/rgw_bucket.cc
+++ b/src/rgw/rgw_bucket.cc
@@ -15,8 +15,6 @@ static rgw_bucket pi_buckets(BUCKETS_POOL_NAME);
 static string avail_pools = ".pools.avail";
 static string pool_name_prefix = "p";
 
-#define POOLS_PREALLOCATE_NUM 100
-
 
 int rgw_store_bucket_info(string& bucket_name, RGWBucketInfo& info)
 {
@@ -49,7 +47,12 @@ int rgw_get_bucket_info(string& bucket_name, RGWBucketInfo& info)
   }
 
   bufferlist::iterator iter = bl.begin();
-  ::decode(info, iter);
+  try {
+    ::decode(info, iter);
+  } catch (buffer::error& err) {
+    RGW_LOG(0) << "ERROR: could not decode buffer info, caught buffer::error" << dendl;
+    return -EIO;
+  }
 
   RGW_LOG(0) << "rgw_get_bucket_info: bucket=" << info.bucket << dendl;
 
@@ -64,11 +67,11 @@ int rgw_remove_bucket_info(string& bucket_name)
   return ret;
 }
 
-static int generate_preallocated_pools(vector<string>& pools)
+static int generate_preallocated_pools(vector<string>& pools, int num)
 {
   vector<string> names;
 
-  for (int i = 0; i < POOLS_PREALLOCATE_NUM; i++) {
+  for (int i = 0; i < num; i++) {
     string name = pool_name_prefix;
     append_rand_alpha(pool_name_prefix, name, 8);
     names.push_back(name);
@@ -98,18 +101,8 @@ static int generate_preallocated_pools(vector<string>& pools)
   return 0;
 }
 
-static int generate_pool(string& bucket_name, rgw_bucket& bucket)
+static int register_available_pools(vector<string>& pools)
 {
-  vector<string> pools;
-  int ret = generate_preallocated_pools(pools);
-  if (ret < 0) {
-    RGW_LOG(0) << "generate_preallocad_pools returned " << ret << dendl;
-    return ret;
-  }
-  bucket.pool = pools.back();
-  pools.pop_back();
-  bucket.name = bucket_name;
-
   map<string, bufferlist> m;
   vector<string>::iterator iter;
 
@@ -119,7 +112,7 @@ static int generate_pool(string& bucket_name, rgw_bucket& bucket)
     m[name] = bl;
   }
   rgw_obj obj(pi_buckets, avail_pools);
-  ret = rgwstore->tmap_set(obj, m);
+  int ret = rgwstore->tmap_set(obj, m);
   if (ret == -ENOENT) {
     rgw_bucket new_bucket;
     map<string,bufferlist> attrs;
@@ -136,6 +129,26 @@ static int generate_pool(string& bucket_name, rgw_bucket& bucket)
   return 0;
 }
 
+static int generate_pool(string& bucket_name, rgw_bucket& bucket)
+{
+  vector<string> pools;
+  int ret = generate_preallocated_pools(pools, g_conf->rgw_pools_preallocate_max);
+  if (ret < 0) {
+    RGW_LOG(0) << "generate_preallocad_pools returned " << ret << dendl;
+    return ret;
+  }
+  bucket.pool = pools.back();
+  pools.pop_back();
+  bucket.name = bucket_name;
+
+  ret = register_available_pools(pools);
+  if (ret < 0) {
+    return ret;
+  }
+
+  return 0;
+}
+
 static int withdraw_pool(string& pool_name)
 {
   rgw_obj obj(pi_buckets, avail_pools);
@@ -143,15 +156,45 @@ static int withdraw_pool(string& pool_name)
   return rgwstore->tmap_set(obj, pool_name, bl);
 }
 
+int rgw_bucket_maintain_pools()
+{
+  bufferlist header;
+  map<string, bufferlist> m;
+  string pool_name;
+
+  rgw_obj obj(pi_buckets, avail_pools);
+  int ret = rgwstore->tmap_get(obj, header, m);
+  if (ret < 0 && ret != -ENOENT) {
+      return ret;
+  }
+
+  if ((int)m.size() < g_conf->rgw_pools_preallocate_threshold) {
+    RGW_LOG(0) << "rgw_bucket_maintain_pools allocating pools (m.size()=" << m.size() << " threshold="
+               << g_conf->rgw_pools_preallocate_threshold << ")" << dendl;
+    vector<string> pools;
+    ret = generate_preallocated_pools(pools, g_conf->rgw_pools_preallocate_max - m.size());
+    if (ret < 0) {
+      RGW_LOG(0) << "failed to preallocate pools" << dendl;
+      return ret;
+    }
+    ret = register_available_pools(pools);
+    if (ret < 0) {
+      RGW_LOG(0) << "failed to register available pools" << dendl;
+      return ret;
+    }
+  }
+
+  return 0;
+}
+
 int rgw_bucket_allocate_pool(string& bucket_name, rgw_bucket& bucket)
 {
-  bufferlist bl;
   bufferlist header;
   map<string, bufferlist> m;
   string pool_name;
 
   rgw_obj obj(pi_buckets, avail_pools);
-  int ret = rgwstore->tmap_get(obj, bl);
+  int ret = rgwstore->tmap_get(obj, header, m);
   if (ret < 0) {
     if (ret == -ENOENT) {
       return generate_pool(bucket_name, bucket);
@@ -159,10 +202,6 @@ int rgw_bucket_allocate_pool(string& bucket_name, rgw_bucket& bucket)
     return ret;
   }
 
-  bufferlist::iterator iter = bl.begin();
-  ::decode(header, iter);
-  ::decode(m, iter);
-
   if (!m.size()) {
     return generate_pool(bucket_name, bucket);
   }
diff --git a/src/rgw/rgw_bucket.h b/src/rgw/rgw_bucket.h
index 63c339c1402..91706f56455 100644
--- a/src/rgw/rgw_bucket.h
+++ b/src/rgw/rgw_bucket.h
@@ -17,6 +17,7 @@ extern int rgw_bucket_allocate_pool(string& bucket_name, rgw_bucket& bucket);
 extern int rgw_create_bucket(std::string& id, string& bucket_name, rgw_bucket& bucket,
                       map<std::string, bufferlist>& attrs, bool exclusive = true, uint64_t auid = 0);
 
+extern int rgw_bucket_maintain_pools(void);
 
 #endif
 
diff --git a/src/rgw/rgw_cache.h b/src/rgw/rgw_cache.h
index d5c8c11ed13..8a2ab0dbaa4 100644
--- a/src/rgw/rgw_cache.h
+++ b/src/rgw/rgw_cache.h
@@ -320,9 +320,12 @@ int RGWCache<T>::watch_cb(int opcode, uint64_t ver, bufferlist& bl)
   try {
     bufferlist::iterator iter = bl.begin();
     ::decode(info, iter);
-  } catch (buffer::end_of_buffer *err) {
+  } catch (buffer::end_of_buffer& err) {
     RGW_LOG(0) << "ERROR: got bad notification" << dendl;
     return -EIO;
+  } catch (buffer::error& err) {
+    RGW_LOG(0) << "ERROR: buffer::error" << dendl;
+    return -EIO;
   }
 
   string name = normal_name(info.obj);
diff --git a/src/rgw/rgw_common.cc b/src/rgw/rgw_common.cc
index abb644efeac..c1db8fa59c6 100644
--- a/src/rgw/rgw_common.cc
+++ b/src/rgw/rgw_common.cc
@@ -263,7 +263,10 @@ bool verify_permission(RGWAccessControlPolicy *policy, string& uid, int user_per
    if (!policy)
      return false;
 
-   int acl_perm = policy->get_perm(g_ceph_context, uid, perm) & user_perm_mask;
+   int policy_perm = policy->get_perm(g_ceph_context, uid, perm);
+   int acl_perm = policy_perm & user_perm_mask;
+
+   RGW_LOG(10) << " uid=" << uid << " requested perm (type)=" << perm << ", policy perm=" << policy_perm << ", user_perm_mask=" << user_perm_mask << ", acl perm=" << acl_perm << dendl;
 
    return (perm == acl_perm);
 }
diff --git a/src/rgw/rgw_common.h b/src/rgw/rgw_common.h
index 6f4b77e2094..4f309bc3e1d 100644
--- a/src/rgw/rgw_common.h
+++ b/src/rgw/rgw_common.h
@@ -67,8 +67,8 @@ extern string rgw_obj_category_none;
 #define RGW_FORMAT_XML          1
 #define RGW_FORMAT_JSON         2
 
-#define RGW_REST_OPENSTACK      0x1
-#define RGW_REST_OPENSTACK_AUTH 0x2
+#define RGW_REST_SWIFT          0x1
+#define RGW_REST_SWIFT_AUTH     0x2
 
 #define RGW_SUSPENDED_USER_AUID (uint64_t)-2
 
@@ -112,6 +112,7 @@ extern string rgw_obj_category_none;
 #define ERR_LENGTH_REQUIRED     2011
 #define ERR_REQUEST_TIME_SKEWED 2012
 #define ERR_USER_SUSPENDED      2100
+#define ERR_INTERNAL_ERROR      2200
 
 typedef void *RGWAccessHandle;
 
@@ -271,8 +272,8 @@ struct RGWUserInfo
   string user_id;
   string display_name;
   string user_email;
-  string openstack_name;
-  string openstack_key;
+  string swift_name;
+  string swift_key;
   map<string, RGWAccessKey> access_keys;
   map<string, RGWSubUser> subusers;
   __u8 suspended;
@@ -295,8 +296,8 @@ struct RGWUserInfo
      ::encode(secret_key, bl);
      ::encode(display_name, bl);
      ::encode(user_email, bl);
-     ::encode(openstack_name, bl);
-     ::encode(openstack_key, bl);
+     ::encode(swift_name, bl);
+     ::encode(swift_key, bl);
      ::encode(user_id, bl);
      ::encode(access_keys, bl);
      ::encode(subusers, bl);
@@ -319,8 +320,8 @@ struct RGWUserInfo
     }
     ::decode(display_name, bl);
     ::decode(user_email, bl);
-    if (ver >= 3) ::decode(openstack_name, bl);
-    if (ver >= 4) ::decode(openstack_key, bl);
+    if (ver >= 3) ::decode(swift_name, bl);
+    if (ver >= 4) ::decode(swift_key, bl);
     if (ver >= 5)
       ::decode(user_id, bl);
     else
@@ -481,7 +482,7 @@ struct req_state {
    string bucket_name_str;
    string object_str;
 
-   map<string, string> x_amz_map;
+   map<string, string> x_meta_map;
 
    RGWUserInfo user; 
    RGWAccessControlPolicy *acl;
diff --git a/src/rgw/rgw_formats.h b/src/rgw/rgw_formats.h
index 5d7ee0139f4..c43b24b050d 100644
--- a/src/rgw/rgw_formats.h
+++ b/src/rgw/rgw_formats.h
@@ -13,7 +13,7 @@ struct plain_stack_entry {
 };
 
 /* FIXME: this class is mis-named.
- * FIXME: This was a hack to send certain openstack messages.
+ * FIXME: This was a hack to send certain swift messages.
  * There is a much better way to do this.
  */
 class RGWFormatter_Plain : public Formatter {
diff --git a/src/rgw/rgw_log.cc b/src/rgw/rgw_log.cc
index 8836c52a2cc..a610e82d8da 100644
--- a/src/rgw/rgw_log.cc
+++ b/src/rgw/rgw_log.cc
@@ -17,6 +17,7 @@ static void set_param_str(struct req_state *s, const char *name, string& str)
 int rgw_log_op(struct req_state *s)
 {
   struct rgw_log_entry entry;
+  uint64_t pool_id;
 
   if (!s->should_log)
     return 0;
@@ -26,8 +27,13 @@ int rgw_log_op(struct req_state *s)
     return -EINVAL;
   }
   if (s->err.ret == -ERR_NO_SUCH_BUCKET) {
-    RGW_LOG(0) << "bucket " << s->bucket << " doesn't exist, not logging" << dendl;
-    return 0;
+    if (!g_conf->rgw_log_nonexistent_bucket) {
+      RGW_LOG(0) << "bucket " << s->bucket << " doesn't exist, not logging" << dendl;
+      return 0;
+    }
+    pool_id = 0;
+  } else {
+    pool_id = s->pool_id;
   }
   entry.bucket = s->bucket_name;
 
@@ -67,7 +73,7 @@ int rgw_log_op(struct req_state *s)
     entry.http_status = "200"; // default
 
   entry.error_code = s->err.s3_code;
-  entry.pool_id = s->pool_id;
+  entry.pool_id = pool_id;
 
   bufferlist bl;
   ::encode(entry, bl);
diff --git a/src/rgw/rgw_main.cc b/src/rgw/rgw_main.cc
index 3a6b833a524..0a2e108fabc 100644
--- a/src/rgw/rgw_main.cc
+++ b/src/rgw/rgw_main.cc
@@ -17,14 +17,16 @@
 #include "common/config.h"
 #include "common/errno.h"
 #include "common/WorkQueue.h"
+#include "common/Timer.h"
 #include "rgw_common.h"
 #include "rgw_access.h"
 #include "rgw_acl.h"
 #include "rgw_user.h"
 #include "rgw_op.h"
 #include "rgw_rest.h"
-#include "rgw_os.h"
+#include "rgw_swift.h"
 #include "rgw_log.h"
+#include "rgw_bucket.h"
 
 #include <map>
 #include <string>
@@ -218,6 +220,16 @@ done:
   RGW_LOG(0) << "====== req done fcgx=" << hex << fcgx << dec << " http_status=" << http_ret << " ======" << dendl;
 }
 
+class C_RGWMaintenanceTick : public Context {
+  SafeTimer *timer;
+public:
+  C_RGWMaintenanceTick(SafeTimer *t) : timer(t) {}
+  void finish(int r) {
+    rgw_bucket_maintain_pools();
+    RGW_LOG(20) << "C_RGWMaintenanceTick::finish()" << dendl;
+    timer->add_event_after(g_conf->rgw_maintenance_tick_interval, new C_RGWMaintenanceTick(timer));
+  }
+};
 /*
  * start up the RADOS connection and then handle HTTP messages as they come in
  */
@@ -253,10 +265,22 @@ int main(int argc, const char **argv)
     return EIO;
   }
 
-  RGWProcess process(g_ceph_context, 20);
+  RGWProcess process(g_ceph_context, g_conf->rgw_thread_pool_size);
+
+  Mutex lock("rgw_timer_lock");
+  SafeTimer timer(g_ceph_context, lock);
+
+  lock.Lock();
+  timer.init();
+  timer.add_event_after(g_conf->rgw_maintenance_tick_interval, new C_RGWMaintenanceTick(&timer));
+  lock.Unlock();
 
   process.run();
 
+  lock.Lock();
+  timer.shutdown();
+  lock.Unlock();
+
   return 0;
 }
 
diff --git a/src/rgw/rgw_op.cc b/src/rgw/rgw_op.cc
index b357fb2683a..2fe571d56ed 100644
--- a/src/rgw/rgw_op.cc
+++ b/src/rgw/rgw_op.cc
@@ -114,7 +114,7 @@ static void format_xattr(std::string &xattr)
 void get_request_metadata(struct req_state *s, map<string, bufferlist>& attrs)
 {
   map<string, string>::iterator iter;
-  for (iter = s->x_amz_map.begin(); iter != s->x_amz_map.end(); ++iter) {
+  for (iter = s->x_meta_map.begin(); iter != s->x_meta_map.end(); ++iter) {
     const string &name(iter->first);
     string &xattr(iter->second);
 #define X_AMZ_META "x-amz-meta"
@@ -148,7 +148,12 @@ static int get_policy_from_attr(void *ctx, RGWAccessControlPolicy *policy, rgw_o
 
     if (ret >= 0) {
       bufferlist::iterator iter = bl.begin();
-      policy->decode(iter);
+      try {
+        policy->decode(iter);
+      } catch (buffer::error& err) {
+        RGW_LOG(0) << "error: could not decode policy, caught buffer::error" << dendl;
+        return -EIO;
+      }
       if (g_conf->rgw_log >= 15) {
         RGW_LOG(15) << "Read AccessControlPolicy" << dendl;
         policy->to_xml(cerr);
@@ -327,7 +332,7 @@ int RGWListBuckets::verify_permission()
 
 void RGWListBuckets::execute()
 {
-  ret = rgw_read_user_buckets(s->user.user_id, buckets, !!(s->prot_flags & RGW_REST_OPENSTACK));
+  ret = rgw_read_user_buckets(s->user.user_id, buckets, !!(s->prot_flags & RGW_REST_SWIFT));
   if (ret < 0) {
     /* hmm.. something wrong here.. the user was authenticated, so it
        should exist, just try to recreate */
@@ -407,7 +412,7 @@ void RGWListBucket::execute()
   }
   url_decode(s->args.get("delimiter"), delimiter);
 
-  if (s->prot_flags & RGW_REST_OPENSTACK) {
+  if (s->prot_flags & RGW_REST_SWIFT) {
     string path_args;
     url_decode(s->args.get("path"), path_args);
     if (!path_args.empty()) {
@@ -421,7 +426,7 @@ void RGWListBucket::execute()
   }
 
   ret = rgwstore->list_objects(s->user.user_id, s->bucket, max, prefix, delimiter, marker, objs, common_prefixes,
-                               !!(s->prot_flags & RGW_REST_OPENSTACK), no_ns, &is_truncated, NULL);
+                               !!(s->prot_flags & RGW_REST_SWIFT), no_ns, &is_truncated, NULL);
 
 done:
   send_response();
@@ -749,7 +754,6 @@ void RGWPutObj::execute()
         goto done_err;
 
       bl.clear();
-      map<string, bufferlist> meta_attrs;
       RGWUploadPartInfo info;
       string p = "part.";
       p.append(part_num);
@@ -758,13 +762,10 @@ void RGWPutObj::execute()
       info.size = s->obj_size;
       info.modified = ceph_clock_now(g_ceph_context);
       ::encode(info, bl);
-      meta_attrs[p] = bl;
 
       rgw_obj meta_obj(s->bucket, multipart_meta_obj, s->object_str, mp_ns);
 
-      // we don't set a category, since by now a category should have already been assigned
-      string nocategory;
-      ret = rgwstore->put_obj_meta(s->obj_ctx, s->user.user_id, meta_obj, NULL, meta_attrs, nocategory, false);
+      ret = rgwstore->tmap_set(meta_obj, p, bl);
     }
   }
 done:
@@ -1191,11 +1192,12 @@ done:
 }
 
 static int get_multiparts_info(struct req_state *s, string& meta_oid, map<uint32_t, RGWUploadPartInfo>& parts,
-                               RGWAccessControlPolicy& policy, map<string, bufferlist>& new_attrs)
+                               RGWAccessControlPolicy& policy, map<string, bufferlist>& attrs)
 {
   void *handle;
-  map<string, bufferlist> attrs;
+  map<string, bufferlist> parts_map;
   map<string, bufferlist>::iterator iter;
+  bufferlist header;
 
   rgw_obj obj(s->bucket, meta_oid, s->object_str, mp_ns);
 
@@ -1206,24 +1208,35 @@ static int get_multiparts_info(struct req_state *s, string& meta_oid, map<uint32
   if (ret < 0)
     return ret;
 
+  ret = rgwstore->tmap_get(obj, header, parts_map);
+  if (ret < 0)
+    return ret;
+
   for (iter = attrs.begin(); iter != attrs.end(); ++iter) {
     string name = iter->first;
     if (name.compare(RGW_ATTR_ACL) == 0) {
       bufferlist& bl = iter->second;
       bufferlist::iterator bli = bl.begin();
-      ::decode(policy, bli);
-      new_attrs[RGW_ATTR_ACL] = bl;
-      continue;
-    }
-    if (name.compare(0, 5, "part.") != 0) {
-      new_attrs[iter->first] = iter->second;
-      continue;
+      try {
+        ::decode(policy, bli);
+      } catch (buffer::error& err) {
+        RGW_LOG(0) << "ERROR: could not decode policy, caught buffer::error" << dendl;
+        return -EIO;
+      }
+      break;
     }
+  }
 
+
+  for (iter = parts_map.begin(); iter != parts_map.end(); ++iter) {
     bufferlist& bl = iter->second;
     bufferlist::iterator bli = bl.begin();
     RGWUploadPartInfo info;
-    ::decode(info, bli);
+    try {
+      ::decode(info, bli);
+    } catch (buffer::error& err) {
+      RGW_LOG(0) << "ERROR: could not decode policy, caught buffer::error" << dendl;
+    }
     parts[info.num] = info;
   }
   return 0;
@@ -1454,7 +1467,7 @@ void RGWListBucketMultiparts::execute()
   if (ret < 0)
     goto done;
 
-  if (s->prot_flags & RGW_REST_OPENSTACK) {
+  if (s->prot_flags & RGW_REST_SWIFT) {
     string path_args;
     url_decode(s->args.get("path"), path_args);
     if (!path_args.empty()) {
@@ -1468,7 +1481,7 @@ void RGWListBucketMultiparts::execute()
   }
   marker_meta = marker.get_meta();
   ret = rgwstore->list_objects(s->user.user_id, s->bucket, max_uploads, prefix, delimiter, marker_meta, objs, common_prefixes,
-                               !!(s->prot_flags & RGW_REST_OPENSTACK), mp_ns, &is_truncated, &mp_filter);
+                               !!(s->prot_flags & RGW_REST_SWIFT), mp_ns, &is_truncated, &mp_filter);
   if (objs.size()) {
     vector<RGWObjEnt>::iterator iter;
     RGWMultipartUploadEntry entry;
diff --git a/src/rgw/rgw_os_auth.h b/src/rgw/rgw_os_auth.h
deleted file mode 100644
index 4a31b3dca5c..00000000000
--- a/src/rgw/rgw_os_auth.h
+++ /dev/null
@@ -1,30 +0,0 @@
-#ifndef CEPH_RGW_OS_AUTH_H
-#define CEPH_RGW_OS_AUTH_H
-
-#include "rgw_op.h"
-
-#define RGW_OS_TOKEN_EXPIRATION (15 * 60)
-
-extern int rgw_os_verify_signed_token(const char *token, RGWUserInfo& info);
-
-class RGW_OS_Auth_Get : public RGWOp {
-public:
-  RGW_OS_Auth_Get() {}
-  ~RGW_OS_Auth_Get() {}
-
-  int verify_permission() { return 0; }
-  void execute();
-};
-
-class RGWHandler_OS_Auth : public RGWHandler {
-public:
-  RGWHandler_OS_Auth() {}
-  ~RGWHandler_OS_Auth() {}
-  RGWOp *get_op();
-  void put_op(RGWOp *op);
-
-  int authorize();
-  int read_permissions() { return 0; }
-};
-
-#endif
diff --git a/src/rgw/rgw_rados.cc b/src/rgw/rgw_rados.cc
index f9bc8e6c5a2..404bddb33fb 100644
--- a/src/rgw/rgw_rados.cc
+++ b/src/rgw/rgw_rados.cc
@@ -280,7 +280,12 @@ int RGWRados::list_objects(string& id, rgw_bucket& bucket, int max, string& pref
         bufferlist& bl = iter->second;
         bufferlist::iterator i = bl.begin();
         RGWAccessControlPolicy policy;
-        policy.decode_owner(i);
+        try {
+          policy.decode_owner(i);
+        } catch (buffer::error& err) {
+          RGW_LOG(0) << "ERROR: could not decode policy for oid=" << oid << ", caught buffer::error" << dendl;
+          continue;
+        }
         ACLOwner& owner = policy.get_owner();
         obj.owner = owner.get_id();
         obj.owner_display_name = owner.get_display_name();
@@ -793,16 +798,26 @@ int RGWRados::get_obj_state(RGWRadosCtx *rctx, rgw_obj& obj, librados::IoCtx& io
   s->exists = true;
 
   bufferlist::iterator oiter = outbl.begin();
-  ::decode(s->attrset, oiter);
+  try {
+    ::decode(s->attrset, oiter);
+  } catch (buffer::error& err) {
+    RGW_LOG(0) << "ERROR: failed decoding s->attrset (obj=" << obj << "), aborting" << dendl;
+    return -EIO;
+  }
 
   map<string, bufferlist>::iterator aiter;
   for (aiter = s->attrset.begin(); aiter != s->attrset.end(); ++aiter) {
     RGW_LOG(0) << "iter->first=" << aiter->first << dendl;
   }
-  ::decode(s->size, oiter);
-  utime_t ut;
-  ::decode(ut, oiter);
-  s->mtime = ut.sec();
+
+  try {
+    ::decode(s->size, oiter);
+    utime_t ut;
+    ::decode(ut, oiter);
+    s->mtime = ut.sec();
+  } catch (buffer::error& err) {
+    RGW_LOG(0) << "ERROR: failed decoding object (obj=" << obj << ") info (either size or mtime), aborting" << dendl;
+  }
 
   s->has_attrs = true;
   map<string, bufferlist>::iterator iter = s->attrset.find(RGW_ATTR_SHADOW_OBJ);
@@ -1473,8 +1488,9 @@ int RGWRados::get_bucket_stats(rgw_bucket& bucket, map<string, RGWBucketStats>&
   return 0;
 }
 
-int RGWRados::tmap_get(rgw_obj& obj, bufferlist& bl)
+int RGWRados::tmap_get(rgw_obj& obj, bufferlist& header, std::map<string, bufferlist>& m)
 {
+  bufferlist bl;
   librados::IoCtx io_ctx;
   rgw_bucket& bucket = obj.bucket;
   std::string& oid = obj.object;
@@ -1485,8 +1501,19 @@ int RGWRados::tmap_get(rgw_obj& obj, bufferlist& bl)
   io_ctx.locator_set_key(obj.key);
 
   r = io_ctx.tmap_get(oid, bl);
+  if (r < 0)
+    return r;
 
-  return r;
+  try {
+    bufferlist::iterator iter = bl.begin();
+    ::decode(header, iter);
+    ::decode(m, iter);
+  } catch (buffer::error& err) {
+    RGW_LOG(0) << "ERROR: tmap_get failed, caught buffer::error" << dendl;
+    return -EIO;
+  }
+
+  return 0;
  
 }
 
@@ -1590,13 +1617,13 @@ int RGWRados::update_containers_stats(map<string, RGWBucketEnt>& m)
   int count = 0;
 
   map<string, RGWBucketEnt>::iterator iter;
-  list<string> buckets_list;
+  list<string> pools_list;
   for (iter = m.begin(); iter != m.end(); ++iter) {
-    string bucket_name = iter->first;
-    buckets_list.push_back(bucket_name);
+    string pool_name = iter->second.bucket.pool;
+    pools_list.push_back(pool_name);
   }
   map<std::string,librados::stats_map> sm;
-  int r = rados->get_pool_stats(buckets_list, rgw_obj_category_main, sm);
+  int r = rados->get_pool_stats(pools_list, rgw_obj_category_main, sm);
   if (r < 0)
     return r;
 
@@ -1605,6 +1632,8 @@ int RGWRados::update_containers_stats(map<string, RGWBucketEnt>& m)
   for (miter = sm.begin(), iter = m.begin(); miter != sm.end(), iter != m.end(); ++iter, ++miter) {
     stats_map stats = miter->second;
     stats_map::iterator stats_iter = stats.begin();
+    if (stats_iter == stats.end())
+      continue;
 
     string bucket_name = miter->first;
     RGWBucketEnt& ent = iter->second;
diff --git a/src/rgw/rgw_rados.h b/src/rgw/rgw_rados.h
index a78ff5a5d9a..bcc96835e29 100644
--- a/src/rgw/rgw_rados.h
+++ b/src/rgw/rgw_rados.h
@@ -2,10 +2,12 @@
 #define CEPH_RGWRADOS_H
 
 #include "include/rados/librados.hpp"
+#include "include/Context.h"
 #include "rgw_access.h"
 #include "rgw_common.h"
 
 class RGWWatcher;
+class SafeTimer;
 
 struct RGWObjState {
   bool is_atomic;
@@ -77,6 +79,19 @@ class RGWRados  : public RGWAccess
 
   int set_buckets_auid(vector<rgw_bucket>& buckets, uint64_t auid);
 
+  Mutex lock;
+  SafeTimer *timer;
+
+  class C_Tick : public Context {
+    RGWRados *rados;
+  public:
+    C_Tick(RGWRados *_r) : rados(_r) {}
+    void finish(int r) {
+      rados->tick();
+    }
+  };
+
+
   RGWWatcher *watcher;
   uint64_t watch_handle;
   librados::IoCtx root_pool_ctx;
@@ -106,7 +121,9 @@ class RGWRados  : public RGWAccess
                  pair<string, bufferlist> *cmp_xattr);
   int delete_obj_impl(void *ctx, std::string& id, rgw_obj& src_obj, bool sync);
 public:
-  RGWRados() : watcher(NULL), watch_handle(0) {}
+  RGWRados() : lock("rados_timer_lock"), timer(NULL), watcher(NULL), watch_handle(0) {}
+
+  void tick();
 
   /** Initialize the RADOS instance and prepare to do other ops */
   virtual int initialize(CephContext *cct);
@@ -228,7 +245,7 @@ public:
   virtual int get_bucket_id(rgw_bucket& bucket, uint64_t *bucket_id);
 
   virtual bool supports_tmap() { return true; }
-  virtual int tmap_get(rgw_obj& obj, bufferlist& bl);
+  virtual int tmap_get(rgw_obj& obj, bufferlist& header, std::map<string, bufferlist>& m);
   virtual int tmap_set(rgw_obj& obj, std::string& key, bufferlist& bl);
   virtual int tmap_set(rgw_obj& obj, map<std::string, bufferlist>& m);
   virtual int tmap_create(rgw_obj& obj, std::string& key, bufferlist& bl);
diff --git a/src/rgw/rgw_rest.cc b/src/rgw/rgw_rest.cc
index 13be0dcaa83..9cf35c555af 100644
--- a/src/rgw/rgw_rest.cc
+++ b/src/rgw/rgw_rest.cc
@@ -7,9 +7,9 @@
 #include "rgw_formats.h"
 #include "rgw_op.h"
 #include "rgw_rest.h"
-#include "rgw_rest_os.h"
+#include "rgw_rest_swift.h"
 #include "rgw_rest_s3.h"
-#include "rgw_os_auth.h"
+#include "rgw_swift_auth.h"
 
 #include "rgw_formats.h"
 
@@ -52,6 +52,7 @@ const static struct rgw_html_errors RGW_HTML_ERRORS[] = {
     { EEXIST, 409, "BucketAlreadyExists" },
     { ENOTEMPTY, 409, "BucketNotEmpty" },
     { ERANGE, 416, "InvalidRange" },
+    { ERR_INTERNAL_ERROR, 500, "InternalError" },
 };
 
 void set_req_state_err(struct req_state *s, int err_no)
@@ -97,7 +98,7 @@ void dump_content_length(struct req_state *s, size_t len)
 
 void dump_etag(struct req_state *s, const char *etag)
 {
-  if (s->prot_flags & RGW_REST_OPENSTACK)
+  if (s->prot_flags & RGW_REST_SWIFT)
     CGI_PRINTF(s,"etag: %s\n", etag);
   else
     CGI_PRINTF(s,"ETag: \"%s\"\n", etag);
@@ -428,13 +429,13 @@ void init_entities_from_header(struct req_state *s)
 
   pos = req.find('/');
   if (pos >= 0) {
-    const char *openstack_url_prefix = s->env->get("RGW_OPENSTACK_URL_PREFIX");
-    bool cut_url = (openstack_url_prefix != NULL);
-    if (!openstack_url_prefix)
-      openstack_url_prefix = "v1";
+    const char *swift_url_prefix = s->env->get("RGW_SWIFT_URL_PREFIX");
+    bool cut_url = (swift_url_prefix != NULL);
+    if (!swift_url_prefix)
+      swift_url_prefix = "v1";
     first = req.substr(0, pos);
-    if (first.compare(openstack_url_prefix) == 0) {
-      s->prot_flags |= RGW_REST_OPENSTACK;
+    if (first.compare(swift_url_prefix) == 0) {
+      s->prot_flags |= RGW_REST_SWIFT;
       if (cut_url) {
         next_tok(req, first, '/');
       }
@@ -443,7 +444,7 @@ void init_entities_from_header(struct req_state *s)
     first = req;
   }
 
-  if (s->prot_flags & RGW_REST_OPENSTACK) {
+  if (s->prot_flags & RGW_REST_SWIFT) {
     s->format = 0;
     delete s->formatter;
     s->formatter = new RGWFormatter_Plain;
@@ -459,7 +460,7 @@ void init_entities_from_header(struct req_state *s)
     }
   }
 
-  if (s->prot_flags & RGW_REST_OPENSTACK) {
+  if (s->prot_flags & RGW_REST_SWIFT) {
     string ver;
     string auth_key;
 
@@ -495,7 +496,7 @@ void init_entities_from_header(struct req_state *s)
   }
 
   if (strcmp(s->bucket_name, "auth") == 0)
-    s->prot_flags |= RGW_REST_OPENSTACK_AUTH;
+    s->prot_flags |= RGW_REST_SWIFT_AUTH;
 
   if (pos >= 0) {
     string encoded_obj_str = req.substr(pos+1);
@@ -545,44 +546,44 @@ static void init_auth_info(struct req_state *s)
 {
   const char *p;
 
-  s->x_amz_map.clear();
+  s->x_meta_map.clear();
 
   for (int i=0; (p = s->fcgx->envp[i]); ++i) {
 #define HTTP_X_AMZ "HTTP_X_AMZ"
     if (strncmp(p, HTTP_X_AMZ, sizeof(HTTP_X_AMZ) - 1) == 0) {
-      RGW_LOG(10) << "amz>> " << p << dendl;
-      const char *amz = p+5; /* skip the HTTP_ part */
-      const char *eq = strchr(amz, '=');
+      RGW_LOG(10) << "meta>> " << p << dendl;
+      const char *name = p+5; /* skip the HTTP_ part */
+      const char *eq = strchr(name, '=');
       if (!eq) /* shouldn't happen! */
         continue;
-      int len = eq - amz;
-      char amz_low[len + 1];
+      int len = eq - name;
+      char name_low[len + 1];
       int j;
       for (j=0; j<len; j++) {
-        amz_low[j] = tolower(amz[j]);
-        if (amz_low[j] == '_')
-          amz_low[j] = '-';
+        name_low[j] = tolower(name[j]);
+        if (name_low[j] == '_')
+          name_low[j] = '-';
       }
-      amz_low[j] = 0;
+      name_low[j] = 0;
       string val;
       line_unfold(eq + 1, val);
 
       map<string, string>::iterator iter;
-      iter = s->x_amz_map.find(amz_low);
-      if (iter != s->x_amz_map.end()) {
+      iter = s->x_meta_map.find(name_low);
+      if (iter != s->x_meta_map.end()) {
         string old = iter->second;
         int pos = old.find_last_not_of(" \t"); /* get rid of any whitespaces after the value */
         old = old.substr(0, pos + 1);
         old.append(",");
         old.append(val);
-        s->x_amz_map[amz_low] = old;
+        s->x_meta_map[name_low] = old;
       } else {
-        s->x_amz_map[amz_low] = val;
+        s->x_meta_map[name_low] = val;
       }
     }
   }
   map<string, string>::iterator iter;
-  for (iter = s->x_amz_map.begin(); iter != s->x_amz_map.end(); ++iter) {
+  for (iter = s->x_meta_map.begin(); iter != s->x_meta_map.end(); ++iter) {
     RGW_LOG(10) << "x>> " << iter->first << ":" << iter->second << dendl;
   }
 
@@ -808,8 +809,8 @@ RGWOp *RGWHandler_REST::get_op()
 
 RGWRESTMgr::RGWRESTMgr()
 {
-  m_os_handler = new RGWHandler_REST_OS;
-  m_os_auth_handler = new RGWHandler_OS_Auth;
+  m_os_handler = new RGWHandler_REST_SWIFT;
+  m_os_auth_handler = new RGWHandler_SWIFT_Auth;
   m_s3_handler = new RGWHandler_REST_S3;
 }
 
@@ -827,9 +828,9 @@ RGWHandler *RGWRESTMgr::get_handler(struct req_state *s, FCGX_Request *fcgx,
 
   *init_error = RGWHandler_REST::preprocess(s, fcgx);
 
-  if (s->prot_flags & RGW_REST_OPENSTACK)
+  if (s->prot_flags & RGW_REST_SWIFT)
     handler = m_os_handler;
-  else if (s->prot_flags & RGW_REST_OPENSTACK_AUTH)
+  else if (s->prot_flags & RGW_REST_SWIFT_AUTH)
     handler = m_os_auth_handler;
   else
     handler = m_s3_handler;
diff --git a/src/rgw/rgw_rest.h b/src/rgw/rgw_rest.h
index 5bac7e74470..d94d29952d8 100644
--- a/src/rgw/rgw_rest.h
+++ b/src/rgw/rgw_rest.h
@@ -147,13 +147,13 @@ public:
   virtual int authorize() = 0;
 };
 
-class RGWHandler_REST_OS;
-class RGWHandler_OS_Auth;
+class RGWHandler_REST_SWIFT;
+class RGWHandler_SWIFT_Auth;
 class RGWHandler_REST_S3;
 
 class RGWRESTMgr {
-  RGWHandler_REST_OS *m_os_handler;
-  RGWHandler_OS_Auth *m_os_auth_handler;
+  RGWHandler_REST_SWIFT *m_os_handler;
+  RGWHandler_SWIFT_Auth *m_os_auth_handler;
   RGWHandler_REST_S3 *m_s3_handler;
 
 public:
diff --git a/src/rgw/rgw_rest_os.h b/src/rgw/rgw_rest_os.h
deleted file mode 100644
index 3c5c346c345..00000000000
--- a/src/rgw/rgw_rest_os.h
+++ /dev/null
@@ -1,116 +0,0 @@
-#ifndef CEPH_RGW_REST_OS_H
-#define CEPH_RGW_REST_OS_H
-#define TIME_BUF_SIZE 128
-
-#include "rgw_op.h"
-#include "rgw_rest.h"
-
-class RGWGetObj_REST_OS : public RGWGetObj_REST {
-public:
-  RGWGetObj_REST_OS() {}
-  ~RGWGetObj_REST_OS() {}
-
-  int send_response(void *handle);
-};
-
-class RGWListBuckets_REST_OS : public RGWListBuckets_REST {
-public:
-  RGWListBuckets_REST_OS() {}
-  ~RGWListBuckets_REST_OS() {}
-
-  void send_response();
-};
-
-class RGWListBucket_REST_OS : public RGWListBucket_REST {
-public:
-  RGWListBucket_REST_OS() {
-    limit_opt_name = "limit";
-    default_max = 10000;
-  }
-  ~RGWListBucket_REST_OS() {}
-
-  void send_response();
-};
-
-class RGWStatBucket_REST_OS : public RGWStatBucket_REST {
-public:
-  RGWStatBucket_REST_OS() {}
-  ~RGWStatBucket_REST_OS() {}
-
-  void send_response();
-};
-
-class RGWCreateBucket_REST_OS : public RGWCreateBucket_REST {
-public:
-  RGWCreateBucket_REST_OS() {}
-  ~RGWCreateBucket_REST_OS() {}
-
-  void send_response();
-};
-
-class RGWDeleteBucket_REST_OS : public RGWDeleteBucket_REST {
-public:
-  RGWDeleteBucket_REST_OS() {}
-  ~RGWDeleteBucket_REST_OS() {}
-
-  void send_response();
-};
-
-class RGWPutObj_REST_OS : public RGWPutObj_REST {
-public:
-  RGWPutObj_REST_OS() {}
-  ~RGWPutObj_REST_OS() {}
-
-  void send_response();
-};
-
-class RGWDeleteObj_REST_OS : public RGWDeleteObj_REST {
-public:
-  RGWDeleteObj_REST_OS() {}
-  ~RGWDeleteObj_REST_OS() {}
-
-  void send_response();
-};
-
-class RGWCopyObj_REST_OS : public RGWCopyObj_REST {
-public:
-  RGWCopyObj_REST_OS() {}
-  ~RGWCopyObj_REST_OS() {}
-
-  void send_response() {}
-};
-
-class RGWGetACLs_REST_OS : public RGWGetACLs_REST {
-public:
-  RGWGetACLs_REST_OS() {}
-  ~RGWGetACLs_REST_OS() {}
-
-  void send_response() {}
-};
-
-class RGWPutACLs_REST_OS : public RGWPutACLs_REST {
-public:
-  RGWPutACLs_REST_OS() : RGWPutACLs_REST() {}
-  virtual ~RGWPutACLs_REST_OS() {}
-
-  void send_response() {}
-};
-
-
-class RGWHandler_REST_OS : public RGWHandler_REST {
-protected:
-
-  RGWOp *get_retrieve_obj_op(bool get_data);
-  RGWOp *get_retrieve_op(bool get_data);
-  RGWOp *get_create_op();
-  RGWOp *get_delete_op();
-  RGWOp *get_post_op() { return NULL; }
-
-public:
-  RGWHandler_REST_OS() : RGWHandler_REST() {}
-  virtual ~RGWHandler_REST_OS() {}
-
-  int authorize();
-};
-
-#endif
diff --git a/src/rgw/rgw_rest_s3.cc b/src/rgw/rgw_rest_s3.cc
index e74e7b3ccee..4aa379ce4e9 100644
--- a/src/rgw/rgw_rest_s3.cc
+++ b/src/rgw/rgw_rest_s3.cc
@@ -489,7 +489,7 @@ static void get_canon_amz_hdr(struct req_state *s, string& dest)
 {
   dest = "";
   map<string, string>::iterator iter;
-  for (iter = s->x_amz_map.begin(); iter != s->x_amz_map.end(); ++iter) {
+  for (iter = s->x_meta_map.begin(); iter != s->x_meta_map.end(); ++iter) {
     dest.append(iter->first);
     dest.append(":");
     dest.append(iter->second);
diff --git a/src/rgw/rgw_rest_os.cc b/src/rgw/rgw_rest_swift.cc
index 3c346bb3ff9..694e47fff95 100644
--- a/src/rgw/rgw_rest_os.cc
+++ b/src/rgw/rgw_rest_swift.cc
@@ -1,11 +1,11 @@
 
 #include "common/Formatter.h"
-#include "rgw_os.h"
-#include "rgw_rest_os.h"
+#include "rgw_swift.h"
+#include "rgw_rest_swift.h"
 
 #include <sstream>
 
-void RGWListBuckets_REST_OS::send_response()
+void RGWListBuckets_REST_SWIFT::send_response()
 {
   set_req_state_err(s, ret);
   dump_errno(s);
@@ -57,7 +57,7 @@ void RGWListBuckets_REST_OS::send_response()
   s->formatter->reset();
 }
 
-void RGWListBucket_REST_OS::send_response()
+void RGWListBucket_REST_SWIFT::send_response()
 {
   set_req_state_err(s, (ret < 0 ? ret : 0));
   dump_errno(s);
@@ -125,7 +125,7 @@ static void dump_container_metadata(struct req_state *s, RGWBucketEnt& bucket)
   CGI_PRINTF(s,"X-Container-Bytes-Used: %s\n", buf);
 }
 
-void RGWStatBucket_REST_OS::send_response()
+void RGWStatBucket_REST_SWIFT::send_response()
 {
   if (ret >= 0)
     dump_container_metadata(s, bucket);
@@ -139,7 +139,7 @@ void RGWStatBucket_REST_OS::send_response()
   flush_formatter_to_req_state(s, s->formatter);
 }
 
-void RGWCreateBucket_REST_OS::send_response()
+void RGWCreateBucket_REST_SWIFT::send_response()
 {
   if (ret)
     set_req_state_err(s, ret);
@@ -148,7 +148,7 @@ void RGWCreateBucket_REST_OS::send_response()
   flush_formatter_to_req_state(s, s->formatter);
 }
 
-void RGWDeleteBucket_REST_OS::send_response()
+void RGWDeleteBucket_REST_SWIFT::send_response()
 {
   int r = ret;
   if (!r)
@@ -160,7 +160,7 @@ void RGWDeleteBucket_REST_OS::send_response()
   flush_formatter_to_req_state(s, s->formatter);
 }
 
-void RGWPutObj_REST_OS::send_response()
+void RGWPutObj_REST_SWIFT::send_response()
 {
   if (!ret)
     ret = 201; // "created"
@@ -171,7 +171,7 @@ void RGWPutObj_REST_OS::send_response()
   flush_formatter_to_req_state(s, s->formatter);
 }
 
-void RGWDeleteObj_REST_OS::send_response()
+void RGWDeleteObj_REST_SWIFT::send_response()
 {
   int r = ret;
   if (!r)
@@ -183,7 +183,7 @@ void RGWDeleteObj_REST_OS::send_response()
   flush_formatter_to_req_state(s, s->formatter);
 }
 
-int RGWGetObj_REST_OS::send_response(void *handle)
+int RGWGetObj_REST_SWIFT::send_response(void *handle)
 {
   const char *content_type = NULL;
   int orig_ret = ret;
@@ -239,14 +239,14 @@ send_data:
   return 0;
 }
 
-RGWOp *RGWHandler_REST_OS::get_retrieve_obj_op(bool get_data)
+RGWOp *RGWHandler_REST_SWIFT::get_retrieve_obj_op(bool get_data)
 {
   if (is_acl_op()) {
-    return new RGWGetACLs_REST_OS;
+    return new RGWGetACLs_REST_SWIFT;
   }
 
   if (s->object) {
-    RGWGetObj_REST_OS *get_obj_op = new RGWGetObj_REST_OS;
+    RGWGetObj_REST_SWIFT *get_obj_op = new RGWGetObj_REST_SWIFT;
     get_obj_op->set_get_data(get_data);
     return get_obj_op;
   } else if (!s->bucket_name) {
@@ -254,54 +254,56 @@ RGWOp *RGWHandler_REST_OS::get_retrieve_obj_op(bool get_data)
   }
 
   if (get_data)
-    return new RGWListBucket_REST_OS;
+    return new RGWListBucket_REST_SWIFT;
   else
-    return new RGWStatBucket_REST_OS;
+    return new RGWStatBucket_REST_SWIFT;
 }
 
-RGWOp *RGWHandler_REST_OS::get_retrieve_op(bool get_data)
+RGWOp *RGWHandler_REST_SWIFT::get_retrieve_op(bool get_data)
 {
   if (s->bucket_name) {
     if (is_acl_op()) {
-      return new RGWGetACLs_REST_OS;
+      return new RGWGetACLs_REST_SWIFT;
     }
     return get_retrieve_obj_op(get_data);
   }
 
-  return new RGWListBuckets_REST_OS;
+  return new RGWListBuckets_REST_SWIFT;
 }
 
-RGWOp *RGWHandler_REST_OS::get_create_op()
+RGWOp *RGWHandler_REST_SWIFT::get_create_op()
 {
   if (is_acl_op()) {
-    return new RGWPutACLs_REST_OS;
+    return new RGWPutACLs_REST_SWIFT;
   } else if (s->object) {
     if (!s->copy_source)
-      return new RGWPutObj_REST_OS;
+      return new RGWPutObj_REST_SWIFT;
     else
-      return new RGWCopyObj_REST_OS;
+      return new RGWCopyObj_REST_SWIFT;
   } else if (s->bucket_name) {
-    return new RGWCreateBucket_REST_OS;
+    return new RGWCreateBucket_REST_SWIFT;
   }
 
   return NULL;
 }
 
-RGWOp *RGWHandler_REST_OS::get_delete_op()
+RGWOp *RGWHandler_REST_SWIFT::get_delete_op()
 {
   if (s->object)
-    return new RGWDeleteObj_REST_OS;
+    return new RGWDeleteObj_REST_SWIFT;
   else if (s->bucket_name)
-    return new RGWDeleteBucket_REST_OS;
+    return new RGWDeleteBucket_REST_SWIFT;
 
   return NULL;
 }
 
-int RGWHandler_REST_OS::authorize()
+int RGWHandler_REST_SWIFT::authorize()
 {
   bool authorized = rgw_verify_os_token(s);
   if (!authorized)
     return -EPERM;
 
+  s->perm_mask = RGW_PERM_FULL_CONTROL;
+
   return 0;
 }
diff --git a/src/rgw/rgw_rest_swift.h b/src/rgw/rgw_rest_swift.h
new file mode 100644
index 00000000000..c9bb6ff51aa
--- /dev/null
+++ b/src/rgw/rgw_rest_swift.h
@@ -0,0 +1,116 @@
+#ifndef CEPH_RGW_REST_SWIFT_H
+#define CEPH_RGW_REST_SWIFT_H
+#define TIME_BUF_SIZE 128
+
+#include "rgw_op.h"
+#include "rgw_rest.h"
+
+class RGWGetObj_REST_SWIFT : public RGWGetObj_REST {
+public:
+  RGWGetObj_REST_SWIFT() {}
+  ~RGWGetObj_REST_SWIFT() {}
+
+  int send_response(void *handle);
+};
+
+class RGWListBuckets_REST_SWIFT : public RGWListBuckets_REST {
+public:
+  RGWListBuckets_REST_SWIFT() {}
+  ~RGWListBuckets_REST_SWIFT() {}
+
+  void send_response();
+};
+
+class RGWListBucket_REST_SWIFT : public RGWListBucket_REST {
+public:
+  RGWListBucket_REST_SWIFT() {
+    limit_opt_name = "limit";
+    default_max = 10000;
+  }
+  ~RGWListBucket_REST_SWIFT() {}
+
+  void send_response();
+};
+
+class RGWStatBucket_REST_SWIFT : public RGWStatBucket_REST {
+public:
+  RGWStatBucket_REST_SWIFT() {}
+  ~RGWStatBucket_REST_SWIFT() {}
+
+  void send_response();
+};
+
+class RGWCreateBucket_REST_SWIFT : public RGWCreateBucket_REST {
+public:
+  RGWCreateBucket_REST_SWIFT() {}
+  ~RGWCreateBucket_REST_SWIFT() {}
+
+  void send_response();
+};
+
+class RGWDeleteBucket_REST_SWIFT : public RGWDeleteBucket_REST {
+public:
+  RGWDeleteBucket_REST_SWIFT() {}
+  ~RGWDeleteBucket_REST_SWIFT() {}
+
+  void send_response();
+};
+
+class RGWPutObj_REST_SWIFT : public RGWPutObj_REST {
+public:
+  RGWPutObj_REST_SWIFT() {}
+  ~RGWPutObj_REST_SWIFT() {}
+
+  void send_response();
+};
+
+class RGWDeleteObj_REST_SWIFT : public RGWDeleteObj_REST {
+public:
+  RGWDeleteObj_REST_SWIFT() {}
+  ~RGWDeleteObj_REST_SWIFT() {}
+
+  void send_response();
+};
+
+class RGWCopyObj_REST_SWIFT : public RGWCopyObj_REST {
+public:
+  RGWCopyObj_REST_SWIFT() {}
+  ~RGWCopyObj_REST_SWIFT() {}
+
+  void send_response() {}
+};
+
+class RGWGetACLs_REST_SWIFT : public RGWGetACLs_REST {
+public:
+  RGWGetACLs_REST_SWIFT() {}
+  ~RGWGetACLs_REST_SWIFT() {}
+
+  void send_response() {}
+};
+
+class RGWPutACLs_REST_SWIFT : public RGWPutACLs_REST {
+public:
+  RGWPutACLs_REST_SWIFT() : RGWPutACLs_REST() {}
+  virtual ~RGWPutACLs_REST_SWIFT() {}
+
+  void send_response() {}
+};
+
+
+class RGWHandler_REST_SWIFT : public RGWHandler_REST {
+protected:
+
+  RGWOp *get_retrieve_obj_op(bool get_data);
+  RGWOp *get_retrieve_op(bool get_data);
+  RGWOp *get_create_op();
+  RGWOp *get_delete_op();
+  RGWOp *get_post_op() { return NULL; }
+
+public:
+  RGWHandler_REST_SWIFT() : RGWHandler_REST() {}
+  virtual ~RGWHandler_REST_SWIFT() {}
+
+  int authorize();
+};
+
+#endif
diff --git a/src/rgw/rgw_os.cc b/src/rgw/rgw_swift.cc
index edecf55f70b..29894c00c4a 100644
--- a/src/rgw/rgw_os.cc
+++ b/src/rgw/rgw_swift.cc
@@ -6,8 +6,8 @@
 #include <curl/easy.h>
 
 #include "rgw_common.h"
-#include "rgw_os.h"
-#include "rgw_os_auth.h"
+#include "rgw_swift.h"
+#include "rgw_swift_auth.h"
 #include "rgw_user.h"
 
 
@@ -15,7 +15,7 @@ static size_t read_http_header(void *ptr, size_t size, size_t nmemb, void *_info
 {
   size_t len = size * nmemb;
   char line[len + 1];
-  struct rgw_os_auth_info *info = (struct rgw_os_auth_info *)_info;
+  struct rgw_swift_auth_info *info = (struct rgw_swift_auth_info *)_info;
 
   char *s = (char *)ptr, *end = (char *)ptr + len;
   char *p = line;
@@ -56,14 +56,14 @@ static size_t read_http_header(void *ptr, size_t size, size_t nmemb, void *_info
   return len;
 }
 
-static int rgw_os_validate_token(const char *token, struct rgw_os_auth_info *info)
+static int rgw_swift_validate_token(const char *token, struct rgw_swift_auth_info *info)
 {
   CURL *curl_handle;
   string auth_url = "http://127.0.0.1:11000/token";
   char url_buf[auth_url.size() + 1 + strlen(token) + 1];
   sprintf(url_buf, "%s/%s", auth_url.c_str(), token);
 
-  RGW_LOG(10) << "rgw_os_validate_token url=" << url_buf << dendl;
+  RGW_LOG(10) << "rgw_swift_validate_token url=" << url_buf << dendl;
 
   curl_handle = curl_easy_init();
 
@@ -83,37 +83,37 @@ static int rgw_os_validate_token(const char *token, struct rgw_os_auth_info *inf
 bool rgw_verify_os_token(req_state *s)
 {
   if (strncmp(s->os_auth_token, "AUTH_rgwtk", 10) == 0) {
-    int ret = rgw_os_verify_signed_token(s->os_auth_token, s->user);
+    int ret = rgw_swift_verify_signed_token(s->os_auth_token, s->user);
     if (ret < 0)
       return false;
 
     return  true;
   }
 
-  struct rgw_os_auth_info info;
+  struct rgw_swift_auth_info info;
 
   memset(&info, 0, sizeof(info));
 
   info.status = 401; // start with access denied, validate_token might change that
 
-  int ret = rgw_os_validate_token(s->os_auth_token, &info);
+  int ret = rgw_swift_validate_token(s->os_auth_token, &info);
   if (ret < 0)
     return ret;
 
   if (!info.user) {
-    RGW_LOG(0) << "openstack auth didn't authorize a user" << dendl;
+    RGW_LOG(0) << "swift auth didn't authorize a user" << dendl;
     return false;
   }
 
   s->os_user = info.user;
   s->os_groups = info.auth_groups;
 
-  string openstack_user = s->os_user;
+  string swift_user = s->os_user;
 
-  RGW_LOG(10) << "openstack user=" << s->os_user << dendl;
+  RGW_LOG(10) << "swift user=" << s->os_user << dendl;
 
-  if (rgw_get_user_info_by_openstack(openstack_user, s->user) < 0) {
-    RGW_LOG(0) << "couldn't map openstack user" << dendl;
+  if (rgw_get_user_info_by_swift(swift_user, s->user) < 0) {
+    RGW_LOG(0) << "couldn't map swift user" << dendl;
     return false;
   }
 
diff --git a/src/rgw/rgw_os.h b/src/rgw/rgw_swift.h
index 2114ecaa294..16204bcb2d9 100644
--- a/src/rgw/rgw_os.h
+++ b/src/rgw/rgw_swift.h
@@ -1,11 +1,11 @@
 
-#ifndef CEPH_RGW_OS_H
-#define CEPH_RGW_OS_H
+#ifndef CEPH_RGW_SWIFT_H
+#define CEPH_RGW_SWIFT_H
 
 #include "rgw_common.h"
 
 
-struct rgw_os_auth_info {
+struct rgw_swift_auth_info {
   int status;
   char *auth_groups;
   char *user;
diff --git a/src/rgw/rgw_os_auth.cc b/src/rgw/rgw_swift_auth.cc
index a382ea6f102..afcd03d68bc 100644
--- a/src/rgw/rgw_os_auth.cc
+++ b/src/rgw/rgw_swift_auth.cc
@@ -1,4 +1,4 @@
-#include "rgw_os_auth.h"
+#include "rgw_swift_auth.h"
 #include "rgw_rest.h"
 
 #include "common/ceph_crypto.h"
@@ -8,7 +8,7 @@
 
 using namespace ceph::crypto;
 
-static RGW_OS_Auth_Get rgw_os_auth_get;
+static RGW_SWIFT_Auth_Get rgw_swift_auth_get;
 
 static int build_token(string& os_user, string& key, uint64_t nonce, utime_t& expiration, bufferlist& bl)
 {
@@ -45,14 +45,14 @@ static int encode_token(string& os_user, string& key, bufferlist& bl)
     return ret;
 
   utime_t expiration = ceph_clock_now(g_ceph_context);
-  expiration += RGW_OS_TOKEN_EXPIRATION; // 15 minutes
+  expiration += RGW_SWIFT_TOKEN_EXPIRATION; // 15 minutes
 
   ret = build_token(os_user, key, nonce, expiration, bl);
 
   return ret;
 }
 
-int rgw_os_verify_signed_token(const char *token, RGWUserInfo& info)
+int rgw_swift_verify_signed_token(const char *token, RGWUserInfo& info)
 {
   if (strncmp(token, "AUTH_rgwtk", 10) != 0)
     return -EINVAL;
@@ -83,7 +83,7 @@ int rgw_os_verify_signed_token(const char *token, RGWUserInfo& info)
     ::decode(os_user, iter);
     ::decode(nonce, iter);
     ::decode(expiration, iter);
-  } catch (buffer::error *err) {
+  } catch (buffer::error& err) {
     RGW_LOG(0) << "failed to decode token: caught exception" << dendl;
     return -EINVAL;
   }
@@ -92,13 +92,13 @@ int rgw_os_verify_signed_token(const char *token, RGWUserInfo& info)
     return -EPERM;
   }
 
-  if ((ret = rgw_get_user_info_by_openstack(os_user, info)) < 0)
+  if ((ret = rgw_get_user_info_by_swift(os_user, info)) < 0)
     return ret;
 
   RGW_LOG(10) << "os_user=" << os_user << dendl;
 
   bufferlist tok;
-  ret = build_token(os_user, info.openstack_key, nonce, expiration, tok);
+  ret = build_token(os_user, info.swift_key, nonce, expiration, tok);
   if (ret < 0)
     return ret;
 
@@ -117,23 +117,23 @@ int rgw_os_verify_signed_token(const char *token, RGWUserInfo& info)
   return 0;
 }
 
-void RGW_OS_Auth_Get::execute()
+void RGW_SWIFT_Auth_Get::execute()
 {
   int ret = -EPERM;
 
-  RGW_LOG(20) << "RGW_OS_Auth_Get::execute()" << dendl;
+  RGW_LOG(20) << "RGW_SWIFT_Auth_Get::execute()" << dendl;
 
   const char *key = s->env->get("HTTP_X_AUTH_KEY");
   const char *user = s->env->get("HTTP_X_AUTH_USER");
-  const char *url_prefix = s->env->get("RGW_OPENSTACK_URL_PREFIX");
-  const char *os_url = s->env->get("RGW_OPENSTACK_URL");
+  const char *url_prefix = s->env->get("RGW_SWIFT_URL_PREFIX");
+  const char *os_url = s->env->get("RGW_SWIFT_URL");
 
   string user_str = user;
   RGWUserInfo info;
   bufferlist bl;
 
   if (!os_url || !url_prefix) {
-    RGW_LOG(0) << "server is misconfigured, missing RGW_OPENSTACK_URL_PREFIX or RGW_OPENSTACK_URL" << dendl;
+    RGW_LOG(0) << "server is misconfigured, missing RGW_SWIFT_URL_PREFIX or RGW_SWIFT_URL" << dendl;
     ret = -EINVAL;
     goto done;
   }
@@ -141,18 +141,18 @@ void RGW_OS_Auth_Get::execute()
   if (!key || !user)
     goto done;
 
-  if ((ret = rgw_get_user_info_by_openstack(user_str, info)) < 0)
+  if ((ret = rgw_get_user_info_by_swift(user_str, info)) < 0)
     goto done;
 
-  if (info.openstack_key.compare(key) != 0) {
-    RGW_LOG(0) << "RGW_OS_Auth_Get::execute(): bad openstack key" << dendl;
+  if (info.swift_key.compare(key) != 0) {
+    RGW_LOG(0) << "RGW_SWIFT_Auth_Get::execute(): bad swift key" << dendl;
     ret = -EPERM;
     goto done;
   }
 
   CGI_PRINTF(s, "X-Storage-Url: %s/%s/v1/AUTH_rgw\n", os_url, url_prefix);
 
-  if ((ret = encode_token(info.openstack_name, info.openstack_key, bl)) < 0)
+  if ((ret = encode_token(info.swift_name, info.swift_key, bl)) < 0)
     goto done;
 
   {
@@ -170,17 +170,17 @@ done:
   end_header(s);
 }
 
-int RGWHandler_OS_Auth::authorize()
+int RGWHandler_SWIFT_Auth::authorize()
 {
   return 0;
 }
 
-RGWOp *RGWHandler_OS_Auth::get_op()
+RGWOp *RGWHandler_SWIFT_Auth::get_op()
 {
   RGWOp *op;
   switch (s->op) {
    case OP_GET:
-     op = &rgw_os_auth_get;
+     op = &rgw_swift_auth_get;
      break;
    default:
      return NULL;
@@ -192,7 +192,7 @@ RGWOp *RGWHandler_OS_Auth::get_op()
   return op;
 }
 
-void RGWHandler_OS_Auth::put_op(RGWOp *op)
+void RGWHandler_SWIFT_Auth::put_op(RGWOp *op)
 {
 }
 
diff --git a/src/rgw/rgw_swift_auth.h b/src/rgw/rgw_swift_auth.h
new file mode 100644
index 00000000000..95909a96015
--- /dev/null
+++ b/src/rgw/rgw_swift_auth.h
@@ -0,0 +1,30 @@
+#ifndef CEPH_RGW_SWIFT_AUTH_H
+#define CEPH_RGW_SWIFT_AUTH_H
+
+#include "rgw_op.h"
+
+#define RGW_SWIFT_TOKEN_EXPIRATION (15 * 60)
+
+extern int rgw_swift_verify_signed_token(const char *token, RGWUserInfo& info);
+
+class RGW_SWIFT_Auth_Get : public RGWOp {
+public:
+  RGW_SWIFT_Auth_Get() {}
+  ~RGW_SWIFT_Auth_Get() {}
+
+  int verify_permission() { return 0; }
+  void execute();
+};
+
+class RGWHandler_SWIFT_Auth : public RGWHandler {
+public:
+  RGWHandler_SWIFT_Auth() {}
+  ~RGWHandler_SWIFT_Auth() {}
+  RGWOp *get_op();
+  void put_op(RGWOp *op);
+
+  int authorize();
+  int read_permissions() { return 0; }
+};
+
+#endif
diff --git a/src/rgw/rgw_user.cc b/src/rgw/rgw_user.cc
index 98317ca2015..f47123144fe 100644
--- a/src/rgw/rgw_user.cc
+++ b/src/rgw/rgw_user.cc
@@ -15,7 +15,7 @@ using namespace std;
 
 static rgw_bucket ui_key_bucket(USER_INFO_POOL_NAME);
 static rgw_bucket ui_email_bucket(USER_INFO_EMAIL_POOL_NAME);
-static rgw_bucket ui_openstack_bucket(USER_INFO_OPENSTACK_POOL_NAME);
+static rgw_bucket ui_swift_bucket(USER_INFO_SWIFT_POOL_NAME);
 static rgw_bucket ui_uid_bucket(USER_INFO_UID_POOL_NAME);
 
 static rgw_bucket pi_pool_bucket(POOL_INFO_POOL_NAME);
@@ -49,12 +49,12 @@ int rgw_store_user_info(RGWUserInfo& info)
   int ret;
   map<string,bufferlist> attrs;
 
-  if (info.openstack_name.size()) {
-    /* check if openstack mapping exists */
+  if (info.swift_name.size()) {
+    /* check if swift mapping exists */
     RGWUserInfo inf;
-    int r = rgw_get_user_info_by_openstack(info.openstack_name, inf);
+    int r = rgw_get_user_info_by_swift(info.swift_name, inf);
     if (r >= 0 && inf.user_id.compare(info.user_id) != 0) {
-      RGW_LOG(0) << "can't store user info, openstack id already mapped to another user" << dendl;
+      RGW_LOG(0) << "can't store user info, swift id already mapped to another user" << dendl;
       return -EEXIST;
     }
   }
@@ -99,8 +99,8 @@ int rgw_store_user_info(RGWUserInfo& info)
     }
   }
 
-  if (info.openstack_name.size())
-    ret = rgw_put_obj(info.user_id, ui_openstack_bucket, info.openstack_name, uid_bl.c_str(), uid_bl.length());
+  if (info.swift_name.size())
+    ret = rgw_put_obj(info.user_id, ui_swift_bucket, info.swift_name, uid_bl.c_str(), uid_bl.length());
 
   return ret;
 }
@@ -115,9 +115,14 @@ int rgw_get_user_info_from_index(string& key, rgw_bucket& bucket, RGWUserInfo& i
     return ret;
 
   bufferlist::iterator iter = bl.begin();
-  ::decode(uid, iter);
-  if (!iter.end())
-    info.decode(iter);
+  try {
+    ::decode(uid, iter);
+    if (!iter.end())
+      info.decode(iter);
+  } catch (buffer::error& err) {
+    RGW_LOG(0) << "ERROR: failed to decode user info, caught buffer::error" << dendl;
+    return -EIO;
+  }
 
   return 0;
 }
@@ -141,12 +146,12 @@ int rgw_get_user_info_by_email(string& email, RGWUserInfo& info)
 }
 
 /**
- * Given an openstack username, finds the user_info associated with it.
+ * Given an swift username, finds the user_info associated with it.
  * returns: 0 on success, -ERR# on failure (including nonexistence)
  */
-extern int rgw_get_user_info_by_openstack(string& openstack_name, RGWUserInfo& info)
+extern int rgw_get_user_info_by_swift(string& swift_name, RGWUserInfo& info)
 {
-  return rgw_get_user_info_from_index(openstack_name, ui_openstack_bucket, info);
+  return rgw_get_user_info_from_index(swift_name, ui_swift_bucket, info);
 }
 
 /**
@@ -173,7 +178,12 @@ static int rgw_read_buckets_from_attr(string& user_id, RGWUserBuckets& buckets)
     return ret;
 
   bufferlist::iterator iter = bl.begin();
-  buckets.decode(iter);
+  try {
+    buckets.decode(iter);
+  } catch (buffer::error& err) {
+    RGW_LOG(0) << "ERROR: failed to decode buckets info, caught buffer::error" << dendl;
+    return -EIO;
+  }
   return 0;
 }
 
@@ -230,13 +240,18 @@ int rgw_read_user_buckets(string user_id, RGWUserBuckets& buckets, bool need_sta
     bufferlist::iterator p = bl.begin();
     bufferlist header;
     map<string,bufferlist> m;
-    ::decode(header, p);
-    ::decode(m, p);
-    for (map<string,bufferlist>::iterator q = m.begin(); q != m.end(); q++) {
-      bufferlist::iterator iter = q->second.begin();
-      RGWBucketEnt bucket;
-      ::decode(bucket, iter);
-      buckets.add(bucket);
+    try {
+      ::decode(header, p);
+      ::decode(m, p);
+      for (map<string,bufferlist>::iterator q = m.begin(); q != m.end(); q++) {
+        bufferlist::iterator iter = q->second.begin();
+        RGWBucketEnt bucket;
+        ::decode(bucket, iter);
+        buckets.add(bucket);
+      }
+    } catch (buffer::error& err) {
+      RGW_LOG(0) << "ERROR: failed to decode bucket information, caught buffer::error" << dendl;
+      return -EIO;
     }
   } else {
     ret = rgw_read_buckets_from_attr(user_id, buckets);
@@ -392,9 +407,9 @@ int rgw_remove_email_index(string& uid, string& email)
   return ret;
 }
 
-int rgw_remove_openstack_name_index(string& uid, string& openstack_name)
+int rgw_remove_swift_name_index(string& uid, string& swift_name)
 {
-  rgw_obj obj(ui_openstack_bucket, openstack_name);
+  rgw_obj obj(ui_swift_bucket, swift_name);
   int ret = rgwstore->delete_obj(NULL, uid, obj);
   return ret;
 }
@@ -421,34 +436,49 @@ int rgw_delete_user(RGWUserInfo& info, bool purge_data) {
   }
   map<string, RGWAccessKey>::iterator kiter = info.access_keys.begin();
   for (; kiter != info.access_keys.end(); ++kiter) {
+    RGW_LOG(0) << "removing key index: " << kiter->first << dendl;
     ret = rgw_remove_key_index(kiter->second);
-    if (ret < 0 && ret != -ENOENT)
-      RGW_LOG(0) << "ERROR: could not remove " << kiter->first << " (access key object), should be fixed manually (err=" << ret << ")" << dendl;
+    if (ret < 0 && ret != -ENOENT) {
+      RGW_LOG(0) << "ERROR: could not remove " << kiter->first << " (access key object), should be fixed (err=" << ret << ")" << dendl;
+      return ret;
+    }
   }
 
-  rgw_obj uid_obj(ui_uid_bucket, info.user_id);
-  ret = rgwstore->delete_obj(NULL, info.user_id, uid_obj);
-  if (ret < 0 && ret != -ENOENT)
-    RGW_LOG(0) << "ERROR: could not remove " << info.user_id << ":" << uid_obj << ", should be fixed manually (err=" << ret << ")" << dendl;
-
-  string buckets_obj_id;
-  get_buckets_obj(info.user_id, buckets_obj_id);
-  rgw_obj uid_bucks(ui_uid_bucket, buckets_obj_id);
-  ret = rgwstore->delete_obj(NULL, info.user_id, uid_bucks);
-  if (ret < 0 && ret != -ENOENT)
-    RGW_LOG(0) << "ERROR: could not remove " << info.user_id << ":" << uid_bucks << ", should be fixed manually (err=" << ret << ")" << dendl;
-  
-
   rgw_obj email_obj(ui_email_bucket, info.user_email);
+  RGW_LOG(0) << "removing email index: " << info.user_email << dendl;
   ret = rgwstore->delete_obj(NULL, info.user_id, email_obj);
-  if (ret < 0 && ret != -ENOENT)
-    RGW_LOG(0) << "ERROR: could not remove " << info.user_id << ":" << email_obj << ", should be fixed manually (err=" << ret << ")" << dendl;
+  if (ret < 0 && ret != -ENOENT) {
+    RGW_LOG(0) << "ERROR: could not remove " << info.user_id << ":" << email_obj << ", should be fixed (err=" << ret << ")" << dendl;
+    return ret;
+  }
 
   if (purge_data) {
+    RGW_LOG(0) << "purging user buckets" << dendl;
     ret = rgwstore->purge_buckets(info.user_id, buckets_vec);
-    if (ret < 0)
+    if (ret < 0 && ret != -ENOENT) {
       RGW_LOG(0) << "ERROR: delete_buckets returned " << ret << dendl;
+      return ret;
+    }
+  }
+
+  string buckets_obj_id;
+  get_buckets_obj(info.user_id, buckets_obj_id);
+  rgw_obj uid_bucks(ui_uid_bucket, buckets_obj_id);
+  RGW_LOG(0) << "removing user buckets index" << dendl;
+  ret = rgwstore->delete_obj(NULL, info.user_id, uid_bucks);
+  if (ret < 0 && ret != -ENOENT) {
+    RGW_LOG(0) << "ERROR: could not remove " << info.user_id << ":" << uid_bucks << ", should be fixed (err=" << ret << ")" << dendl;
+    return ret;
+  }
+  
+  rgw_obj uid_obj(ui_uid_bucket, info.user_id);
+  RGW_LOG(0) << "removing user index: " << info.user_id << dendl;
+  ret = rgwstore->delete_obj(NULL, info.user_id, uid_obj);
+  if (ret < 0 && ret != -ENOENT) {
+    RGW_LOG(0) << "ERROR: could not remove " << info.user_id << ":" << uid_obj << ", should be fixed (err=" << ret << ")" << dendl;
+    return ret;
   }
+
   return 0;
 }
 
@@ -485,7 +515,12 @@ int rgw_retrieve_pool_info(int64_t pool_id, RGWPoolInfo& pool_info)
     return ret;
   }
   bufferlist::iterator iter = bl.begin();
-  ::decode(pool_info, iter);
+  try {
+    ::decode(pool_info, iter);
+  } catch (buffer::error& err) {
+    RGW_LOG(0) << "ERROR: failed to decode pool information, caught buffer::error" << dendl;
+    return -EIO;
+  }
 
   return 0;
 }
diff --git a/src/rgw/rgw_user.h b/src/rgw/rgw_user.h
index c87ec8cf277..92b964f6941 100644
--- a/src/rgw/rgw_user.h
+++ b/src/rgw/rgw_user.h
@@ -11,7 +11,7 @@ using namespace std;
 
 #define USER_INFO_POOL_NAME ".users"
 #define USER_INFO_EMAIL_POOL_NAME ".users.email"
-#define USER_INFO_OPENSTACK_POOL_NAME ".users.openstack"
+#define USER_INFO_SWIFT_POOL_NAME ".users.swift"
 #define USER_INFO_UID_POOL_NAME ".users.uid"
 #define RGW_USER_ANON_ID "anonymous"
 
@@ -53,15 +53,15 @@ extern int rgw_store_user_info(RGWUserInfo& info);
  */
 extern int rgw_get_user_info_by_uid(string& user_id, RGWUserInfo& info);
 /**
- * Given an openstack username, finds the user info associated with it.
+ * Given an swift username, finds the user info associated with it.
  * returns: 0 on success, -ERR# on failure (including nonexistence)
  */
 extern int rgw_get_user_info_by_email(string& email, RGWUserInfo& info);
 /**
- * Given an openstack username, finds the user info associated with it.
+ * Given an swift username, finds the user info associated with it.
  * returns: 0 on success, -ERR# on failure (including nonexistence)
  */
-extern int rgw_get_user_info_by_openstack(string& openstack_name, RGWUserInfo& info);
+extern int rgw_get_user_info_by_swift(string& swift_name, RGWUserInfo& info);
 /**
  * Given an access key, finds the user info associated with it.
  * returns: 0 on success, -ERR# on failure (including nonexistence)
@@ -147,7 +147,7 @@ extern int rgw_remove_bucket(string user_id, rgw_bucket& bucket, bool purge_data
 extern int rgw_remove_key_index(RGWAccessKey& access_key);
 extern int rgw_remove_uid_index(string& uid);
 extern int rgw_remove_email_index(string& uid, string& email);
-extern int rgw_remove_openstack_name_index(string& uid, string& openstack_name);
+extern int rgw_remove_swift_name_index(string& uid, string& swift_name);
 
 extern int rgw_store_pool_info(int64_t pool_id, RGWPoolInfo& pool_info);
 extern int rgw_retrieve_pool_info(int64_t pool_id, RGWPoolInfo& pool_info);
diff --git a/src/test/cli/radosgw_admin/help.t b/src/test/cli/radosgw_admin/help.t
index adc6259534c..8860c3cdd02 100644
--- a/src/test/cli/radosgw_admin/help.t
+++ b/src/test/cli/radosgw_admin/help.t
@@ -27,11 +27,11 @@
      --uid=<id>                user id
      --subuser=<name>          subuser name
      --access-key=<key>        S3 access key
-     --os-user=<group:name>    OpenStack user
+     --swift-user=<group:name> Swift user
      --email=<email>
      --auth_uid=<auid>         librados uid
      --secret=<key>            S3 key
-     --os-secret=<key>         OpenStack key
+     --swift-secret=<key>      Swift key
      --gen-access-key          generate random access key
      --gen-secret              generate random secret key
      --access=<access>         Set access permissions for sub-user, should be one
diff --git a/src/test/store_test.cc b/src/test/store_test.cc
index a3c9f529e56..10fb6564dc2 100644
--- a/src/test/store_test.cc
+++ b/src/test/store_test.cc
@@ -253,7 +253,7 @@ public:
     // hash
     //boost::binomial_distribution<uint32_t> bin(0xFFFFFF, 0.5);
     ++seq;
-    return hobject_t(name, CEPH_NOSNAP, rand());
+    return hobject_t(name, string(), CEPH_NOSNAP, rand());
   }
 };
 
@@ -469,7 +469,7 @@ TEST_F(StoreTest, HashCollisionTest) {
     if (!(i % 5)) {
       cerr << "Object " << i << std::endl;
     }
-    hobject_t hoid(string(buf) + base, CEPH_NOSNAP, 0);
+    hobject_t hoid(string(buf) + base, string(), CEPH_NOSNAP, 0);
     {
       ObjectStore::Transaction t;
       t.touch(cid, hoid);