diff options
86 files changed, 2543 insertions, 1269 deletions
diff --git a/.gitignore b/.gitignore index 99560459d1e..73a8d86bd14 100644 --- a/.gitignore +++ b/.gitignore @@ -53,3 +53,4 @@ core .project .cproject /build-doc +/doc/object_store.png diff --git a/admin/build-doc b/admin/build-doc index aee1a48345c..622d7c84031 100755 --- a/admin/build-doc +++ b/admin/build-doc @@ -10,6 +10,7 @@ if [ ! -e build-doc/doxygen/xml ]; then fi dia --filter=png-libart --export=doc/overview.png.tmp doc/overview.dia + mv -- doc/overview.png.tmp doc/overview.png cd build-doc diff --git a/doc/appendix/differences-from-posix.rst b/doc/appendix/differences-from-posix.rst new file mode 100644 index 00000000000..f327e786dc9 --- /dev/null +++ b/doc/appendix/differences-from-posix.rst @@ -0,0 +1,17 @@ +======================== + Differences from POSIX +======================== + +.. todo:: delete http://ceph.newdream.net/wiki/Differences_from_POSIX + +Ceph does have a few places where it diverges from strict POSIX semantics for various reasons: + +- Sparse files propagate incorrectly to tools like df. They will only + use up the required space, but in df will increase the "used" space + by the full file size. We do this because actually keeping track of + the space a large, sparse file uses is very expensive. +- In shared simultaneous writer situations, a write that crosses + object boundaries is not necessarily atomic. This means that you + could have writer A write "aa|aa" and writer B write "bb|bb" + simultaneously (where | is the object boundary), and end up with + "aa|bb" rather than the proper "aa|aa" or "bb|bb". diff --git a/doc/appendix/index.rst b/doc/appendix/index.rst new file mode 100644 index 00000000000..a98bf899a2a --- /dev/null +++ b/doc/appendix/index.rst @@ -0,0 +1,10 @@ +============ + Appendices +============ + +.. toctree:: + :glob: + :numbered: + :titlesonly: + + * diff --git a/doc/architecture.rst b/doc/architecture.rst index f66cf3cd9b7..05a3fdfa493 100644 --- a/doc/architecture.rst +++ b/doc/architecture.rst @@ -2,167 +2,180 @@ Architecture of Ceph ====================== -- Introduction to Ceph Project - - - High-level overview of project benefits for users (few paragraphs, mention each subproject) - - Introduction to sub-projects (few paragraphs to a page each) - - - RADOS - - RGW - - RBD - - Ceph - - - Example scenarios Ceph projects are/not suitable for - - (Very) High-Level overview of Ceph - - This would include an introduction to basic project terminology, - the concept of OSDs, MDSes, and Monitors, and things like - that. What they do, some of why they're awesome, but not how they - work. - -- Discussion of MDS terminology, daemon types (active, standby, - standby-replay) - -.. todo:: write me - -================================= - Library architecture -================================= -Ceph is structured into libraries which are built and then combined together to -make executables and other libraries. - -- libcommon: a collection of utilities which are available to nearly every ceph - library and executable. In general, libcommon should not contain global - variables, because it is intended to be linked into libraries such as - libceph.so. - -- libglobal: a collection of utilities focused on the needs of Ceph daemon - programs. In here you will find pidfile management functions, signal - handlers, and so forth. - -.. todo:: document other libraries - -================================= - Configuration Management System -================================= -The configuration management system exists to provide every daemon with the -proper configuration information. The configuration can be viewed as a set of -key-value pairs. - -How can the configuration be set? Well, there are several sources: - - the ceph configuration file, usually named ceph.conf - - command line arguments:: - --debug-ms=1 - --debug-pg=10 - etc. - - arguments injected at runtime by using injectargs - -====================================================== - The Configuration File -====================================================== -Most configuration settings originate in the Ceph configuration file. - -How do we find the configuration file? Well, in order, we check: - - the default locations - - the environment variable CEPH_CONF - - the command line argument -c - -Each stanza of the configuration file describes the key-value pairs that will be in -effect for a particular subset of the daemons. The "global" stanza applies to -everything. The "mon", "osd", and "mds" stanzas specify settings to take effect -for all monitors, all osds, and all mds servers, respectively. A stanza of the -form mon.$name, osd.$name, or mds.$name gives settings for the monitor, OSD, or -MDS of that name, respectively. Configuration values that appear later in the -file win over earlier ones. - -A sample configuration file can be found in src/sample.ceph.conf. - -====================================================== - Metavariables -====================================================== -The configuration system supports certain "metavariables." If these occur -inside a configuration value, they are expanded into something else-- similar to -how bash shell expansion works. - -There are a few different metavariables: - - $host: expands to the current hostname - - $type: expands to one of "mds", "osd", or "mon" - - $id: expands to the daemon identifier. For osd.0, this would be "0"; for mds.a, it would be "a" - - $num: same as $id - - $name: expands to $type.$id - -====================================================== - Interfacing with the Configuration Management System -====================================================== -There are two ways for Ceph code to get configuration values. One way is to -read it directly from a variable named "g_conf," or equivalently, -"g_ceph_ctx->_conf." The other is to register an observer that will called -every time the relevant configuration values changes. This observer will be -called soon after the initial configuration is read, and every time after that -when one of the relevant values changes. Each observer tracks a set of keys -and is invoked only when one of the relevant keys changes. - -The interface to implement is found in common/config_obs.h. - -The observer method should be preferred in new code because - - It is more flexible, allowing the code to do whatever reinitialization needs - to be done to implement the new configuration value. - - It is the only way to create a std::string configuration variable that can - be changed by injectargs. - - Even for int-valued configuration options, changing the values in one thread - while another thread is reading them can lead to subtle and - impossible-to-diagnose bugs. - -For these reasons, reading directly from g_conf should be considered deprecated -and not done in new code. Do not ever alter g_conf. - -================================= - Debug Logs -================================= -The main debugging tool for Ceph is the dout and derr logging functions. -Collectively, these are referred to as "dout logging." - -Dout has several log faculties, which can be set at various log -levels using the configuration management system. So it is possible to enable -debugging just for the messenger, by setting debug_ms to 10, for example. - -Dout is implemented mainly in common/DoutStreambuf.cc - -The dout macro avoids even generating log messages which are not going to be -used, by enclosing them in an "if" statement. What this means is that if you -have the debug level set at 0, and you run this code - -``dout(20) << "myfoo() = " << myfoo() << dendl;`` - - -myfoo() will not be called here. - -Unfortunately, the performance of debug logging is relatively low. This is -because there is a single, process-wide mutex which every debug output -statement takes, and every debug output statement leads to a write() system -call or a call to syslog(). There is also a computational overhead to using C++ -streams to consider. So you will need to be parsimonius in your logging to get -the best performance. - -Sometimes, enabling logging can hide race conditions and other bugs by changing -the timing of events. Keep this in mind when debugging. - -================================= - CephContext -================================= -A CephContext represents a single view of the Ceph cluster. It comes complete -with a configuration, a set of performance counters (PerfCounters), and a -heartbeat map. You can find more information about CephContext in -src/common/ceph_context.h. - -Generally, you will have only one CephContext in your application, called -g_ceph_context. However, in library code, it is possible that the library user -will initialize multiple CephContexts. For example, this would happen if he -called rados_create more than once. - -A ceph context is required to issue log messages. Why is this? Well, without -the CephContext, we would not know which log messages were disabled and which -were enabled. The dout() macro implicitly references g_ceph_context, so it -can't be used in library code. It is fine to use dout and derr in daemons, but -in library code, you must use ldout and lderr, and pass in your own CephContext -object. The compiler will enforce this restriction. +Ceph is a distributed network storage and file system with distributed +metadata management and POSIX semantics. + +RADOS is a reliable object store, used by Ceph, but also directly +accessible. + +``radosgw`` is an S3-compatible RESTful HTTP service for object +storage, using RADOS storage. + +RBD is a Linux kernel feature that exposes RADOS storage as a block +device. Qemu/KVM also has a direct RBD client, that avoids the kernel +overhead. + + +.. _monitor: + +Monitor cluster +=============== + +``cmon`` is a lightweight daemon that provides a consensus for +distributed decisionmaking in a Ceph/RADOS cluster. + +It also is the initial point of contact for new clients, and will hand +out information about the topology of the cluster, such as the +``osdmap``. + +You normally run 3 ``cmon`` daemons, on 3 separate physical machines, +isolated from each other; for example, in different racks or rows. + +You could run just 1 instance, but that means giving up on high +availability. + +You may use the same hosts for ``cmon`` and other purposes. + +``cmon`` processes talk to each other using a Paxos_\-style +protocol. They discover each other via the ``[mon.X] mon addr`` fields +in ``ceph.conf``. + +.. todo:: What about ``monmap``? Fact check. + +Any decision requires the majority of the ``cmon`` processes to be +healthy and communicating with each other. For this reason, you never +want an even number of ``cmon``\s; there is no unambiguous majority +subgroup for an even number. + +.. _Paxos: http://en.wikipedia.org/wiki/Paxos_algorithm + +.. todo:: explain monmap + + +.. _rados: + + +RADOS +===== + +``cosd`` is the storage daemon that provides the RADOS service. It +uses ``cmon`` for cluster membership, services object read/write/etc +request from clients, and peers with other ``cosd``\s for data +replication. + +The data model is fairly simple on this level. There are multiple +named pools, and within each pool there are named objects, in a flat +namespace (no directories). Each object has both data and metadata. + +The data for an object is a single, potentially big, series of +bytes. Additionally, the series may be sparse, it may have holes that +contain binary zeros, and take up no actual storage. + +The metadata is an unordered set of key-value pairs. It's semantics +are completely up to the client; for example, the Ceph filesystem uses +metadata to store file owner etc. + +.. todo:: Verify that metadata is unordered. + +Underneath, ``cosd`` stores the data on a local filesystem. We +recommend using Btrfs_, but any POSIX filesystem that has extended +attributes should work (see :ref:`xattr`). + +.. _Btrfs: http://en.wikipedia.org/wiki/Btrfs + +.. todo:: write about access control + +.. todo:: explain osdmap + +.. todo:: explain plugins ("classes") + + +.. _cephfs: + +Ceph filesystem +=============== + +The Ceph filesystem service is provided by a daemon called +``cmds``. It uses RADOS to store all the filesystem metadata +(directories, file ownership, access modes, etc), and directs clients +to access RADOS directly for the file contents. + +The Ceph filesystem aims for POSIX compatibility, except for a few +chosen differences. See :doc:`/appendix/differences-from-posix`. + +``cmds`` can run as a single process, or it can be distributed out to +multiple physical machines, either for high availability or for +scalability. + +For high availability, the extra ``cmds`` instances can be `standby`, +ready to take over the duties of any failed ``cmds`` that was +`active`. This is easy because all the data, including the journal, is +stored on RADOS. The transition is triggered automatically by +``cmon``. + +For scalability, multiple ``cmds`` instances can be `active`, and they +will split the directory tree into subtrees (and shards of a single +busy directory), effectively balancing the load amongst all `active` +servers. + +Combinations of `standby` and `active` etc are possible, for example +running 3 `active` ``cmds`` instances for scaling, and one `standby`. + +To control the number of `active` ``cmds``\es, see :doc:`/ops/grow/mds`. + +.. topic:: Status as of 2011-09: + + Multiple `active` ``cmds`` operation is stable under normal + circumstances, but some failure scenarios may still cause + operational issues. + +.. todo:: document `standby-replay` + +.. todo:: mds.0 vs mds.alpha etc details + + + +``radosgw`` +=========== + +``radosgw`` is a FastCGI service that provides a RESTful_ HTTP API to +store objects and metadata. It layers on top of RADOS with its own +data formats, and maintains it's own user database, authentication, +access control, and so on. + +.. _RESTful: http://en.wikipedia.org/wiki/RESTful + + +Rados Block Device (RBD) +======================== + +In virtual machine scenarios, RBD is typically used via the ``rbd`` +network storage driver in Qemu/KVM, where the host machine uses +``librbd`` to provide a block device service to the guest. + +Alternatively, as no direct ``librbd`` support is available in Xen, +the Linux kernel can act as the RBD client and provide a real block +device on the host machine, that can then be accessed by the +virtualization. This is done with the command-line tool ``rbd`` (see +:doc:`/ops/rbd`). + +The latter is also useful in non-virtualized scenarios. + +Internally, RBD stripes the device image over multiple RADOS objects, +each typically located on a separate ``cosd``, allowing it to perform +better than a single server could. + + +Client +====== + +.. todo:: cephfs, cfuse, librados, libceph, librbd + + +.. todo:: Summarize how much Ceph trusts the client, for what parts (security vs reliability). + + +TODO +==== + +.. todo:: Example scenarios Ceph projects are/not suitable for diff --git a/doc/conf.py b/doc/conf.py index b74229608a1..afac33f816a 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -22,6 +22,7 @@ html_sidebars = { # ugly kludge until breathe is distutils-friendly import sys; sys.path.append('../build-doc/breathe') extensions = [ + 'sphinx.ext.graphviz', 'sphinx.ext.todo', 'breathe', ] diff --git a/doc/dev/config.rst b/doc/dev/config.rst new file mode 100644 index 00000000000..59f1b92989c --- /dev/null +++ b/doc/dev/config.rst @@ -0,0 +1,77 @@ +================================= + Configuration Management System +================================= + +The configuration management system exists to provide every daemon with the +proper configuration information. The configuration can be viewed as a set of +key-value pairs. + +How can the configuration be set? Well, there are several sources: + - the ceph configuration file, usually named ceph.conf + - command line arguments:: + --debug-ms=1 + --debug-pg=10 + etc. + - arguments injected at runtime by using injectargs + + +The Configuration File +====================== + +Most configuration settings originate in the Ceph configuration file. + +How do we find the configuration file? Well, in order, we check: + - the default locations + - the environment variable CEPH_CONF + - the command line argument -c + +Each stanza of the configuration file describes the key-value pairs that will be in +effect for a particular subset of the daemons. The "global" stanza applies to +everything. The "mon", "osd", and "mds" stanzas specify settings to take effect +for all monitors, all osds, and all mds servers, respectively. A stanza of the +form mon.$name, osd.$name, or mds.$name gives settings for the monitor, OSD, or +MDS of that name, respectively. Configuration values that appear later in the +file win over earlier ones. + +A sample configuration file can be found in src/sample.ceph.conf. + + +Metavariables +============= + +The configuration system supports certain "metavariables." If these occur +inside a configuration value, they are expanded into something else-- similar to +how bash shell expansion works. + +There are a few different metavariables: + - $host: expands to the current hostname + - $type: expands to one of "mds", "osd", or "mon" + - $id: expands to the daemon identifier. For osd.0, this would be "0"; for mds.a, it would be "a" + - $num: same as $id + - $name: expands to $type.$id + + +Interfacing with the Configuration Management System +==================================================== + +There are two ways for Ceph code to get configuration values. One way is to +read it directly from a variable named "g_conf," or equivalently, +"g_ceph_ctx->_conf." The other is to register an observer that will called +every time the relevant configuration values changes. This observer will be +called soon after the initial configuration is read, and every time after that +when one of the relevant values changes. Each observer tracks a set of keys +and is invoked only when one of the relevant keys changes. + +The interface to implement is found in common/config_obs.h. + +The observer method should be preferred in new code because + - It is more flexible, allowing the code to do whatever reinitialization needs + to be done to implement the new configuration value. + - It is the only way to create a std::string configuration variable that can + be changed by injectargs. + - Even for int-valued configuration options, changing the values in one thread + while another thread is reading them can lead to subtle and + impossible-to-diagnose bugs. + +For these reasons, reading directly from g_conf should be considered deprecated +and not done in new code. Do not ever alter g_conf. diff --git a/doc/dev/context.rst b/doc/dev/context.rst new file mode 100644 index 00000000000..1a2b2cbfb00 --- /dev/null +++ b/doc/dev/context.rst @@ -0,0 +1,20 @@ +============= + CephContext +============= + +A CephContext represents a single view of the Ceph cluster. It comes complete +with a configuration, a set of performance counters (PerfCounters), and a +heartbeat map. You can find more information about CephContext in +src/common/ceph_context.h. + +Generally, you will have only one CephContext in your application, called +g_ceph_context. However, in library code, it is possible that the library user +will initialize multiple CephContexts. For example, this would happen if he +called rados_create more than once. + +A ceph context is required to issue log messages. Why is this? Well, without +the CephContext, we would not know which log messages were disabled and which +were enabled. The dout() macro implicitly references g_ceph_context, so it +can't be used in library code. It is fine to use dout and derr in daemons, but +in library code, you must use ldout and lderr, and pass in your own CephContext +object. The compiler will enforce this restriction. diff --git a/doc/dev/index.rst b/doc/dev/index.rst new file mode 100644 index 00000000000..69fda91850c --- /dev/null +++ b/doc/dev/index.rst @@ -0,0 +1,22 @@ +================================== + Internal developer documentation +================================== + +.. note:: If you're looking for how to use Ceph as a library from your + own software, please see :doc:`/api/index`. + +You can start a development mode Ceph cluster, after compiling the source, with:: + + cd src + install -d -m0755 out dev/osd0 + ./vstart.sh -n -x -l + # check that it's there + ./ceph health + +.. todo:: vstart is woefully undocumented and full of sharp sticks to poke yourself with. + + +.. toctree:: + :glob: + + * diff --git a/doc/dev/libs.rst b/doc/dev/libs.rst new file mode 100644 index 00000000000..bdebdb3b000 --- /dev/null +++ b/doc/dev/libs.rst @@ -0,0 +1,18 @@ +====================== + Library architecture +====================== + +Ceph is structured into libraries which are built and then combined together to +make executables and other libraries. + +- libcommon: a collection of utilities which are available to nearly every ceph + library and executable. In general, libcommon should not contain global + variables, because it is intended to be linked into libraries such as + libceph.so. + +- libglobal: a collection of utilities focused on the needs of Ceph daemon + programs. In here you will find pidfile management functions, signal + handlers, and so forth. + +.. todo:: document other libraries + diff --git a/doc/dev/logs.rst b/doc/dev/logs.rst new file mode 100644 index 00000000000..f4bef84c2ca --- /dev/null +++ b/doc/dev/logs.rst @@ -0,0 +1,31 @@ +============ + Debug Logs +============ + +The main debugging tool for Ceph is the dout and derr logging functions. +Collectively, these are referred to as "dout logging." + +Dout has several log faculties, which can be set at various log +levels using the configuration management system. So it is possible to enable +debugging just for the messenger, by setting debug_ms to 10, for example. + +Dout is implemented mainly in common/DoutStreambuf.cc + +The dout macro avoids even generating log messages which are not going to be +used, by enclosing them in an "if" statement. What this means is that if you +have the debug level set at 0, and you run this code + +``dout(20) << "myfoo() = " << myfoo() << dendl;`` + + +myfoo() will not be called here. + +Unfortunately, the performance of debug logging is relatively low. This is +because there is a single, process-wide mutex which every debug output +statement takes, and every debug output statement leads to a write() system +call or a call to syslog(). There is also a computational overhead to using C++ +streams to consider. So you will need to be parsimonius in your logging to get +the best performance. + +Sometimes, enabling logging can hide race conditions and other bugs by changing +the timing of events. Keep this in mind when debugging. diff --git a/doc/dev/object-store.rst b/doc/dev/object-store.rst new file mode 100644 index 00000000000..698070c5e7f --- /dev/null +++ b/doc/dev/object-store.rst @@ -0,0 +1,67 @@ +==================================== + Object Store Architecture Overview +==================================== + +.. graphviz:: + + /* + * Rough outline of object store module dependencies + */ + + digraph object_store { + size="7,7"; + node [color=lightblue2, style=filled, fontname="Serif"]; + + "testrados" -> "librados" + "testradospp" -> "librados" + + "rbd" -> "librados" + + "radostool" -> "librados" + + "radosgw_admin" -> "rgw" + + "rgw" -> "librados" + + "radosacl" -> "librados" + + "librados" -> "objecter" + + "ObjectCacher" -> "Filer" + + "dumpjournal" -> "Journaler" + + "Journaler" -> "Filer" + + "SyntheticClient" -> "Filer" + "SyntheticClient" -> "objecter" + + "Filer" -> "objecter" + + "objecter" -> "OSDMap" + + "cosd" -> "PG" + "cosd" -> "ObjectStore" + + "crushtool" -> "CrushWrapper" + + "OSDMap" -> "CrushWrapper" + + "OSDMapTool" -> "OSDMap" + + "PG" -> "ReplicatedPG" + "PG" -> "ObjectStore" + "PG" -> "OSDMap" + + "ReplicatedPG" -> "ObjectStore" + "ReplicatedPG" -> "OSDMap" + + "ObjectStore" -> "FileStore" + + "FileStore" -> "ext3" + "FileStore" -> "ext4" + "FileStore" -> "btrfs" + } + + +.. todo:: write more here diff --git a/doc/index.rst b/doc/index.rst index c108dffeeb4..d3ad0c669ec 100644 --- a/doc/index.rst +++ b/doc/index.rst @@ -90,9 +90,11 @@ Table of Contents architecture ops/index api/index + Internals <dev/index> man/index papers glossary + appendix/index Indices and tables diff --git a/doc/ops/autobuilt.rst b/doc/ops/autobuilt.rst new file mode 100644 index 00000000000..add9b292a92 --- /dev/null +++ b/doc/ops/autobuilt.rst @@ -0,0 +1,64 @@ +============================= + Autobuilt unstable packages +============================= + +We automatically build Debian and Ubuntu packages for any branches or +tags that appear in the |ceph.git|_. We build packages for the `amd64` +and `i386` architectures (`arch list`_), for the following +distributions (`distro list`_): + +- ``natty`` (Ubuntu 11.04) +- ``squeeze`` (Debian 6.0) + +.. |ceph.git| replace:: + ``ceph.git`` repository +.. _`ceph.git`: https://github.com/NewDreamNetwork/ceph + +.. _`arch list`: http://ceph.newdream.net/debian-snapshot-amd64/master/dists/natty/main/ +.. _`distro list`: http://ceph.newdream.net/debian-snapshot-amd64/master/dists/ + +The current status of autobuilt packages can be found at +http://ceph.newdream.net/gitbuilder-deb-amd64/ . + +If you wish to use these packages, you need to modify the +:ref:`earlier instructions <install-debs>` as follows: + +.. warning:: The following commands make your computer trust any code + that makes it into ``ceph.git``, including work in progress + branches and versions of code with possible security issues (that + were fixed afterwards). Use at your own risk! + +Whenever we say *DISTRO* below, replace it with the codename of your +operating system. + +Whenever we say *BRANCH* below, replace it with the version of the +code you want to run, e.g. ``master``, ``stable`` or ``v0.34`` (`branch list`_ [#broken-links]_). + +.. _`branch list`: http://ceph.newdream.net/debian-snapshot-amd64/ + +Run these commands on all nodes:: + + wget -q -O- https://raw.github.com/NewDreamNetwork/ceph/master/keys/autobuild.asc \ + | sudo apt-key add - + + sudo tee /etc/apt/sources.list.d/ceph.list <<EOF + deb http://ceph.newdream.net/debian-snapshot-amd64/BRANCH/ DISTRO main + deb-src http://ceph.newdream.net/debian-snapshot-amd64/BRANCH/ DISTRO main + EOF + + sudo apt-get update + sudo apt-get install ceph + +From here on, you can follow the usual set up instructions in +:doc:`/ops/install`. + + + +.. rubric:: Footnotes + +.. [#broken-links] Technical issues with how that part of the URL + space is HTTP reverse proxied means that the links in the generated + directory listings are broken. Please don't click on the links, + instead edit the URL bar manually, for now. + + .. todo:: Fix the gitbuilder reverse proxy to not break relative URLs. diff --git a/doc/ops/filesystem.rst b/doc/ops/filesystem.rst index 6a9ddeca7a6..ad302dff69c 100644 --- a/doc/ops/filesystem.rst +++ b/doc/ops/filesystem.rst @@ -3,3 +3,22 @@ ======================================= .. todo:: Benefits of each, limits on non-btrfs ones, performance data when we have them, etc + + +.. _btrfs: + +Btrfs +----- + +.. todo:: what does btrfs give you (the journaling thing) + + +ext4/ext3 +--------- + +.. _xattr: + +Enabling extended attributes +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. todo:: how to enable xattr on ext4/3 diff --git a/doc/ops/index.rst b/doc/ops/index.rst index 3221394d374..915a02dcfc5 100644 --- a/doc/ops/index.rst +++ b/doc/ops/index.rst @@ -14,3 +14,5 @@ monitor grow/index data-placement + autobuilt + misc diff --git a/doc/ops/install.rst b/doc/ops/install.rst index 692ac926bec..d0b589d56fc 100644 --- a/doc/ops/install.rst +++ b/doc/ops/install.rst @@ -2,13 +2,195 @@ Installing a Ceph cluster =========================== +For development and really early stage testing, see :doc:`/dev/index`. + +For installing the latest development builds, see +:doc:`/ops/autobuilt`. + +Installing any complex distributed software can be a lot of work. We +support two automated ways of installing Ceph: using Chef_, or with +the ``mkcephfs`` shell script. + +.. _Chef: http://wiki.opscode.com/display/chef + +.. topic:: Status as of 2011-09 + + This section hides a lot of the tedious underlying details. If you + need to, or wish to, roll your own deployment automation, or are + doing it manually, you'll have to dig into a lot more intricate + details. We are working on simplifying the installation, as that + also simplifies our Chef cookbooks. + + +Installing Ceph using Chef +========================== + +(Try saying that fast 10 times.) + +.. topic:: Status as of 2011-09 + + While we have Chef cookbooks in use internally, they are not yet + ready to handle unsupervised installation of a full cluster. Stay + tuned for updates. + .. todo:: write me -Authentication is optional but very much recommended. -Basically, everything somebody needs to go through to build a new -cluster when not cheating via vstart or teuthology, but without -mentioning all the design tradeoffs and options like journaling -locations or filesystems +Installing Ceph using ``mkcephfs`` +================================== + +Pick a host that has the Ceph software installed -- it does not have +to be a part of your cluster, but it does need to have *matching +versions* of the ``mkcephfs`` command and other Ceph tools +installed. This will be your `admin host`. + + +Installing the packages +----------------------- + + +.. _install-debs: + +Debian/Ubuntu +~~~~~~~~~~~~~ + +We regularly build Debian and Ubuntu packages for the `amd64` and +`i386` architectures, for the following distributions: + +- ``sid`` (Debian unstable) +- ``squeeze`` (Debian 6.0) +- ``lenny`` (Debian 5.0) +- ``oneiric`` (Ubuntu 11.11) +- ``natty`` (Ubuntu 11.04) +- ``maverick`` (Ubuntu 10.10) + +.. todo:: http://ceph.newdream.net/debian/dists/ also has ``lucid`` + (Ubuntu 10.04), should that be removed? + +Whenever we say *DISTRO* below, replace that with the codename of your +operating system. + +Run these commands on all nodes:: + + wget -q -O- https://raw.github.com/NewDreamNetwork/ceph/master/keys/release.asc \ + | sudo apt-key add - + + sudo tee /etc/apt/sources.list.d/ceph.list <<EOF + deb http://ceph.newdream.net/debian/ DISTRO main + deb-src http://ceph.newdream.net/debian/ DISTRO main + EOF + + sudo apt-get update + sudo apt-get install ceph + + +.. todo:: For older distributions, you may need to make sure your apt-get may read .bz2 compressed files. This works for Debian Lenny 5.0.3: ``apt-get install bzip2`` + +.. todo:: Ponder packages; ceph.deb currently pulls in gceph (ceph.deb + Recommends: ceph-client-tools ceph-fuse libceph1 librados2 librbd1 + btrfs-tools gceph) (other interesting: ceph-client-tools ceph-fuse + libceph-dev librados-dev librbd-dev obsync python-ceph radosgw) + + +.. todo:: Other operating system support. + + +Creating a ``ceph.conf`` file +----------------------------- + +On the `admin host`, create a file with a name like +``mycluster.conf``. + +Here's a template for a 3-node cluster, where all three machines run a +:ref:`monitor <monitor>` and an :ref:`object store <rados>`, and the +first one runs the :ref:`Ceph filesystem daemon <cephfs>`. Replace the +hostnames and IP addresses with your own, and add/remove hosts as +appropriate. All hostnames *must* be short form (no domain). + +.. literalinclude:: mycluster.conf + :language: ini + +Note how the ``host`` variables dictate what node runs what +services. See :doc:`/ops/config` for more information. + +.. todo:: More specific link for host= convention. + +.. todo:: Point to cluster design docs, once they are ready. + +.. todo:: At this point, either use 1 or 3 mons, point to :doc:`grow/mon` + + +Running ``mkcephfs`` +-------------------- + +Verify that you can manage the nodes from the host you intend to run +``mkcephfs`` on: + +- Make sure you can SSH_ from the `admin host` into all the nodes + using the short hostnames (``myserver`` not + ``myserver.mydept.example.com``), with no user specified + [#ssh_config]_. +- Make sure you can run ``sudo`` without passphrase prompts on all + nodes [#sudo]_. + +.. _SSH: http://openssh.org/ + +If you are not using :ref:`Btrfs <btrfs>`, enable :ref:`extended +attributes <xattr>`. + +On each node, make sure the directory ``/srv/osd.N`` (with the +appropriate ``N``) exists, and the right filesystem is mounted. If you +are not using a separate filesystem for the file store, just run +``sudo mkdir /srv/osd.N`` (with the right ``N``). + +Then, using the right path to the ``mycluster.conf`` file you prepared +earlier, run:: + + mkcephfs -a -c mycluster.conf -k mycluster.keyring + +This will place an `admin key` into ``mycluster.keyring``. This will +be used to manage the cluster. Treat it like a ``root`` password to +your filesystem. + +.. todo:: Link to explanation of `admin key`. + +That should SSH into all the nodes, and set up Ceph for you. + +It does **not** copy the configuration, or start the services. Let's +do that:: + + ssh myserver01 sudo tee /etc/ceph/ceph.conf <mycluster.conf + ssh myserver02 sudo tee /etc/ceph/ceph.conf <mycluster.conf + ssh myserver03 sudo tee /etc/ceph/ceph.conf <mycluster.conf + ... + + ssh myserver01 sudo /etc/init.d/ceph start + ssh myserver02 sudo /etc/init.d/ceph start + ssh myserver03 sudo /etc/init.d/ceph start + ... + +After a little while, the cluster should come up and reach a healthy +state. We can check that:: + + ceph -k mycluster.keyring -c mycluster.conf health + 2011-09-06 12:33:51.561012 mon <- [health] + 2011-09-06 12:33:51.562164 mon2 -> 'HEALTH_OK' (0) + +.. todo:: Document "healthy" + +.. todo:: Improve output. + + + +.. rubric:: Footnotes + +.. [#ssh_config] Something like this in your ``~/.ssh_config`` may + help -- unfortunately you need an entry per node:: + + Host myserverNN + Hostname myserverNN.dept.example.com + User ubuntu + +.. [#sudo] The relevant ``sudoers`` syntax looks like this:: -At this point, either use 1 or 3 mons, point to :doc:`grow/mon` + %admin ALL=(ALL) NOPASSWD:ALL diff --git a/doc/ops/misc.rst b/doc/ops/misc.rst new file mode 100644 index 00000000000..084debc80e7 --- /dev/null +++ b/doc/ops/misc.rst @@ -0,0 +1,14 @@ +=============== + Miscellaneous +=============== + +.. todo:: This section should not exist. Try to reorganize, when + document is otherwise more ready. + + +Disabling encryption +==================== + +Authentication is optional but very much recommended. + +.. todo:: write me diff --git a/doc/ops/mycluster.conf b/doc/ops/mycluster.conf new file mode 100644 index 00000000000..454eca63bfb --- /dev/null +++ b/doc/ops/mycluster.conf @@ -0,0 +1,37 @@ +[global] + auth supported = cephx + keyring = /etc/ceph/$name.keyring + +[mon] + mon data = /srv/mon.$id + +[mds] + +[osd] + osd data = /srv/osd.$id + osd journal = /srv/osd.$id.journal + osd journal size = 1000 + +[mon.a] + host = myserver01 + mon addr = 10.0.0.101:6789 + +[mon.b] + host = myserver02 + mon addr = 10.0.0.102:6789 + +[mon.c] + host = myserver03 + mon addr = 10.0.0.103:6789 + +[osd.0] + host = myserver01 + +[osd.1] + host = myserver02 + +[osd.2] + host = myserver03 + +[mds.a] + host = myserver01 diff --git a/keys/autobuild.asc b/keys/autobuild.asc new file mode 100644 index 00000000000..2a1d17dc9ef --- /dev/null +++ b/keys/autobuild.asc @@ -0,0 +1,41 @@ +-----BEGIN PGP PUBLIC KEY BLOCK----- +Version: GnuPG v1.4.9 (GNU/Linux) + +mQGiBE1Rr28RBADCxdpLV3ea9ocpS/1+UCvHqD5xjmlw/9dmji4qrUX0+IhPMNuA +GBBt2CRaR7ygMF5S0NFXooegph0/+NT0KisLIuhUI3gde4SWb5jsb8hpGUse9MC5 +DN39P46zZSpepIMlQuQUkge8W/H2qBu10RcwQhs7o2fZ1zK9F3MmRCkBqwCggpap +GsOgE2IlWjcztmE6xcPO0wED/R4BxTaQM+jxIjylnHgn9PYy6795yIc/ZoYjNnIh +QyjqbLWnyzeTmjPBwcXNljKqzEoA/Cjb2gClxHXrYAw7bGu7wKbnqhzdghSx7ab+ +HwIoy/v6IQqv+EXZgYHonqQwqtgfAHp5ON2gWu03cHoGkXfmA4qZIoowqMolZhGo +cF30A/9GotDdnMlqh8bFBOCMuxfRow7H8RpfL0fX7VHA0knAZEDk2rNFeebL5QKH +GNJm9Wa6JSVj1NUIaz4LHyravqXi4MXzlUqauhLHw1iG+qwZlPM04z+1Dj6A+2Hr +b5UxI/I+EzmO5OYa38YWOqybNVBH0wO+sMCpdBq0LABa8X29LbRPQ2VwaCBhdXRv +bWF0ZWQgcGFja2FnZSBidWlsZCAoQ2VwaCBhdXRvbWF0ZWQgcGFja2FnZSBidWls +ZCkgPHNhZ2VAbmV3ZHJlYW0ubmV0PohmBBMRAgAmBQJNUa9vAhsDBQkDwmcABgsJ +CAcDAgQVAggDBBYCAwECHgECF4AACgkQbq6uIgPDlRpR0QCfZnYE8vEDX4JL3sZj +5LvMsXruULIAnjHBAYvdlu5iMowoEMQDJlNNdscxuQQNBE1Rr28QEACKG04kxGY1 +cwGoInHVP6z1+8oqGiaiYWFflYRtSiwoUVtl30T1sMOSzoEvmauc+rmBBfsyaBb8 +DLDUIgGKv1FCOY/tfqnOyQXotPjgaLeCtK5A5Z5D212wbskf5fRHAxiychwKURiE +eesRa7EWrF6ohFxOTy9NOlFi7ctusShw6Q2kUtN7bQCX9hJdYs7PYQXvCXvW8DNt +7IitF7MpgMHNcj0wik6p38I4s7pqK6mqP4AXVVSWbJKr/LSz8bI8KhWRAT7erVAZ +f6FElR2xZVr3c4zsE2HFpnZTsM5y/nj8fUkgKGl8OfBuUoh+MCVfnPmE6sgWfDTK +kwWtUcmL6V9UQ1INUJ3sk+XBY9SMNbOn04su9FjQyNEMI/3VK7yuyKBRAN7IIVgP +2ch499m6+YFV9ZkG3JSTovNiqSpQouW7YPkS+8mxlPo03LQcU5bHeacBl0T8Xjlv +qu6q279EliHul4huKL0+myPN4DtmOTh/kwgSy3BGCBdS+wfAJSZcuKI7pk7pHGCd +UjNMHQZmPFbwzp33bVLd16gnAx0OW5DOn6l0VfgIQNSJ2rn7WZ5jdyg/Flp2VlWV +tAHFLzkCa+LvQ5twSuzrV/VipSr3xz3pTDLY+ZxDztvrgA6AST8+sdq6uQTYjwUQ +V0wzanvp9hkC5eqRY6YlzcgMkWFv8DCIEwADBQ//ZQaeVmG6T5vyfXf2JrCipmI4 +MAdO+ezEtWE82wgixlCvvm26UmUejCYgtD6DmwY/7/bIjvJDhUwP0+hAHHOpR62g +ncoMtbMryHpm3FvYH58JNk5gx8ZA322WEc2GCRCQzrMQoMKBcpZY/703GpQ4l3RZ +7/25gq7ANohV5zeddFQftc05PMBBJLU3U+lrnahJS1WaOXNQzS6oVj9jNda1jkgc +Qni6QssSIMT6rAPsVbGJhe9mxr2VWdQ90QlubpszIeSJuqqJxLwqH8XHXZmQOYxm +yVP9a3pFqWDmsNxDA8ttYnMIc+nUAgCDJ84ScwQ1GvoCUD1b1cFNzvvhEHsNb4D/ +XbdrFcFGwEkeyivUsojdq2YnGjYSgauqyNWbeEgBrWzUe5USYysmziL/KAubcUjI +beRGxyPS6iQ2kbvfEJJPgocWTfLs5j61FObO+MVlj+PEmxWbcsIRv/pnG2V2FPJ8 +evhzgvp7cG9imZPM6dWHzc/ZFdi3Bcs51RtStsvPqXv4icKIi+01h1MLHNBqwuUk +IiiK7ooMlvnp+DiEsVSuYYKBdGTi+4+nduuYL2g8CTNJKZuC46dY7EcE3lRYZlxl +7dwN3jfLPRlnNscs34dwhZa+b70Flia0U1DNF4jrIFFBSHD3TqMg0Z6kxp1Tfxpe +GOLOqnBWrr0GKehu9CGITwQYEQIADwUCTVGvbwIbDAUJA8JnAAAKCRBurq4iA8OV +GqKjAJ9QA7mNQs0Rko5VGYA+xjPokf0yVACfQMEFVHxT/k9+awAbBFLR3D0jjJ4= +=PYuQ +-----END PGP PUBLIC KEY BLOCK----- diff --git a/keys/release.asc b/keys/release.asc new file mode 100644 index 00000000000..a4e8441c01d --- /dev/null +++ b/keys/release.asc @@ -0,0 +1,140 @@ +-----BEGIN PGP PUBLIC KEY BLOCK----- +Version: GnuPG v1.4.9 (GNU/Linux) + +mQGiBEtYu+8RBAC3y30BJKKV67fz6UhB0cQm1mdfYl1gBMPQv5FuX9sai7ZuzAWh +83uGXByCp6foBp33xbusVqtM9YBrswBUfR0K5SJiWrp07e/vUz6MIORakPhQ4ggK +M+pAvg+myQa9FdiAdFN1Pm7QXzziLSJ6hkNtmytkSxNQ1p/srx4PT2WHGwCg6bTk +i7mUeTw+l3MQtudEAXRwb2sD/isdW5KdWtok9UwncQlfsdv8C/coHeGLHBxvIEdh +BGHUD6DnWEIb4XbCaIBXpHu0aztm9PLE160YN8dYRMqnuwNmU7RWp0b97g641xi1 +YW/+ShBVrfUjHlYjO7vZpIJlO6rnTQB6bLuKrZ0ZG5KdMnQjbURnCzMtFPMdp0IV +P9EgA/0YAxL1AM4aTeSYLfyDSRHe5Nvv8Y0wPV8cHKQXacgP5042riyjCxY9vSkQ +yiLqKvC+ZBr93vRnLdS4UnDiOlEW1vqhbT8pRhAM7n8ngGRLWbrG7nhRo27eJVPr +0r+srwCVR+udfBS1RTnzHtXMDrpLY9GS59r9x/UquwHHsfyU1bQdU2FnZSBXZWls +IDxzYWdlQG5ld2RyZWFtLm5ldD6IYQQTEQgAIQIbAwIeAQIXgAUCS1i+bgULCQgH +AwUVCgkICwUWAgMBAAAKCRDaRCDtKImVyHpGAJ9ThO25++a5RK/HloILHOZfC2nc +BgCfVpv7PqHlvJcNLGETMjK9OjdzK12IRgQQEQgABgUCS1i/yQAKCRDOlFtnvv7s +ZKPLAKDL0hF9FEGmRCC4E7K2Og1E7ju76ACfSOFFdsso0wsvmLIzPeOuwixZBEaI +YQQTEQgAIQUCS1i77wIbAwULCQgHAwUVCgkICwUWAgMBAAIeAQIXgAAKCRDaRCDt +KImVyAJcAKCPXc3aC7PAHuG+tJGc+ozvobkpJgCg22rH3JRr9mtYdX6RzIJCde24 +jliIRgQQEQIABgUCS4NJBAAKCRD0mo+yK5g/WDl+AKCopuwEut14Uz7FksJ/LUVT +rveKWACfbUCPrKIo7hpHHXm0w2eBsTRs/TOJAhwEEAECAAYFAkuDSRAACgkQk0OA +6Dag0KrGAQ/+IHS8fUApyCuomo318MPpHXQ1wJlnysMgmtvTJDTpsWM51iCYB3fG +Op2WHlddkRJtZkrK0Yxf6koK+HaBHj4vlQgDkGwdd9N1xqQz8N0ztrwtDOwlffdx +TVzoqaAymSvb37Eptcet0Ru+aQB6hmyFkj02SNfJ4ncPTFgabDKpXMtsZAcbBw5I +8UHXYw25XruE4M9yF4UGilySZ9wPHm5Na/F3XM/gAJS9O6+3uiB5TbXlPv1jnFbJ ++3bSb+2zz97TrqE2zvfvFd5kiUXbyYkHNvt/HJi8MQFPEeBmq6Le+dQ41jtpa0Zs +dqm6EoJATeVRuzrTOk+7+FVlPZB5ZKQWErOavWln6VDaBogE0IH2KIdxuaC7LWLG +mBcNQZGvq8Qosx60jLGYCtzdftr7pZzYl5QRsqPL7pAiiKds15HJzu4beSewPe9u +QZ1MFQ249rHn170TNDE2LsUQp4JeO5E00MExyxP6EYB3Vo6mFWoSf377GrIHbVkd +5ePPmUN3KAsK7omUwTKQpsbtcoz8oyMOgSFIkYA/SyI18riRKviL2eJQ2jI3a0M7 +G3SB2lVueyF/gf7E/+jNjeWMImd7glQ8s7v+Mbiuq7lszLljwgPsahCiVgvDS1pN +PDx8E2/VDE437RZpCmLae/JTwL61BsY9F4bB0Q4z9sEQ7IFcx1xHdfiIRgQQEQIA +BgUCS4NNKQAKCRANjRNR/daw26C8AKC6+goG/2EJ31oIg7nL/Qf1BwbHiwCfZbXY +AqCRBDWQmky8Upqc48WCgv2JAhwEEwECAAYFAkuDgOUACgkQqQNBJhIwH6LhuhAA +lFgek4UpgvuM5+3wysojdaD207WSadLAUU2uv+lSSGFaOFtEk0u41LbTSUjiFwkJ +22s7qCb4eCWPO8VKpmjlp9SWyxDn3OMRR34idrle8bEPeIJBiwkUBWO3QpRfkrbL +IuK5g9hHMKtUlKLXiFxc63RXNdekaygDlJuuvjLiN6IYvvDJK9S1Iy+PlodPjckR +J3GtxOmD2k0TFqXrfEO6qdzNVM6XQJ01irBrl1r8PYdMdk+sRjJy6GXO0KBjFNl7 +JZw71hEyvVEzquQqy5F6xvfr1Um5Y7UHpjra9rA33qn7T2IHSAAV9RF3j1y4Bmsc +o5BnMi1TmQa9zOC5oAszDPFxVfbxpHeX7v7QAiUdaCf9uw49fyOd8k0PO7Notasn +DnZPMwqINBHaTXlIOtVYiXV3FUlARxpLrqZ7JwdcIXiqRCqRqv7NlDoob1vBE5Ew +mLnAAdH6Lj/QIMkeEJzhR6k2KRPXq4SB/ggUmuzw/jAWdAyUr3kOXkPAwOKLMLh0 +FB3DK5QkbWkYGGm4zYBmVBSpcUt5dLUqX9O0qhKA1F287y8eF/E7ArCaoToVNAZx +6swnNdQNgOqKASRikLANJB0Qof1iSVh6dMbJUR7D8iHuowcSPDP/1JGNn7VFDeo/ +QsQOaAtL+nhJlFpqIcwaTTXO7KTN0sHGAxlsZQtkppKIRgQTEQIABgUCS4S7NwAK +CRAjcDdYmlP8Ku40AJwL1Hh/05el/VYQe/JgDB17UJVawACfX9vGPaJIvj5lpYic +OGn4VnNlQJ2IRgQTEQIABgUCS4OP4AAKCRAR6PE9C66WSJotAJ9cKKaTbaHjcjvM +YOnAhhuXcqD9KwCeMwxZTlfAVMEGyWcq5NOvnqAfzLqIRgQTEQIABgUCS4SxkwAK +CRAhGTfVLP6ynenjAJ0XOHfVA1uAulVfHNtR4F8NGb6+pwCdEuC9RiBjNSV+vdCQ +lFsFganlnNmIRgQTEQIABgUCS4TAdQAKCRD/XLkM/+lAh7PHAJ0caUFxh9JVKxNU +pSXT3usvz/49wgCfYHTNkZYaTyML+zSZ3kpf2/UiPPGJARwEEwECAAYFAkuEy90A +CgkQtlATpJN1Xgh0NAf/drKpNHibzd43ipM1KGA1CJI9QCU+pH8P37hf7KbpDBMz +XKzsYzijuqUtlt+bokKpHj17OUpdGhvk3/gdHTBqjoj+2S4SZOrfhbbczCP9YgxX +fHtPWVpxQDliND3B+N2LnwrOGBVtfL2QlHY5pfTevzOZnvo2ivNQ/5NqVjWc1zxe +2yIhDFNaTfgr+GP8wO3G8vzY+kdPVCZA1SdBRCvMzJakFICpwXex/yi1CR89DtMV +5eqD5pgtbW3z8J75hF+x/FOs87EIwtUUBqq5XKE81VDPF5d1gO/AQ5Aw1JYvFgL/ +dFl7BAc4yMB6n61Hnz7ykTtUuzcYDPNF8tNxjJ5BBIkBHAQQAQIABgUCS4YYvAAK +CRCGiaJaiNv9F+PwCAC/1mCxwgnQ/rqEBCCIZ7YnIGDdoyLB+samhYNGbYRMX9HS +8i6/bX/G6U9pbsRhHXdLPUrU5qpFfjec7RJg/+QJReHmCKbG+Ed4SvZDTlR/R/VB +vOEyitV1+jnPyPJYNjvRC7E6IPL8n8iRPsGf5Pj/mAwPQIugu23TM/zcUGaWpkJe +V7hElpXQsJx9pzvi/MC51L/wE0N+TClZGMoIIuCJrfkWrgZ/P4rtdlnH8fdTkEgj +P0KGMZ1mevk4LLrzUqR9+TAmEzOnSNK2BmV49Dz7cGNyUBJrT7uTdor3KVUNHSjH +C3kLm1gHxiHXrHeSUU/lwG1OQvijAGTqzM7qI3KfiEYEExECAAYFAkuGaakACgkQ +O/Dos2Fb9DSB8wCfWwOU744GNjWzho7/dJbetsZFq74AnjFJcAOm6AhnS56UdJMx +QCJBPK5WiEYEExECAAYFAkuHPcwACgkQ9e/J5tUGicVt2ACfUkYQe1ge6/Auq8nE +KCkig2SCZJoAoJtdSPKPDLb5oyhnyl9VzRnAnSmmiEYEExECAAYFAkuHskcACgkQ +LBV88STLCDnYvQCfRB7Bfd54qXnhoGKE/DCZ5pqNEj0An34Xc/H1gri/J74ajomK +b1ZgI/dgtBtTYWdlIFdlaWwgPHNhZ2VAa2VybmVsLm9yZz6IYQQTEQgAIQUCS1jH +/gIbAwULCQgHAwUVCgkICwUWAgMBAAIeAQIXgAAKCRDaRCDtKImVyIX/AJwNXg/7 +uBgx6wnd0cfZtYuxFKNwmwCfQBEAL6Ml9lams5KOTNlk6Rp1kKeIRgQQEQIABgUC +S4NJBAAKCRD0mo+yK5g/WCZgAKDCGALmU3MINVnZ7xWhgeX2u447rACeKZ2gkcYf +Ctv/6BhsIs+mQMDVQl6JAhwEEAECAAYFAkuDSRAACgkQk0OA6Dag0KpWGg/6A7f1 +yveGxH1vOdJHWFEW6OoMSpAEEhmV61Xl/ExPyqn+0Z8nHa9mKrkv8HP53Grfk7Vz +QjoZPYWprIJsljuFSig+Dr5UaW8gyiFtzbICZXRmGI4yR1ZI0gGbWfx59Ypavd4U +gWym7SCFOzSvwh9VyymB0DANmEp6MHH7KMnDOnbxu9TbDg2ShWVyR+DKNg9KZUM/ +AsT+vNJubKwo7wd9v6cJxVnZ1RO3Iyy3119Q0N0FP28YoijaWrIpguoBPgAeqkLn +KgIVme+QP7cMQFxBqBwtSu7Jo5Jjo33CvmfXftUCV/JQCkbcbPR5thhh6wt/4VfM +/VvQO3N03joR+brNEshS9YqdrVSl1IYdUJ9HR098Nkgjx7ZZrSYuCF3CeaxXsFgv +xKPmZ34ZalnvvJsp2SE2vYS0NqH56fQ+pxCacO5u/OUXBvzFn/Ih/C7hGdPwUWnz +t4lRo44Hb2uXGInSVIDOgxszvnToOy55yZe/sAUYE/HIjmjvGga3Qx4UH6kT0k/Q +3YncOC5Y2eckPefwNvvtcv8oet2/d1zX81/R1CMCJJnuVDvI570gFGZd2/yHP8uH +vP4aBFuQfoCeKsWx13+WQ4FCinwK1clnVeMxAY9CufO6OdSPy2rg8PECBaM8S3zV +0MLO2kozQU7GCmTTZ3r7GcXCR2xeayFahHC5ZwGIRgQQEQIABgUCS4NNIAAKCRAN +jRNR/daw232FAKCcVRTO27X2OIm+UFiIDfuo32ACRwCgvS0gMza5jH4H0vikeyWp +c7Nn566JAhwEEwECAAYFAkuDgOAACgkQqQNBJhIwH6L9Mg/8DJ2F/CjNWqiI5DAC +aoKEF4xszO/gqd15hnGe/OEgszMU5etm9EAKAW4a+2i2XgzdmxvRKudJS69Nze1D +y1yzOB6688raRTNc3bd33Sd/vtAht1hwjI1UitxXr/71jxU+lEiXizFyU4gF1Omc +LuyOj8Ap2m4eR6GlLu/MCTTLoBYGPGpiMQQ4kA6A1wRYQkemBnYozn2wUw12EuUs +znaLClexhG2h/V6JPXdn/lTfeMQdLG/+qzVdN64ZvuLAGZHcmMjrdsrcSSvVLfYP +5Hn43NO0W+mntQGyCgvllYUeVGhXHGnwRZnPiIm+vtkRtHmyv/T6ru4XUknGr7n8 +6LF4CEN2hCwMrFeb5cdYCPEnEgpON9f0KU+E96wxHVH844iXg5fDa98P2MpDEEQB +Psvh11VzPpUQwdrPxfus+uf26PhOAxsyNAOt4VMtYtT6LxADf4tQmkh3OkmSe/1Q +0oAm2M96I1Pf8iDdSG1xRISxd7zPbQZefNXwqMTJEt7YdqVJPTtAejmIcRFV5ypn +XLvteeBYFZO5YWGOOeLedYOVRB7KVitYmWQRTQk8zZJyvKuF+/nmuseuczT31hlu +JxJAjjbZ0PxSSkrNuwqiLqj8bMMKkXC2423g4SEJF6VzNKwUtKuGlKNLxCyOfNFQ +IX4AlD/HgaWHd1giZFJMhwm94aGIRgQTEQIABgUCS4OP3QAKCRAR6PE9C66WSGDp +AJ0dHSfxYPlUBB3f2b/iIAK4kbxmnwCbB7wXSFR1Rse5AnXOHa3adu3lnZaJARwE +EAECAAYFAkuGGLwACgkQhomiWojb/ReoCAgA4SHL0WIarFR0ssaXh0lNO5DEY9xr +qio/H0RN32BtkLSoYTxfQqwRIWTacfjOEX6NUq5oypE/vEnbFhDOPHqf+PBKp97D +wNjS/07KyhrCA8LDbCQRHpmRNDimPwqFdOahXEctoYKFu+GCqfvism1AIHv26ZKA +hSHjkMTfmDJeQbnvWgSRHU5bM2q+eo3Mt42fRbs+Z+QPYsSs0vDxEmcEkwP26oLT +4UAueDVHlbyKEv3CwqUJu2uXYAmYOXD3AlaCaPfD0833Wk/wcTdwK9sj9P0O8/lv +NQ8TSFkmbk9jkkKerxm2IbO5iz9STWDjauO2UvNWXeS5OV+fvAfargsyPrkEDQRL +WLvvEBAAp215RD7U20GFb0VqZ1EmBJJlCFJALyxmPpxAjtabxZ5jjIwSSbcVSPt5 +mi3IvLpT56X5jMkUQXj+g5NcPFcquNJn4x8p3rTreiS6x3M/vuWYbZPMeCtACfQI +bRDiix7eFX5aY1C2AwQAludAYPMmuiWS6HC8OttSJSrusJeHbSlVPsK46xX3c706 +HHJLXQuDP/in496KOZMVmWH8xhqAohdHrJyMNO4rj71R7pigHZT5YryuJu4fW95K +9LrMkWG1J49CcLKiz4jiDfVSEZDBvDZZ9XkHqFbIKBVSWJnJr/k76myalTMpZGTq +3RfdZ/soIeNU+yGscCGKKZvh2845oy6LJNotRMI0oy40bcfjzA7BTTipeZ9ChPRs +H8RoQH1eoPREwLOqtk77gXKDgQgpWqQeRDt5+vF8lapwcnIpt95LKAM2epp7FZZ0 +Tv7lywU2SeFTSi5Gi7jYtWTtXzwCepSl+VXZ5ai3ZuQiCV0b38Ijds56SKkfw/ZU +y8cZzs55SOAdamBTB0hB+6IVAKW0tVSTVCEBYLr+xu+Cfpj8tWEsFVs5e7t3EwBS +sUF3WTLD6rjPNtSwRUhwxRkh41kKzQhUBteyK+ruk0Z5mNBxOphKqIGeMB5ry32S +i/nnQiKNHaqX7t0RrTOlbxf8n0XvfUoEr4quRGrU1TzAGdoetF8ABR0P/j5FU5Iu +rlzhJS9SiIu6pYqkBfk+fKvtEFGXYQfIsJQSiCmj6YHQs1WVvnqc8TeiwwvH0J8D +fwDplkhUJ0VNz5KtehN8/uRhmJSpMc0Up0x/VajN2q7k5BNal0GC1/rOrYFTsdyi +YvGl50D4dG5M7VmrYmaXTKZ1HeC0vTqcH12a0lGEB9IYjma1Y6MWxUr79wWPS361 +lsxuwdXpKoHKQfT8LUYrXMLmIuuCtZvqPHJ0HblIp9UBNwTfqL/8g3A3KyEdT6iM +lF3cUPsigqISPVLFocQ3Q2LnypZkE3XoqM/JsK4sFfFwg4XDaIsORt+YG66vsZEt +STwM44wxyZm5pgjloKsdY+UwP1w/OBPhhNZq/QPCsL+pKbdbt3Nx9CO84GiyoxhQ +zVLKGmTFCvilyW3nauojV4WM+Evu3dVS56tq+oNk9ttOH+5UsLcvMiaA/hS3q/O2 +jf7wS8dHOdTn1elHqAJPUbe/HlReTrAU1ll4rujKWi8ZQabd4RoOTjfIPxAOHszO +pmFwHrpuL/7jH8TGcNK/6GbVeOVFlFU40klJIJ5O5He91IqSFyZoTMqFqloeGxuC +JJV+uVNEPEyM5LCLdag8BvCnfhZZbEiUd2A5th2Pc+yKwxZ3s/Np7HfYkkug5JZ5 +75g6DNinZfPFGga5g6DWgkdPyoxrS4zhU4qoiEkEGBEIAAkFAktYu+8CGwwACgkQ +2kQg7SiJlchkaACglP00AlVap07y7/0Ul7XNS+C/seUAnRp6f33KAyusDGes/tMH +xG6RQNuruQINBEtYvpkBEACvQ/WYV1nJRqyq/ffX7xIAxTGS4t80ra2bOaAURPNX +aRDOPK15Toz4/Ct89eiEohA8xLsrpssC5cEx7oS5g/XBfYDhOXnwtNf/LAgp44GY +jhTjP/Oc+1z3Q3SJvgmCJbEgKtAlzE7nXYcagjvNUIiQa9FgJ5M58vN284Evfm5C +omIbduiJEAWMjJk8is8gNoX8zCdX0TIoaznOiJ1g3dsuUygZ/oZ4UFx6Ph5Lsaaq +XGg6iBHA730Wk3xxOG4ndlkWuLbvKd0vb2I/umQOEcuPoxR6B1NLB2f6GulEY9/P +JCfdPX3NbZgJoI8nS7GqDaJB45DXiuAU02W1kHU9vTQOKp6UJKUiff4NF7zpefef +UNpMCWeyYKGyS3U4Bl17pEI1n9iH1oNyrLQljDlAGjhIAlT/oF/Qbm7nNl/iTO30 +HMXIuXy8yFfMtjolEhOiwRCCK+ooQ000BGwM17yppVNa1i6lDiiHvG3FdVoV1HF1 +hOgHXS4j6fbWmSOwKewKHK2+GBJBxppuedLzuxCtxmQOwFBhAoiJokCXDo1nGE85 ++/dJdyN/dZyBCQ3NyC8A7BB6nK3uUWO/b4wwWZjpxpyrHs6+SgrKIkj12rnaQuAI +SNbVD++y/QJcUVXlqsJkVj4v04aWAwdNWlOrYT770xUzc8xUmPbsd89QgLfqyMFo +hQARAQABiEkEGBEIAAkFAktYvpkCGwwACgkQ2kQg7SiJlcgRgwCghJWbAL8wv7Q0 +tS2vGOq9DWh1N28An2yMBPSf/WaHuo+mtsKqMk3enf6R +=6gus +-----END PGP PUBLIC KEY BLOCK----- diff --git a/man/mkcephfs.8 b/man/mkcephfs.8 index f066d789462..51fed771f90 100644 --- a/man/mkcephfs.8 +++ b/man/mkcephfs.8 @@ -61,7 +61,7 @@ executing commands via SSH. Use the given conf file instead of the default \fI/etc/ceph/ceph.conf\fP. .TP \fB\-k\fI /path/to/keyring\fR -When \fB\-a\fR is used, we can special a location to copy the +When \fB\-a\fR is used, we can specify a location to copy the client.admin keyring, which is used to administer the cluster. The default is \fI/etc/ceph/keyring\fP (or whatever is specified in the config file). diff --git a/qa/workunits/false.sh b/qa/workunits/false.sh new file mode 100644 index 00000000000..8a961b329d5 --- /dev/null +++ b/qa/workunits/false.sh @@ -0,0 +1,3 @@ +#!/bin/sh -ex + +false
\ No newline at end of file diff --git a/src/Makefile.am b/src/Makefile.am index e53b68def91..5139235121e 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -317,12 +317,12 @@ my_radosgw_src = \ rgw/rgw_access.cc \ rgw/rgw_op.cc \ rgw/rgw_rest.cc \ - rgw/rgw_rest_os.cc \ + rgw/rgw_rest_swift.cc \ rgw/rgw_rest_s3.cc \ rgw/rgw_common.cc \ rgw/rgw_cache.cc \ - rgw/rgw_os.cc \ - rgw/rgw_os_auth.cc \ + rgw/rgw_swift.cc \ + rgw/rgw_swift_auth.cc \ rgw/rgw_formats.cc \ rgw/rgw_log.cc \ rgw/rgw_multi.cc \ @@ -885,6 +885,7 @@ libmds_a_SOURCES = \ mds/Dumper.cc \ mds/Resetter.cc \ mds/MDS.cc \ + mds/flock.cc \ mds/locks.c \ mds/journal.cc \ mds/Server.cc \ @@ -1118,8 +1119,8 @@ noinst_HEADERS = \ include/rbd/librbd.h\ include/rbd/librbd.hpp\ logrotate.conf\ - mds/flock.h\ mds/inode_backtrace.h\ + mds/flock.h\ mds/locks.c\ mds/locks.h\ mds/Anchor.h\ @@ -1333,11 +1334,11 @@ noinst_HEADERS = \ rgw/rgw_log.h\ rgw/rgw_multi.h\ rgw/rgw_op.h\ - rgw/rgw_os.h\ - rgw/rgw_os_auth.h\ + rgw/rgw_swift.h\ + rgw/rgw_swift_auth.h\ rgw/rgw_rados.h\ rgw/rgw_rest.h\ - rgw/rgw_rest_os.h\ + rgw/rgw_rest_swift.h\ rgw/rgw_rest_s3.h\ rgw/rgw_tools.h\ rgw/rgw_bucket.h\ diff --git a/src/client/Client.cc b/src/client/Client.cc index bb2cfb90c43..c5fa15abf79 100644 --- a/src/client/Client.cc +++ b/src/client/Client.cc @@ -2948,11 +2948,13 @@ void Client::handle_cap_import(Inode *in, MClientCaps *m) m->get_caps(), m->get_seq(), m->get_mseq(), m->get_realm(), CEPH_CAP_FLAG_AUTH); - // reflush any/all caps - if (in->cap_snaps.size()) - flush_snaps(in, true); - if (in->flushing_caps) - flush_caps(in, mds); + if (in->auth_cap && in->auth_cap->session->mds_num == mds) { + // reflush any/all caps (if we are now the auth_cap) + if (in->cap_snaps.size()) + flush_snaps(in, true); + if (in->flushing_caps) + flush_caps(in, mds); + } if (m->get_mseq() > in->exporting_mseq) { ldout(cct, 5) << "handle_cap_import ino " << m->get_ino() << " mseq " << m->get_mseq() @@ -3279,7 +3281,7 @@ void Client::unmount() // NOTE: i'm assuming all caches are already flushing (because all files are closed). - //clean up any unclosed files + // clean up any unclosed files if (!fd_map.empty()) std::cerr << "Warning: Some files were not closed prior to unmounting;\n" << "Ceph is closing them now.\n"; @@ -3287,7 +3289,8 @@ void Client::unmount() int fd = fd_map.begin()->first; assert(fd_map.count(fd)); Fh *fh = fd_map[fd]; - _release(fh); + lderr(cct) << " destroying lost open file " << fh << " on " << *fh->inode << dendl; + _release_fh(fh); fd_map.erase(fd); } @@ -4823,6 +4826,29 @@ Fh *Client::_create_fh(Inode *in, int flags, int cmode) return f; } +int Client::_release_fh(Fh *f) +{ + //ldout(cct, 3) << "op: client->close(open_files[ " << fh << " ]);" << dendl; + //ldout(cct, 3) << "op: open_files.erase( " << fh << " );" << dendl; + Inode *in = f->inode; + ldout(cct, 5) << "_release_fh " << f << " mode " << f->mode << " on " << *in << dendl; + + if (in->snapid == CEPH_NOSNAP) { + if (in->put_open_ref(f->mode)) { + _flush(in); + check_caps(in, false); + } + } else { + assert(in->snap_cap_refs > 0); + in->snap_cap_refs--; + } + + put_inode( in ); + delete f; + + return 0; +} + int Client::_open(Inode *in, int flags, mode_t mode, Fh **fhp, int uid, int gid) { int cmode = ceph_flags_to_mode(flags); @@ -4866,9 +4892,6 @@ int Client::_open(Inode *in, int flags, mode_t mode, Fh **fhp, int uid, int gid) return result; } - - - int Client::close(int fd) { ldout(cct, 3) << "close enter(" << fd << ")" << dendl; @@ -4878,36 +4901,12 @@ int Client::close(int fd) assert(fd_map.count(fd)); Fh *fh = fd_map[fd]; - _release(fh); + _release_fh(fh); fd_map.erase(fd); ldout(cct, 3) << "close exit(" << fd << ")" << dendl; return 0; } -int Client::_release(Fh *f) -{ - //ldout(cct, 3) << "op: client->close(open_files[ " << fh << " ]);" << dendl; - //ldout(cct, 3) << "op: open_files.erase( " << fh << " );" << dendl; - Inode *in = f->inode; - ldout(cct, 5) << "_release " << f << " mode " << f->mode << " on " << *in << dendl; - - if (in->snapid == CEPH_NOSNAP) { - if (in->put_open_ref(f->mode)) { - _flush(in); - check_caps(in, false); - } - } else { - assert(in->snap_cap_refs > 0); - in->snap_cap_refs--; - } - - put_inode( in ); - delete f; - - return 0; -} - - // ------------ // read, write @@ -6735,7 +6734,7 @@ int Client::ll_release(Fh *fh) tout(cct) << "ll_release" << std::endl; tout(cct) << (unsigned long)fh << std::endl; - _release(fh); + _release_fh(fh); return 0; } diff --git a/src/client/Client.h b/src/client/Client.h index c094082c92f..f4a5f1404f7 100644 --- a/src/client/Client.h +++ b/src/client/Client.h @@ -467,6 +467,7 @@ private: void _ll_drop_pins(); Fh *_create_fh(Inode *in, int flags, int cmode); + int _release_fh(Fh *fh); int _read_sync(Fh *f, uint64_t off, uint64_t len, bufferlist *bl); int _read_async(Fh *f, uint64_t off, uint64_t len, bufferlist *bl); @@ -491,7 +492,6 @@ private: int _removexattr(Inode *in, const char *nm, int uid=-1, int gid=-1); int _open(Inode *in, int flags, mode_t mode, Fh **fhp, int uid=-1, int gid=-1); int _create(Inode *in, const char *name, int flags, mode_t mode, Inode **inp, Fh **fhp, int uid=-1, int gid=-1); - int _release(Fh *fh); loff_t _lseek(Fh *fh, loff_t offset, int whence); int _read(Fh *fh, int64_t offset, uint64_t size, bufferlist *bl); int _write(Fh *fh, int64_t offset, uint64_t size, const char *buf); diff --git a/src/client/fuse_ll.cc b/src/client/fuse_ll.cc index 9ea7e16d59c..aeaceec3f10 100644 --- a/src/client/fuse_ll.cc +++ b/src/client/fuse_ll.cc @@ -317,7 +317,7 @@ static void ceph_ll_link(fuse_req_t req, fuse_ino_t ino, fuse_ino_t newparent, static void ceph_ll_open(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi) { const struct fuse_ctx *ctx = fuse_req_ctx(req); - Fh *fh; + Fh *fh = NULL; int r = client->ll_open(fino_vino(ino), fi->flags, &fh, ctx->uid, ctx->gid); if (r == 0) { fi->fh = (long)fh; @@ -443,7 +443,7 @@ static void ceph_ll_create(fuse_req_t req, fuse_ino_t parent, const char *name, const struct fuse_ctx *ctx = fuse_req_ctx(req); struct fuse_entry_param fe; memset(&fe, 0, sizeof(fe)); - Fh *fh; + Fh *fh = NULL; int r = client->ll_create(fino_vino(parent), name, mode, fi->flags, &fe.attr, &fh, ctx->uid, ctx->gid); if (r == 0) { fi->fh = (long)fh; diff --git a/src/common/config.cc b/src/common/config.cc index c75e066b51f..f618d9d2e40 100644 --- a/src/common/config.cc +++ b/src/common/config.cc @@ -433,6 +433,11 @@ struct config_option config_optionsp[] = { OPTION(rgw_socket_path, OPT_STR, NULL), // path to unix domain socket, if not specified, rgw will not run as external fcgi OPTION(rgw_op_thread_timeout, OPT_INT, 10*60), OPTION(rgw_op_thread_suicide_timeout, OPT_INT, 60*60), + OPTION(rgw_thread_pool_size, OPT_INT, 100), + OPTION(rgw_maintenance_tick_interval, OPT_DOUBLE, 10.0), + OPTION(rgw_pools_preallocate_max, OPT_INT, 100), + OPTION(rgw_pools_preallocate_threshold, OPT_INT, 70), + OPTION(rgw_log_nonexistent_bucket, OPT_BOOL, false), // see config.h OPTION(internal_safe_to_start_threads, OPT_BOOL, false), diff --git a/src/common/config.h b/src/common/config.h index 1d4bfbfc9be..b072edbff32 100644 --- a/src/common/config.h +++ b/src/common/config.h @@ -567,6 +567,11 @@ public: string rgw_socket_path; int rgw_op_thread_timeout; int rgw_op_thread_suicide_timeout; + int rgw_thread_pool_size; + double rgw_maintenance_tick_interval; + int rgw_pools_preallocate_max; + int rgw_pools_preallocate_threshold; + bool rgw_log_nonexistent_bucket; // This will be set to true when it is safe to start threads. // Once it is true, it will never change. diff --git a/src/cosd.cc b/src/cosd.cc index 82fa291901c..e2ce3cc347d 100644 --- a/src/cosd.cc +++ b/src/cosd.cc @@ -202,15 +202,13 @@ int main(int argc, const char **argv) exit(0); } - int err = OSD::convertfs(g_conf->osd_data, g_conf->osd_journal); - if (err < 0) { - derr << TEXT_RED << " ** ERROR: error converting store " << g_conf->osd_data - << ": " << cpp_strerror(-err) << TEXT_NORMAL << dendl; - exit(1); - } if (convertfilestore) { - derr << "Converted Filestore " << g_conf->osd_data << dendl; - exit(0); + int err = OSD::convertfs(g_conf->osd_data, g_conf->osd_journal); + if (err < 0) { + derr << TEXT_RED << " ** ERROR: error converting store " << g_conf->osd_data + << ": " << cpp_strerror(-err) << TEXT_NORMAL << dendl; + exit(1); + } } string magic; @@ -302,6 +300,14 @@ int main(int argc, const char **argv) // Leave stderr open in case we need to report errors. global_init_daemonize(g_ceph_context, CINIT_FLAG_NO_CLOSE_STDERR); common_init_finish(g_ceph_context); + + int err = OSD::convertfs(g_conf->osd_data, g_conf->osd_journal); + if (err < 0) { + derr << TEXT_RED << " ** ERROR: error converting store " << g_conf->osd_data + << ": " << cpp_strerror(-err) << TEXT_NORMAL << dendl; + exit(1); + } + MonClient mc(g_ceph_context); if (mc.build_initial_monmap() < 0) return -1; diff --git a/src/doc/object_store.dot b/src/doc/object_store.dot deleted file mode 100644 index 08328f07a24..00000000000 --- a/src/doc/object_store.dot +++ /dev/null @@ -1,61 +0,0 @@ -/* - * Rough outline of object store module dependencies - * - * build with - * dot -Tpng < ./object_store.dot > object_store.png - */ - -digraph object_store { - size="16,16"; - node [color=lightblue2, style=filled, fontname="Serif"]; - - "testrados" -> "librados" - "testradospp" -> "librados" - - "rbd" -> "librados" - - "radostool" -> "librados" - - "radosgw_admin" -> "rgw" - - "rgw" -> "librados" - - "radosacl" -> "librados" - - "librados" -> "objecter" - - "ObjectCacher" -> "Filer" - - "dumpjournal" -> "Journaler" - - "Journaler" -> "Filer" - - "SyntheticClient" -> "Filer" - "SyntheticClient" -> "objecter" - - "Filer" -> "objecter" - - "objecter" -> "OSDMap" - - "cosd" -> "PG" - "cosd" -> "ObjectStore" - - "crushtool" -> "CrushWrapper" - - "OSDMap" -> "CrushWrapper" - - "OSDMapTool" -> "OSDMap" - - "PG" -> "ReplicatedPG" - "PG" -> "ObjectStore" - "PG" -> "OSDMap" - - "ReplicatedPG" -> "ObjectStore" - "ReplicatedPG" -> "OSDMap" - - "ObjectStore" -> "FileStore" - - "FileStore" -> "ext3" - "FileStore" -> "ext4" - "FileStore" -> "btrfs" -} diff --git a/src/include/object.h b/src/include/object.h index bc56445fd1e..706caa6bffc 100644 --- a/src/include/object.h +++ b/src/include/object.h @@ -261,14 +261,15 @@ namespace __gnu_cxx { struct hobject_t { object_t oid; + string key; snapid_t snap; uint32_t hash; hobject_t() : snap(0), hash(0) {} - hobject_t(object_t oid, snapid_t snap, uint32_t hash) : - oid(oid), snap(snap), hash(hash) {} - hobject_t(const sobject_t &soid, uint32_t hash) : - oid(soid.oid), snap(soid.snap), hash(hash) {} + hobject_t(object_t oid, const string &key, snapid_t snap, uint32_t hash) : + oid(oid), key(key), snap(snap), hash(hash) {} + hobject_t(const sobject_t &soid, const string &key, uint32_t hash) : + oid(soid.oid), key(key), snap(soid.snap), hash(hash) {} /* Do not use when a particular hash function is needed */ explicit hobject_t(const sobject_t &o) : @@ -279,9 +280,11 @@ struct hobject_t { void swap(hobject_t &o) { hobject_t temp(o); o.oid = oid; + o.key = key; o.snap = snap; o.hash = hash; oid = temp.oid; + key = temp.key; snap = temp.snap; hash = temp.hash; } @@ -291,8 +294,9 @@ struct hobject_t { } void encode(bufferlist& bl) const { - __u8 version = 0; + __u8 version = 1; ::encode(version, bl); + ::encode(key, bl); ::encode(oid, bl); ::encode(snap, bl); ::encode(hash, bl); @@ -300,6 +304,8 @@ struct hobject_t { void decode(bufferlist::iterator& bl) { __u8 version; ::decode(version, bl); + if (version >= 1) + ::decode(key, bl); ::decode(oid, bl); ::decode(snap, bl); ::decode(hash, bl); diff --git a/src/include/rbd/librbd.h b/src/include/rbd/librbd.h index 8389f98972a..886e04e8636 100644 --- a/src/include/rbd/librbd.h +++ b/src/include/rbd/librbd.h @@ -37,8 +37,7 @@ extern "C" { typedef void *rbd_snap_t; typedef void *rbd_image_t; -typedef int (*librbd_copy_progress_fn_t)(uint64_t offset, uint64_t src_size, - void *data); +typedef int (*librbd_progress_fn_t)(uint64_t offset, uint64_t total, void *ptr); typedef struct { uint64_t id; @@ -73,7 +72,7 @@ int rbd_resize(rbd_image_t image, uint64_t size); int rbd_stat(rbd_image_t image, rbd_image_info_t *info, size_t infosize); int rbd_copy(rbd_image_t image, rados_ioctx_t dest_io_ctx, const char *destname); int rbd_copy_with_progress(rbd_image_t image, rados_ioctx_t dest_p, - const char *destname, librbd_copy_progress_fn_t fn, void *data); + const char *destname, librbd_progress_fn_t fn, void *data); /* snapshots */ int rbd_snap_list(rbd_image_t image, rbd_snap_info_t *snaps, int *max_snaps); diff --git a/src/include/rbd/librbd.hpp b/src/include/rbd/librbd.hpp index 956f072576e..6a685cccf8b 100644 --- a/src/include/rbd/librbd.hpp +++ b/src/include/rbd/librbd.hpp @@ -41,12 +41,12 @@ namespace librbd { typedef rbd_image_info_t image_info_t; -class ProgressContext -{ -public: - virtual ~ProgressContext(); - virtual int update_progress(uint64_t offset, uint64_t src_size) = 0; -}; + class ProgressContext + { + public: + virtual ~ProgressContext(); + virtual int update_progress(uint64_t offset, uint64_t total) = 0; + }; class RBD { diff --git a/src/librbd.cc b/src/librbd.cc index 3846cb7aeae..0a31e278c1f 100644 --- a/src/librbd.cc +++ b/src/librbd.cc @@ -1053,7 +1053,7 @@ public: class CProgressContext : public librbd::ProgressContext { public: - CProgressContext(librbd_copy_progress_fn_t fn, void *data) + CProgressContext(librbd_progress_fn_t fn, void *data) : m_fn(fn), m_data(data) { } @@ -1062,7 +1062,7 @@ public: return m_fn(offset, src_size, m_data); } private: - librbd_copy_progress_fn_t m_fn; + librbd_progress_fn_t m_fn; void *m_data; }; @@ -1761,7 +1761,7 @@ extern "C" int rbd_copy(rbd_image_t image, rados_ioctx_t dest_p, const char *des } extern "C" int rbd_copy_with_progress(rbd_image_t image, rados_ioctx_t dest_p, - const char *destname, librbd_copy_progress_fn_t fn, void *data) + const char *destname, librbd_progress_fn_t fn, void *data) { librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; librados::IoCtx dest_io_ctx; diff --git a/src/mds/CInode.cc b/src/mds/CInode.cc index aa306adfcb8..528d1f8b970 100644 --- a/src/mds/CInode.cc +++ b/src/mds/CInode.cc @@ -129,6 +129,10 @@ ostream& operator<<(ostream& out, CInode& in) if (in.is_freezing_inode()) out << " FREEZING=" << in.auth_pin_freeze_allowance; if (in.is_frozen_inode()) out << " FROZEN"; + inode_t *pi = in.get_projected_inode(); + if (pi->is_truncating()) + out << " truncating(" << pi->truncate_from << " to " << pi->truncate_size << ")"; + // anchors if (in.is_anchored()) out << " anc"; diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc index e07479e2fe4..5422f5b4992 100644 --- a/src/mds/MDCache.cc +++ b/src/mds/MDCache.cc @@ -5232,6 +5232,8 @@ void MDCache::_truncate_inode(CInode *in, LogSegment *ls) << pi->truncate_from << " -> " << pi->truncate_size << " on " << *in << dendl; + assert(pi->is_truncating()); + in->auth_pin(this); SnapRealm *realm = in->find_snaprealm(); diff --git a/src/mds/flock.cc b/src/mds/flock.cc new file mode 100644 index 00000000000..b97aa013118 --- /dev/null +++ b/src/mds/flock.cc @@ -0,0 +1,488 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +#include <errno.h> + +#include "common/debug.h" +#include "mdstypes.h" +#include "mds/flock.h" + +bool ceph_lock_state_t::is_waiting(ceph_filelock &fl) +{ + multimap<uint64_t, ceph_filelock>::iterator p = waiting_locks.find(fl.start); + while (p != waiting_locks.end()) { + if (p->second.start > fl.start) + return false; + if (p->second.length == fl.length && + p->second.client == fl.client && + p->second.pid == fl.pid && + p->second.pid_namespace == fl.pid_namespace) + return true; + ++p; + } + return false; +} + +void ceph_lock_state_t::remove_waiting(ceph_filelock& fl) +{ + multimap<uint64_t, ceph_filelock>::iterator p = waiting_locks.find(fl.start); + while (p != waiting_locks.end()) { + if (p->second.start > fl.start) + return; + if (p->second.length == fl.length && + p->second.client == fl.client && + p->second.pid == fl.pid && + p->second.pid_namespace == fl.pid_namespace) { + waiting_locks.erase(p); + return; + } + ++p; + } +} + +bool ceph_lock_state_t::add_lock(ceph_filelock& new_lock, bool wait_on_fail) +{ + dout(15) << "add_lock " << new_lock << dendl; + bool ret = false; + list<multimap<uint64_t, ceph_filelock>::iterator> + overlapping_locks, self_overlapping_locks, neighbor_locks; + + // first, get any overlapping locks and split them into owned-by-us and not + if (get_overlapping_locks(new_lock, overlapping_locks, &neighbor_locks)) { + dout(15) << "got overlapping lock, splitting by owner" << dendl; + split_by_owner(new_lock, overlapping_locks, self_overlapping_locks); + } + if (!overlapping_locks.empty()) { //overlapping locks owned by others :( + if (CEPH_LOCK_EXCL == new_lock.type) { + //can't set, we want an exclusive + dout(15) << "overlapping lock, and this lock is exclusive, can't set" + << dendl; + if (wait_on_fail) { + waiting_locks.insert(pair<uint64_t, ceph_filelock>(new_lock.start, new_lock)); + } + } else { //shared lock, check for any exclusive locks blocking us + if (contains_exclusive_lock(overlapping_locks)) { //blocked :( + dout(15) << " blocked by exclusive lock in overlapping_locks" << dendl; + if (wait_on_fail) { + waiting_locks.insert(pair<uint64_t, ceph_filelock>(new_lock.start, new_lock)); + } + } else { + //yay, we can insert a shared lock + dout(15) << "inserting shared lock" << dendl; + adjust_locks(self_overlapping_locks, new_lock, neighbor_locks); + held_locks.insert(pair<uint64_t, ceph_filelock>(new_lock.start, new_lock)); + ret = true; + } + } + } else { //no overlapping locks except our own + adjust_locks(self_overlapping_locks, new_lock, neighbor_locks); + dout(15) << "no conflicts, inserting " << new_lock << dendl; + held_locks.insert(pair<uint64_t, ceph_filelock> + (new_lock.start, new_lock)); + ret = true; + } + if (ret) + ++client_held_lock_counts[(client_t)new_lock.client]; + else if (wait_on_fail) + ++client_waiting_lock_counts[(client_t)new_lock.client]; + return ret; +} + +void ceph_lock_state_t::look_for_lock(ceph_filelock& testing_lock) +{ + list<multimap<uint64_t, ceph_filelock>::iterator> overlapping_locks, + self_overlapping_locks; + if (get_overlapping_locks(testing_lock, overlapping_locks)) { + split_by_owner(testing_lock, overlapping_locks, self_overlapping_locks); + } + if (!overlapping_locks.empty()) { //somebody else owns overlapping lock + if (CEPH_LOCK_EXCL == testing_lock.type) { //any lock blocks it + testing_lock = (*overlapping_locks.begin())->second; + } else { + ceph_filelock *blocking_lock; + if ((blocking_lock = contains_exclusive_lock(overlapping_locks))) { + testing_lock = *blocking_lock; + } else { //nothing blocking! + testing_lock.type = CEPH_LOCK_UNLOCK; + } + } + return; + } + //if we get here, only our own locks block + testing_lock.type = CEPH_LOCK_UNLOCK; +} + +void ceph_lock_state_t::remove_lock(ceph_filelock removal_lock, + list<ceph_filelock>& activated_locks) +{ + list<multimap<uint64_t, ceph_filelock>::iterator> overlapping_locks, + self_overlapping_locks, crossed_waiting_locks; + if (get_overlapping_locks(removal_lock, overlapping_locks)) { + dout(15) << "splitting by owner" << dendl; + split_by_owner(removal_lock, overlapping_locks, self_overlapping_locks); + } else dout(15) << "attempt to remove lock at " << removal_lock.start + << " but no locks there!" << dendl; + bool remove_to_end = (0 == removal_lock.length); + bool old_lock_to_end; + uint64_t removal_start = removal_lock.start; + uint64_t removal_end = removal_start + removal_lock.length - 1; + uint64_t old_lock_end; + __s64 old_lock_client = 0; + ceph_filelock *old_lock; + + dout(15) << "examining " << self_overlapping_locks.size() + << " self-overlapping locks for removal" << dendl; + for (list<multimap<uint64_t, ceph_filelock>::iterator>::iterator + iter = self_overlapping_locks.begin(); + iter != self_overlapping_locks.end(); + ++iter) { + dout(15) << "self overlapping lock " << (*iter)->second << dendl; + old_lock = &(*iter)->second; + old_lock_to_end = (0 == old_lock->length); + old_lock_end = old_lock->start + old_lock->length - 1; + old_lock_client = old_lock->client; + if (remove_to_end) { + if (old_lock->start < removal_start) { + old_lock->length = removal_start - old_lock->start; + } else { + dout(15) << "erasing " << (*iter)->second << dendl; + held_locks.erase(*iter); + --client_held_lock_counts[old_lock_client]; + } + } else if (old_lock_to_end) { + ceph_filelock append_lock = *old_lock; + append_lock.start = removal_end+1; + held_locks.insert(pair<uint64_t, ceph_filelock> + (append_lock.start, append_lock)); + ++client_held_lock_counts[(client_t)old_lock->client]; + if (old_lock->start >= removal_start) { + dout(15) << "erasing " << (*iter)->second << dendl; + held_locks.erase(*iter); + --client_held_lock_counts[old_lock_client]; + } else old_lock->length = removal_start - old_lock->start; + } else { + if (old_lock_end > removal_end) { + ceph_filelock append_lock = *old_lock; + append_lock.start = removal_end + 1; + append_lock.length = old_lock_end - append_lock.start + 1; + held_locks.insert(pair<uint64_t, ceph_filelock> + (append_lock.start, append_lock)); + ++client_held_lock_counts[(client_t)old_lock->client]; + } + if (old_lock->start < removal_start) { + old_lock->length = removal_start - old_lock->start; + } else { + dout(15) << "erasing " << (*iter)->second << dendl; + held_locks.erase(*iter); + --client_held_lock_counts[old_lock_client]; + } + } + if (!client_held_lock_counts[old_lock_client]) { + client_held_lock_counts.erase(old_lock_client); + } + } +} + +bool ceph_lock_state_t::remove_all_from (client_t client) +{ + bool cleared_any = false; + if (client_held_lock_counts.count(client)) { + remove_all_from(client, held_locks); + client_held_lock_counts.erase(client); + cleared_any = true; + } + if (client_waiting_lock_counts.count(client)) { + remove_all_from(client, waiting_locks); + client_waiting_lock_counts.erase(client); + } + return cleared_any; +} + +void ceph_lock_state_t::adjust_locks(list<multimap<uint64_t, ceph_filelock>::iterator> old_locks, + ceph_filelock& new_lock, + list<multimap<uint64_t, ceph_filelock>::iterator> + neighbor_locks) +{ + dout(15) << "adjust_locks" << dendl; + bool new_lock_to_end = (0 == new_lock.length); + bool old_lock_to_end; + uint64_t new_lock_start = new_lock.start; + uint64_t new_lock_end = new_lock.start + new_lock.length - 1; + uint64_t old_lock_start, old_lock_end; + __s64 old_lock_client = 0; + ceph_filelock *old_lock; + for (list<multimap<uint64_t, ceph_filelock>::iterator>::iterator + iter = old_locks.begin(); + iter != old_locks.end(); + ++iter) { + old_lock = &(*iter)->second; + dout(15) << "adjusting lock: " << *old_lock << dendl; + old_lock_to_end = (0 == old_lock->length); + old_lock_start = old_lock->start; + old_lock_end = old_lock->start + old_lock->length - 1; + new_lock_start = new_lock.start; + new_lock_end = new_lock.start + new_lock.length - 1; + old_lock_client = old_lock->client; + if (new_lock_to_end || old_lock_to_end) { + //special code path to deal with a length set at 0 + dout(15) << "one lock extends forever" << dendl; + if (old_lock->type == new_lock.type) { + //just unify them in new lock, remove old lock + dout(15) << "same lock type, unifying" << dendl; + new_lock.start = (new_lock_start < old_lock_start) ? new_lock_start : + old_lock_start; + new_lock.length = 0; + held_locks.erase(*iter); + --client_held_lock_counts[old_lock_client]; + } else { //not same type, have to keep any remains of old lock around + dout(15) << "shrinking old lock" << dendl; + if (new_lock_to_end) { + if (old_lock_start < new_lock_start) { + old_lock->length = new_lock_start - old_lock_start; + } else { + held_locks.erase(*iter); + --client_held_lock_counts[old_lock_client]; + } + } else { //old lock extends past end of new lock + ceph_filelock appended_lock = *old_lock; + appended_lock.start = new_lock_end + 1; + held_locks.insert(pair<uint64_t, ceph_filelock> + (appended_lock.start, appended_lock)); + ++client_held_lock_counts[(client_t)old_lock->client]; + if (old_lock_start < new_lock_start) { + old_lock->length = new_lock_start - old_lock_start; + } else { + held_locks.erase(*iter); + --client_held_lock_counts[old_lock_client]; + } + } + } + } else { + if (old_lock->type == new_lock.type) { //just merge them! + dout(15) << "merging locks, they're the same type" << dendl; + new_lock.start = (old_lock_start < new_lock_start ) ? old_lock_start : + new_lock_start; + int new_end = (new_lock_end > old_lock_end) ? new_lock_end : + old_lock_end; + new_lock.length = new_end - new_lock.start + 1; + dout(15) << "erasing lock " << (*iter)->second << dendl; + held_locks.erase(*iter); + --client_held_lock_counts[old_lock_client]; + } else { //we'll have to update sizes and maybe make new locks + dout(15) << "locks aren't same type, changing sizes" << dendl; + if (old_lock_end > new_lock_end) { //add extra lock after new_lock + ceph_filelock appended_lock = *old_lock; + appended_lock.start = new_lock_end + 1; + appended_lock.length = old_lock_end - appended_lock.start + 1; + held_locks.insert(pair<uint64_t, ceph_filelock> + (appended_lock.start, appended_lock)); + ++client_held_lock_counts[(client_t)old_lock->client]; + } + if (old_lock_start < new_lock_start) { + old_lock->length = new_lock_start - old_lock_start; + } else { //old_lock starts inside new_lock, so remove it + //if it extended past new_lock_end it's been replaced + held_locks.erase(*iter); + --client_held_lock_counts[old_lock_client]; + } + } + } + if (!client_held_lock_counts[old_lock_client]) { + client_held_lock_counts.erase(old_lock_client); + } + } + + //make sure to coalesce neighboring locks + for (list<multimap<uint64_t, ceph_filelock>::iterator>::iterator + iter = neighbor_locks.begin(); + iter != neighbor_locks.end(); + ++iter) { + old_lock = &(*iter)->second; + old_lock_client = old_lock->client; + dout(15) << "lock to coalesce: " << *old_lock << dendl; + /* because if it's a neibhoring lock there can't be any self-overlapping + locks that covered it */ + if (old_lock->type == new_lock.type) { //merge them + if (0 == new_lock.length) { + if (old_lock->start + old_lock->length == new_lock.start) { + new_lock.start = old_lock->start; + } else assert(0); /* if there's no end to new_lock, the neighbor + HAS TO be to left side */ + } else if (0 == old_lock->length) { + if (new_lock.start + new_lock.length == old_lock->start) { + new_lock.length = 0; + } else assert(0); //same as before, but reversed + } else { + if (old_lock->start + old_lock->length == new_lock.start) { + new_lock.start = old_lock->start; + new_lock.length = old_lock->length + new_lock.length; + } else if (new_lock.start + new_lock.length == old_lock->start) { + new_lock.length = old_lock->length + new_lock.length; + } + } + held_locks.erase(*iter); + --client_held_lock_counts[old_lock_client]; + } + if (!client_held_lock_counts[old_lock_client]) { + client_held_lock_counts.erase(old_lock_client); + } + } +} + +void ceph_lock_state_t::remove_all_from(client_t client, + multimap<uint64_t, + ceph_filelock>& locks) +{ + multimap<uint64_t, ceph_filelock>::iterator iter = locks.begin(); + while (iter != locks.end()) { + if ((client_t)iter->second.client == client) { + locks.erase(iter++); + } else ++iter; + } +} + +multimap<uint64_t, ceph_filelock>::iterator +ceph_lock_state_t::get_lower_bound(uint64_t start, + multimap<uint64_t, ceph_filelock>& lock_map) +{ + multimap<uint64_t, ceph_filelock>::iterator lower_bound = + lock_map.lower_bound(start); + if ((lower_bound->first != start) + && (start != 0) + && (lower_bound != lock_map.begin())) --lower_bound; + if (lock_map.end() == lower_bound) + dout(15) << "get_lower_dout(15)eturning end()" << dendl; + else dout(15) << "get_lower_bound returning iterator pointing to " + << lower_bound->second << dendl; + return lower_bound; + } + +multimap<uint64_t, ceph_filelock>::iterator +ceph_lock_state_t::get_last_before(uint64_t end, + multimap<uint64_t, ceph_filelock>& lock_map) +{ + multimap<uint64_t, ceph_filelock>::iterator last = + lock_map.upper_bound(end); + if (last != lock_map.begin()) --last; + if (lock_map.end() == last) + dout(15) << "get_last_before returning end()" << dendl; + else dout(15) << "get_last_before returning iterator pointing to " + << last->second << dendl; + return last; +} + +bool ceph_lock_state_t::share_space( + multimap<uint64_t, ceph_filelock>::iterator& iter, + uint64_t start, uint64_t end) +{ + bool ret = ((iter->first >= start && iter->first <= end) || + ((iter->first < start) && + (((iter->first + iter->second.length - 1) >= start) || + (0 == iter->second.length)))); + dout(15) << "share_space got start: " << start << ", end: " << end + << ", lock: " << iter->second << ", returning " << ret << dendl; + return ret; +} + +bool ceph_lock_state_t::get_overlapping_locks(ceph_filelock& lock, + list<multimap<uint64_t, + ceph_filelock>::iterator> & overlaps, + list<multimap<uint64_t, + ceph_filelock>::iterator> *self_neighbors) +{ + dout(15) << "get_overlapping_locks" << dendl; + // create a lock starting one earlier and ending one later + // to check for neighbors + ceph_filelock neighbor_check_lock = lock; + if (neighbor_check_lock.start != 0) { + neighbor_check_lock.start = neighbor_check_lock.start - 1; + if (neighbor_check_lock.length) + neighbor_check_lock.length = neighbor_check_lock.length + 2; + } else { + if (neighbor_check_lock.length) + neighbor_check_lock.length = neighbor_check_lock.length + 1; + } + //find the last held lock starting at the point after lock + uint64_t endpoint = lock.start; + if (lock.length) { + endpoint += lock.length; + } else { + endpoint = uint64_t(-1); // max offset + } + multimap<uint64_t, ceph_filelock>::iterator iter = + get_last_before(endpoint, held_locks); + bool cont = iter != held_locks.end(); + while(cont) { + if (share_space(iter, lock)) { + overlaps.push_front(iter); + } else if (self_neighbors && + (neighbor_check_lock.client == iter->second.client) && + (neighbor_check_lock.pid == iter->second.pid) && + share_space(iter, neighbor_check_lock)) { + self_neighbors->push_front(iter); + } + if ((iter->first < lock.start) && (CEPH_LOCK_EXCL == iter->second.type)) { + //can't be any more overlapping locks or they'd interfere with this one + cont = false; + } else if (held_locks.begin() == iter) cont = false; + else --iter; + } + return !overlaps.empty(); +} + +bool ceph_lock_state_t::get_waiting_overlaps(ceph_filelock& lock, + list<multimap<uint64_t, + ceph_filelock>::iterator>& + overlaps) +{ + dout(15) << "get_waiting_overlaps" << dendl; + multimap<uint64_t, ceph_filelock>::iterator iter = + get_last_before(lock.start + lock.length - 1, waiting_locks); + bool cont = iter != waiting_locks.end(); + while(cont) { + if (share_space(iter, lock)) overlaps.push_front(iter); + if (waiting_locks.begin() == iter) cont = false; + --iter; + } + return !overlaps.empty(); +} + +void ceph_lock_state_t::split_by_owner(ceph_filelock& owner, + list<multimap<uint64_t, + ceph_filelock>::iterator>& locks, + list<multimap<uint64_t, + ceph_filelock>::iterator>& + owned_locks) +{ + list<multimap<uint64_t, ceph_filelock>::iterator>::iterator + iter = locks.begin(); + dout(15) << "owner lock: " << owner << dendl; + while (iter != locks.end()) { + dout(15) << "comparing to " << (*iter)->second << dendl; + if ((*iter)->second.client == owner.client && + (*iter)->second.pid_namespace == owner.pid_namespace && + (*iter)->second.pid == owner.pid) { + dout(15) << "success, pushing to owned_locks" << dendl; + owned_locks.push_back(*iter); + iter = locks.erase(iter); + } else { + dout(15) << "failure, something not equal in this group " + << (*iter)->second.client << ":" << owner.client << "," + << (*iter)->second.pid_namespace << ":" << owner.pid_namespace + << "," << (*iter)->second.pid << ":" << owner.pid << dendl; + ++iter; + } + } +} + +ceph_filelock * +ceph_lock_state_t::contains_exclusive_lock(list<multimap<uint64_t, + ceph_filelock>::iterator>& locks) +{ + for (list<multimap<uint64_t, ceph_filelock>::iterator>::iterator + iter = locks.begin(); + iter != locks.end(); + ++iter) { + if (CEPH_LOCK_EXCL == (*iter)->second.type) return &(*iter)->second; + } + return NULL; +} diff --git a/src/mds/flock.h b/src/mds/flock.h index 69228b46c90..133feabee66 100644 --- a/src/mds/flock.h +++ b/src/mds/flock.h @@ -26,239 +26,63 @@ inline bool operator==(ceph_filelock& l, ceph_filelock& r) { l.type == r.type; } -struct ceph_lock_state_t { +class ceph_lock_state_t { +public: multimap<uint64_t, ceph_filelock> held_locks; // current locks multimap<uint64_t, ceph_filelock> waiting_locks; // locks waiting for other locks // both of the above are keyed by starting offset map<client_t, int> client_held_lock_counts; map<client_t, int> client_waiting_lock_counts; - bool is_waiting(ceph_filelock &fl) { - multimap<uint64_t, ceph_filelock>::iterator p = waiting_locks.find(fl.start); - while (p != waiting_locks.end()) { - if (p->second.start > fl.start) - return false; - if (p->second.length == fl.length && - p->second.client == fl.client && - p->second.pid == fl.pid && - p->second.pid_namespace == fl.pid_namespace) - return true; - ++p; - } - return false; - } - void remove_waiting(ceph_filelock& fl) { - multimap<uint64_t, ceph_filelock>::iterator p = waiting_locks.find(fl.start); - while (p != waiting_locks.end()) { - if (p->second.start > fl.start) - return; - if (p->second.length == fl.length && - p->second.client == fl.client && - p->second.pid == fl.pid && - p->second.pid_namespace == fl.pid_namespace) { - waiting_locks.erase(p); - return; - } - ++p; - } - } + /** + * Check if a lock is on the waiting_locks list. + * + * @param fl The filelock to check for + * @returns True if the lock is waiting, false otherwise + */ + bool is_waiting(ceph_filelock &fl); + /** + * Remove a lock from the waiting_locks list + * + * @param fl The filelock to remove + */ + void remove_waiting(ceph_filelock& fl); /* * Try to set a new lock. If it's blocked and wait_on_fail is true, * add the lock to waiting_locks. * The lock needs to be of type CEPH_LOCK_EXCL or CEPH_LOCK_SHARED. + * This may merge previous locks, or convert the type of already-owned + * locks. * - * If we already added ourselves to waiting_locks, did_wait will be - * true. If did_wait==true and we're not on the list, that means we - * were canceled and we should return an error. + * @param new_lock The lock to set + * @param wait_on_fail whether to wait until the lock can be set. + * Otherwise it fails immediately when blocked. * - * Returns true if set, false if not set. + * @returns true if set, false if not set. */ - bool add_lock(ceph_filelock& new_lock, bool wait_on_fail) { - dout(15) << "add_lock " << new_lock << dendl; - bool ret = false; - list<multimap<uint64_t, ceph_filelock>::iterator> - overlapping_locks, self_overlapping_locks, neighbor_locks; - - // first, get any overlapping locks and split them into owned-by-us and not - if (get_overlapping_locks(new_lock, overlapping_locks, &neighbor_locks)) { - dout(15) << "got overlapping lock, splitting by owner" << dendl; - split_by_owner(new_lock, overlapping_locks, self_overlapping_locks); - } - if (!overlapping_locks.empty()) { //overlapping locks owned by others :( - if (CEPH_LOCK_EXCL == new_lock.type) { - //can't set, we want an exclusive - dout(15) << "overlapping lock, and this lock is exclusive, can't set" - << dendl; - if (wait_on_fail) { - waiting_locks.insert(pair<uint64_t, ceph_filelock>(new_lock.start, new_lock)); - } - } else { //shared lock, check for any exclusive locks blocking us - if (contains_exclusive_lock(overlapping_locks)) { //blocked :( - dout(15) << " blocked by exclusive lock in overlapping_locks" << dendl; - if (wait_on_fail) { - waiting_locks.insert(pair<uint64_t, ceph_filelock>(new_lock.start, new_lock)); - } - } else { - //yay, we can insert a shared lock - dout(15) << "inserting shared lock" << dendl; - adjust_locks(self_overlapping_locks, new_lock, neighbor_locks); - held_locks.insert(pair<uint64_t, ceph_filelock>(new_lock.start, new_lock)); - ret = true; - } - } - } else { //no overlapping locks except our own - adjust_locks(self_overlapping_locks, new_lock, neighbor_locks); - dout(15) << "no conflicts, inserting " << new_lock << dendl; - held_locks.insert(pair<uint64_t, ceph_filelock> - (new_lock.start, new_lock)); - ret = true; - } - if (ret) - ++client_held_lock_counts[(client_t)new_lock.client]; - else if (wait_on_fail) - ++client_waiting_lock_counts[(client_t)new_lock.client]; - return ret; - } - - void look_for_lock(ceph_filelock& testing_lock) { - list<multimap<uint64_t, ceph_filelock>::iterator> overlapping_locks, - self_overlapping_locks; - if (get_overlapping_locks(testing_lock, overlapping_locks)) { - split_by_owner(testing_lock, overlapping_locks, self_overlapping_locks); - } - if (!overlapping_locks.empty()) { //somebody else owns overlapping lock - if (CEPH_LOCK_EXCL == testing_lock.type) { //any lock blocks it - testing_lock = (*overlapping_locks.begin())->second; - } else { - ceph_filelock *blocking_lock; - if ((blocking_lock = contains_exclusive_lock(overlapping_locks))) { - testing_lock = *blocking_lock; - } else { //nothing blocking! - testing_lock.type = CEPH_LOCK_UNLOCK; - } - } - return; - } - //if we get here, only our own locks block - testing_lock.type = CEPH_LOCK_UNLOCK; - } + bool add_lock(ceph_filelock& new_lock, bool wait_on_fail); + /** + * See if a lock is blocked by existing locks. If the lock is blocked, + * it will be set to the value of the first blocking lock. Otherwise, + * it will be returned unchanged, except for setting the type field + * to CEPH_LOCK_UNLOCK. + * + * @param testing_lock The lock to check for conflicts on. + */ + void look_for_lock(ceph_filelock& testing_lock); /* * Remove lock(s) described in old_lock. This may involve splitting a * previous lock or making a previous lock smaller. + * + * @param removal_lock The lock to remove + * @param activated_locks A return parameter, holding activated wait locks. */ void remove_lock(ceph_filelock removal_lock, - list<ceph_filelock>& activated_locks) { - list<multimap<uint64_t, ceph_filelock>::iterator> overlapping_locks, - self_overlapping_locks, crossed_waiting_locks; - if (get_overlapping_locks(removal_lock, overlapping_locks)) { - dout(15) << "splitting by owner" << dendl; - split_by_owner(removal_lock, overlapping_locks, self_overlapping_locks); - } else dout(15) << "attempt to remove lock at " << removal_lock.start - << " but no locks there!" << dendl; - bool remove_to_end = (0 == removal_lock.length); - bool old_lock_to_end; - uint64_t removal_start = removal_lock.start; - uint64_t removal_end = removal_start + removal_lock.length - 1; - uint64_t old_lock_end; - __s64 old_lock_client = 0; - ceph_filelock *old_lock; - - dout(15) << "examining " << self_overlapping_locks.size() - << " self-overlapping locks for removal" << dendl; - for (list<multimap<uint64_t, ceph_filelock>::iterator>::iterator - iter = self_overlapping_locks.begin(); - iter != self_overlapping_locks.end(); - ++iter) { - dout(15) << "self overlapping lock " << (*iter)->second << dendl; - old_lock = &(*iter)->second; - old_lock_to_end = (0 == old_lock->length); - old_lock_end = old_lock->start + old_lock->length - 1; - old_lock_client = old_lock->client; - if (remove_to_end) { - if (old_lock->start < removal_start) { - old_lock->length = removal_start - old_lock->start; - } else { - dout(15) << "erasing " << (*iter)->second << dendl; - held_locks.erase(*iter); - --client_held_lock_counts[old_lock_client]; - } - } else if (old_lock_to_end) { - ceph_filelock append_lock = *old_lock; - append_lock.start = removal_end+1; - held_locks.insert(pair<uint64_t, ceph_filelock> - (append_lock.start, append_lock)); - ++client_held_lock_counts[(client_t)old_lock->client]; - if (old_lock->start >= removal_start) { - dout(15) << "erasing " << (*iter)->second << dendl; - held_locks.erase(*iter); - --client_held_lock_counts[old_lock_client]; - } else old_lock->length = removal_start - old_lock->start; - } else { - if (old_lock_end > removal_end) { - ceph_filelock append_lock = *old_lock; - append_lock.start = removal_end + 1; - append_lock.length = old_lock_end - append_lock.start + 1; - held_locks.insert(pair<uint64_t, ceph_filelock> - (append_lock.start, append_lock)); - ++client_held_lock_counts[(client_t)old_lock->client]; - } - if (old_lock->start < removal_start) { - old_lock->length = removal_start - old_lock->start; - } else { - dout(15) << "erasing " << (*iter)->second << dendl; - held_locks.erase(*iter); - --client_held_lock_counts[old_lock_client]; - } - } - if (!client_held_lock_counts[old_lock_client]) { - client_held_lock_counts.erase(old_lock_client); - } - } - - /* okay, we've removed the locks, but removing them might allow some - * other waiting locks to come through */ - if (get_waiting_overlaps(removal_lock, crossed_waiting_locks)) { - /*let's do this the SUPER lazy way for now. Should work out something - that's slightly less slow and wasteful, though. - 1) Remove lock from waiting_locks. - 2) attempt to insert lock via add_lock - 3) Add to success list if we get back "true" - - In the future, should probably set this up to detect some - guaranteed blocks and do fewer map lookups. - */ - for (list<multimap<uint64_t, ceph_filelock>::iterator>::iterator - iter = crossed_waiting_locks.begin(); - iter != crossed_waiting_locks.end(); - ++iter) { - ceph_filelock cur_lock = (*iter)->second; - waiting_locks.erase(*iter); - --client_waiting_lock_counts[(client_t)cur_lock.client]; - if (!client_waiting_lock_counts[(client_t)cur_lock.client]) { - client_waiting_lock_counts.erase((client_t)cur_lock.client); - } - if (add_lock(cur_lock, true)) - activated_locks.push_back(cur_lock); - } - } - } - - bool remove_all_from (client_t client) { - bool cleared_any = false; - if (client_held_lock_counts.count(client)) { - remove_all_from(client, held_locks); - client_held_lock_counts.erase(client); - cleared_any = true; - } - if (client_waiting_lock_counts.count(client)) { - remove_all_from(client, waiting_locks); - client_waiting_lock_counts.erase(client); - } - return cleared_any; - } + list<ceph_filelock>& activated_locks); + bool remove_all_from(client_t client); private: /** * Adjust old locks owned by a single process so that process can set @@ -270,182 +94,32 @@ private: * This function should only be called once you know the lock will be * inserted, as it DOES adjust new_lock. You can call this function * on an empty list, in which case it does nothing. - * This function does not remove elements from the list, so regard the list + * This function does not remove elements from old_locks, so regard the list * as bad information following function invocation. * - * new_lock: The new lock the process has requested. - * old_locks: list of all locks currently held by same + * @param new_lock The new lock the process has requested. + * @param old_locks list of all locks currently held by same * client/process that overlap new_lock. - * neighbor_locks: locks owned by same process that neighbor new_lock on + * @param neighbor_locks locks owned by same process that neighbor new_lock on * left or right side. */ void adjust_locks(list<multimap<uint64_t, ceph_filelock>::iterator> old_locks, - ceph_filelock& new_lock, - list<multimap<uint64_t, ceph_filelock>::iterator> - neighbor_locks) { - dout(15) << "adjust_locks" << dendl; - bool new_lock_to_end = (0 == new_lock.length); - bool old_lock_to_end; - uint64_t new_lock_start = new_lock.start; - uint64_t new_lock_end = new_lock.start + new_lock.length - 1; - uint64_t old_lock_start, old_lock_end; - __s64 old_lock_client = 0; - ceph_filelock *old_lock; - for (list<multimap<uint64_t, ceph_filelock>::iterator>::iterator - iter = old_locks.begin(); - iter != old_locks.end(); - ++iter) { - old_lock = &(*iter)->second; - dout(15) << "adjusting lock: " << *old_lock << dendl; - old_lock_to_end = (0 == old_lock->length); - old_lock_start = old_lock->start; - old_lock_end = old_lock->start + old_lock->length - 1; - new_lock_start = new_lock.start; - new_lock_end = new_lock.start + new_lock.length - 1; - old_lock_client = old_lock->client; - if (new_lock_to_end || old_lock_to_end) { - //special code path to deal with a length set at 0 - dout(15) << "one lock extends forever" << dendl; - if (old_lock->type == new_lock.type) { - //just unify them in new lock, remove old lock - dout(15) << "same lock type, unifying" << dendl; - new_lock.start = (new_lock_start < old_lock_start) ? new_lock_start : - old_lock_start; - new_lock.length = 0; - held_locks.erase(*iter); - --client_held_lock_counts[old_lock_client]; - } else { //not same type, have to keep any remains of old lock around - dout(15) << "shrinking old lock" << dendl; - if (new_lock_to_end) { - if (old_lock_start < new_lock_start) { - old_lock->length = new_lock_start - old_lock_start; - } else { - held_locks.erase(*iter); - --client_held_lock_counts[old_lock_client]; - } - } else { //old lock extends past end of new lock - ceph_filelock appended_lock = *old_lock; - appended_lock.start = new_lock_end + 1; - held_locks.insert(pair<uint64_t, ceph_filelock> - (appended_lock.start, appended_lock)); - ++client_held_lock_counts[(client_t)old_lock->client]; - if (old_lock_start < new_lock_start) { - old_lock->length = new_lock_start - old_lock_start; - } else { - held_locks.erase(*iter); - --client_held_lock_counts[old_lock_client]; - } - } - } - } else { - if (old_lock->type == new_lock.type) { //just merge them! - dout(15) << "merging locks, they're the same type" << dendl; - new_lock.start = (old_lock_start < new_lock_start ) ? old_lock_start : - new_lock_start; - int new_end = (new_lock_end > old_lock_end) ? new_lock_end : - old_lock_end; - new_lock.length = new_end - new_lock.start + 1; - dout(15) << "erasing lock " << (*iter)->second << dendl; - held_locks.erase(*iter); - --client_held_lock_counts[old_lock_client]; - } else { //we'll have to update sizes and maybe make new locks - dout(15) << "locks aren't same type, changing sizes" << dendl; - if (old_lock_end > new_lock_end) { //add extra lock after new_lock - ceph_filelock appended_lock = *old_lock; - appended_lock.start = new_lock_end + 1; - appended_lock.length = old_lock_end - appended_lock.start + 1; - held_locks.insert(pair<uint64_t, ceph_filelock> - (appended_lock.start, appended_lock)); - ++client_held_lock_counts[(client_t)old_lock->client]; - } - if (old_lock_start < new_lock_start) { - old_lock->length = new_lock_start - old_lock_start; - } else { //old_lock starts inside new_lock, so remove it - //if it extended past new_lock_end it's been replaced - held_locks.erase(*iter); - --client_held_lock_counts[old_lock_client]; - } - } - } - if (!client_held_lock_counts[old_lock_client]) { - client_held_lock_counts.erase(old_lock_client); - } - } - - //make sure to coalesce neighboring locks - for (list<multimap<uint64_t, ceph_filelock>::iterator>::iterator - iter = neighbor_locks.begin(); - iter != neighbor_locks.end(); - ++iter) { - old_lock = &(*iter)->second; - old_lock_client = old_lock->client; - dout(15) << "lock to coalesce: " << *old_lock << dendl; - /* because if it's a neibhoring lock there can't be any self-overlapping - locks that covered it */ - if (old_lock->type == new_lock.type) { //merge them - if (0 == new_lock.length) { - if (old_lock->start + old_lock->length == new_lock.start) { - new_lock.start = old_lock->start; - } else assert(0); /* if there's no end to new_lock, the neighbor - HAS TO be to left side */ - } else if (0 == old_lock->length) { - if (new_lock.start + new_lock.length == old_lock->start) { - new_lock.length = 0; - } else assert(0); //same as before, but reversed - } else { - if (old_lock->start + old_lock->length == new_lock.start) { - new_lock.start = old_lock->start; - new_lock.length = old_lock->length + new_lock.length; - } else if (new_lock.start + new_lock.length == old_lock->start) { - new_lock.length = old_lock->length + new_lock.length; - } - } - held_locks.erase(*iter); - --client_held_lock_counts[old_lock_client]; - } - if (!client_held_lock_counts[old_lock_client]) { - client_held_lock_counts.erase(old_lock_client); - } - } - } + ceph_filelock& new_lock, + list<multimap<uint64_t, ceph_filelock>::iterator> + neighbor_locks); //this won't reset the counter map value, do that yourself - void remove_all_from(client_t client, multimap<uint64_t, ceph_filelock>& locks) { - multimap<uint64_t, ceph_filelock>::iterator iter = locks.begin(); - while (iter != locks.end()) { - if ((client_t)iter->second.client == client) { - locks.erase(iter++); - } else ++iter; - } - } + void remove_all_from(client_t client, + multimap<uint64_t, ceph_filelock>& locks); //get last lock prior to start position multimap<uint64_t, ceph_filelock>::iterator - get_lower_bound(uint64_t start, multimap<uint64_t, ceph_filelock>& lock_map) { - multimap<uint64_t, ceph_filelock>::iterator lower_bound = - lock_map.lower_bound(start); - if ((lower_bound->first != start) - && (start != 0) - && (lower_bound != lock_map.begin())) --lower_bound; - if (lock_map.end() == lower_bound) - dout(15) << "get_lower_dout(15)eturning end()" << dendl; - else dout(15) << "get_lower_bound returning iterator pointing to " - << lower_bound->second << dendl; - return lower_bound; - } - + get_lower_bound(uint64_t start, + multimap<uint64_t, ceph_filelock>& lock_map); //get latest-starting lock that goes over the byte "end" multimap<uint64_t, ceph_filelock>::iterator - get_last_before(uint64_t end, multimap<uint64_t, ceph_filelock>& lock_map) { - multimap<uint64_t, ceph_filelock>::iterator last = - lock_map.upper_bound(end); - if (last != lock_map.begin()) --last; - if (lock_map.end() == last) - dout(15) << "get_last_before returning end()" << dendl; - else dout(15) << "get_last_before returning iterator pointing to " - << last->second << dendl; - return last; - } + get_last_before(uint64_t end, + multimap<uint64_t, ceph_filelock>& lock_map); /* * See if an iterator's lock covers any of the same bounds as a given range @@ -454,25 +128,18 @@ private: * If the length is 0, the lock covers from "start" to the end of the file. */ bool share_space(multimap<uint64_t, ceph_filelock>::iterator& iter, - uint64_t start, uint64_t end) { - bool ret = ((iter->first >= start && iter->first <= end) || - ((iter->first < start) && - (((iter->first + iter->second.length - 1) >= start) || - (0 == iter->second.length)))); - dout(15) << "share_space got start: " << start << ", end: " << end - << ", lock: " << iter->second << ", returning " << ret << dendl; - return ret; - } + uint64_t start, uint64_t end); + bool share_space(multimap<uint64_t, ceph_filelock>::iterator& iter, - ceph_filelock& lock) { + ceph_filelock &lock) { uint64_t end = lock.start; - if (lock.length) + if (lock.length) { end += lock.length - 1; - else // zero length means to end-of-file + } else { // zero length means end of file end = uint64_t(-1); + } return share_space(iter, lock.start, end); } - /* *get a list of all locks overlapping with the given lock's range * lock: the lock to compare with. @@ -480,47 +147,11 @@ private: * Returns: true if at least one lock overlaps. */ bool get_overlapping_locks(ceph_filelock& lock, - list<multimap<uint64_t, ceph_filelock>::iterator> & overlaps, - list<multimap<uint64_t, ceph_filelock>::iterator> *self_neighbors) { - dout(15) << "get_overlapping_locks" << dendl; - // create a lock starting one earlier and ending one later - // to check for neighbors - ceph_filelock neighbor_check_lock = lock; - if (neighbor_check_lock.start != 0) { - neighbor_check_lock.start = neighbor_check_lock.start - 1; - if (neighbor_check_lock.length) - neighbor_check_lock.length = neighbor_check_lock.length + 2; - } else { - if (neighbor_check_lock.length) - neighbor_check_lock.length = neighbor_check_lock.length + 1; - } - //find the last held lock starting at the point after lock - uint64_t endpoint = lock.start; - if (lock.length) { - endpoint += lock.length; - } else { - endpoint = uint64_t(-1); // max offset - } - multimap<uint64_t, ceph_filelock>::iterator iter = - get_last_before(endpoint, held_locks); - bool cont = iter != held_locks.end(); - while(cont) { - if (share_space(iter, lock)) { - overlaps.push_front(iter); - } else if (self_neighbors && - (neighbor_check_lock.client == iter->second.client) && - (neighbor_check_lock.pid == iter->second.pid) && - share_space(iter, neighbor_check_lock)) { - self_neighbors->push_front(iter); - } - if ((iter->first < lock.start) && (CEPH_LOCK_EXCL == iter->second.type)) { - //can't be any more overlapping locks or they'd interfere with this one - cont = false; - } else if (held_locks.begin() == iter) cont = false; - else --iter; - } - return !overlaps.empty(); - } + list<multimap<uint64_t, + ceph_filelock>::iterator> & overlaps, + list<multimap<uint64_t, + ceph_filelock>::iterator> *self_neighbors); + bool get_overlapping_locks(ceph_filelock& lock, list<multimap<uint64_t, ceph_filelock>::iterator>& overlaps) { @@ -534,20 +165,8 @@ private: * Returns: true if at least one waiting_lock overlaps */ bool get_waiting_overlaps(ceph_filelock& lock, - list<multimap<uint64_t, ceph_filelock>::iterator>& - overlaps) { - dout(15) << "get_waiting_overlaps" << dendl; - multimap<uint64_t, ceph_filelock>::iterator iter = - get_last_before(lock.start + lock.length - 1, waiting_locks); - bool cont = iter != waiting_locks.end(); - while(cont) { - if (share_space(iter, lock)) overlaps.push_front(iter); - if (waiting_locks.begin() == iter) cont = false; - --iter; - } - return !overlaps.empty(); - } - + list<multimap<uint64_t, + ceph_filelock>::iterator>& overlaps); /* * split a list of locks up by whether they're owned by same * process as given lock @@ -557,38 +176,13 @@ private: * owned_locks: an empty list, to be filled with the locks owned by owner */ void split_by_owner(ceph_filelock& owner, - list<multimap<uint64_t, ceph_filelock>::iterator> & locks, - list<multimap<uint64_t, ceph_filelock>::iterator> & owned_locks) { - list<multimap<uint64_t, ceph_filelock>::iterator>::iterator - iter = locks.begin(); - dout(15) << "owner lock: " << owner << dendl; - while (iter != locks.end()) { - dout(15) << "comparing to " << (*iter)->second << dendl; - if ((*iter)->second.client == owner.client && - (*iter)->second.pid_namespace == owner.pid_namespace && - (*iter)->second.pid == owner.pid) { - dout(15) << "success, pushing to owned_locks" << dendl; - owned_locks.push_back(*iter); - iter = locks.erase(iter); - } else { - dout(15) << "failure, something not equal in this group " - << (*iter)->second.client << ":" << owner.client << "," - << (*iter)->second.pid_namespace << ":" << owner.pid_namespace - << "," << (*iter)->second.pid << ":" << owner.pid << dendl; - ++iter; - } - } - } + list<multimap<uint64_t, + ceph_filelock>::iterator> & locks, + list<multimap<uint64_t, + ceph_filelock>::iterator> & owned_locks); - ceph_filelock *contains_exclusive_lock(list<multimap<uint64_t, ceph_filelock>::iterator>& locks) { - for (list<multimap<uint64_t, ceph_filelock>::iterator>::iterator - iter = locks.begin(); - iter != locks.end(); - ++iter) { - if (CEPH_LOCK_EXCL == (*iter)->second.type) return &(*iter)->second; - } - return NULL; - } + ceph_filelock *contains_exclusive_lock(list<multimap<uint64_t, + ceph_filelock>::iterator>& locks); public: void encode(bufferlist& bl) const { diff --git a/src/mds/mdstypes.h b/src/mds/mdstypes.h index 70f029740d0..32f597277a1 100644 --- a/src/mds/mdstypes.h +++ b/src/mds/mdstypes.h @@ -430,6 +430,7 @@ struct inode_t { bool is_truncating() const { return (truncate_pending > 0); } void truncate(uint64_t old_size, uint64_t new_size) { + assert(new_size < old_size); truncate_from = old_size; size = new_size; rstat.rbytes = new_size; diff --git a/src/mkcephfs.in b/src/mkcephfs.in index d4d5092bf93..fa43155c173 100644 --- a/src/mkcephfs.in +++ b/src/mkcephfs.in @@ -437,7 +437,7 @@ if [ $allhosts -eq 1 ]; then do_root_cmd "$0 -d $rdir --prepare-osdfs $name" fi - do_cmd "$0 -d $rdir --init-daemon $name" + do_root_cmd "$0 -d $rdir --init-daemon $name" # collect the key if [ -n "$ssh" ]; then @@ -468,7 +468,7 @@ if [ $allhosts -eq 1 ]; then rdir=$dir fi - do_cmd "$0 -d $rdir --init-daemon $name" + do_root_cmd "$0 -d $rdir --init-daemon $name" done # admin keyring diff --git a/src/mon/OSDMonitor.cc b/src/mon/OSDMonitor.cc index 6db2324eebd..abb878dea9d 100644 --- a/src/mon/OSDMonitor.cc +++ b/src/mon/OSDMonitor.cc @@ -1664,16 +1664,21 @@ bool OSDMonitor::prepare_command(MMonCommand *m) i = atoi(m->cmd[2].c_str()); if (i < 0 || i >= osdmap.get_max_osd()) { ss << i << " is not a valid osd id"; - getline(ss, rs); - return -ERANGE; + err = -ERANGE; + goto out; + } + if (osdmap.exists(i)) { + ss << i << " already exists"; + err = -EEXIST; + goto out; } - if (osdmap.exists(i) || - pending_inc.new_up_client.count(i) || + if (pending_inc.new_up_client.count(i) || (pending_inc.new_state.count(i) && (pending_inc.new_state[i] & CEPH_OSD_EXISTS))) { ss << i << " already exists"; getline(ss, rs); - return -EEXIST; + paxos->wait_for_commit(new Monitor::C_Command(mon, m, -EEXIST, rs, paxos->get_version())); + return true; } } else { // allocate a new id diff --git a/src/os/CollectionIndex.h b/src/os/CollectionIndex.h index f8846a78cd6..2fca26fb0a6 100644 --- a/src/os/CollectionIndex.h +++ b/src/os/CollectionIndex.h @@ -58,6 +58,9 @@ protected: /// Type of returned paths typedef std::tr1::shared_ptr<Path> IndexedPath; + static const uint32_t FLAT_INDEX_TAG = 0; + static const uint32_t HASH_INDEX_TAG = 1; + static const uint32_t HASH_INDEX_TAG_2 = 2; /** * For tracking Filestore collection versions. * diff --git a/src/os/FileStore.cc b/src/os/FileStore.cc index 9d3ebef3c75..86281ab9982 100644 --- a/src/os/FileStore.cc +++ b/src/os/FileStore.cc @@ -1268,7 +1268,7 @@ int FileStore::version_stamp_is_valid(uint32_t *version) } bufferptr bp(PATH_MAX); int ret = safe_read(fd, bp.c_str(), bp.length()); - TEMP_FAILURE_RETRY(::close(op_fd)); + TEMP_FAILURE_RETRY(::close(fd)); if (ret < 0) return -errno; bufferlist bl; @@ -1292,7 +1292,7 @@ int FileStore::write_version_stamp() ::encode(on_disk_version, bl); int ret = safe_write(fd, bl.c_str(), bl.length()); - TEMP_FAILURE_RETRY(::close(op_fd)); + TEMP_FAILURE_RETRY(::close(fd)); if (ret < 0) return -errno; return 0; @@ -2985,7 +2985,11 @@ void FileStore::sync_entry() sync_epoch++; dout(15) << "sync_entry committing " << cp << " sync_epoch " << sync_epoch << dendl; - write_op_seq(op_fd, cp); + if (write_op_seq(op_fd, cp) < 0) { + derr << "Error: " << cpp_strerror(errno) + << " during write_op_seq" << dendl; + assert(0); + } bool do_snap = btrfs && g_conf->filestore_btrfs_snap; diff --git a/src/os/FileStore.h b/src/os/FileStore.h index 7f01c897407..d17d5e6b82f 100644 --- a/src/os/FileStore.h +++ b/src/os/FileStore.h @@ -39,7 +39,7 @@ using namespace __gnu_cxx; // fake attributes in memory, if we need to. class FileStore : public JournalingObjectStore { - static const uint32_t on_disk_version = 1; + static const uint32_t on_disk_version = 2; string basedir, journalpath; std::string current_fn; std::string current_op_seq_fn; diff --git a/src/os/FlatIndex.h b/src/os/FlatIndex.h index d94edf3f69e..53e27f5ec08 100644 --- a/src/os/FlatIndex.h +++ b/src/os/FlatIndex.h @@ -35,7 +35,7 @@ public: FlatIndex(string base_path) : base_path(base_path) {} /// @see CollectionIndex - uint32_t collection_version() { return 0; } + uint32_t collection_version() { return FLAT_INDEX_TAG; } /// @see CollectionIndex void set_ref(std::tr1::shared_ptr<CollectionIndex> ref); diff --git a/src/os/HashIndex.h b/src/os/HashIndex.h index fb8fde599ab..bbcbc9904ad 100644 --- a/src/os/HashIndex.h +++ b/src/os/HashIndex.h @@ -128,12 +128,13 @@ public: HashIndex( const char *base_path, ///< [in] Path to the index root. int merge_at, ///< [in] Merge threshhold. - int split_at) ///< [in] Split threshhold. - : LFNIndex(base_path), merge_threshold(merge_at), + int split_at, ///< [in] Split threshhold. + uint32_t index_version)///< [in] Index version + : LFNIndex(base_path, index_version), merge_threshold(merge_at), split_threshold(split_at) {} /// @see CollectionIndex - uint32_t collection_version() { return 1; } + uint32_t collection_version() { return index_version; } /// @see CollectionIndex int cleanup(); diff --git a/src/os/IndexManager.cc b/src/os/IndexManager.cc index 251599b1175..504cd8f5217 100644 --- a/src/os/IndexManager.cc +++ b/src/os/IndexManager.cc @@ -70,7 +70,8 @@ int IndexManager::init_index(coll_t c, const char *path, uint32_t version) { if (r < 0) return r; HashIndex index(path, g_conf->filestore_merge_threshold, - g_conf->filestore_split_multiple); + g_conf->filestore_split_multiple, + CollectionIndex::HASH_INDEX_TAG_2); return index.init(); } @@ -84,15 +85,16 @@ int IndexManager::build_index(coll_t c, const char *path, Index *index) { return r; switch (version) { - case 0: { + case CollectionIndex::FLAT_INDEX_TAG: { *index = Index(new FlatIndex(path), RemoveOnDelete(c, this)); return 0; } - case 1: { + case CollectionIndex::HASH_INDEX_TAG: // fall through + case CollectionIndex::HASH_INDEX_TAG_2: { // Must be a HashIndex *index = Index(new HashIndex(path, g_conf->filestore_merge_threshold, - g_conf->filestore_split_multiple), + g_conf->filestore_split_multiple, version), RemoveOnDelete(c, this)); return 0; } @@ -102,7 +104,8 @@ int IndexManager::build_index(coll_t c, const char *path, Index *index) { } else { // No need to check *index = Index(new HashIndex(path, g_conf->filestore_merge_threshold, - g_conf->filestore_split_multiple), + g_conf->filestore_split_multiple, + CollectionIndex::HASH_INDEX_TAG_2), RemoveOnDelete(c, this)); return 0; } diff --git a/src/os/LFNIndex.cc b/src/os/LFNIndex.cc index d6903032e3c..557b69a564d 100644 --- a/src/os/LFNIndex.cc +++ b/src/os/LFNIndex.cc @@ -256,6 +256,23 @@ int LFNIndex::get_mangled_name(const vector<string> &from, return lfn_get_name(from, hoid, mangled_name, 0, exists); } +static int get_hobject_from_oinfo(const char *dir, const char *file, + hobject_t *o) { + char path[PATH_MAX]; + bufferptr bp(PATH_MAX); + snprintf(path, sizeof(path), "%s/%s", dir, file); + // Hack, user.ceph._ is the attribute used to store the object info + int r = do_getxattr(path, "user.ceph._", bp.c_str(), bp.length()); + if (r < 0) + return r; + bufferlist bl; + bl.push_back(bp); + object_info_t oi(bl); + *o = oi.soid; + return 0; +} + + int LFNIndex::list_objects(const vector<string> &to_list, int max_objs, long *handle, map<string, hobject_t> *out) { string to_list_path = get_full_path_subdir(to_list); @@ -295,6 +312,9 @@ int LFNIndex::list_objects(const vector<string> &to_list, int max_objs, if (!lfn_must_hash(long_name)) { assert(long_name == short_name); } + if (index_version == HASH_INDEX_TAG) + get_hobject_from_oinfo(to_list_path.c_str(), short_name.c_str(), &obj); + out->insert(pair<string, hobject_t>(short_name, obj)); ++listed; } else { @@ -413,7 +433,7 @@ int LFNIndex::remove_attr_path(const vector<string> &path, return do_removexattr(full_path.c_str(), mangled_attr_name.c_str()); } -string LFNIndex::lfn_generate_object_name(const hobject_t &hoid) { +string LFNIndex::lfn_generate_object_name_keyless(const hobject_t &hoid) { char s[FILENAME_MAX_LEN]; char *end = s + sizeof(s); char *t = s; @@ -449,7 +469,55 @@ string LFNIndex::lfn_generate_object_name(const hobject_t &hoid) { snprintf(t, end - t, "_%.*X", (int)(sizeof(hoid.hash)*2), hoid.hash); return string(s); - } +} + +static void append_escaped(string::const_iterator begin, + string::const_iterator end, + string *out) { + for (string::const_iterator i = begin; i != end; ++i) { + if (*i == '\\') { + out->append("\\\\"); + } else if (*i == '/') { + out->append("\\s"); + } else if (*i == '_') { + out->append("\\u"); + } else { + out->append(i, i+1); + } + } +} + +string LFNIndex::lfn_generate_object_name(const hobject_t &hoid) { + if (index_version == HASH_INDEX_TAG) + return lfn_generate_object_name_keyless(hoid); + + string full_name; + string::const_iterator i = hoid.oid.name.begin(); + if (hoid.oid.name.substr(0, 4) == "DIR_") { + full_name.append("\\d"); + i += 4; + } else if (hoid.oid.name[0] == '.') { + full_name.append("\\."); + ++i; + } + append_escaped(i, hoid.oid.name.end(), &full_name); + full_name.append("_"); + append_escaped(hoid.key.begin(), hoid.key.end(), &full_name); + full_name.append("_"); + + char snap_with_hash[PATH_MAX]; + char *t = snap_with_hash; + char *end = t + sizeof(snap_with_hash); + if (hoid.snap == CEPH_NOSNAP) + t += snprintf(t, end - t, "head"); + else if (hoid.snap == CEPH_SNAPDIR) + t += snprintf(t, end - t, "snapdir"); + else + t += snprintf(t, end - t, "%llx", (long long unsigned)hoid.snap); + snprintf(t, end - t, "_%.*X", (int)(sizeof(hoid.hash)*2), hoid.hash); + full_name += string(snap_with_hash); + return full_name; +} int LFNIndex::lfn_get_name(const vector<string> &path, const hobject_t &hoid, @@ -487,7 +555,7 @@ int LFNIndex::lfn_get_name(const vector<string> &path, for ( ; ; ++i) { candidate = lfn_get_short_name(hoid, i); candidate_path = get_full_path(path, candidate); - r = do_getxattr(candidate_path.c_str(), LFN_ATTR.c_str(), buf, sizeof(buf)); + r = do_getxattr(candidate_path.c_str(), get_lfn_attr().c_str(), buf, sizeof(buf)); if (r < 0) { if (errno != ENODATA && errno != ENOENT) return -errno; @@ -528,7 +596,7 @@ int LFNIndex::lfn_created(const vector<string> &path, return 0; string full_path = get_full_path(path, mangled_name); string full_name = lfn_generate_object_name(hoid); - return do_setxattr(full_path.c_str(), LFN_ATTR.c_str(), + return do_setxattr(full_path.c_str(), get_lfn_attr().c_str(), full_name.c_str(), full_name.size()); } @@ -585,15 +653,15 @@ int LFNIndex::lfn_unlink(const vector<string> &path, } int LFNIndex::lfn_translate(const vector<string> &path, - const string &short_name, - hobject_t *out) { + const string &short_name, + hobject_t *out) { if (!lfn_is_hashed_filename(short_name)) { return lfn_parse_object_name(short_name, out); } // Get lfn_attr string full_path = get_full_path(path, short_name); char attr[PATH_MAX]; - int r = do_getxattr(full_path.c_str(), LFN_ATTR.c_str(), attr, sizeof(attr) - 1); + int r = do_getxattr(full_path.c_str(), get_lfn_attr().c_str(), attr, sizeof(attr) - 1); if (r < 0) return -errno; if (r < (int)sizeof(attr)) @@ -666,13 +734,90 @@ static int parse_object(const char *s, hobject_t& o) return 0; } -bool LFNIndex::lfn_parse_object_name(const string &long_name, hobject_t *out) { +bool LFNIndex::lfn_parse_object_name_keyless(const string &long_name, hobject_t *out) { bool r = parse_object(long_name.c_str(), *out); if (!r) return r; string temp = lfn_generate_object_name(*out); return r; } +static bool append_unescaped(string::const_iterator begin, + string::const_iterator end, + string *out) { + for (string::const_iterator i = begin; i != end; ++i) { + if (*i == '\\') { + ++i; + if (*i == '\\') + out->append("\\"); + else if (*i == 's') + out->append("/"); + else if (*i == 'u') + out->append("_"); + else + return false; + } else { + out->append(i, i+1); + } + } + return true; +} + +bool LFNIndex::lfn_parse_object_name(const string &long_name, hobject_t *out) { + if (index_version == HASH_INDEX_TAG) + return lfn_parse_object_name_keyless(long_name, out); + + string::const_iterator current = long_name.begin(); + if (*current == '\\') { + ++current; + if (current == long_name.end()) { + return false; + } else if (*current == 'd') { + out->oid.name.append("DIR_"); + ++current; + } else if (*current == '.') { + out->oid.name.append("."); + ++current; + } else { + --current; + } + } + + string::const_iterator end = current; + for ( ; end != long_name.end() && *end != '_'; ++end) ; + if (end == long_name.end()) + return false; + if (!append_unescaped(current, end, &(out->oid.name))) + return false; + + current = ++end; + for ( ; end != long_name.end() && *end != '_'; ++end) ; + if (end == long_name.end()) + return false; + if (!append_unescaped(current, end, &(out->key))) + return false; + + current = ++end; + for ( ; end != long_name.end() && *end != '_'; ++end) ; + if (end == long_name.end()) + return false; + string snap(current, end); + + current = ++end; + for ( ; end != long_name.end() && *end != '_'; ++end) ; + if (end != long_name.end()) + return false; + string hash(current, end); + + if (snap == "head") + out->snap = CEPH_NOSNAP; + else if (snap == "snapdir") + out->snap = CEPH_SNAPDIR; + else + out->snap = strtoull(snap.c_str(), NULL, 16); + sscanf(hash.c_str(), "%X", &out->hash); + return true; +} + bool LFNIndex::lfn_is_hashed_filename(const string &name) { if (name.size() < (unsigned)FILENAME_SHORT_LEN) { return 0; @@ -776,13 +921,13 @@ string LFNIndex::demangle_path_component(const string &component) { } int LFNIndex::decompose_full_path(const char *in, vector<string> *out, - hobject_t *hoid, string *shortname) { + hobject_t *hoid, string *shortname) { const char *beginning = in + get_base_path().size(); const char *end = beginning; while (1) { end++; beginning = end++; - for (; *end != '\0' && *end != '/'; ++end); + for ( ; *end != '\0' && *end != '/'; ++end) ; if (*end != '\0') { out->push_back(demangle_path_component(string(beginning, end - beginning))); continue; diff --git a/src/os/LFNIndex.h b/src/os/LFNIndex.h index dedd8465a28..e0321b45dbf 100644 --- a/src/os/LFNIndex.h +++ b/src/os/LFNIndex.h @@ -76,10 +76,26 @@ class LFNIndex : public CollectionIndex { /// For reference counting the collection @see Path std::tr1::weak_ptr<CollectionIndex> self_ref; +protected: + const uint32_t index_version; + +private: + string lfn_attribute; + public: /// Constructor - LFNIndex(const char *base_path) ///< [in] path to Index root - : base_path(base_path) {} + LFNIndex( + const char *base_path, ///< [in] path to Index root + uint32_t index_version) + : base_path(base_path), index_version(index_version) { + if (index_version == HASH_INDEX_TAG) { + lfn_attribute = LFN_ATTR; + } else { + char buf[100]; + snprintf(buf, sizeof(buf), "%d", index_version); + lfn_attribute = LFN_ATTR + string(buf); + } + } /// Virtual destructor virtual ~LFNIndex() {} @@ -306,6 +322,14 @@ protected: private: /* lfn translation functions */ + + /** + * Gets the version specific lfn attribute tag + */ + const string &get_lfn_attr() const { + return lfn_attribute; + } + /** * Gets the filename corresponsing to hoid in path. * @@ -359,11 +383,22 @@ private: ); ///< @return True if short_name is a subdir, false otherwise /// Generate object name + string lfn_generate_object_name_keyless( + const hobject_t &hoid ///< [in] Object for which to generate. + ); ///< @return Generated object name. + + /// Generate object name string lfn_generate_object_name( const hobject_t &hoid ///< [in] Object for which to generate. ); ///< @return Generated object name. /// Parse object name + bool lfn_parse_object_name_keyless( + const string &long_name, ///< [in] Name to parse + hobject_t *out ///< [out] Resulting Object + ); ///< @return True if successfull, False otherwise. + + /// Parse object name bool lfn_parse_object_name( const string &long_name, ///< [in] Name to parse hobject_t *out ///< [out] Resulting Object diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc index 95adc3ff8ac..85d922cab11 100644 --- a/src/osd/OSD.cc +++ b/src/osd/OSD.cc @@ -193,16 +193,15 @@ static int do_convertfs(ObjectStore *store) g_ceph_context->_conf->filestore_update_collections = true; int r = store->mount(); if (r < 0) - return -r; + return r; uint32_t version; r = store->version_stamp_is_valid(&version); if (r < 0) - return -r; + return r; if (r == 1) { derr << "FileStore is up to date." << dendl; - store->umount(); - return 0; + return store->umount(); } else { derr << "FileStore is old at version " << version << ". Updating..." << dendl; @@ -212,7 +211,7 @@ static int do_convertfs(ObjectStore *store) vector<coll_t> collections; r = store->list_collections(collections); if (r < 0) - return -r; + return r; derr << collections.size() << " to process." << dendl; int processed = 0; @@ -242,8 +241,7 @@ static int do_convertfs(ObjectStore *store) store->sync_and_flush(); store->sync(); cerr << "Version stamp updated, done!" << std::endl; - store->umount(); - return 0; + return store->umount(); } int OSD::convertfs(const std::string &dev, const std::string &jdev) @@ -5069,7 +5067,8 @@ void OSD::handle_op(MOSDOp *op) if ((op->get_flags() & CEPH_OSD_FLAG_PGOP) == 0) { // missing object? - hobject_t head(op->get_oid(), CEPH_NOSNAP, op->get_pg().ps()); + hobject_t head(op->get_oid(), op->get_object_locator().key, + CEPH_NOSNAP, op->get_pg().ps()); if (pg->is_missing_object(head)) { pg->wait_for_missing_object(head, op); pg->unlock(); diff --git a/src/osd/PG.cc b/src/osd/PG.cc index c0f383b6e0f..d77639c2a3b 100644 --- a/src/osd/PG.cc +++ b/src/osd/PG.cc @@ -2250,6 +2250,7 @@ void PG::read_log(ObjectStore *store) ++i) { if (i->oid == e.soid.oid && i->snap == e.soid.snap) { e.soid.hash = i->hash; + e.soid.key = i->key; found = true; break; } diff --git a/src/osd/PG.h b/src/osd/PG.h index 61e8778fd14..d5f810d5130 100644 --- a/src/osd/PG.h +++ b/src/osd/PG.h @@ -393,7 +393,7 @@ public: } void encode(bufferlist &bl) const { - __u8 struct_v = 2; + __u8 struct_v = 3; ::encode(struct_v, bl); ::encode(op, bl); ::encode(soid, bl); @@ -417,6 +417,8 @@ public: } else { ::decode(soid, bl); } + if (struct_v < 3) + invalid_hash = true; ::decode(version, bl); ::decode(prior_version, bl); ::decode(reqid, bl); diff --git a/src/osd/ReplicatedPG.cc b/src/osd/ReplicatedPG.cc index b9a50f95936..700b02c01c3 100644 --- a/src/osd/ReplicatedPG.cc +++ b/src/osd/ReplicatedPG.cc @@ -404,7 +404,9 @@ void ReplicatedPG::do_op(MOSDOp *op) ObjectContext *obc; bool can_create = op->may_write(); snapid_t snapid; - int r = find_object_context(hobject_t(op->get_oid(), op->get_snapid(), op->get_pg().ps()), + int r = find_object_context(hobject_t(op->get_oid(), + op->get_object_locator().key, + op->get_snapid(), op->get_pg().ps()), op->get_object_locator(), &obc, can_create, &snapid); @@ -416,7 +418,8 @@ void ReplicatedPG::do_op(MOSDOp *op) if (is_primary() || (!(op->get_rmw_flags() & CEPH_OSD_FLAG_LOCALIZE_READS))) { // missing the specific snap we need; requeue and wait. assert(!can_create); // only happens on a read - hobject_t soid(op->get_oid(), snapid, op->get_pg().ps()); + hobject_t soid(op->get_oid(), op->get_object_locator().key, + snapid, op->get_pg().ps()); wait_for_missing_object(soid, op); return; } @@ -476,17 +479,19 @@ void ReplicatedPG::do_op(MOSDOp *op) map<hobject_t,ObjectContext*> src_obc; for (vector<OSDOp>::iterator p = op->ops.begin(); p != op->ops.end(); p++) { OSDOp& osd_op = *p; - hobject_t toid(osd_op.soid, op->get_pg().ps()); + hobject_t toid(osd_op.soid, op->get_object_locator().key, op->get_pg().ps()); if (osd_op.soid.oid.name.length()) { if (!src_obc.count(toid)) { ObjectContext *sobc; snapid_t ssnapid; - int r = find_object_context(hobject_t(toid.oid, toid.snap, op->get_pg().ps()), + int r = find_object_context(hobject_t(toid.oid, op->get_object_locator().key, + toid.snap, op->get_pg().ps()), op->get_object_locator(), &sobc, false, &ssnapid); if (r == -EAGAIN) { // missing the specific snap we need; requeue and wait. - hobject_t wait_oid(osd_op.soid.oid, ssnapid, op->get_pg().ps()); + hobject_t wait_oid(osd_op.soid.oid, op->get_object_locator().key, + ssnapid, op->get_pg().ps()); wait_for_missing_object(wait_oid, op); } else if (r) { osd->reply_op_error(op, r); @@ -816,7 +821,7 @@ ReplicatedPG::RepGather *ReplicatedPG::trim_object(const hobject_t &coid, // load clone info bufferlist bl; ObjectContext *obc = 0; - int r = find_object_context(hobject_t(coid.oid, sn, coid.hash), + int r = find_object_context(hobject_t(coid.oid, coid.key, sn, coid.hash), OLOC_BLANK, &obc, false, NULL); if (r == -ENOENT || coid.snap != obc->obs.oi.soid.snap) { if (obc) put_object_context(obc); @@ -829,7 +834,7 @@ ReplicatedPG::RepGather *ReplicatedPG::trim_object(const hobject_t &coid, // get snap set context if (!obc->ssc) - obc->ssc = get_snapset_context(coid.oid, coid.hash, false); + obc->ssc = get_snapset_context(coid.oid, coid.key, coid.hash, false); SnapSetContext *ssc = obc->ssc; assert(ssc); SnapSet& snapset = ssc->snapset; @@ -926,7 +931,7 @@ ReplicatedPG::RepGather *ReplicatedPG::trim_object(const hobject_t &coid, // save head snapset dout(10) << coid << " new snapset " << snapset << dendl; - hobject_t snapoid(coid.oid, snapset.head_exists ? CEPH_NOSNAP:CEPH_SNAPDIR, coid.hash); + hobject_t snapoid(coid.oid, coid.key, snapset.head_exists ? CEPH_NOSNAP:CEPH_SNAPDIR, coid.hash); ctx->snapset_obc = get_object_context(snapoid, coi.oloc, false); assert(ctx->snapset_obc->registered); if (snapset.clones.empty() && !snapset.head_exists) { @@ -1157,7 +1162,7 @@ int ReplicatedPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops, ObjectContext *src_obc = 0; if (ceph_osd_op_type_multi(op.op)) { - src_obc = ctx->src_obc[hobject_t(osd_op.soid, soid.hash)]; + src_obc = ctx->src_obc[hobject_t(osd_op.soid, soid.key, soid.hash)]; assert(src_obc); } @@ -1678,8 +1683,11 @@ int ReplicatedPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops, result = -EINVAL; break; } - t.clone_range(coll, hobject_t(osd_op.soid, obs.oi.soid.hash), obs.oi.soid, - op.clonerange.src_offset, op.clonerange.length, op.clonerange.offset); + t.clone_range(coll, hobject_t(osd_op.soid, obs.oi.soid.key, + obs.oi.soid.hash), + obs.oi.soid, op.clonerange.src_offset, + op.clonerange.length, op.clonerange.offset); + write_update_size_and_usage(ctx->delta_stats, oi, ssc->snapset, ctx->modified_ranges, op.clonerange.offset, op.clonerange.length, false); @@ -2096,7 +2104,7 @@ int ReplicatedPG::_rollback_to(OpContext *ctx, ceph_osd_op& op) dout(10) << "_rollback_to " << soid << " snapid " << snapid << dendl; ObjectContext *rollback_to; - int ret = find_object_context(hobject_t(soid.oid, snapid, soid.hash), + int ret = find_object_context(hobject_t(soid.oid, oi.oloc.key, snapid, soid.hash), oi.oloc, &rollback_to, false, &cloneid); if (ret) { if (-ENOENT == ret) { @@ -2109,7 +2117,7 @@ int ReplicatedPG::_rollback_to(OpContext *ctx, ceph_osd_op& op) /* a different problem, like degraded pool * with not-yet-restored object. We shouldn't have been able * to get here; recovery should have completed first! */ - hobject_t rollback_target(soid.oid, cloneid, soid.hash); + hobject_t rollback_target(soid.oid, soid.key, cloneid, soid.hash); assert(is_missing_object(rollback_target)); dout(20) << "_rollback_to attempted to roll back to a missing object " << rollback_target << " (requested snapid: ) " << snapid << dendl; @@ -2546,7 +2554,7 @@ int ReplicatedPG::prepare_transaction(OpContext *ctx) ctx->op_t.setattr(coll, soid, SS_ATTR, bss); if (!head_existed) { // if we logically recreated the head, remove old _snapdir object - hobject_t snapoid(soid.oid, CEPH_SNAPDIR, soid.hash); + hobject_t snapoid(soid.oid, soid.key, CEPH_SNAPDIR, soid.hash); ctx->snapset_obc = get_object_context(snapoid, ctx->new_obs.oi.oloc, false); if (ctx->snapset_obc && ctx->snapset_obc->obs.exists) { @@ -2563,7 +2571,7 @@ int ReplicatedPG::prepare_transaction(OpContext *ctx) } } else if (ctx->new_snapset.clones.size()) { // save snapset on _snap - hobject_t snapoid(soid.oid, CEPH_SNAPDIR, soid.hash); + hobject_t snapoid(soid.oid, soid.key, CEPH_SNAPDIR, soid.hash); dout(10) << " final snapset " << ctx->new_snapset << " in " << snapoid << dendl; ctx->at_version.version++; @@ -2809,7 +2817,8 @@ void ReplicatedPG::eval_repop(RepGather *repop) reply = new MOSDOpReply(op, 0, osd->osdmap->get_epoch(), 0); reply->add_flags(CEPH_OSD_FLAG_ACK); dout(10) << " sending ack on " << *repop << " " << reply << dendl; - osd->cluster_messenger->send_message(reply, op->get_connection()); + assert(entity_name_t::TYPE_OSD != op->get_connection()->peer_type); + osd->client_messenger->send_message(reply, op->get_connection()); repop->sent_ack = true; } @@ -3017,13 +3026,13 @@ ReplicatedPG::ObjectContext *ReplicatedPG::get_object_context(const hobject_t& s SnapSetContext *ssc = NULL; if (can_create) - ssc = get_snapset_context(soid.oid, soid.hash, true); + ssc = get_snapset_context(soid.oid, soid.key, soid.hash, true); obc = new ObjectContext(oi, true, ssc); } register_object_context(obc); if (can_create && !obc->ssc) - obc->ssc = get_snapset_context(soid.oid, soid.hash, true); + obc->ssc = get_snapset_context(soid.oid, soid.key, soid.hash, true); if (r >= 0) { obc->obs.oi.decode(bv); @@ -3058,7 +3067,7 @@ int ReplicatedPG::find_object_context(const hobject_t& oid, snapid_t *psnapid) { // want the head? - hobject_t head(oid.oid, CEPH_NOSNAP, oid.hash); + hobject_t head(oid.oid, oid.key, CEPH_NOSNAP, oid.hash); if (oid.snap == CEPH_NOSNAP) { ObjectContext *obc = get_object_context(head, oloc, can_create); if (!obc) @@ -3067,13 +3076,13 @@ int ReplicatedPG::find_object_context(const hobject_t& oid, *pobc = obc; if (can_create && !obc->ssc) - obc->ssc = get_snapset_context(oid.oid, oid.hash, true); + obc->ssc = get_snapset_context(oid.oid, oid.key, oid.hash, true); return 0; } // we want a snap - SnapSetContext *ssc = get_snapset_context(oid.oid, oid.hash, can_create); + SnapSetContext *ssc = get_snapset_context(oid.oid, oid.key, oid.hash, can_create); if (!ssc) return -ENOENT; @@ -3115,7 +3124,7 @@ int ReplicatedPG::find_object_context(const hobject_t& oid, put_snapset_context(ssc); return -ENOENT; } - hobject_t soid(oid.oid, ssc->snapset.clones[k], oid.hash); + hobject_t soid(oid.oid, oid.key, ssc->snapset.clones[k], oid.hash); put_snapset_context(ssc); // we're done with ssc ssc = 0; @@ -3179,7 +3188,9 @@ void ReplicatedPG::put_object_contexts(map<hobject_t,ObjectContext*>& obcv) obcv.clear(); } -ReplicatedPG::SnapSetContext *ReplicatedPG::get_snapset_context(const object_t& oid, ps_t seed, +ReplicatedPG::SnapSetContext *ReplicatedPG::get_snapset_context(const object_t& oid, + const string& key, + ps_t seed, bool can_create) { SnapSetContext *ssc; @@ -3188,11 +3199,11 @@ ReplicatedPG::SnapSetContext *ReplicatedPG::get_snapset_context(const object_t& ssc = p->second; } else { bufferlist bv; - hobject_t head(oid, CEPH_NOSNAP, seed); + hobject_t head(oid, key, CEPH_NOSNAP, seed); int r = osd->store->getattr(coll, head, SS_ATTR, bv); if (r < 0) { // try _snapset - hobject_t snapdir(oid, CEPH_SNAPDIR, seed); + hobject_t snapdir(oid, key, CEPH_SNAPDIR, seed); r = osd->store->getattr(coll, snapdir, SS_ATTR, bv); if (r < 0 && !can_create) return NULL; @@ -3602,7 +3613,7 @@ int ReplicatedPG::pull(const hobject_t& soid) } // check snapset - SnapSetContext *ssc = get_snapset_context(soid.oid, soid.hash, false); + SnapSetContext *ssc = get_snapset_context(soid.oid, soid.key, soid.hash, false); dout(10) << " snapset " << ssc->snapset << dendl; calc_clone_subsets(ssc->snapset, soid, missing, data_subset, clone_subsets); @@ -3710,7 +3721,7 @@ void ReplicatedPG::push_to_replica(ObjectContext *obc, const hobject_t& soid, in return push_start(snapdir, peer); } - SnapSetContext *ssc = get_snapset_context(soid.oid, soid.hash, false); + SnapSetContext *ssc = get_snapset_context(soid.oid, soid.key, soid.hash, false); dout(15) << "push_to_replica snapset is " << ssc->snapset << dendl; calc_clone_subsets(ssc->snapset, soid, peer_missing[peer], data_subset, clone_subsets); @@ -3718,7 +3729,7 @@ void ReplicatedPG::push_to_replica(ObjectContext *obc, const hobject_t& soid, in } else if (soid.snap == CEPH_NOSNAP) { // pushing head or unversioned object. // base this on partially on replica's clones? - SnapSetContext *ssc = get_snapset_context(soid.oid, soid.hash, false); + SnapSetContext *ssc = get_snapset_context(soid.oid, soid.key, soid.hash, false); dout(15) << "push_to_replica snapset is " << ssc->snapset << dendl; calc_head_subsets(ssc->snapset, soid, peer_missing[peer], data_subset, clone_subsets); put_snapset_context(ssc); @@ -4062,7 +4073,7 @@ void ReplicatedPG::sub_op_push(MOSDSubOp *op) if (soid.snap && soid.snap < CEPH_NOSNAP) { // clone. make sure we have enough data. - SnapSetContext *ssc = get_snapset_context(soid.oid, soid.hash, false); + SnapSetContext *ssc = get_snapset_context(soid.oid, soid.key, soid.hash, false); assert(ssc); clone_subsets.clear(); // forget what pusher said; recalculate cloning. diff --git a/src/osd/ReplicatedPG.h b/src/osd/ReplicatedPG.h index faadd3f03bf..f9295e4e13c 100644 --- a/src/osd/ReplicatedPG.h +++ b/src/osd/ReplicatedPG.h @@ -510,7 +510,8 @@ protected: ObjectContext **pobc, bool can_create, snapid_t *psnapid=NULL); - SnapSetContext *get_snapset_context(const object_t& oid, ps_t seed, bool can_create); + SnapSetContext *get_snapset_context(const object_t& oid, const string &key, + ps_t seed, bool can_create); void register_snapset_context(SnapSetContext *ssc) { if (!ssc->registered) { ssc->registered = true; diff --git a/src/osd/osd_types.cc b/src/osd/osd_types.cc index 75a93ca73fd..2bdf099d0cf 100644 --- a/src/osd/osd_types.cc +++ b/src/osd/osd_types.cc @@ -574,7 +574,7 @@ ps_t object_info_t::legacy_object_locator_to_ps(const object_t &oid, void object_info_t::encode(bufferlist& bl) const { - const __u8 v = 6; + const __u8 v = 7; ::encode(v, bl); ::encode(soid, bl); ::encode(oloc, bl); @@ -603,12 +603,15 @@ void object_info_t::decode(bufferlist::iterator& bl) sobject_t obj; ::decode(obj, bl); ::decode(oloc, bl); - soid = hobject_t(obj.oid, obj.snap, 0); + soid = hobject_t(obj.oid, oloc.key, obj.snap, 0); soid.hash = legacy_object_locator_to_ps(soid.oid, oloc); } else if (v >= 6) { ::decode(soid, bl); ::decode(oloc, bl); + if (v == 6) + soid.key = oloc.key; } + if (v >= 5) ::decode(category, bl); ::decode(version, bl); diff --git a/src/osdc/ObjectCacher.cc b/src/osdc/ObjectCacher.cc index 19ef1224b57..a4dad0ddf1a 100644 --- a/src/osdc/ObjectCacher.cc +++ b/src/osdc/ObjectCacher.cc @@ -632,7 +632,6 @@ void ObjectCacher::bh_write_commit(int64_t poolid, sobject_t oid, loff_t start, << dendl; if (objects[poolid].count(oid) == 0) { ldout(cct, 7) << "bh_write_commit no object cache" << dendl; - assert(0); } else { Object *ob = objects[poolid][oid]; @@ -642,7 +641,8 @@ void ObjectCacher::bh_write_commit(int64_t poolid, sobject_t oid, loff_t start, p++) { BufferHead *bh = p->second; - if (bh->start() > start+(loff_t)length) break; + if (bh->start() > start+(loff_t)length) + break; if (bh->start() < start && bh->end() > start+(loff_t)length) { diff --git a/src/rados.cc b/src/rados.cc index 39cc1e2f1a2..8b43bb08375 100644 --- a/src/rados.cc +++ b/src/rados.cc @@ -753,6 +753,7 @@ static int rados_tool_common(const std::map < std::string, std::string > &opts, "KB", "objects", "clones", "degraded", "unfound", "rd", "rd KB", "wr", "wr KB"); } else { + formatter->open_object_section("stats"); formatter->open_array_section("pools"); } for (map<string, librados::stats_map>::iterator c = stats.begin(); c != stats.end(); ++c) { @@ -825,6 +826,7 @@ static int rados_tool_common(const std::map < std::string, std::string > &opts, printf(" total avail %12lld\n", (long long unsigned)tstats.kb_avail); printf(" total space %12lld\n", (long long unsigned)tstats.kb); } else { + formatter->close_section(); formatter->dump_format("total_objects", "%lld", (long long unsigned)tstats.num_objects); formatter->dump_format("total_used", "%lld", (long long unsigned)tstats.kb_used); formatter->dump_format("total_avail", "%lld", (long long unsigned)tstats.kb_avail); diff --git a/src/rgw/rgw_access.h b/src/rgw/rgw_access.h index 63e199df168..9a7a19623a7 100644 --- a/src/rgw/rgw_access.h +++ b/src/rgw/rgw_access.h @@ -232,7 +232,7 @@ public: virtual bool supports_tmap() { return false; } - virtual int tmap_get(rgw_obj& obj, bufferlist& bl) { return -ENOTSUP; } + virtual int tmap_get(rgw_obj& obj, bufferlist& header, std::map<string, bufferlist>& m) { return -ENOTSUP; } virtual int tmap_set(rgw_obj& obj, std::string& key, bufferlist& bl) { return -ENOTSUP; } virtual int tmap_set(rgw_obj& obj, map<std::string, bufferlist>& m) { return -ENOTSUP; } virtual int tmap_create(rgw_obj& obj, std::string& key, bufferlist& bl) { return -ENOTSUP; } diff --git a/src/rgw/rgw_admin.cc b/src/rgw/rgw_admin.cc index 7ed07881956..dd6d31f9f85 100644 --- a/src/rgw/rgw_admin.cc +++ b/src/rgw/rgw_admin.cc @@ -57,11 +57,11 @@ void _usage() cerr << " --uid=<id> user id\n"; cerr << " --subuser=<name> subuser name\n"; cerr << " --access-key=<key> S3 access key\n"; - cerr << " --os-user=<group:name> OpenStack user\n"; + cerr << " --swift-user=<group:name> Swift user\n"; cerr << " --email=<email>\n"; cerr << " --auth_uid=<auid> librados uid\n"; cerr << " --secret=<key> S3 key\n"; - cerr << " --os-secret=<key> OpenStack key\n"; + cerr << " --swift-secret=<key> Swift key\n"; cerr << " --gen-access-key generate random access key\n"; cerr << " --gen-secret generate random secret key\n"; cerr << " --access=<access> Set access permissions for sub-user, should be one\n"; @@ -266,32 +266,76 @@ string escape_str(string& src, char c) return dest; } -static void show_user_info( RGWUserInfo& info) +static void show_user_info(RGWUserInfo& info, const char *format, Formatter *formatter) { map<string, RGWAccessKey>::iterator kiter; map<string, RGWSubUser>::iterator uiter; - cout << "User ID: " << info.user_id << std::endl; - cout << "RADOS UID: " << info.auid << std::endl; - cout << "Keys:" << std::endl; - for (kiter = info.access_keys.begin(); kiter != info.access_keys.end(); ++kiter) { - RGWAccessKey& k = kiter->second; - cout << " User: " << info.user_id << (k.subuser.empty() ? "" : ":") << k.subuser << std::endl; - cout << " Access Key: " << k.id << std::endl; - cout << " Secret Key: " << k.key << std::endl; - } - cout << "Users: " << std::endl; - for (uiter = info.subusers.begin(); uiter != info.subusers.end(); ++uiter) { - RGWSubUser& u = uiter->second; - cout << " Name: " << info.user_id << ":" << u.name << std::endl; - char buf[256]; - perm_to_str(u.perm_mask, buf, sizeof(buf)); - cout << " Permissions: " << buf << std::endl; + + if (!format) { + cout << "User ID: " << info.user_id << std::endl; + cout << "RADOS UID: " << info.auid << std::endl; + cout << "Keys:" << std::endl; + for (kiter = info.access_keys.begin(); kiter != info.access_keys.end(); ++kiter) { + RGWAccessKey& k = kiter->second; + cout << " User: " << info.user_id << (k.subuser.empty() ? "" : ":") << k.subuser << std::endl; + cout << " Access Key: " << k.id << std::endl; + cout << " Secret Key: " << k.key << std::endl; + } + cout << "Users: " << std::endl; + for (uiter = info.subusers.begin(); uiter != info.subusers.end(); ++uiter) { + RGWSubUser& u = uiter->second; + cout << " Name: " << info.user_id << ":" << u.name << std::endl; + char buf[256]; + perm_to_str(u.perm_mask, buf, sizeof(buf)); + cout << " Permissions: " << buf << std::endl; + } + cout << "Display Name: " << info.display_name << std::endl; + cout << "Email: " << info.user_email << std::endl; + cout << "Swift User: " << (info.swift_name.size() ? info.swift_name : "<undefined>")<< std::endl; + cout << "Swift Key: " << (info.swift_key.size() ? info.swift_key : "<undefined>")<< std::endl; + } else { + formatter->open_object_section("user_info"); + + formatter->dump_string("user_id", info.user_id.c_str()); + formatter->dump_format("rados_uid", "%lld", info.auid); + formatter->dump_string("display_name", info.display_name.c_str()); + formatter->dump_string("email", info.user_email.c_str()); + formatter->dump_string("swift_user", info.swift_name.c_str()); + formatter->dump_string("swift_key", info.swift_key.c_str()); + + // keys + formatter->open_array_section("keys"); + for (kiter = info.access_keys.begin(); kiter != info.access_keys.end(); ++kiter) { + RGWAccessKey& k = kiter->second; + const char *sep = (k.subuser.empty() ? "" : ":"); + const char *subuser = (k.subuser.empty() ? "" : k.subuser.c_str()); + formatter->open_object_section("key"); + formatter->dump_format("user", "%s%s%s", info.user_id.c_str(), sep, subuser); + formatter->dump_string("access_key", k.id); + formatter->dump_string("secret_key", k.key); + formatter->close_section(); + formatter->flush(cout); + } + formatter->close_section(); + + // subusers + formatter->open_array_section("subusers"); + for (uiter = info.subusers.begin(); uiter != info.subusers.end(); ++uiter) { + RGWSubUser& u = uiter->second; + formatter->open_object_section("user"); + formatter->dump_format("id", "%s:%s", info.user_id.c_str(), u.name.c_str()); + char buf[256]; + perm_to_str(u.perm_mask, buf, sizeof(buf)); + formatter->dump_string("permissions", buf); + formatter->close_section(); + formatter->flush(cout); + } + formatter->close_section(); + + formatter->close_section(); + formatter->flush(cout); } - cout << "Display Name: " << info.display_name << std::endl; - cout << "Email: " << info.user_email << std::endl; - cout << "OpenStack User: " << (info.openstack_name.size() ? info.openstack_name : "<undefined>")<< std::endl; - cout << "OpenStack Key: " << (info.openstack_key.size() ? info.openstack_key : "<undefined>")<< std::endl; } static int create_bucket(string bucket_str, string& user_id, string& display_name, uint64_t auid) @@ -358,11 +402,11 @@ static void remove_old_indexes(RGWUserInfo& old_info, RGWUserInfo new_info) } } - if (!old_info.openstack_name.empty() && - old_info.openstack_name.compare(new_info.openstack_name) != 0) { - ret = rgw_remove_openstack_name_index(new_info.user_id, old_info.openstack_name); + if (!old_info.swift_name.empty() && + old_info.swift_name.compare(new_info.swift_name) != 0) { + ret = rgw_remove_swift_name_index(new_info.user_id, old_info.swift_name); if (ret < 0 && ret != -ENOENT) { - cerr << "ERROR: could not remove index for openstack_name " << old_info.openstack_name << " return code: " << ret << std::endl; + cerr << "ERROR: could not remove index for swift_name " << old_info.swift_name << " return code: " << ret << std::endl; success = false; } } @@ -421,7 +465,12 @@ int process_intent_log(rgw_bucket& bucket, string& oid, time_t epoch, int flags, try { while (!iter.end()) { struct rgw_intent_log_entry entry; - ::decode(entry, iter); + try { + ::decode(entry, iter); + } catch (buffer::error& err) { + RGW_LOG(0) << "ERROR: " << __func__ << "(): caught buffer::error" << dendl; + return -EIO; + } if (entry.op_time.sec() > epoch) { cerr << "skipping entry for obj=" << obj << " entry.op_time=" << entry.op_time.sec() << " requested epoch=" << epoch << std::endl; cerr << "skipping intent log" << std::endl; // no use to continue @@ -484,7 +533,7 @@ int main(int argc, char **argv) common_init_finish(g_ceph_context); std::string user_id, access_key, secret_key, user_email, display_name; - std::string bucket_name, object, openstack_user, openstack_key; + std::string bucket_name, object, swift_user, swift_key; std::string date, time, subuser, access, format; rgw_bucket bucket; uint32_t perm_mask = 0; @@ -540,9 +589,9 @@ int main(int argc, char **argv) } auid = tmp; } else if (ceph_argparse_witharg(args, i, &val, "--os-user", (char*)NULL)) { - openstack_user = val; + swift_user = val; } else if (ceph_argparse_witharg(args, i, &val, "--os-secret", (char*)NULL)) { - openstack_key = val; + swift_key = val; } else if (ceph_argparse_witharg(args, i, &val, "--date", (char*)NULL)) { date = val; } else if (ceph_argparse_witharg(args, i, &val, "--time", (char*)NULL)) { @@ -654,12 +703,12 @@ int main(int argc, char **argv) cerr << "could not find user by specified access key" << std::endl; } } - if (!found && (!openstack_user.empty())) { - s = openstack_user; - if (rgw_get_user_info_by_openstack(s, info) >= 0) { + if (!found && (!swift_user.empty())) { + s = swift_user; + if (rgw_get_user_info_by_swift(s, info) >= 0) { found = true; } else - cerr << "could not find user by specified openstack username" << std::endl; + cerr << "could not find user by specified swift username" << std::endl; } if (found) user_id = info.user_id.c_str(); @@ -799,10 +848,10 @@ int main(int argc, char **argv) info.user_email = user_email; if (auid != (uint64_t)-1) info.auid = auid; - if (!openstack_user.empty()) - info.openstack_name = openstack_user; - if (!openstack_key.empty()) - info.openstack_key = openstack_key; + if (!swift_user.empty()) + info.swift_name = swift_user; + if (!swift_key.empty()) + info.swift_key = swift_key; if (!subuser.empty()) { RGWSubUser u; u.name = subuser; @@ -817,7 +866,7 @@ int main(int argc, char **argv) remove_old_indexes(old_info, info); - show_user_info(info); + show_user_info(info, format.c_str(), formatter); break; case OPT_SUBUSER_RM: @@ -828,7 +877,7 @@ int main(int argc, char **argv) cerr << "error storing user info: " << cpp_strerror(-err) << std::endl; break; } - show_user_info(info); + show_user_info(info, format.c_str(), formatter); break; case OPT_KEY_RM: @@ -843,11 +892,11 @@ int main(int argc, char **argv) break; } } - show_user_info(info); + show_user_info(info, format.c_str(), formatter); break; case OPT_USER_INFO: - show_user_info(info); + show_user_info(info, format.c_str(), formatter); break; } @@ -859,7 +908,12 @@ int main(int argc, char **argv) RGWAccessControlPolicy policy; if (ret >= 0) { bufferlist::iterator iter = bl.begin(); - policy.decode(iter); + try { + policy.decode(iter); + } catch (buffer::error& err) { + RGW_LOG(0) << "ERROR: caught buffer::error, could not decode policy" << dendl; + return -EIO; + } policy.to_xml(cout); cout << std::endl; } diff --git a/src/rgw/rgw_bucket.cc b/src/rgw/rgw_bucket.cc index a1c440fee3b..94eebd8f045 100644 --- a/src/rgw/rgw_bucket.cc +++ b/src/rgw/rgw_bucket.cc @@ -15,8 +15,6 @@ static rgw_bucket pi_buckets(BUCKETS_POOL_NAME); static string avail_pools = ".pools.avail"; static string pool_name_prefix = "p"; -#define POOLS_PREALLOCATE_NUM 100 - int rgw_store_bucket_info(string& bucket_name, RGWBucketInfo& info) { @@ -49,7 +47,12 @@ int rgw_get_bucket_info(string& bucket_name, RGWBucketInfo& info) } bufferlist::iterator iter = bl.begin(); - ::decode(info, iter); + try { + ::decode(info, iter); + } catch (buffer::error& err) { + RGW_LOG(0) << "ERROR: could not decode buffer info, caught buffer::error" << dendl; + return -EIO; + } RGW_LOG(0) << "rgw_get_bucket_info: bucket=" << info.bucket << dendl; @@ -64,11 +67,11 @@ int rgw_remove_bucket_info(string& bucket_name) return ret; } -static int generate_preallocated_pools(vector<string>& pools) +static int generate_preallocated_pools(vector<string>& pools, int num) { vector<string> names; - for (int i = 0; i < POOLS_PREALLOCATE_NUM; i++) { + for (int i = 0; i < num; i++) { string name = pool_name_prefix; append_rand_alpha(pool_name_prefix, name, 8); names.push_back(name); @@ -98,18 +101,8 @@ static int generate_preallocated_pools(vector<string>& pools) return 0; } -static int generate_pool(string& bucket_name, rgw_bucket& bucket) +static int register_available_pools(vector<string>& pools) { - vector<string> pools; - int ret = generate_preallocated_pools(pools); - if (ret < 0) { - RGW_LOG(0) << "generate_preallocad_pools returned " << ret << dendl; - return ret; - } - bucket.pool = pools.back(); - pools.pop_back(); - bucket.name = bucket_name; - map<string, bufferlist> m; vector<string>::iterator iter; @@ -119,7 +112,7 @@ static int generate_pool(string& bucket_name, rgw_bucket& bucket) m[name] = bl; } rgw_obj obj(pi_buckets, avail_pools); - ret = rgwstore->tmap_set(obj, m); + int ret = rgwstore->tmap_set(obj, m); if (ret == -ENOENT) { rgw_bucket new_bucket; map<string,bufferlist> attrs; @@ -136,6 +129,26 @@ static int generate_pool(string& bucket_name, rgw_bucket& bucket) return 0; } +static int generate_pool(string& bucket_name, rgw_bucket& bucket) +{ + vector<string> pools; + int ret = generate_preallocated_pools(pools, g_conf->rgw_pools_preallocate_max); + if (ret < 0) { + RGW_LOG(0) << "generate_preallocad_pools returned " << ret << dendl; + return ret; + } + bucket.pool = pools.back(); + pools.pop_back(); + bucket.name = bucket_name; + + ret = register_available_pools(pools); + if (ret < 0) { + return ret; + } + + return 0; +} + static int withdraw_pool(string& pool_name) { rgw_obj obj(pi_buckets, avail_pools); @@ -143,15 +156,45 @@ static int withdraw_pool(string& pool_name) return rgwstore->tmap_set(obj, pool_name, bl); } +int rgw_bucket_maintain_pools() +{ + bufferlist header; + map<string, bufferlist> m; + string pool_name; + + rgw_obj obj(pi_buckets, avail_pools); + int ret = rgwstore->tmap_get(obj, header, m); + if (ret < 0 && ret != -ENOENT) { + return ret; + } + + if ((int)m.size() < g_conf->rgw_pools_preallocate_threshold) { + RGW_LOG(0) << "rgw_bucket_maintain_pools allocating pools (m.size()=" << m.size() << " threshold=" + << g_conf->rgw_pools_preallocate_threshold << ")" << dendl; + vector<string> pools; + ret = generate_preallocated_pools(pools, g_conf->rgw_pools_preallocate_max - m.size()); + if (ret < 0) { + RGW_LOG(0) << "failed to preallocate pools" << dendl; + return ret; + } + ret = register_available_pools(pools); + if (ret < 0) { + RGW_LOG(0) << "failed to register available pools" << dendl; + return ret; + } + } + + return 0; +} + int rgw_bucket_allocate_pool(string& bucket_name, rgw_bucket& bucket) { - bufferlist bl; bufferlist header; map<string, bufferlist> m; string pool_name; rgw_obj obj(pi_buckets, avail_pools); - int ret = rgwstore->tmap_get(obj, bl); + int ret = rgwstore->tmap_get(obj, header, m); if (ret < 0) { if (ret == -ENOENT) { return generate_pool(bucket_name, bucket); @@ -159,10 +202,6 @@ int rgw_bucket_allocate_pool(string& bucket_name, rgw_bucket& bucket) return ret; } - bufferlist::iterator iter = bl.begin(); - ::decode(header, iter); - ::decode(m, iter); - if (!m.size()) { return generate_pool(bucket_name, bucket); } diff --git a/src/rgw/rgw_bucket.h b/src/rgw/rgw_bucket.h index 63c339c1402..91706f56455 100644 --- a/src/rgw/rgw_bucket.h +++ b/src/rgw/rgw_bucket.h @@ -17,6 +17,7 @@ extern int rgw_bucket_allocate_pool(string& bucket_name, rgw_bucket& bucket); extern int rgw_create_bucket(std::string& id, string& bucket_name, rgw_bucket& bucket, map<std::string, bufferlist>& attrs, bool exclusive = true, uint64_t auid = 0); +extern int rgw_bucket_maintain_pools(void); #endif diff --git a/src/rgw/rgw_cache.h b/src/rgw/rgw_cache.h index d5c8c11ed13..8a2ab0dbaa4 100644 --- a/src/rgw/rgw_cache.h +++ b/src/rgw/rgw_cache.h @@ -320,9 +320,12 @@ int RGWCache<T>::watch_cb(int opcode, uint64_t ver, bufferlist& bl) try { bufferlist::iterator iter = bl.begin(); ::decode(info, iter); - } catch (buffer::end_of_buffer *err) { + } catch (buffer::end_of_buffer& err) { RGW_LOG(0) << "ERROR: got bad notification" << dendl; return -EIO; + } catch (buffer::error& err) { + RGW_LOG(0) << "ERROR: buffer::error" << dendl; + return -EIO; } string name = normal_name(info.obj); diff --git a/src/rgw/rgw_common.cc b/src/rgw/rgw_common.cc index abb644efeac..c1db8fa59c6 100644 --- a/src/rgw/rgw_common.cc +++ b/src/rgw/rgw_common.cc @@ -263,7 +263,10 @@ bool verify_permission(RGWAccessControlPolicy *policy, string& uid, int user_per if (!policy) return false; - int acl_perm = policy->get_perm(g_ceph_context, uid, perm) & user_perm_mask; + int policy_perm = policy->get_perm(g_ceph_context, uid, perm); + int acl_perm = policy_perm & user_perm_mask; + + RGW_LOG(10) << " uid=" << uid << " requested perm (type)=" << perm << ", policy perm=" << policy_perm << ", user_perm_mask=" << user_perm_mask << ", acl perm=" << acl_perm << dendl; return (perm == acl_perm); } diff --git a/src/rgw/rgw_common.h b/src/rgw/rgw_common.h index 6f4b77e2094..4f309bc3e1d 100644 --- a/src/rgw/rgw_common.h +++ b/src/rgw/rgw_common.h @@ -67,8 +67,8 @@ extern string rgw_obj_category_none; #define RGW_FORMAT_XML 1 #define RGW_FORMAT_JSON 2 -#define RGW_REST_OPENSTACK 0x1 -#define RGW_REST_OPENSTACK_AUTH 0x2 +#define RGW_REST_SWIFT 0x1 +#define RGW_REST_SWIFT_AUTH 0x2 #define RGW_SUSPENDED_USER_AUID (uint64_t)-2 @@ -112,6 +112,7 @@ extern string rgw_obj_category_none; #define ERR_LENGTH_REQUIRED 2011 #define ERR_REQUEST_TIME_SKEWED 2012 #define ERR_USER_SUSPENDED 2100 +#define ERR_INTERNAL_ERROR 2200 typedef void *RGWAccessHandle; @@ -271,8 +272,8 @@ struct RGWUserInfo string user_id; string display_name; string user_email; - string openstack_name; - string openstack_key; + string swift_name; + string swift_key; map<string, RGWAccessKey> access_keys; map<string, RGWSubUser> subusers; __u8 suspended; @@ -295,8 +296,8 @@ struct RGWUserInfo ::encode(secret_key, bl); ::encode(display_name, bl); ::encode(user_email, bl); - ::encode(openstack_name, bl); - ::encode(openstack_key, bl); + ::encode(swift_name, bl); + ::encode(swift_key, bl); ::encode(user_id, bl); ::encode(access_keys, bl); ::encode(subusers, bl); @@ -319,8 +320,8 @@ struct RGWUserInfo } ::decode(display_name, bl); ::decode(user_email, bl); - if (ver >= 3) ::decode(openstack_name, bl); - if (ver >= 4) ::decode(openstack_key, bl); + if (ver >= 3) ::decode(swift_name, bl); + if (ver >= 4) ::decode(swift_key, bl); if (ver >= 5) ::decode(user_id, bl); else @@ -481,7 +482,7 @@ struct req_state { string bucket_name_str; string object_str; - map<string, string> x_amz_map; + map<string, string> x_meta_map; RGWUserInfo user; RGWAccessControlPolicy *acl; diff --git a/src/rgw/rgw_formats.h b/src/rgw/rgw_formats.h index 5d7ee0139f4..c43b24b050d 100644 --- a/src/rgw/rgw_formats.h +++ b/src/rgw/rgw_formats.h @@ -13,7 +13,7 @@ struct plain_stack_entry { }; /* FIXME: this class is mis-named. - * FIXME: This was a hack to send certain openstack messages. + * FIXME: This was a hack to send certain swift messages. * There is a much better way to do this. */ class RGWFormatter_Plain : public Formatter { diff --git a/src/rgw/rgw_log.cc b/src/rgw/rgw_log.cc index 8836c52a2cc..a610e82d8da 100644 --- a/src/rgw/rgw_log.cc +++ b/src/rgw/rgw_log.cc @@ -17,6 +17,7 @@ static void set_param_str(struct req_state *s, const char *name, string& str) int rgw_log_op(struct req_state *s) { struct rgw_log_entry entry; + uint64_t pool_id; if (!s->should_log) return 0; @@ -26,8 +27,13 @@ int rgw_log_op(struct req_state *s) return -EINVAL; } if (s->err.ret == -ERR_NO_SUCH_BUCKET) { - RGW_LOG(0) << "bucket " << s->bucket << " doesn't exist, not logging" << dendl; - return 0; + if (!g_conf->rgw_log_nonexistent_bucket) { + RGW_LOG(0) << "bucket " << s->bucket << " doesn't exist, not logging" << dendl; + return 0; + } + pool_id = 0; + } else { + pool_id = s->pool_id; } entry.bucket = s->bucket_name; @@ -67,7 +73,7 @@ int rgw_log_op(struct req_state *s) entry.http_status = "200"; // default entry.error_code = s->err.s3_code; - entry.pool_id = s->pool_id; + entry.pool_id = pool_id; bufferlist bl; ::encode(entry, bl); diff --git a/src/rgw/rgw_main.cc b/src/rgw/rgw_main.cc index 3a6b833a524..0a2e108fabc 100644 --- a/src/rgw/rgw_main.cc +++ b/src/rgw/rgw_main.cc @@ -17,14 +17,16 @@ #include "common/config.h" #include "common/errno.h" #include "common/WorkQueue.h" +#include "common/Timer.h" #include "rgw_common.h" #include "rgw_access.h" #include "rgw_acl.h" #include "rgw_user.h" #include "rgw_op.h" #include "rgw_rest.h" -#include "rgw_os.h" +#include "rgw_swift.h" #include "rgw_log.h" +#include "rgw_bucket.h" #include <map> #include <string> @@ -218,6 +220,16 @@ done: RGW_LOG(0) << "====== req done fcgx=" << hex << fcgx << dec << " http_status=" << http_ret << " ======" << dendl; } +class C_RGWMaintenanceTick : public Context { + SafeTimer *timer; +public: + C_RGWMaintenanceTick(SafeTimer *t) : timer(t) {} + void finish(int r) { + rgw_bucket_maintain_pools(); + RGW_LOG(20) << "C_RGWMaintenanceTick::finish()" << dendl; + timer->add_event_after(g_conf->rgw_maintenance_tick_interval, new C_RGWMaintenanceTick(timer)); + } +}; /* * start up the RADOS connection and then handle HTTP messages as they come in */ @@ -253,10 +265,22 @@ int main(int argc, const char **argv) return EIO; } - RGWProcess process(g_ceph_context, 20); + RGWProcess process(g_ceph_context, g_conf->rgw_thread_pool_size); + + Mutex lock("rgw_timer_lock"); + SafeTimer timer(g_ceph_context, lock); + + lock.Lock(); + timer.init(); + timer.add_event_after(g_conf->rgw_maintenance_tick_interval, new C_RGWMaintenanceTick(&timer)); + lock.Unlock(); process.run(); + lock.Lock(); + timer.shutdown(); + lock.Unlock(); + return 0; } diff --git a/src/rgw/rgw_op.cc b/src/rgw/rgw_op.cc index b357fb2683a..2fe571d56ed 100644 --- a/src/rgw/rgw_op.cc +++ b/src/rgw/rgw_op.cc @@ -114,7 +114,7 @@ static void format_xattr(std::string &xattr) void get_request_metadata(struct req_state *s, map<string, bufferlist>& attrs) { map<string, string>::iterator iter; - for (iter = s->x_amz_map.begin(); iter != s->x_amz_map.end(); ++iter) { + for (iter = s->x_meta_map.begin(); iter != s->x_meta_map.end(); ++iter) { const string &name(iter->first); string &xattr(iter->second); #define X_AMZ_META "x-amz-meta" @@ -148,7 +148,12 @@ static int get_policy_from_attr(void *ctx, RGWAccessControlPolicy *policy, rgw_o if (ret >= 0) { bufferlist::iterator iter = bl.begin(); - policy->decode(iter); + try { + policy->decode(iter); + } catch (buffer::error& err) { + RGW_LOG(0) << "error: could not decode policy, caught buffer::error" << dendl; + return -EIO; + } if (g_conf->rgw_log >= 15) { RGW_LOG(15) << "Read AccessControlPolicy" << dendl; policy->to_xml(cerr); @@ -327,7 +332,7 @@ int RGWListBuckets::verify_permission() void RGWListBuckets::execute() { - ret = rgw_read_user_buckets(s->user.user_id, buckets, !!(s->prot_flags & RGW_REST_OPENSTACK)); + ret = rgw_read_user_buckets(s->user.user_id, buckets, !!(s->prot_flags & RGW_REST_SWIFT)); if (ret < 0) { /* hmm.. something wrong here.. the user was authenticated, so it should exist, just try to recreate */ @@ -407,7 +412,7 @@ void RGWListBucket::execute() } url_decode(s->args.get("delimiter"), delimiter); - if (s->prot_flags & RGW_REST_OPENSTACK) { + if (s->prot_flags & RGW_REST_SWIFT) { string path_args; url_decode(s->args.get("path"), path_args); if (!path_args.empty()) { @@ -421,7 +426,7 @@ void RGWListBucket::execute() } ret = rgwstore->list_objects(s->user.user_id, s->bucket, max, prefix, delimiter, marker, objs, common_prefixes, - !!(s->prot_flags & RGW_REST_OPENSTACK), no_ns, &is_truncated, NULL); + !!(s->prot_flags & RGW_REST_SWIFT), no_ns, &is_truncated, NULL); done: send_response(); @@ -749,7 +754,6 @@ void RGWPutObj::execute() goto done_err; bl.clear(); - map<string, bufferlist> meta_attrs; RGWUploadPartInfo info; string p = "part."; p.append(part_num); @@ -758,13 +762,10 @@ void RGWPutObj::execute() info.size = s->obj_size; info.modified = ceph_clock_now(g_ceph_context); ::encode(info, bl); - meta_attrs[p] = bl; rgw_obj meta_obj(s->bucket, multipart_meta_obj, s->object_str, mp_ns); - // we don't set a category, since by now a category should have already been assigned - string nocategory; - ret = rgwstore->put_obj_meta(s->obj_ctx, s->user.user_id, meta_obj, NULL, meta_attrs, nocategory, false); + ret = rgwstore->tmap_set(meta_obj, p, bl); } } done: @@ -1191,11 +1192,12 @@ done: } static int get_multiparts_info(struct req_state *s, string& meta_oid, map<uint32_t, RGWUploadPartInfo>& parts, - RGWAccessControlPolicy& policy, map<string, bufferlist>& new_attrs) + RGWAccessControlPolicy& policy, map<string, bufferlist>& attrs) { void *handle; - map<string, bufferlist> attrs; + map<string, bufferlist> parts_map; map<string, bufferlist>::iterator iter; + bufferlist header; rgw_obj obj(s->bucket, meta_oid, s->object_str, mp_ns); @@ -1206,24 +1208,35 @@ static int get_multiparts_info(struct req_state *s, string& meta_oid, map<uint32 if (ret < 0) return ret; + ret = rgwstore->tmap_get(obj, header, parts_map); + if (ret < 0) + return ret; + for (iter = attrs.begin(); iter != attrs.end(); ++iter) { string name = iter->first; if (name.compare(RGW_ATTR_ACL) == 0) { bufferlist& bl = iter->second; bufferlist::iterator bli = bl.begin(); - ::decode(policy, bli); - new_attrs[RGW_ATTR_ACL] = bl; - continue; - } - if (name.compare(0, 5, "part.") != 0) { - new_attrs[iter->first] = iter->second; - continue; + try { + ::decode(policy, bli); + } catch (buffer::error& err) { + RGW_LOG(0) << "ERROR: could not decode policy, caught buffer::error" << dendl; + return -EIO; + } + break; } + } + + for (iter = parts_map.begin(); iter != parts_map.end(); ++iter) { bufferlist& bl = iter->second; bufferlist::iterator bli = bl.begin(); RGWUploadPartInfo info; - ::decode(info, bli); + try { + ::decode(info, bli); + } catch (buffer::error& err) { + RGW_LOG(0) << "ERROR: could not decode policy, caught buffer::error" << dendl; + } parts[info.num] = info; } return 0; @@ -1454,7 +1467,7 @@ void RGWListBucketMultiparts::execute() if (ret < 0) goto done; - if (s->prot_flags & RGW_REST_OPENSTACK) { + if (s->prot_flags & RGW_REST_SWIFT) { string path_args; url_decode(s->args.get("path"), path_args); if (!path_args.empty()) { @@ -1468,7 +1481,7 @@ void RGWListBucketMultiparts::execute() } marker_meta = marker.get_meta(); ret = rgwstore->list_objects(s->user.user_id, s->bucket, max_uploads, prefix, delimiter, marker_meta, objs, common_prefixes, - !!(s->prot_flags & RGW_REST_OPENSTACK), mp_ns, &is_truncated, &mp_filter); + !!(s->prot_flags & RGW_REST_SWIFT), mp_ns, &is_truncated, &mp_filter); if (objs.size()) { vector<RGWObjEnt>::iterator iter; RGWMultipartUploadEntry entry; diff --git a/src/rgw/rgw_os_auth.h b/src/rgw/rgw_os_auth.h deleted file mode 100644 index 4a31b3dca5c..00000000000 --- a/src/rgw/rgw_os_auth.h +++ /dev/null @@ -1,30 +0,0 @@ -#ifndef CEPH_RGW_OS_AUTH_H -#define CEPH_RGW_OS_AUTH_H - -#include "rgw_op.h" - -#define RGW_OS_TOKEN_EXPIRATION (15 * 60) - -extern int rgw_os_verify_signed_token(const char *token, RGWUserInfo& info); - -class RGW_OS_Auth_Get : public RGWOp { -public: - RGW_OS_Auth_Get() {} - ~RGW_OS_Auth_Get() {} - - int verify_permission() { return 0; } - void execute(); -}; - -class RGWHandler_OS_Auth : public RGWHandler { -public: - RGWHandler_OS_Auth() {} - ~RGWHandler_OS_Auth() {} - RGWOp *get_op(); - void put_op(RGWOp *op); - - int authorize(); - int read_permissions() { return 0; } -}; - -#endif diff --git a/src/rgw/rgw_rados.cc b/src/rgw/rgw_rados.cc index f9bc8e6c5a2..404bddb33fb 100644 --- a/src/rgw/rgw_rados.cc +++ b/src/rgw/rgw_rados.cc @@ -280,7 +280,12 @@ int RGWRados::list_objects(string& id, rgw_bucket& bucket, int max, string& pref bufferlist& bl = iter->second; bufferlist::iterator i = bl.begin(); RGWAccessControlPolicy policy; - policy.decode_owner(i); + try { + policy.decode_owner(i); + } catch (buffer::error& err) { + RGW_LOG(0) << "ERROR: could not decode policy for oid=" << oid << ", caught buffer::error" << dendl; + continue; + } ACLOwner& owner = policy.get_owner(); obj.owner = owner.get_id(); obj.owner_display_name = owner.get_display_name(); @@ -793,16 +798,26 @@ int RGWRados::get_obj_state(RGWRadosCtx *rctx, rgw_obj& obj, librados::IoCtx& io s->exists = true; bufferlist::iterator oiter = outbl.begin(); - ::decode(s->attrset, oiter); + try { + ::decode(s->attrset, oiter); + } catch (buffer::error& err) { + RGW_LOG(0) << "ERROR: failed decoding s->attrset (obj=" << obj << "), aborting" << dendl; + return -EIO; + } map<string, bufferlist>::iterator aiter; for (aiter = s->attrset.begin(); aiter != s->attrset.end(); ++aiter) { RGW_LOG(0) << "iter->first=" << aiter->first << dendl; } - ::decode(s->size, oiter); - utime_t ut; - ::decode(ut, oiter); - s->mtime = ut.sec(); + + try { + ::decode(s->size, oiter); + utime_t ut; + ::decode(ut, oiter); + s->mtime = ut.sec(); + } catch (buffer::error& err) { + RGW_LOG(0) << "ERROR: failed decoding object (obj=" << obj << ") info (either size or mtime), aborting" << dendl; + } s->has_attrs = true; map<string, bufferlist>::iterator iter = s->attrset.find(RGW_ATTR_SHADOW_OBJ); @@ -1473,8 +1488,9 @@ int RGWRados::get_bucket_stats(rgw_bucket& bucket, map<string, RGWBucketStats>& return 0; } -int RGWRados::tmap_get(rgw_obj& obj, bufferlist& bl) +int RGWRados::tmap_get(rgw_obj& obj, bufferlist& header, std::map<string, bufferlist>& m) { + bufferlist bl; librados::IoCtx io_ctx; rgw_bucket& bucket = obj.bucket; std::string& oid = obj.object; @@ -1485,8 +1501,19 @@ int RGWRados::tmap_get(rgw_obj& obj, bufferlist& bl) io_ctx.locator_set_key(obj.key); r = io_ctx.tmap_get(oid, bl); + if (r < 0) + return r; - return r; + try { + bufferlist::iterator iter = bl.begin(); + ::decode(header, iter); + ::decode(m, iter); + } catch (buffer::error& err) { + RGW_LOG(0) << "ERROR: tmap_get failed, caught buffer::error" << dendl; + return -EIO; + } + + return 0; } @@ -1590,13 +1617,13 @@ int RGWRados::update_containers_stats(map<string, RGWBucketEnt>& m) int count = 0; map<string, RGWBucketEnt>::iterator iter; - list<string> buckets_list; + list<string> pools_list; for (iter = m.begin(); iter != m.end(); ++iter) { - string bucket_name = iter->first; - buckets_list.push_back(bucket_name); + string pool_name = iter->second.bucket.pool; + pools_list.push_back(pool_name); } map<std::string,librados::stats_map> sm; - int r = rados->get_pool_stats(buckets_list, rgw_obj_category_main, sm); + int r = rados->get_pool_stats(pools_list, rgw_obj_category_main, sm); if (r < 0) return r; @@ -1605,6 +1632,8 @@ int RGWRados::update_containers_stats(map<string, RGWBucketEnt>& m) for (miter = sm.begin(), iter = m.begin(); miter != sm.end(), iter != m.end(); ++iter, ++miter) { stats_map stats = miter->second; stats_map::iterator stats_iter = stats.begin(); + if (stats_iter == stats.end()) + continue; string bucket_name = miter->first; RGWBucketEnt& ent = iter->second; diff --git a/src/rgw/rgw_rados.h b/src/rgw/rgw_rados.h index a78ff5a5d9a..bcc96835e29 100644 --- a/src/rgw/rgw_rados.h +++ b/src/rgw/rgw_rados.h @@ -2,10 +2,12 @@ #define CEPH_RGWRADOS_H #include "include/rados/librados.hpp" +#include "include/Context.h" #include "rgw_access.h" #include "rgw_common.h" class RGWWatcher; +class SafeTimer; struct RGWObjState { bool is_atomic; @@ -77,6 +79,19 @@ class RGWRados : public RGWAccess int set_buckets_auid(vector<rgw_bucket>& buckets, uint64_t auid); + Mutex lock; + SafeTimer *timer; + + class C_Tick : public Context { + RGWRados *rados; + public: + C_Tick(RGWRados *_r) : rados(_r) {} + void finish(int r) { + rados->tick(); + } + }; + + RGWWatcher *watcher; uint64_t watch_handle; librados::IoCtx root_pool_ctx; @@ -106,7 +121,9 @@ class RGWRados : public RGWAccess pair<string, bufferlist> *cmp_xattr); int delete_obj_impl(void *ctx, std::string& id, rgw_obj& src_obj, bool sync); public: - RGWRados() : watcher(NULL), watch_handle(0) {} + RGWRados() : lock("rados_timer_lock"), timer(NULL), watcher(NULL), watch_handle(0) {} + + void tick(); /** Initialize the RADOS instance and prepare to do other ops */ virtual int initialize(CephContext *cct); @@ -228,7 +245,7 @@ public: virtual int get_bucket_id(rgw_bucket& bucket, uint64_t *bucket_id); virtual bool supports_tmap() { return true; } - virtual int tmap_get(rgw_obj& obj, bufferlist& bl); + virtual int tmap_get(rgw_obj& obj, bufferlist& header, std::map<string, bufferlist>& m); virtual int tmap_set(rgw_obj& obj, std::string& key, bufferlist& bl); virtual int tmap_set(rgw_obj& obj, map<std::string, bufferlist>& m); virtual int tmap_create(rgw_obj& obj, std::string& key, bufferlist& bl); diff --git a/src/rgw/rgw_rest.cc b/src/rgw/rgw_rest.cc index 13be0dcaa83..9cf35c555af 100644 --- a/src/rgw/rgw_rest.cc +++ b/src/rgw/rgw_rest.cc @@ -7,9 +7,9 @@ #include "rgw_formats.h" #include "rgw_op.h" #include "rgw_rest.h" -#include "rgw_rest_os.h" +#include "rgw_rest_swift.h" #include "rgw_rest_s3.h" -#include "rgw_os_auth.h" +#include "rgw_swift_auth.h" #include "rgw_formats.h" @@ -52,6 +52,7 @@ const static struct rgw_html_errors RGW_HTML_ERRORS[] = { { EEXIST, 409, "BucketAlreadyExists" }, { ENOTEMPTY, 409, "BucketNotEmpty" }, { ERANGE, 416, "InvalidRange" }, + { ERR_INTERNAL_ERROR, 500, "InternalError" }, }; void set_req_state_err(struct req_state *s, int err_no) @@ -97,7 +98,7 @@ void dump_content_length(struct req_state *s, size_t len) void dump_etag(struct req_state *s, const char *etag) { - if (s->prot_flags & RGW_REST_OPENSTACK) + if (s->prot_flags & RGW_REST_SWIFT) CGI_PRINTF(s,"etag: %s\n", etag); else CGI_PRINTF(s,"ETag: \"%s\"\n", etag); @@ -428,13 +429,13 @@ void init_entities_from_header(struct req_state *s) pos = req.find('/'); if (pos >= 0) { - const char *openstack_url_prefix = s->env->get("RGW_OPENSTACK_URL_PREFIX"); - bool cut_url = (openstack_url_prefix != NULL); - if (!openstack_url_prefix) - openstack_url_prefix = "v1"; + const char *swift_url_prefix = s->env->get("RGW_SWIFT_URL_PREFIX"); + bool cut_url = (swift_url_prefix != NULL); + if (!swift_url_prefix) + swift_url_prefix = "v1"; first = req.substr(0, pos); - if (first.compare(openstack_url_prefix) == 0) { - s->prot_flags |= RGW_REST_OPENSTACK; + if (first.compare(swift_url_prefix) == 0) { + s->prot_flags |= RGW_REST_SWIFT; if (cut_url) { next_tok(req, first, '/'); } @@ -443,7 +444,7 @@ void init_entities_from_header(struct req_state *s) first = req; } - if (s->prot_flags & RGW_REST_OPENSTACK) { + if (s->prot_flags & RGW_REST_SWIFT) { s->format = 0; delete s->formatter; s->formatter = new RGWFormatter_Plain; @@ -459,7 +460,7 @@ void init_entities_from_header(struct req_state *s) } } - if (s->prot_flags & RGW_REST_OPENSTACK) { + if (s->prot_flags & RGW_REST_SWIFT) { string ver; string auth_key; @@ -495,7 +496,7 @@ void init_entities_from_header(struct req_state *s) } if (strcmp(s->bucket_name, "auth") == 0) - s->prot_flags |= RGW_REST_OPENSTACK_AUTH; + s->prot_flags |= RGW_REST_SWIFT_AUTH; if (pos >= 0) { string encoded_obj_str = req.substr(pos+1); @@ -545,44 +546,44 @@ static void init_auth_info(struct req_state *s) { const char *p; - s->x_amz_map.clear(); + s->x_meta_map.clear(); for (int i=0; (p = s->fcgx->envp[i]); ++i) { #define HTTP_X_AMZ "HTTP_X_AMZ" if (strncmp(p, HTTP_X_AMZ, sizeof(HTTP_X_AMZ) - 1) == 0) { - RGW_LOG(10) << "amz>> " << p << dendl; - const char *amz = p+5; /* skip the HTTP_ part */ - const char *eq = strchr(amz, '='); + RGW_LOG(10) << "meta>> " << p << dendl; + const char *name = p+5; /* skip the HTTP_ part */ + const char *eq = strchr(name, '='); if (!eq) /* shouldn't happen! */ continue; - int len = eq - amz; - char amz_low[len + 1]; + int len = eq - name; + char name_low[len + 1]; int j; for (j=0; j<len; j++) { - amz_low[j] = tolower(amz[j]); - if (amz_low[j] == '_') - amz_low[j] = '-'; + name_low[j] = tolower(name[j]); + if (name_low[j] == '_') + name_low[j] = '-'; } - amz_low[j] = 0; + name_low[j] = 0; string val; line_unfold(eq + 1, val); map<string, string>::iterator iter; - iter = s->x_amz_map.find(amz_low); - if (iter != s->x_amz_map.end()) { + iter = s->x_meta_map.find(name_low); + if (iter != s->x_meta_map.end()) { string old = iter->second; int pos = old.find_last_not_of(" \t"); /* get rid of any whitespaces after the value */ old = old.substr(0, pos + 1); old.append(","); old.append(val); - s->x_amz_map[amz_low] = old; + s->x_meta_map[name_low] = old; } else { - s->x_amz_map[amz_low] = val; + s->x_meta_map[name_low] = val; } } } map<string, string>::iterator iter; - for (iter = s->x_amz_map.begin(); iter != s->x_amz_map.end(); ++iter) { + for (iter = s->x_meta_map.begin(); iter != s->x_meta_map.end(); ++iter) { RGW_LOG(10) << "x>> " << iter->first << ":" << iter->second << dendl; } @@ -808,8 +809,8 @@ RGWOp *RGWHandler_REST::get_op() RGWRESTMgr::RGWRESTMgr() { - m_os_handler = new RGWHandler_REST_OS; - m_os_auth_handler = new RGWHandler_OS_Auth; + m_os_handler = new RGWHandler_REST_SWIFT; + m_os_auth_handler = new RGWHandler_SWIFT_Auth; m_s3_handler = new RGWHandler_REST_S3; } @@ -827,9 +828,9 @@ RGWHandler *RGWRESTMgr::get_handler(struct req_state *s, FCGX_Request *fcgx, *init_error = RGWHandler_REST::preprocess(s, fcgx); - if (s->prot_flags & RGW_REST_OPENSTACK) + if (s->prot_flags & RGW_REST_SWIFT) handler = m_os_handler; - else if (s->prot_flags & RGW_REST_OPENSTACK_AUTH) + else if (s->prot_flags & RGW_REST_SWIFT_AUTH) handler = m_os_auth_handler; else handler = m_s3_handler; diff --git a/src/rgw/rgw_rest.h b/src/rgw/rgw_rest.h index 5bac7e74470..d94d29952d8 100644 --- a/src/rgw/rgw_rest.h +++ b/src/rgw/rgw_rest.h @@ -147,13 +147,13 @@ public: virtual int authorize() = 0; }; -class RGWHandler_REST_OS; -class RGWHandler_OS_Auth; +class RGWHandler_REST_SWIFT; +class RGWHandler_SWIFT_Auth; class RGWHandler_REST_S3; class RGWRESTMgr { - RGWHandler_REST_OS *m_os_handler; - RGWHandler_OS_Auth *m_os_auth_handler; + RGWHandler_REST_SWIFT *m_os_handler; + RGWHandler_SWIFT_Auth *m_os_auth_handler; RGWHandler_REST_S3 *m_s3_handler; public: diff --git a/src/rgw/rgw_rest_os.h b/src/rgw/rgw_rest_os.h deleted file mode 100644 index 3c5c346c345..00000000000 --- a/src/rgw/rgw_rest_os.h +++ /dev/null @@ -1,116 +0,0 @@ -#ifndef CEPH_RGW_REST_OS_H -#define CEPH_RGW_REST_OS_H -#define TIME_BUF_SIZE 128 - -#include "rgw_op.h" -#include "rgw_rest.h" - -class RGWGetObj_REST_OS : public RGWGetObj_REST { -public: - RGWGetObj_REST_OS() {} - ~RGWGetObj_REST_OS() {} - - int send_response(void *handle); -}; - -class RGWListBuckets_REST_OS : public RGWListBuckets_REST { -public: - RGWListBuckets_REST_OS() {} - ~RGWListBuckets_REST_OS() {} - - void send_response(); -}; - -class RGWListBucket_REST_OS : public RGWListBucket_REST { -public: - RGWListBucket_REST_OS() { - limit_opt_name = "limit"; - default_max = 10000; - } - ~RGWListBucket_REST_OS() {} - - void send_response(); -}; - -class RGWStatBucket_REST_OS : public RGWStatBucket_REST { -public: - RGWStatBucket_REST_OS() {} - ~RGWStatBucket_REST_OS() {} - - void send_response(); -}; - -class RGWCreateBucket_REST_OS : public RGWCreateBucket_REST { -public: - RGWCreateBucket_REST_OS() {} - ~RGWCreateBucket_REST_OS() {} - - void send_response(); -}; - -class RGWDeleteBucket_REST_OS : public RGWDeleteBucket_REST { -public: - RGWDeleteBucket_REST_OS() {} - ~RGWDeleteBucket_REST_OS() {} - - void send_response(); -}; - -class RGWPutObj_REST_OS : public RGWPutObj_REST { -public: - RGWPutObj_REST_OS() {} - ~RGWPutObj_REST_OS() {} - - void send_response(); -}; - -class RGWDeleteObj_REST_OS : public RGWDeleteObj_REST { -public: - RGWDeleteObj_REST_OS() {} - ~RGWDeleteObj_REST_OS() {} - - void send_response(); -}; - -class RGWCopyObj_REST_OS : public RGWCopyObj_REST { -public: - RGWCopyObj_REST_OS() {} - ~RGWCopyObj_REST_OS() {} - - void send_response() {} -}; - -class RGWGetACLs_REST_OS : public RGWGetACLs_REST { -public: - RGWGetACLs_REST_OS() {} - ~RGWGetACLs_REST_OS() {} - - void send_response() {} -}; - -class RGWPutACLs_REST_OS : public RGWPutACLs_REST { -public: - RGWPutACLs_REST_OS() : RGWPutACLs_REST() {} - virtual ~RGWPutACLs_REST_OS() {} - - void send_response() {} -}; - - -class RGWHandler_REST_OS : public RGWHandler_REST { -protected: - - RGWOp *get_retrieve_obj_op(bool get_data); - RGWOp *get_retrieve_op(bool get_data); - RGWOp *get_create_op(); - RGWOp *get_delete_op(); - RGWOp *get_post_op() { return NULL; } - -public: - RGWHandler_REST_OS() : RGWHandler_REST() {} - virtual ~RGWHandler_REST_OS() {} - - int authorize(); -}; - -#endif diff --git a/src/rgw/rgw_rest_s3.cc b/src/rgw/rgw_rest_s3.cc index e74e7b3ccee..4aa379ce4e9 100644 --- a/src/rgw/rgw_rest_s3.cc +++ b/src/rgw/rgw_rest_s3.cc @@ -489,7 +489,7 @@ static void get_canon_amz_hdr(struct req_state *s, string& dest) { dest = ""; map<string, string>::iterator iter; - for (iter = s->x_amz_map.begin(); iter != s->x_amz_map.end(); ++iter) { + for (iter = s->x_meta_map.begin(); iter != s->x_meta_map.end(); ++iter) { dest.append(iter->first); dest.append(":"); dest.append(iter->second); diff --git a/src/rgw/rgw_rest_os.cc b/src/rgw/rgw_rest_swift.cc index 3c346bb3ff9..694e47fff95 100644 --- a/src/rgw/rgw_rest_os.cc +++ b/src/rgw/rgw_rest_swift.cc @@ -1,11 +1,11 @@ #include "common/Formatter.h" -#include "rgw_os.h" -#include "rgw_rest_os.h" +#include "rgw_swift.h" +#include "rgw_rest_swift.h" #include <sstream> -void RGWListBuckets_REST_OS::send_response() +void RGWListBuckets_REST_SWIFT::send_response() { set_req_state_err(s, ret); dump_errno(s); @@ -57,7 +57,7 @@ void RGWListBuckets_REST_OS::send_response() s->formatter->reset(); } -void RGWListBucket_REST_OS::send_response() +void RGWListBucket_REST_SWIFT::send_response() { set_req_state_err(s, (ret < 0 ? ret : 0)); dump_errno(s); @@ -125,7 +125,7 @@ static void dump_container_metadata(struct req_state *s, RGWBucketEnt& bucket) CGI_PRINTF(s,"X-Container-Bytes-Used: %s\n", buf); } -void RGWStatBucket_REST_OS::send_response() +void RGWStatBucket_REST_SWIFT::send_response() { if (ret >= 0) dump_container_metadata(s, bucket); @@ -139,7 +139,7 @@ void RGWStatBucket_REST_OS::send_response() flush_formatter_to_req_state(s, s->formatter); } -void RGWCreateBucket_REST_OS::send_response() +void RGWCreateBucket_REST_SWIFT::send_response() { if (ret) set_req_state_err(s, ret); @@ -148,7 +148,7 @@ void RGWCreateBucket_REST_OS::send_response() flush_formatter_to_req_state(s, s->formatter); } -void RGWDeleteBucket_REST_OS::send_response() +void RGWDeleteBucket_REST_SWIFT::send_response() { int r = ret; if (!r) @@ -160,7 +160,7 @@ void RGWDeleteBucket_REST_OS::send_response() flush_formatter_to_req_state(s, s->formatter); } -void RGWPutObj_REST_OS::send_response() +void RGWPutObj_REST_SWIFT::send_response() { if (!ret) ret = 201; // "created" @@ -171,7 +171,7 @@ void RGWPutObj_REST_OS::send_response() flush_formatter_to_req_state(s, s->formatter); } -void RGWDeleteObj_REST_OS::send_response() +void RGWDeleteObj_REST_SWIFT::send_response() { int r = ret; if (!r) @@ -183,7 +183,7 @@ void RGWDeleteObj_REST_OS::send_response() flush_formatter_to_req_state(s, s->formatter); } -int RGWGetObj_REST_OS::send_response(void *handle) +int RGWGetObj_REST_SWIFT::send_response(void *handle) { const char *content_type = NULL; int orig_ret = ret; @@ -239,14 +239,14 @@ send_data: return 0; } -RGWOp *RGWHandler_REST_OS::get_retrieve_obj_op(bool get_data) +RGWOp *RGWHandler_REST_SWIFT::get_retrieve_obj_op(bool get_data) { if (is_acl_op()) { - return new RGWGetACLs_REST_OS; + return new RGWGetACLs_REST_SWIFT; } if (s->object) { - RGWGetObj_REST_OS *get_obj_op = new RGWGetObj_REST_OS; + RGWGetObj_REST_SWIFT *get_obj_op = new RGWGetObj_REST_SWIFT; get_obj_op->set_get_data(get_data); return get_obj_op; } else if (!s->bucket_name) { @@ -254,54 +254,56 @@ RGWOp *RGWHandler_REST_OS::get_retrieve_obj_op(bool get_data) } if (get_data) - return new RGWListBucket_REST_OS; + return new RGWListBucket_REST_SWIFT; else - return new RGWStatBucket_REST_OS; + return new RGWStatBucket_REST_SWIFT; } -RGWOp *RGWHandler_REST_OS::get_retrieve_op(bool get_data) +RGWOp *RGWHandler_REST_SWIFT::get_retrieve_op(bool get_data) { if (s->bucket_name) { if (is_acl_op()) { - return new RGWGetACLs_REST_OS; + return new RGWGetACLs_REST_SWIFT; } return get_retrieve_obj_op(get_data); } - return new RGWListBuckets_REST_OS; + return new RGWListBuckets_REST_SWIFT; } -RGWOp *RGWHandler_REST_OS::get_create_op() +RGWOp *RGWHandler_REST_SWIFT::get_create_op() { if (is_acl_op()) { - return new RGWPutACLs_REST_OS; + return new RGWPutACLs_REST_SWIFT; } else if (s->object) { if (!s->copy_source) - return new RGWPutObj_REST_OS; + return new RGWPutObj_REST_SWIFT; else - return new RGWCopyObj_REST_OS; + return new RGWCopyObj_REST_SWIFT; } else if (s->bucket_name) { - return new RGWCreateBucket_REST_OS; + return new RGWCreateBucket_REST_SWIFT; } return NULL; } -RGWOp *RGWHandler_REST_OS::get_delete_op() +RGWOp *RGWHandler_REST_SWIFT::get_delete_op() { if (s->object) - return new RGWDeleteObj_REST_OS; + return new RGWDeleteObj_REST_SWIFT; else if (s->bucket_name) - return new RGWDeleteBucket_REST_OS; + return new RGWDeleteBucket_REST_SWIFT; return NULL; } -int RGWHandler_REST_OS::authorize() +int RGWHandler_REST_SWIFT::authorize() { bool authorized = rgw_verify_os_token(s); if (!authorized) return -EPERM; + s->perm_mask = RGW_PERM_FULL_CONTROL; + return 0; } diff --git a/src/rgw/rgw_rest_swift.h b/src/rgw/rgw_rest_swift.h new file mode 100644 index 00000000000..c9bb6ff51aa --- /dev/null +++ b/src/rgw/rgw_rest_swift.h @@ -0,0 +1,116 @@ +#ifndef CEPH_RGW_REST_SWIFT_H +#define CEPH_RGW_REST_SWIFT_H +#define TIME_BUF_SIZE 128 + +#include "rgw_op.h" +#include "rgw_rest.h" + +class RGWGetObj_REST_SWIFT : public RGWGetObj_REST { +public: + RGWGetObj_REST_SWIFT() {} + ~RGWGetObj_REST_SWIFT() {} + + int send_response(void *handle); +}; + +class RGWListBuckets_REST_SWIFT : public RGWListBuckets_REST { +public: + RGWListBuckets_REST_SWIFT() {} + ~RGWListBuckets_REST_SWIFT() {} + + void send_response(); +}; + +class RGWListBucket_REST_SWIFT : public RGWListBucket_REST { +public: + RGWListBucket_REST_SWIFT() { + limit_opt_name = "limit"; + default_max = 10000; + } + ~RGWListBucket_REST_SWIFT() {} + + void send_response(); +}; + +class RGWStatBucket_REST_SWIFT : public RGWStatBucket_REST { +public: + RGWStatBucket_REST_SWIFT() {} + ~RGWStatBucket_REST_SWIFT() {} + + void send_response(); +}; + +class RGWCreateBucket_REST_SWIFT : public RGWCreateBucket_REST { +public: + RGWCreateBucket_REST_SWIFT() {} + ~RGWCreateBucket_REST_SWIFT() {} + + void send_response(); +}; + +class RGWDeleteBucket_REST_SWIFT : public RGWDeleteBucket_REST { +public: + RGWDeleteBucket_REST_SWIFT() {} + ~RGWDeleteBucket_REST_SWIFT() {} + + void send_response(); +}; + +class RGWPutObj_REST_SWIFT : public RGWPutObj_REST { +public: + RGWPutObj_REST_SWIFT() {} + ~RGWPutObj_REST_SWIFT() {} + + void send_response(); +}; + +class RGWDeleteObj_REST_SWIFT : public RGWDeleteObj_REST { +public: + RGWDeleteObj_REST_SWIFT() {} + ~RGWDeleteObj_REST_SWIFT() {} + + void send_response(); +}; + +class RGWCopyObj_REST_SWIFT : public RGWCopyObj_REST { +public: + RGWCopyObj_REST_SWIFT() {} + ~RGWCopyObj_REST_SWIFT() {} + + void send_response() {} +}; + +class RGWGetACLs_REST_SWIFT : public RGWGetACLs_REST { +public: + RGWGetACLs_REST_SWIFT() {} + ~RGWGetACLs_REST_SWIFT() {} + + void send_response() {} +}; + +class RGWPutACLs_REST_SWIFT : public RGWPutACLs_REST { +public: + RGWPutACLs_REST_SWIFT() : RGWPutACLs_REST() {} + virtual ~RGWPutACLs_REST_SWIFT() {} + + void send_response() {} +}; + + +class RGWHandler_REST_SWIFT : public RGWHandler_REST { +protected: + + RGWOp *get_retrieve_obj_op(bool get_data); + RGWOp *get_retrieve_op(bool get_data); + RGWOp *get_create_op(); + RGWOp *get_delete_op(); + RGWOp *get_post_op() { return NULL; } + +public: + RGWHandler_REST_SWIFT() : RGWHandler_REST() {} + virtual ~RGWHandler_REST_SWIFT() {} + + int authorize(); +}; + +#endif diff --git a/src/rgw/rgw_os.cc b/src/rgw/rgw_swift.cc index edecf55f70b..29894c00c4a 100644 --- a/src/rgw/rgw_os.cc +++ b/src/rgw/rgw_swift.cc @@ -6,8 +6,8 @@ #include <curl/easy.h> #include "rgw_common.h" -#include "rgw_os.h" -#include "rgw_os_auth.h" +#include "rgw_swift.h" +#include "rgw_swift_auth.h" #include "rgw_user.h" @@ -15,7 +15,7 @@ static size_t read_http_header(void *ptr, size_t size, size_t nmemb, void *_info { size_t len = size * nmemb; char line[len + 1]; - struct rgw_os_auth_info *info = (struct rgw_os_auth_info *)_info; + struct rgw_swift_auth_info *info = (struct rgw_swift_auth_info *)_info; char *s = (char *)ptr, *end = (char *)ptr + len; char *p = line; @@ -56,14 +56,14 @@ static size_t read_http_header(void *ptr, size_t size, size_t nmemb, void *_info return len; } -static int rgw_os_validate_token(const char *token, struct rgw_os_auth_info *info) +static int rgw_swift_validate_token(const char *token, struct rgw_swift_auth_info *info) { CURL *curl_handle; string auth_url = "http://127.0.0.1:11000/token"; char url_buf[auth_url.size() + 1 + strlen(token) + 1]; sprintf(url_buf, "%s/%s", auth_url.c_str(), token); - RGW_LOG(10) << "rgw_os_validate_token url=" << url_buf << dendl; + RGW_LOG(10) << "rgw_swift_validate_token url=" << url_buf << dendl; curl_handle = curl_easy_init(); @@ -83,37 +83,37 @@ static int rgw_os_validate_token(const char *token, struct rgw_os_auth_info *inf bool rgw_verify_os_token(req_state *s) { if (strncmp(s->os_auth_token, "AUTH_rgwtk", 10) == 0) { - int ret = rgw_os_verify_signed_token(s->os_auth_token, s->user); + int ret = rgw_swift_verify_signed_token(s->os_auth_token, s->user); if (ret < 0) return false; return true; } - struct rgw_os_auth_info info; + struct rgw_swift_auth_info info; memset(&info, 0, sizeof(info)); info.status = 401; // start with access denied, validate_token might change that - int ret = rgw_os_validate_token(s->os_auth_token, &info); + int ret = rgw_swift_validate_token(s->os_auth_token, &info); if (ret < 0) return ret; if (!info.user) { - RGW_LOG(0) << "openstack auth didn't authorize a user" << dendl; + RGW_LOG(0) << "swift auth didn't authorize a user" << dendl; return false; } s->os_user = info.user; s->os_groups = info.auth_groups; - string openstack_user = s->os_user; + string swift_user = s->os_user; - RGW_LOG(10) << "openstack user=" << s->os_user << dendl; + RGW_LOG(10) << "swift user=" << s->os_user << dendl; - if (rgw_get_user_info_by_openstack(openstack_user, s->user) < 0) { - RGW_LOG(0) << "couldn't map openstack user" << dendl; + if (rgw_get_user_info_by_swift(swift_user, s->user) < 0) { + RGW_LOG(0) << "couldn't map swift user" << dendl; return false; } diff --git a/src/rgw/rgw_os.h b/src/rgw/rgw_swift.h index 2114ecaa294..16204bcb2d9 100644 --- a/src/rgw/rgw_os.h +++ b/src/rgw/rgw_swift.h @@ -1,11 +1,11 @@ -#ifndef CEPH_RGW_OS_H -#define CEPH_RGW_OS_H +#ifndef CEPH_RGW_SWIFT_H +#define CEPH_RGW_SWIFT_H #include "rgw_common.h" -struct rgw_os_auth_info { +struct rgw_swift_auth_info { int status; char *auth_groups; char *user; diff --git a/src/rgw/rgw_os_auth.cc b/src/rgw/rgw_swift_auth.cc index a382ea6f102..afcd03d68bc 100644 --- a/src/rgw/rgw_os_auth.cc +++ b/src/rgw/rgw_swift_auth.cc @@ -1,4 +1,4 @@ -#include "rgw_os_auth.h" +#include "rgw_swift_auth.h" #include "rgw_rest.h" #include "common/ceph_crypto.h" @@ -8,7 +8,7 @@ using namespace ceph::crypto; -static RGW_OS_Auth_Get rgw_os_auth_get; +static RGW_SWIFT_Auth_Get rgw_swift_auth_get; static int build_token(string& os_user, string& key, uint64_t nonce, utime_t& expiration, bufferlist& bl) { @@ -45,14 +45,14 @@ static int encode_token(string& os_user, string& key, bufferlist& bl) return ret; utime_t expiration = ceph_clock_now(g_ceph_context); - expiration += RGW_OS_TOKEN_EXPIRATION; // 15 minutes + expiration += RGW_SWIFT_TOKEN_EXPIRATION; // 15 minutes ret = build_token(os_user, key, nonce, expiration, bl); return ret; } -int rgw_os_verify_signed_token(const char *token, RGWUserInfo& info) +int rgw_swift_verify_signed_token(const char *token, RGWUserInfo& info) { if (strncmp(token, "AUTH_rgwtk", 10) != 0) return -EINVAL; @@ -83,7 +83,7 @@ int rgw_os_verify_signed_token(const char *token, RGWUserInfo& info) ::decode(os_user, iter); ::decode(nonce, iter); ::decode(expiration, iter); - } catch (buffer::error *err) { + } catch (buffer::error& err) { RGW_LOG(0) << "failed to decode token: caught exception" << dendl; return -EINVAL; } @@ -92,13 +92,13 @@ int rgw_os_verify_signed_token(const char *token, RGWUserInfo& info) return -EPERM; } - if ((ret = rgw_get_user_info_by_openstack(os_user, info)) < 0) + if ((ret = rgw_get_user_info_by_swift(os_user, info)) < 0) return ret; RGW_LOG(10) << "os_user=" << os_user << dendl; bufferlist tok; - ret = build_token(os_user, info.openstack_key, nonce, expiration, tok); + ret = build_token(os_user, info.swift_key, nonce, expiration, tok); if (ret < 0) return ret; @@ -117,23 +117,23 @@ int rgw_os_verify_signed_token(const char *token, RGWUserInfo& info) return 0; } -void RGW_OS_Auth_Get::execute() +void RGW_SWIFT_Auth_Get::execute() { int ret = -EPERM; - RGW_LOG(20) << "RGW_OS_Auth_Get::execute()" << dendl; + RGW_LOG(20) << "RGW_SWIFT_Auth_Get::execute()" << dendl; const char *key = s->env->get("HTTP_X_AUTH_KEY"); const char *user = s->env->get("HTTP_X_AUTH_USER"); - const char *url_prefix = s->env->get("RGW_OPENSTACK_URL_PREFIX"); - const char *os_url = s->env->get("RGW_OPENSTACK_URL"); + const char *url_prefix = s->env->get("RGW_SWIFT_URL_PREFIX"); + const char *os_url = s->env->get("RGW_SWIFT_URL"); string user_str = user; RGWUserInfo info; bufferlist bl; if (!os_url || !url_prefix) { - RGW_LOG(0) << "server is misconfigured, missing RGW_OPENSTACK_URL_PREFIX or RGW_OPENSTACK_URL" << dendl; + RGW_LOG(0) << "server is misconfigured, missing RGW_SWIFT_URL_PREFIX or RGW_SWIFT_URL" << dendl; ret = -EINVAL; goto done; } @@ -141,18 +141,18 @@ void RGW_OS_Auth_Get::execute() if (!key || !user) goto done; - if ((ret = rgw_get_user_info_by_openstack(user_str, info)) < 0) + if ((ret = rgw_get_user_info_by_swift(user_str, info)) < 0) goto done; - if (info.openstack_key.compare(key) != 0) { - RGW_LOG(0) << "RGW_OS_Auth_Get::execute(): bad openstack key" << dendl; + if (info.swift_key.compare(key) != 0) { + RGW_LOG(0) << "RGW_SWIFT_Auth_Get::execute(): bad swift key" << dendl; ret = -EPERM; goto done; } CGI_PRINTF(s, "X-Storage-Url: %s/%s/v1/AUTH_rgw\n", os_url, url_prefix); - if ((ret = encode_token(info.openstack_name, info.openstack_key, bl)) < 0) + if ((ret = encode_token(info.swift_name, info.swift_key, bl)) < 0) goto done; { @@ -170,17 +170,17 @@ done: end_header(s); } -int RGWHandler_OS_Auth::authorize() +int RGWHandler_SWIFT_Auth::authorize() { return 0; } -RGWOp *RGWHandler_OS_Auth::get_op() +RGWOp *RGWHandler_SWIFT_Auth::get_op() { RGWOp *op; switch (s->op) { case OP_GET: - op = &rgw_os_auth_get; + op = &rgw_swift_auth_get; break; default: return NULL; @@ -192,7 +192,7 @@ RGWOp *RGWHandler_OS_Auth::get_op() return op; } -void RGWHandler_OS_Auth::put_op(RGWOp *op) +void RGWHandler_SWIFT_Auth::put_op(RGWOp *op) { } diff --git a/src/rgw/rgw_swift_auth.h b/src/rgw/rgw_swift_auth.h new file mode 100644 index 00000000000..95909a96015 --- /dev/null +++ b/src/rgw/rgw_swift_auth.h @@ -0,0 +1,30 @@ +#ifndef CEPH_RGW_SWIFT_AUTH_H +#define CEPH_RGW_SWIFT_AUTH_H + +#include "rgw_op.h" + +#define RGW_SWIFT_TOKEN_EXPIRATION (15 * 60) + +extern int rgw_swift_verify_signed_token(const char *token, RGWUserInfo& info); + +class RGW_SWIFT_Auth_Get : public RGWOp { +public: + RGW_SWIFT_Auth_Get() {} + ~RGW_SWIFT_Auth_Get() {} + + int verify_permission() { return 0; } + void execute(); +}; + +class RGWHandler_SWIFT_Auth : public RGWHandler { +public: + RGWHandler_SWIFT_Auth() {} + ~RGWHandler_SWIFT_Auth() {} + RGWOp *get_op(); + void put_op(RGWOp *op); + + int authorize(); + int read_permissions() { return 0; } +}; + +#endif diff --git a/src/rgw/rgw_user.cc b/src/rgw/rgw_user.cc index 98317ca2015..f47123144fe 100644 --- a/src/rgw/rgw_user.cc +++ b/src/rgw/rgw_user.cc @@ -15,7 +15,7 @@ using namespace std; static rgw_bucket ui_key_bucket(USER_INFO_POOL_NAME); static rgw_bucket ui_email_bucket(USER_INFO_EMAIL_POOL_NAME); -static rgw_bucket ui_openstack_bucket(USER_INFO_OPENSTACK_POOL_NAME); +static rgw_bucket ui_swift_bucket(USER_INFO_SWIFT_POOL_NAME); static rgw_bucket ui_uid_bucket(USER_INFO_UID_POOL_NAME); static rgw_bucket pi_pool_bucket(POOL_INFO_POOL_NAME); @@ -49,12 +49,12 @@ int rgw_store_user_info(RGWUserInfo& info) int ret; map<string,bufferlist> attrs; - if (info.openstack_name.size()) { - /* check if openstack mapping exists */ + if (info.swift_name.size()) { + /* check if swift mapping exists */ RGWUserInfo inf; - int r = rgw_get_user_info_by_openstack(info.openstack_name, inf); + int r = rgw_get_user_info_by_swift(info.swift_name, inf); if (r >= 0 && inf.user_id.compare(info.user_id) != 0) { - RGW_LOG(0) << "can't store user info, openstack id already mapped to another user" << dendl; + RGW_LOG(0) << "can't store user info, swift id already mapped to another user" << dendl; return -EEXIST; } } @@ -99,8 +99,8 @@ int rgw_store_user_info(RGWUserInfo& info) } } - if (info.openstack_name.size()) - ret = rgw_put_obj(info.user_id, ui_openstack_bucket, info.openstack_name, uid_bl.c_str(), uid_bl.length()); + if (info.swift_name.size()) + ret = rgw_put_obj(info.user_id, ui_swift_bucket, info.swift_name, uid_bl.c_str(), uid_bl.length()); return ret; } @@ -115,9 +115,14 @@ int rgw_get_user_info_from_index(string& key, rgw_bucket& bucket, RGWUserInfo& i return ret; bufferlist::iterator iter = bl.begin(); - ::decode(uid, iter); - if (!iter.end()) - info.decode(iter); + try { + ::decode(uid, iter); + if (!iter.end()) + info.decode(iter); + } catch (buffer::error& err) { + RGW_LOG(0) << "ERROR: failed to decode user info, caught buffer::error" << dendl; + return -EIO; + } return 0; } @@ -141,12 +146,12 @@ int rgw_get_user_info_by_email(string& email, RGWUserInfo& info) } /** - * Given an openstack username, finds the user_info associated with it. + * Given an swift username, finds the user_info associated with it. * returns: 0 on success, -ERR# on failure (including nonexistence) */ -extern int rgw_get_user_info_by_openstack(string& openstack_name, RGWUserInfo& info) +extern int rgw_get_user_info_by_swift(string& swift_name, RGWUserInfo& info) { - return rgw_get_user_info_from_index(openstack_name, ui_openstack_bucket, info); + return rgw_get_user_info_from_index(swift_name, ui_swift_bucket, info); } /** @@ -173,7 +178,12 @@ static int rgw_read_buckets_from_attr(string& user_id, RGWUserBuckets& buckets) return ret; bufferlist::iterator iter = bl.begin(); - buckets.decode(iter); + try { + buckets.decode(iter); + } catch (buffer::error& err) { + RGW_LOG(0) << "ERROR: failed to decode buckets info, caught buffer::error" << dendl; + return -EIO; + } return 0; } @@ -230,13 +240,18 @@ int rgw_read_user_buckets(string user_id, RGWUserBuckets& buckets, bool need_sta bufferlist::iterator p = bl.begin(); bufferlist header; map<string,bufferlist> m; - ::decode(header, p); - ::decode(m, p); - for (map<string,bufferlist>::iterator q = m.begin(); q != m.end(); q++) { - bufferlist::iterator iter = q->second.begin(); - RGWBucketEnt bucket; - ::decode(bucket, iter); - buckets.add(bucket); + try { + ::decode(header, p); + ::decode(m, p); + for (map<string,bufferlist>::iterator q = m.begin(); q != m.end(); q++) { + bufferlist::iterator iter = q->second.begin(); + RGWBucketEnt bucket; + ::decode(bucket, iter); + buckets.add(bucket); + } + } catch (buffer::error& err) { + RGW_LOG(0) << "ERROR: failed to decode bucket information, caught buffer::error" << dendl; + return -EIO; } } else { ret = rgw_read_buckets_from_attr(user_id, buckets); @@ -392,9 +407,9 @@ int rgw_remove_email_index(string& uid, string& email) return ret; } -int rgw_remove_openstack_name_index(string& uid, string& openstack_name) +int rgw_remove_swift_name_index(string& uid, string& swift_name) { - rgw_obj obj(ui_openstack_bucket, openstack_name); + rgw_obj obj(ui_swift_bucket, swift_name); int ret = rgwstore->delete_obj(NULL, uid, obj); return ret; } @@ -421,34 +436,49 @@ int rgw_delete_user(RGWUserInfo& info, bool purge_data) { } map<string, RGWAccessKey>::iterator kiter = info.access_keys.begin(); for (; kiter != info.access_keys.end(); ++kiter) { + RGW_LOG(0) << "removing key index: " << kiter->first << dendl; ret = rgw_remove_key_index(kiter->second); - if (ret < 0 && ret != -ENOENT) - RGW_LOG(0) << "ERROR: could not remove " << kiter->first << " (access key object), should be fixed manually (err=" << ret << ")" << dendl; + if (ret < 0 && ret != -ENOENT) { + RGW_LOG(0) << "ERROR: could not remove " << kiter->first << " (access key object), should be fixed (err=" << ret << ")" << dendl; + return ret; + } } - rgw_obj uid_obj(ui_uid_bucket, info.user_id); - ret = rgwstore->delete_obj(NULL, info.user_id, uid_obj); - if (ret < 0 && ret != -ENOENT) - RGW_LOG(0) << "ERROR: could not remove " << info.user_id << ":" << uid_obj << ", should be fixed manually (err=" << ret << ")" << dendl; - - string buckets_obj_id; - get_buckets_obj(info.user_id, buckets_obj_id); - rgw_obj uid_bucks(ui_uid_bucket, buckets_obj_id); - ret = rgwstore->delete_obj(NULL, info.user_id, uid_bucks); - if (ret < 0 && ret != -ENOENT) - RGW_LOG(0) << "ERROR: could not remove " << info.user_id << ":" << uid_bucks << ", should be fixed manually (err=" << ret << ")" << dendl; - - rgw_obj email_obj(ui_email_bucket, info.user_email); + RGW_LOG(0) << "removing email index: " << info.user_email << dendl; ret = rgwstore->delete_obj(NULL, info.user_id, email_obj); - if (ret < 0 && ret != -ENOENT) - RGW_LOG(0) << "ERROR: could not remove " << info.user_id << ":" << email_obj << ", should be fixed manually (err=" << ret << ")" << dendl; + if (ret < 0 && ret != -ENOENT) { + RGW_LOG(0) << "ERROR: could not remove " << info.user_id << ":" << email_obj << ", should be fixed (err=" << ret << ")" << dendl; + return ret; + } if (purge_data) { + RGW_LOG(0) << "purging user buckets" << dendl; ret = rgwstore->purge_buckets(info.user_id, buckets_vec); - if (ret < 0) + if (ret < 0 && ret != -ENOENT) { RGW_LOG(0) << "ERROR: delete_buckets returned " << ret << dendl; + return ret; + } + } + + string buckets_obj_id; + get_buckets_obj(info.user_id, buckets_obj_id); + rgw_obj uid_bucks(ui_uid_bucket, buckets_obj_id); + RGW_LOG(0) << "removing user buckets index" << dendl; + ret = rgwstore->delete_obj(NULL, info.user_id, uid_bucks); + if (ret < 0 && ret != -ENOENT) { + RGW_LOG(0) << "ERROR: could not remove " << info.user_id << ":" << uid_bucks << ", should be fixed (err=" << ret << ")" << dendl; + return ret; + } + + rgw_obj uid_obj(ui_uid_bucket, info.user_id); + RGW_LOG(0) << "removing user index: " << info.user_id << dendl; + ret = rgwstore->delete_obj(NULL, info.user_id, uid_obj); + if (ret < 0 && ret != -ENOENT) { + RGW_LOG(0) << "ERROR: could not remove " << info.user_id << ":" << uid_obj << ", should be fixed (err=" << ret << ")" << dendl; + return ret; } + return 0; } @@ -485,7 +515,12 @@ int rgw_retrieve_pool_info(int64_t pool_id, RGWPoolInfo& pool_info) return ret; } bufferlist::iterator iter = bl.begin(); - ::decode(pool_info, iter); + try { + ::decode(pool_info, iter); + } catch (buffer::error& err) { + RGW_LOG(0) << "ERROR: failed to decode pool information, caught buffer::error" << dendl; + return -EIO; + } return 0; } diff --git a/src/rgw/rgw_user.h b/src/rgw/rgw_user.h index c87ec8cf277..92b964f6941 100644 --- a/src/rgw/rgw_user.h +++ b/src/rgw/rgw_user.h @@ -11,7 +11,7 @@ using namespace std; #define USER_INFO_POOL_NAME ".users" #define USER_INFO_EMAIL_POOL_NAME ".users.email" -#define USER_INFO_OPENSTACK_POOL_NAME ".users.openstack" +#define USER_INFO_SWIFT_POOL_NAME ".users.swift" #define USER_INFO_UID_POOL_NAME ".users.uid" #define RGW_USER_ANON_ID "anonymous" @@ -53,15 +53,15 @@ extern int rgw_store_user_info(RGWUserInfo& info); */ extern int rgw_get_user_info_by_uid(string& user_id, RGWUserInfo& info); /** - * Given an openstack username, finds the user info associated with it. + * Given an swift username, finds the user info associated with it. * returns: 0 on success, -ERR# on failure (including nonexistence) */ extern int rgw_get_user_info_by_email(string& email, RGWUserInfo& info); /** - * Given an openstack username, finds the user info associated with it. + * Given an swift username, finds the user info associated with it. * returns: 0 on success, -ERR# on failure (including nonexistence) */ -extern int rgw_get_user_info_by_openstack(string& openstack_name, RGWUserInfo& info); +extern int rgw_get_user_info_by_swift(string& swift_name, RGWUserInfo& info); /** * Given an access key, finds the user info associated with it. * returns: 0 on success, -ERR# on failure (including nonexistence) @@ -147,7 +147,7 @@ extern int rgw_remove_bucket(string user_id, rgw_bucket& bucket, bool purge_data extern int rgw_remove_key_index(RGWAccessKey& access_key); extern int rgw_remove_uid_index(string& uid); extern int rgw_remove_email_index(string& uid, string& email); -extern int rgw_remove_openstack_name_index(string& uid, string& openstack_name); +extern int rgw_remove_swift_name_index(string& uid, string& swift_name); extern int rgw_store_pool_info(int64_t pool_id, RGWPoolInfo& pool_info); extern int rgw_retrieve_pool_info(int64_t pool_id, RGWPoolInfo& pool_info); diff --git a/src/test/cli/radosgw_admin/help.t b/src/test/cli/radosgw_admin/help.t index adc6259534c..8860c3cdd02 100644 --- a/src/test/cli/radosgw_admin/help.t +++ b/src/test/cli/radosgw_admin/help.t @@ -27,11 +27,11 @@ --uid=<id> user id --subuser=<name> subuser name --access-key=<key> S3 access key - --os-user=<group:name> OpenStack user + --swift-user=<group:name> Swift user --email=<email> --auth_uid=<auid> librados uid --secret=<key> S3 key - --os-secret=<key> OpenStack key + --swift-secret=<key> Swift key --gen-access-key generate random access key --gen-secret generate random secret key --access=<access> Set access permissions for sub-user, should be one diff --git a/src/test/store_test.cc b/src/test/store_test.cc index a3c9f529e56..10fb6564dc2 100644 --- a/src/test/store_test.cc +++ b/src/test/store_test.cc @@ -253,7 +253,7 @@ public: // hash //boost::binomial_distribution<uint32_t> bin(0xFFFFFF, 0.5); ++seq; - return hobject_t(name, CEPH_NOSNAP, rand()); + return hobject_t(name, string(), CEPH_NOSNAP, rand()); } }; @@ -469,7 +469,7 @@ TEST_F(StoreTest, HashCollisionTest) { if (!(i % 5)) { cerr << "Object " << i << std::endl; } - hobject_t hoid(string(buf) + base, CEPH_NOSNAP, 0); + hobject_t hoid(string(buf) + base, string(), CEPH_NOSNAP, 0); { ObjectStore::Transaction t; t.touch(cid, hoid); |