summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGreg Farnum <greg@inktank.com>2013-08-27 17:26:36 -0700
committerGreg Farnum <greg@inktank.com>2013-08-27 17:26:36 -0700
commit9101433a889e0221529a16bf2a8a5ed0890e1a10 (patch)
tree764ce395cfb3f0590a3aa144dde23581fdfdd69c
parent6c432f1932f2a6361fb439ed0b95b35ba0f711e6 (diff)
parent7cc2eb246df14925ca27b8dee19b32e0bdb505a8 (diff)
downloadceph-9101433a889e0221529a16bf2a8a5ed0890e1a10.tar.gz
Merge remote-tracking branch 'origin/master' into wip-6029
Conflicts: src/librados/AioCompletionImpl.h
-rw-r--r--.gitignore1
-rw-r--r--COPYING5
-rw-r--r--README28
-rw-r--r--ceph.spec.in4
-rw-r--r--configure.ac17
-rw-r--r--debian/control3
-rw-r--r--debian/copyright5
-rwxr-xr-xdo_autogen.sh6
-rw-r--r--doc/changelog/v0.67.2.txt207
-rw-r--r--doc/dev/osd_internals/erasure_coding.rst5
-rw-r--r--doc/dev/osd_internals/erasure_coding/PGBackend-h.rst5
-rw-r--r--doc/dev/osd_internals/erasure_coding/developer_notes.rst21
-rw-r--r--doc/install/upgrading-ceph.rst57
-rw-r--r--doc/man/8/monmaptool.rst2
-rw-r--r--doc/rados/deployment/ceph-deploy-mon.rst7
-rw-r--r--doc/rados/operations/operating.rst31
-rw-r--r--doc/release-notes.rst24
-rwxr-xr-xqa/workunits/suites/fsstress.sh15
-rw-r--r--src/Makefile.am62
-rw-r--r--src/arch/intel.c46
-rw-r--r--src/arch/intel.h16
-rw-r--r--src/arch/probe.cc20
-rw-r--r--src/arch/probe.h16
-rw-r--r--src/auth/cephx/CephxKeyServer.cc6
-rwxr-xr-xsrc/ceph-disk15
-rw-r--r--src/ceph_osd.cc2
-rw-r--r--src/client/fuse_ll.cc2
-rw-r--r--src/common/config_opts.h8
-rw-r--r--src/common/crc32c-intel.c113
-rw-r--r--src/common/crc32c.cc41
-rw-r--r--src/common/crc32c_intel_baseline.c126
-rw-r--r--src/common/crc32c_intel_baseline.h14
-rw-r--r--src/common/crc32c_intel_fast.c30
-rw-r--r--src/common/crc32c_intel_fast.h28
-rw-r--r--src/common/crc32c_intel_fast_asm.S664
-rw-r--r--src/common/sctp_crc32.c2
-rw-r--r--src/common/sctp_crc32.h14
-rw-r--r--src/common/sharedptr_registry.hpp25
-rw-r--r--src/gtest/.gitignore1
-rw-r--r--src/include/buffer.h2
-rw-r--r--src/include/crc32c.h30
-rw-r--r--src/init-ceph.in4
-rwxr-xr-xsrc/init-rbdmap2
-rw-r--r--src/json_spirit/json_spirit_writer_template.h2
-rw-r--r--src/librados-config.cc3
-rw-r--r--src/librados/AioCompletionImpl.h22
-rw-r--r--src/librbd/internal.cc2
-rw-r--r--src/mds/MDCache.cc2
-rw-r--r--src/mds/flock.cc5
-rw-r--r--src/mon/DataHealthService.cc42
-rw-r--r--src/mon/DataHealthService.h1
-rw-r--r--src/mon/MonCap.cc2
-rw-r--r--src/mon/MonCommands.h2
-rw-r--r--src/mon/Monitor.cc1
-rw-r--r--src/mon/MonitorDBStore.h8
-rw-r--r--src/mon/PGMap.cc19
-rw-r--r--src/mon/PGMap.h1
-rw-r--r--src/mon/PGMonitor.cc6
-rw-r--r--src/mon/Paxos.cc37
-rw-r--r--src/mon/Paxos.h3
-rw-r--r--src/mon/mon_types.h68
-rw-r--r--src/msg/Message.h4
-rw-r--r--src/msg/Pipe.cc8
-rw-r--r--src/os/BtrfsFileStoreBackend.cc34
-rw-r--r--src/os/FileStore.cc8
-rw-r--r--src/os/FlatIndex.cc5
-rw-r--r--src/os/KeyValueDB.h2
-rw-r--r--src/os/LFNIndex.cc4
-rw-r--r--src/os/LevelDBStore.h64
-rw-r--r--src/os/WBThrottle.cc2
-rw-r--r--src/osd/OSD.cc75
-rw-r--r--src/osd/OSD.h1
-rw-r--r--src/osd/OSDCap.cc2
-rw-r--r--src/osd/PG.cc3
-rw-r--r--src/osd/PG.h9
-rw-r--r--src/osd/ReplicatedPG.cc340
-rw-r--r--src/osd/ReplicatedPG.h105
-rw-r--r--src/osd/Watch.cc18
-rw-r--r--src/osd/Watch.h10
-rw-r--r--src/osd/osd_types.h29
-rw-r--r--src/osdc/ObjectCacher.cc31
-rw-r--r--src/osdc/ObjectCacher.h18
-rw-r--r--src/osdc/Objecter.cc17
-rwxr-xr-xsrc/pybind/ceph_rest_api.py13
-rw-r--r--src/rgw/rgw_admin.cc4
-rw-r--r--src/rgw/rgw_bucket.cc7
-rw-r--r--src/rgw/rgw_cache.h9
-rw-r--r--src/rgw/rgw_rados.cc11
-rw-r--r--src/rgw/rgw_rados.h1
-rw-r--r--src/test/ObjectMap/KeyValueDBMemory.h18
-rw-r--r--src/test/ObjectMap/test_store_tool/test_store_tool.cc14
-rw-r--r--src/test/cli/radosgw-admin/help.t1
-rw-r--r--src/test/common/test_sharedptr_registry.cc24
-rw-r--r--src/test/mon/moncap.cc1
-rw-r--r--src/test/osd/osdcap.cc1
-rw-r--r--src/test/pybind/test_rados.py8
-rw-r--r--src/test/test_osd_types.cc9
-rw-r--r--src/tools/ceph-monstore-tool.cc3
-rwxr-xr-xsrc/vstart.sh89
-rwxr-xr-xsrc/yasm-wrapper38
100 files changed, 2317 insertions, 681 deletions
diff --git a/.gitignore b/.gitignore
index 6dc8d49b3ae..211c09cbba7 100644
--- a/.gitignore
+++ b/.gitignore
@@ -69,3 +69,4 @@ web/*.html
# dir from coverity tools
cov-int/
+/test-driver \ No newline at end of file
diff --git a/COPYING b/COPYING
index 5e18e66bf5a..28d88ebb7fa 100644
--- a/COPYING
+++ b/COPYING
@@ -26,6 +26,11 @@ Files: m4/acx_pthread.m4
Copyright: Steven G. Johnson <stevenj@alum.mit.edu>
License: GPLWithACException
+Files: src/common/crc32c_intel*:
+Copyright:
+ Copyright 2012-2013 Intel Corporation All Rights Reserved.
+License: BSD 3-clause
+
Files: src/common/sctp_crc32.c:
Copyright:
Copyright (c) 2001-2007, by Cisco Systems, Inc. All rights reserved.
diff --git a/README b/README
index 1dcf94512ac..3662d0ea2cb 100644
--- a/README
+++ b/README
@@ -97,7 +97,11 @@ To build the documentation, ensure that you are in the top-level `/ceph director
Build Prerequisites
--------------------
+===================
+
+
+debian-based
+------------
To build the source code, you must install the following:
- automake
@@ -132,3 +136,25 @@ To build the source code, you must install the following:
For example:
$ apt-get install automake autoconf pkg-config gcc g++ make libboost-dev libedit-dev libssl-dev libtool libfcgi libfcgi-dev libfuse-dev linux-kernel-headers libcrypto++-dev libaio-dev libgoogle-perftools-dev libkeyutils-dev uuid-dev libatomic-ops-dev libboost-program-options-dev libboost-thread-dev libexpat1-dev libleveldb-dev libsnappy-dev libcurl4-gnutls-dev python-argparse python-flask
+
+rpm-based
+---------
+These are the rpm packages needed to install in an rpm-based OS:
+
+ autoconf
+ automake
+ gcc
+ make
+ libtool
+ python-argparse
+ python-flask
+ libuuid-devel
+ nss-devel
+ fuse-devel
+ gperftools-devel
+ libedit-devel
+ libatomic_ops-devel
+ snappy-devel
+ leveldb-devel
+ libaio-devel
+ boost-devel
diff --git a/ceph.spec.in b/ceph.spec.in
index 82c9d073980..8091018c1dc 100644
--- a/ceph.spec.in
+++ b/ceph.spec.in
@@ -42,6 +42,7 @@ BuildRequires: libcurl-devel
BuildRequires: libxml2-devel
BuildRequires: libuuid-devel
BuildRequires: leveldb-devel > 1.2
+BuildRequires: yasm
%if 0%{?rhel_version} || 0%{?centos_version} || 0%{?fedora}
BuildRequires: snappy-devel
%endif
@@ -125,7 +126,6 @@ Requires: apache2-mod_fcgid
%else
BuildRequires: expat-devel
BuildRequires: fcgi-devel
-Requires: mod_fcgid
%endif
%description radosgw
radosgw is an S3 HTTP REST gateway for the RADOS object store. It is
@@ -249,7 +249,7 @@ BuildRequires: junit
%description -n cephfs-java
This package contains the Java libraries for the Ceph File System.
-%if (0%{?centos} || 0%{?opensuse} || 0%{?suse_version})
+%if 0%{?opensuse} || 0%{?suse_version}
%debug_package
%endif
diff --git a/configure.ac b/configure.ac
index 6f3cc6a9f5f..38b6782a46a 100644
--- a/configure.ac
+++ b/configure.ac
@@ -33,6 +33,8 @@ AC_CANONICAL_TARGET
AM_INIT_AUTOMAKE
AM_PROG_CC_C_O
AM_PROG_LIBTOOL
+AM_PROG_AS
+
# enable make V=0 (if automake >1.11)
AM_INIT_AUTOMAKE([foreign])
@@ -59,6 +61,8 @@ if test "$CXX" = no || test "$CXX:$GXX" = "g++:"; then
AC_MSG_ERROR([no C++ compiler found])
fi
+AM_CONDITIONAL(CLANG, test x"$CXX" = x"clang++")
+
#AC_PROG_CC
AC_PROG_MAKE_SET
AC_PROG_LIBTOOL
@@ -68,6 +72,19 @@ AC_PROG_LIBTOOL
AC_SUBST(AM_CXXFLAGS)
AM_CXXFLAGS="${AM_CXXFLAGS}"
+# Check for yasm
+if yasm -f elf64 src/common/crc32c_intel_fast_asm.S -o /dev/null; then
+ echo 'we have a modern and working yasm'
+ if test `arch` = "x86_64"; then
+ echo 'we are x86_64'
+ AC_DEFINE([HAVE_GOOD_YASM_ELF64], [1], [we have a recent yasm and are x86_64])
+ with_good_yasm=yes
+ fi
+else
+ echo 'we do not have a modern/working yasm'
+fi
+AM_CONDITIONAL(WITH_GOOD_YASM_ELF64, test "$with_good_yasm" = "yes")
+
# Checks for compiler warning types
# AC_CHECK_CC_FLAG(FLAG_TO_TEST, VARIABLE_TO_SET_IF_SUPPORTED)
diff --git a/debian/control b/debian/control
index 195cb37fe62..44ee725efd4 100644
--- a/debian/control
+++ b/debian/control
@@ -34,7 +34,8 @@ Build-Depends: autoconf,
libxml2-dev,
pkg-config,
python (>= 2.6.6-3~),
- uuid-dev
+ uuid-dev,
+ yasm
Standards-Version: 3.9.3
Package: ceph
diff --git a/debian/copyright b/debian/copyright
index 0dc2160ae9c..aa91a149853 100644
--- a/debian/copyright
+++ b/debian/copyright
@@ -27,6 +27,11 @@ Files: m4/acx_pthread.m4
Copyright: Steven G. Johnson <stevenj@alum.mit.edu>
License: GPLWithACException
+Files: src/common/crc32c_intel*:
+Copyright:
+ Copyright 2012-2013 Intel Corporation All Rights Reserved.
+License: BSD 3-clause
+
Files: src/common/sctp_crc32.c:
Copyright:
Copyright (c) 2001-2007, by Cisco Systems, Inc. All rights reserved.
diff --git a/do_autogen.sh b/do_autogen.sh
index baf2dc1eba7..32e9df4623b 100755
--- a/do_autogen.sh
+++ b/do_autogen.sh
@@ -82,7 +82,11 @@ if [ "${debug_level}" -ge 3 ]; then
-Wno-missing-field-initializers -Wno-missing-declarations"
fi
if [ "${debug_level}" -ge 4 ]; then
- CXXFLAGS="${CXXFLAGS} -Wstrict-null-sentinel -Woverloaded-virtual"
+ if [ "${CXX}" -ne "clang++" ]; then
+ CXXFLAGS="${CXXFLAGS} -Wstrict-null-sentinel -Woverloaded-virtual"
+ else
+ CXXFLAGS="${CXXFLAGS} -Woverloaded-virtual"
+ fi
CFLAGS="${CFLAGS} \
-Wuninitialized -Winit-self \
-Wformat=2 -Wunused -Wfloat-equal \
diff --git a/doc/changelog/v0.67.2.txt b/doc/changelog/v0.67.2.txt
new file mode 100644
index 00000000000..e5d0f3d5e5b
--- /dev/null
+++ b/doc/changelog/v0.67.2.txt
@@ -0,0 +1,207 @@
+commit eb4380dd036a0b644c6283869911d615ed729ac8
+Author: Gary Lowell <gary.lowell@inktank.com>
+Date: Thu Aug 22 19:10:55 2013 -0700
+
+ v0.67.2
+
+commit 242e43dae5b7c935b8f92c09e8dfe4704ba13787
+Author: Sage Weil <sage@inktank.com>
+Date: Fri Aug 9 12:49:57 2013 -0700
+
+ .gitignore: ignore test-driver
+
+ Signed-off-by: Sage Weil <sage@inktank.com>
+ (cherry picked from commit edf2c3449ec96d91d3d7ad01c50f7a79b7b2f7cc)
+
+commit 88aef702fb77c0a176caf37646a11ef480621412
+Author: Sage Weil <sage@inktank.com>
+Date: Fri Aug 9 12:42:49 2013 -0700
+
+ fuse: fix warning when compiled against old fuse versions
+
+ client/fuse_ll.cc: In function 'void invalidate_cb(void*, vinodeno_t, int64_t, int64_t)':
+ warning: client/fuse_ll.cc:540: unused variable 'fino'
+
+ Signed-off-by: Sage Weil <sage@inktank.com>
+ (cherry picked from commit 9833e9dabe010e538cb98c51d79b6df58ce28f9e)
+
+commit 48e104c9486f7a532455df108dbc225c00796097
+Author: Sage Weil <sage@inktank.com>
+Date: Fri Aug 9 12:40:34 2013 -0700
+
+ json_spirit: remove unused typedef
+
+ In file included from json_spirit/json_spirit_writer.cpp:7:0:
+ json_spirit/json_spirit_writer_template.h: In function 'String_type json_spirit::non_printable_to_string(unsigned int)':
+ json_spirit/json_spirit_writer_template.h:37:50: warning: typedef 'Char_type' locally defined but not used [-Wunused-local-typedefs]
+ typedef typename String_type::value_type Char_type;
+
+ (Also, ha ha, this file uses \r\n.)
+
+ Signed-off-by: Sage Weil <sage@inktank.com>
+ (cherry picked from commit 6abae35a3952e5b513895267711fea63ff3bad09)
+
+commit ae42619ca710d737bf4d8c63f39d1102326c903c
+Author: Sage Weil <sage@inktank.com>
+Date: Fri Aug 9 12:31:41 2013 -0700
+
+ gtest: add build-aux/test-driver to .gitignore
+
+ Signed-off-by: Sage Weil <sage@inktank.com>
+ (cherry picked from commit c9cdd19d1cd88b84e8a867f5ab85cb51fdc6f8e4)
+
+commit 2c122be08db2f233d66214eb804734ae45646084
+Author: Josh Durgin <josh.durgin@inktank.com>
+Date: Wed Aug 21 14:28:49 2013 -0700
+
+ objecter: resend unfinished lingers when osdmap is no longer paused
+
+ Plain Ops that haven't finished yet need to be resent if the osdmap
+ transitions from full or paused to unpaused. If these Ops are
+ triggered by LingerOps, they will be cancelled instead (since
+ should_resend = false), but the LingerOps that triggered them will not
+ be resent.
+
+ Fix this by checking the registered flag for all linger ops, and
+ resending any of them that aren't paused anymore.
+
+ Fixes: #6070
+ Signed-off-by: Josh Durgin <josh.durgin@inktank.com>
+ Reviewed-by: Sage Weil <sage.weil@inktank.com>
+ (cherry picked from commit 38a0ca66a79af4b541e6322467ae3a8a4483cc72)
+
+commit f6fe74ff51f679e7245b02462822d9ef1e15d28c
+Author: Sage Weil <sage@inktank.com>
+Date: Tue Aug 20 11:23:46 2013 -0700
+
+ pybind: fix Rados.conf_parse_env test
+
+ This happens after we connect, which means we get ENOSYS always.
+ Instead, parse_env inside the normal setup method, which had the added
+ benefit of being able to debug these tests.
+
+ Backport: dumpling
+ Signed-off-by: Sage Weil <sage@inktank.com>
+ (cherry picked from commit 6ef1970340c57d6e02f947348fb38882b51d131c)
+
+commit 47c89497b7f69cbf1557cd05b89837c388e2ba2f
+Author: Sage Weil <sage@inktank.com>
+Date: Tue Aug 13 13:14:59 2013 -0700
+
+ librados: fix MWatchNotify leak
+
+ Do not leak the message if the watcher is not registered. Also, simplify
+ this block.
+
+ Fixes (part of): #5949
+ Backport: dumpling, cuttlefish
+ Signed-off-by: Sage Weil <sage@inktank.com>
+ Reviewed-by: Yehuda Sadeh <yehuda@inktank.com>
+ (cherry picked from commit 6f5d8036f3e70c5e30edf7e36fb8ff9a56197f60)
+
+commit b3a9a8c4e5edff5431d8da71033047eced6bf985
+Author: Samuel Just <sam.just@inktank.com>
+Date: Mon Aug 19 17:23:44 2013 -0700
+
+ PG: remove old log when we upgrade log version
+
+ Otherwise the log_oid will be non-empty and the next
+ boot will cause us to try to upgrade again.
+
+ Fixes: #6057
+ Signed-off-by: Samuel Just <sam.just@inktank.com>
+ Reviewed-by: Sage Weil <sage@inktank.com>
+ (cherry picked from commit 1f851cb2489a95526de932ec6734ebf413e43490)
+
+commit c6005ccbaa482c62d7a6cbb387bdcf17f0e308d5
+Author: Samuel Just <sam.just@inktank.com>
+Date: Mon Aug 19 00:02:24 2013 -0700
+
+ PGLog: add a config to disable PGLog::check()
+
+ This is a debug check which may be causing excessive
+ cpu usage.
+
+ Reviewed-by: Sage Weil <sage@inktank.com>
+ Signed-off-by: Samuel Just <sam.just@inktank.com>
+ (cherry picked from commit 00080d785f6695b800f71317a3048a21064e61cb)
+
+commit 96d719eeecceaa06078a29c2f868e50e6bc9ab31
+Author: Sage Weil <sage@inktank.com>
+Date: Mon Aug 19 12:48:50 2013 -0700
+
+ ceph: parse CEPH_ARGS environment variable
+
+ Fixes: #6052
+ Signed-off-by: Sage Weil <sage@inktank.com>
+ Reviewed-by: Dan Mick <dan.mick@inktank.com>
+ (cherry picked from commit 67a95b9880c9bc6e858150352318d68d64ed74ad)
+
+commit d348cf5d135d099fe0490c1519196cd83a04831e
+Author: Sage Weil <sage@inktank.com>
+Date: Mon Aug 19 12:48:40 2013 -0700
+
+ rados pybind: add conf_parse_env()
+
+ Signed-off-by: Sage Weil <sage@inktank.com>
+ Reviewed-by: Dan Mick <dan.mick@inktank.com>
+ (cherry picked from commit eef7cacdb19313907a9367187b742db5382ee584)
+
+commit 290bcd8a718887eb0e28aa2d97bceeee79068ea9
+Author: Yehuda Sadeh <yehuda@inktank.com>
+Date: Tue Aug 13 13:16:07 2013 -0700
+
+ rgw: drain requests before exiting
+
+ Fixes: #5953
+
+ Signed-off-by: Yehuda Sadeh <yehuda@inktank.com>
+ Reviewed-by: Sage Weil <sage@inktank.com>
+ (cherry picked from commit 3cbf6a7b031c2ce8072733c5c0b7ceb53fdcb090)
+
+commit 863df08a43dff99797453040eb1ef6071b0432f9
+Author: Sage Weil <sage@inktank.com>
+Date: Tue Aug 13 11:16:17 2013 -0700
+
+ rgw: do not leak handler in get_handler() error path
+
+ If we fail to initialize, delete the handler.
+
+ Fixes (part of): #5949
+ Signed-off-by: Sage Weil <sage@inktank.com>
+ Reviewed-by: Yehuda Sadeh <yehuda@inktank.com>
+ (cherry picked from commit 810c52de36719c3ee6cf2bdf59d5cde8840bbe55)
+
+commit 9ac003f793b6cc72059110aac44014ddf2372bee
+Author: Sage Weil <sage@inktank.com>
+Date: Fri Jul 26 23:20:54 2013 -0700
+
+ rgw: fix leak of RGWDataChangesLog::renew_thread
+
+ Signed-off-by: Sage Weil <sage@inktank.com>
+ (cherry picked from commit 25948319c4d256c4aeb0137eb88947e54d14cc79)
+
+commit 89cd9dc403e97b4bd08920fbb5d6e2b8b9b7dac2
+Author: Sage Weil <sage@inktank.com>
+Date: Fri Jul 26 23:17:10 2013 -0700
+
+ rgw: free resolver on shutdown
+
+ Signed-off-by: Sage Weil <sage@inktank.com>
+ (cherry picked from commit a31356338b8ae55df59d829d9080ffad70b97d10)
+
+commit 5b26ca7fa5beb87cbbe6bbb26d70789ff2aa7661
+Author: Sage Weil <sage@inktank.com>
+Date: Fri Jul 26 23:22:20 2013 -0700
+
+ rgw: fix up signal handling
+
+ OMG libfcgi is annoying with shutdown and signals. You need to close
+ the fd *and* resend a signal to ensure that you kick the accept loop
+ hard enough to make it shut down.
+
+ Document this, and switch to the async signal handlers. Put them
+ tightly around the runtime loop as we do with other daemons.
+
+ Signed-off-by: Sage Weil <sage@inktank.com>
+ (cherry picked from commit 59b13cebee600dad2551d2c7dc3482b05eaf8b22)
diff --git a/doc/dev/osd_internals/erasure_coding.rst b/doc/dev/osd_internals/erasure_coding.rst
index bfc425251a8..cc1efe4b4bf 100644
--- a/doc/dev/osd_internals/erasure_coding.rst
+++ b/doc/dev/osd_internals/erasure_coding.rst
@@ -27,7 +27,8 @@ Glossary
Example:
::
- OSD 40 OSD 33
+
+ OSD 40 OSD 33
+-------------------------+ +-------------------------+
| shard 0 - PG 10 | | shard 1 - PG 10 |
|+------ object O -------+| |+------ object O -------+|
@@ -54,4 +55,4 @@ Table of content
High level design document <erasure_coding/pgbackend>
Developer notes <erasure_coding/developer_notes>
-
+ Draft PGBackend.h header <erasure_coding/PGBackend-h>
diff --git a/doc/dev/osd_internals/erasure_coding/PGBackend-h.rst b/doc/dev/osd_internals/erasure_coding/PGBackend-h.rst
index 7e1998382a0..b39cdb0e88e 100644
--- a/doc/dev/osd_internals/erasure_coding/PGBackend-h.rst
+++ b/doc/dev/osd_internals/erasure_coding/PGBackend-h.rst
@@ -1,5 +1,10 @@
+===========
PGBackend.h
+===========
+
+Work in progress:
::
+
/**
* PGBackend
*
diff --git a/doc/dev/osd_internals/erasure_coding/developer_notes.rst b/doc/dev/osd_internals/erasure_coding/developer_notes.rst
index 496a4a99f76..d542fdb86e2 100644
--- a/doc/dev/osd_internals/erasure_coding/developer_notes.rst
+++ b/doc/dev/osd_internals/erasure_coding/developer_notes.rst
@@ -31,6 +31,7 @@ is stored as an attribute of the object. The chunk *1* contains *ABC*
and is stored on *OSD5*, the chunk *4* contains *XYY* and is stored on
*OSD3*.
::
+
+-------------------+
name | NYAN |
+-------------------+
@@ -82,6 +83,7 @@ could not be read because the *OSD4* is *out*. The decoding function
is called as soon as three chunks are read : *OSD2* was the slowest
and its chunk was not taken into account.
::
+
+-------------------+
name | NYAN |
+-------------------+
@@ -132,6 +134,7 @@ the payload into M+K chunks and send them to the OSDs in the acting
set. It is also responsible for maintaining an authoritative version
of the placement group logs.
::
+
primary
+---OSD 1---+
| log |
@@ -155,6 +158,7 @@ of the placement group logs.
An erasure coded placement group has been created with M = 2 + K = 1 and is supported by three OSDs, two for M and one for K. The acting set of the placement group is made of *OSD 1* *OSD 2* and *OSD 3*. An object has been encoded and stored in the OSDs : the chunk D1v1 (i.e. Data chunk number 1 version 1) is on *OSD 1*, D2v1 on *OSD 2* and P1v1 (i.e. Parity chunk number 1 version 1) on *OSD 3*. The placement group logs on each OSD are in synch at epoch 1 version 1 (i.e. 1,1).
::
+
primary
+---OSD 1---+
|+----+ log |
@@ -180,6 +184,7 @@ An erasure coded placement group has been created with M = 2 + K = 1 and is supp
*OSD 1* is the primary and receives a WRITE FULL from a client, meaning the payload is to replace the content of the object entirely, it is not a partial write that would only overwrite part of it. The version two of the object is created to override the version one. *OSD 1* encodes the payload into three chunks : D1v2 (i.e. Data chunk number 1 version 2) will be on *OSD 1*, D2v2 on *OSD 2* and P1v2 (i.e. Parity chunk number 1 version 2) on *OSD 3*. Each chunk is sent to the target OSD, including the primary OSD which is responsible for storing chunks in addition to handling write operations and maintaining an authoritative version of the placement group logs. When an OSD receives the message instructing it to write the chunk, it also creates a new entry in the placement group logs to reflect the change. For instance, as soon as *OSD 3* stores *P1v2*, it adds the entry 1,2 ( i.e. epoch 1, version 2 ) to its logs. Because the OSDs work asynchronously, some chunks may still be in flight ( such as *D2v2* ) while others are acknowledged and on disk ( such as *P1v1* and *D1v1* ).
::
+
primary
+---OSD 1---+
|+----+ log |
@@ -208,6 +213,7 @@ An erasure coded placement group has been created with M = 2 + K = 1 and is supp
If all goes well, the chunks are acknowledged on each OSD in the acting set and the *last_complete* pointer of the logs can move from *1,1* to *1,2* and the files used to store the chunks of the previous version of the object can be removed : *D1v1* on *OSD 1*, *D2v1* on *OSD 2* and *P1v1* on *OSD 3*.
::
+
+---OSD 1---+
| |
| DOWN |
@@ -234,6 +240,7 @@ If all goes well, the chunks are acknowledged on each OSD in the acting set and
But accidents happen. If *OSD 1* goes down while *D2v2* is still in flight, the version 2 of the object is partially written : *OSD 3* has one chunk but does not have enough to recover. It lost two chunks : *D1v2* and *D2v2* but the erasure coding parameters M = 2 + K = 1 requires that at least two chunks are available to rebuild the third. *OSD 4* becomes the new primary and finds that the *last_complete* log entry ( i.e. all objects before this entry were known to be available on all OSDs in the previous acting set ) is *1,1* and will be the head of the new authoritative log.
::
+
+---OSD 2---+
|+----+ log |
||D2v1| 1,1 |
@@ -252,6 +259,7 @@ But accidents happen. If *OSD 1* goes down while *D2v2* is still in flight, the
The log entry *1,2* found on *OSD 3* is divergent from the new authoritative log provided by *OSD 4* : it is discarded and the file containing the *P1v2* chunk is removed.
::
+
+---OSD 2---+
|+----+ log |
||D2v1| 1,1 |
@@ -275,6 +283,7 @@ Interrupted append
An object is coded in stripes as described above. In the case of a full write, and assuming the object size is not too large to encode it in memory, there is a single stripe. When appending to an existing object, the stripe size is retrieved from the attributes of the object and if the total size of the object is a multiple of the stripe size and the payload of the append message is lower or equal to the strip size, the following applies. It applies, for instance, when *rgw* writes an object with sequence of append instead of a single write.
::
+
primary
+---OSD 1---+
|+-s1-+ log |
@@ -298,6 +307,7 @@ An object is coded in stripes as described above. In the case of a full write, a
*OSD 1* is the primary and receives an APPEND from a client, meaning the payload is to be appended at the end of the object. *OSD 1* encodes the payload into three chunks : S2D1 (i.e. Stripe two data chunk number 1 ) will be in s1 ( shard 1 ) on *OSD 1*, S2D2 in s2 on *OSD 2* and S2P1 (i.e. Stripe two parity chunk number 1 ) in s3 on *OSD 3*. Each chunk is sent to the target OSD, including the primary OSD which is responsible for storing chunks in addition to handling write operations and maintaining an authoritative version of the placement group logs. When an OSD receives the message instructing it to write the chunk, it also creates a new entry in the placement group logs to reflect the change. For instance, as soon as *OSD 3* stores *S2P1*, it adds the entry 1,2 ( i.e. epoch 1, version 2 ) to its logs. The log entry also carries the nature of the operation: in this case 1,2 is an APPEND where 1,1 was a CREATE. Because the OSDs work asynchronously, some chunks may still be in flight ( such as *S2D2* ) while others are acknowledged and on disk ( such as *S2D1* and *S2P1* ).
::
+
+---OSD 1---+
| |
| DOWN |
@@ -323,6 +333,7 @@ An object is coded in stripes as described above. In the case of a full write, a
If *OSD 1* goes down while *S2D2* is still in flight, the payload is partially appended : s3 ( shard 3) in *OSD 3* has one chunk but does not have enough to recover because s1 and s2 don't have it. It lost two chunks : *S2D1* and *S2D2* but the erasure coding parameters M = 2 + K = 1 requires that at least two chunks are available to rebuild the third. *OSD 4* becomes the new primary and finds that the *last_complete* log entry ( i.e. all objects before this entry were known to be available on all OSDs in the previous acting set ) is *1,1* and will be the head of the new authoritative log.
::
+
+---OSD 2---+
|+-s2-+ log |
||S1D2| 1,1 |
@@ -341,9 +352,11 @@ If *OSD 1* goes down while *S2D2* is still in flight, the payload is partially a
The log entry *1,2* found on *OSD 3* is divergent from the new authoritative log provided by *OSD 4* : it is discarded and the file containing the *S2P1* chunk is truncated to the nearest multiple of the stripe size.
-`Erasure code library <http://tracker.ceph.com/issues/5878>`_
+Erasure code library
--------------------
+See also `the corresponding tracker issue <http://tracker.ceph.com/issues/5878>`_
+
Using `Reed-Solomon <https://en.wikipedia.org/wiki/Reed_Solomon>`_,
with parameters M+K object O is encoded by dividing it into chunks O1,
O2, ... OM and computing parity chunks P1, P2, ... PK. Any M chunks
@@ -377,6 +390,7 @@ Although Reed-Solomon is provided as a default, Ceph uses it via an
abstract API designed to allow each pool to choose the plugin that
implements it.
::
+
ceph osd pool create <pool> \
erasure-code-directory=<dir> \
erasure-code-plugin=<plugin>
@@ -387,12 +401,14 @@ The *<plugin>* is dynamically loaded from *<dir>* (defaults to
which is responsible for registering an object derived from
*ErasureCodePlugin* in the registry singleton :
::
+
registry.plugins[plugin_name] = new ErasureCodePluginExample();
The *ErasureCodePlugin* derived object must provide a factory method
from which the concrete implementation of the *ErasureCodeInterface*
object can be generated:
::
+
virtual int factory(ErasureCodeInterfaceRef *erasure_code,
const map<std::string,std::string> &parameters) {
*erasure_code = ErasureCodeInterfaceRef(new ErasureCodeExample(parameters));
@@ -402,6 +418,7 @@ object can be generated:
The *parameters* is the list of *key=value* pairs that were set when the pool
was created. Each *key* must be prefixed with erasure-code to avoid name collisions
::
+
ceph osd pool create <pool> \
erasure-code-directory=<dir> \ # mandatory
erasure-code-plugin=jerasure \ # mandatory
@@ -419,6 +436,7 @@ Erasure code jerasure plugin
The parameters interpreted by the jerasure plugin are:
::
+
ceph osd pool create <pool> \
erasure-code-directory=<dir> \ # plugin directory absolute path
erasure-code-plugin=jerasure \ # plugin name (only jerasure)
@@ -507,6 +525,7 @@ require to encode the first object and not all of them.
Objects can be further divided into stripes to reduce the overhead of
partial writes. For instance:
::
+
+-----------------------+
|+---------------------+|
|| stripe 0 ||
diff --git a/doc/install/upgrading-ceph.rst b/doc/install/upgrading-ceph.rst
index 839f315bcd4..6020cb6a237 100644
--- a/doc/install/upgrading-ceph.rst
+++ b/doc/install/upgrading-ceph.rst
@@ -202,19 +202,13 @@ Multi-MDS configurations with identical names must be adjusted accordingly to
give daemons unique names. If you run your cluster with one metadata server,
you can disregard this notice for now.
+
ceph-deploy
-----------
-The ceph-deploy tool is now the preferred method of provisioning new
-clusters. For existing clusters created via mkcephfs that would like
-to transition to the new tool, there is a migration path, documented
-at `Transitioning to ceph-deploy`_. Note that transitioning to
-ceph-deploy is not required; it is entirely acceptable to continue
-provisioning new OSDs and monitors using the previous methods.
-However, ceph-deploy streamlines these processes significantly.
-
-.. _Transitioning to ceph-deploy: ../../rados/deployment/ceph-deploy-transition
-
+The ``ceph-deploy`` tool is now the preferred method of provisioning new clusters.
+For existing clusters created via ``mkcephfs`` that would like to transition to the
+new tool, there is a migration path, documented at `Transitioning to ceph-deploy`_.
Cuttlefish to Dumpling
======================
@@ -248,7 +242,7 @@ Then add a new ``ceph.repo`` repository entry with the following contents.
gpgkey=https://ceph.com/git/?p=ceph.git;a=blob_plain;f=keys/release.asc
-.. important:: Ensure you use the correct URL for your distribution. Check the
+.. note:: Ensure you use the correct URL for your distribution. Check the
http://ceph.com/rpm directory for your distribution.
.. note:: Since you can upgrade using ``ceph-deploy`` you will only need to add
@@ -387,8 +381,8 @@ To upgrade a Ceph OSD Daemon, perform the following steps:
sudo restart ceph-osd id=N
For multiple OSDs on a host, you may restart all of them with Upstart. ::
-
- sudo restart ceph-osd-all
+
+ sudo restart ceph-osd-all
For CentOS/Red Hat distributions, use::
@@ -462,6 +456,43 @@ cluster, we recommend upgrading ``ceph-common`` and client libraries
If you do not have the latest version, you may need to uninstall, auto remove
dependencies and reinstall.
+
+Transitioning to ceph-deploy
+============================
+
+If you have an existing cluster that you deployed with ``mkcephfs`` (usually
+Argonaut or Bobtail releases), you will need to make a few changes to your
+configuration to ensure that your cluster will work with ``ceph-deploy``.
+
+
+Monitor Keyring
+---------------
+
+You will need to add ``caps mon = "allow *"`` to your monitor keyring if it is
+not already in the keyring. By default, the monitor keyring is located under
+``/var/lib/ceph/mon/ceph-$id/keyring``. When you have added the ``caps``
+setting, your monitor keyring should look something like this::
+
+ [mon.]
+ key = AQBJIHhRuHCwDRAAZjBTSJcIBIoGpdOR9ToiyQ==
+ caps mon = "allow *"
+
+Adding ``caps mon = "allow *"`` will ease the transition from ``mkcephfs`` to
+``ceph-deploy`` by allowing ``ceph-create-keys`` to use the ``mon.`` keyring
+file in ``$mon_data`` and get the caps it needs.
+
+
+Use Default Paths
+-----------------
+
+Under the ``/var/lib/ceph`` directory, the ``mon`` and ``osd`` directories need
+to use the default paths.
+
+- **OSDs**: The path should be ``/var/lib/ceph/osd/ceph-$id``
+- **MON**: The path should be ``/var/lib/ceph/mon/ceph-$id``
+
+Under those directories, the keyring should be in a file named ``keyring``.
+
.. _Monitor Config Reference: ../../rados/configuration/mon-config-ref
.. _Joao's blog post: http://ceph.com/dev-notes/cephs-new-monitor-changes
.. _Ceph Authentication: ../../rados/operations/authentication/
diff --git a/doc/man/8/monmaptool.rst b/doc/man/8/monmaptool.rst
index ffc17c4c5f1..8415ba4136a 100644
--- a/doc/man/8/monmaptool.rst
+++ b/doc/man/8/monmaptool.rst
@@ -83,7 +83,7 @@ To create a new map with three monitors (for a fresh Ceph file system)::
To display the contents of the map::
- monmaptool --print onmap
+ monmaptool --print monmap
To replace one monitor::
diff --git a/doc/rados/deployment/ceph-deploy-mon.rst b/doc/rados/deployment/ceph-deploy-mon.rst
index fcd25756e64..a6123e596ca 100644
--- a/doc/rados/deployment/ceph-deploy-mon.rst
+++ b/doc/rados/deployment/ceph-deploy-mon.rst
@@ -13,10 +13,9 @@ only install one monitor per host.**
For high availability, you should run a production Ceph cluster with **AT
LEAST** three monitors. Ceph uses the Paxos algorithm, which requires a
-consensus among the majority of monitors in a quorum. You can establish a
-monitor quorum with only one monitor; however, you can not determine a majority
-with two monitors. A majority of monitors must be counted as such: 1:1, 2:3,
-3:4, 3:5, 4:6, etc.
+consensus among the majority of monitors in a quorum. With Paxos, the monitors
+cannot determine a majority for establishing a quorum with only two monitors. A
+majority of monitors must be counted as such: 1:1, 2:3, 3:4, 3:5, 4:6, etc.
See `Monitor Config Reference`_ for details on configuring monitors.
diff --git a/doc/rados/operations/operating.rst b/doc/rados/operations/operating.rst
index 591704217d0..9942ea3cabf 100644
--- a/doc/rados/operations/operating.rst
+++ b/doc/rados/operations/operating.rst
@@ -8,28 +8,31 @@ Running Ceph with Upstart
=========================
When deploying Ceph Cuttlefish and beyond with ``ceph-deploy``, you may start
-and stop Ceph daemons or the entire cluster using the event-based `Upstart`_.
+and stop Ceph daemons on a :term:`Ceph Node` using the event-based `Upstart`_.
Upstart does not require you to define daemon instances in the Ceph configuration
file (although, they are still required for ``sysvinit`` should you choose to
use it).
-To list the Ceph Upstart jobs and instances, execute::
+To list the Ceph Upstart jobs and instances on a node, execute::
sudo initctl list | grep ceph
See `initctl`_ for additional details.
-Starting a Cluster
-------------------
+Starting all Daemons
+--------------------
-To start the cluster, execute the following::
+To start all daemons on a Ceph Node (irrespective of type), execute the
+following::
sudo start ceph-all
-Stopping a Cluster
-------------------
-To stop the cluster, execute the following::
+Stopping all Daemons
+--------------------
+
+To stop all daemons on a Ceph Node (irrespective of type), execute the
+following::
sudo stop ceph-all
@@ -37,7 +40,8 @@ To stop the cluster, execute the following::
Starting all Daemons by Type
----------------------------
-To start all daemons of a particular type, execute one of the following::
+To start all daemons of a particular type on a Ceph Node, execute one of the
+following::
sudo start ceph-osd-all
sudo start ceph-mon-all
@@ -47,7 +51,8 @@ To start all daemons of a particular type, execute one of the following::
Stopping all Daemons by Type
----------------------------
-To stop all daemons of a particular type, execute one of the following::
+To stop all daemons of a particular type on a Ceph Node, execute one of the
+following::
sudo stop ceph-osd-all
sudo stop ceph-mon-all
@@ -57,7 +62,8 @@ To stop all daemons of a particular type, execute one of the following::
Starting a Daemon
-----------------
-To start a specific daemon instance, execute one of the following::
+To start a specific daemon instance on a Ceph Node, execute one of the
+following::
sudo start ceph-osd id={id}
sudo start ceph-mon id={hostname}
@@ -73,7 +79,8 @@ For example::
Stopping a Daemon
-----------------
-To stop a specific daemon instance, execute one of the following::
+To stop a specific daemon instance on a Ceph Node, execute one of the
+following::
sudo stop ceph-osd id={id}
sudo stop ceph-mon id={hostname}
diff --git a/doc/release-notes.rst b/doc/release-notes.rst
index 2baeb8ff023..bc043fd037a 100644
--- a/doc/release-notes.rst
+++ b/doc/release-notes.rst
@@ -2,6 +2,30 @@
Release Notes
===============
+v0.67.2 "Dumpling"
+------------------
+
+This is an imporant point release for Dumpling. Most notably, it
+fixes a problem when upgrading directly from v0.56.x Bobtail to
+v0.67.x Dumpling (without stopping at v0.61.x Cuttlefish along the
+way). It also fixes a problem with the CLI parsing of the CEPH_ARGS
+environment variable, high CPU utilization by the ceph-osd daemons,
+and cleans up the radosgw shutdown sequence.
+
+Notable Changes
+~~~~~~~~~~~~~~~
+
+* objecter: resend linger requests when cluster goes from full to non-full
+* ceph: parse CEPH_ARGS environment variable
+* librados: fix small memory leak
+* osd: remove old log objects on upgrade (fixes bobtail -> dumpling jump)
+* osd: disable PGLog::check() via config option (fixes CPU burn)
+* rgw: drain requests on shutdown
+* rgw: misc memory leaks on shutdown
+
+For more detailed information, see :download:`the complete changelog <changelog/v0.67.2.txt>`.
+
+
v0.67.1 "Dumpling"
------------------
diff --git a/qa/workunits/suites/fsstress.sh b/qa/workunits/suites/fsstress.sh
index d511e375f6f..7f945172687 100755
--- a/qa/workunits/suites/fsstress.sh
+++ b/qa/workunits/suites/fsstress.sh
@@ -1,5 +1,20 @@
#!/bin/bash
+if [ ! -f /usr/lib/ltp/testcases/bin/fsstress ]
+then
+ mkdir -p /tmp/fsstress
+ cd /tmp/fsstress
+ wget -q -O /tmp/fsstress/ltp-full.tgz http://ceph.com/qa/ltp-full-20091231.tgz
+ tar xzf /tmp/fsstress/ltp-full.tgz
+ rm /tmp/fsstress/ltp-full.tgz
+ cd /tmp/fsstress/ltp-full-20091231/testcases/kernel/fs/fsstress
+ make
+ sudo mkdir -p /usr/lib/ltp/testcases/bin
+ sudo cp -avf /tmp/fsstress/ltp-full-20091231/testcases/kernel/fs/fsstress/fsstress /usr/lib/ltp/testcases/bin/fsstress
+ sudo chmod 755 /usr/lib/ltp/testcases/bin/fsstress
+ rm -Rf /tmp/fsstress
+fi
+
command="/usr/lib/ltp/testcases/bin/fsstress -d fsstress-`hostname`$$ -l 1 -n 1000 -p 10 -v"
echo "Starting fsstress $command"
diff --git a/src/Makefile.am b/src/Makefile.am
index 5a0f3472080..4b09c23e872 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -20,7 +20,8 @@ EXTRA_DIST = \
libs3/mswin \
libs3/src \
libs3/test \
- unittest_bufferlist.sh
+ unittest_bufferlist.sh \
+ yasm-wrapper
CLEANFILES =
bin_PROGRAMS =
@@ -1318,7 +1319,6 @@ AM_COMMON_FLAGS = \
-D_THREAD_SAFE \
-D__STDC_FORMAT_MACROS \
-D_GNU_SOURCE \
- -rdynamic \
-Wall \
${WARN_TYPE_LIMITS} \
${WARN_IGNORED_QUALIFIERS} \
@@ -1327,6 +1327,9 @@ AM_COMMON_FLAGS = \
-Werror=format-security \
-fno-strict-aliasing \
-fsigned-char
+if !CLANG
+ AM_COMMON_FLAGS += -rdynamic
+endif
AM_CFLAGS = $(AM_COMMON_FLAGS)
AM_CXXFLAGS = \
@@ -1334,8 +1337,11 @@ AM_CXXFLAGS = \
$(AM_COMMON_FLAGS) \
-DCEPH_LIBDIR=\"${libdir}\" \
-Wnon-virtual-dtor \
- -Wno-invalid-offsetof \
- -Wstrict-null-sentinel
+ -Wno-invalid-offsetof
+
+if !CLANG
+ AM_CXXFLAGS += -Wstrict-null-sentinel
+endif
# note: this is position dependant, it affects the -l options that
# come after it on the command line. when you use ${AM_LDFLAGS} in
# later rules, take care where you place it. for more information, see
@@ -1475,22 +1481,31 @@ clean-local:
# libs
+CCAS = ${srcdir}/yasm-wrapper
+AM_CCASFLAGS = -f elf64
+
+# crc
+libcrc_la_SOURCES = \
+ common/sctp_crc32.c \
+ common/crc32c.cc \
+ common/crc32c_intel_baseline.c \
+ common/crc32c_intel_fast.c
+
+if WITH_GOOD_YASM_ELF64
+libcrc_la_SOURCES += common/crc32c_intel_fast_asm.S
+libcrc_la_LIBTOOLFLAGS = --tag=CC
+endif
+
+noinst_LTLIBRARIES += libcrc.la
+
+# common
libcommon_la_SOURCES = $(libcommon_files)
libcommon_la_CFLAGS= ${CRYPTO_CFLAGS} ${AM_CFLAGS}
libcommon_la_CXXFLAGS= ${AM_CXXFLAGS}
libcommon_la_LDFLAGS = -lrt
+libcommon_la_LIBADD = libcrc.la
noinst_LTLIBRARIES += libcommon.la
-libglobal_la_SOURCES = \
- global/global_context.cc \
- global/global_init.cc \
- global/pidfile.cc \
- global/signal_handler.cc
-libglobal_la_CFLAGS= ${CRYPTO_CFLAGS} ${AM_CFLAGS}
-libglobal_la_CXXFLAGS= ${AM_CXXFLAGS}
-libglobal_la_LIBADD= libcommon.la
-noinst_LTLIBRARIES += libglobal.la
-
crush_files = \
crush/builder.c \
crush/mapper.c \
@@ -1503,6 +1518,8 @@ crush_files = \
# this list ommits the ceph_ver.c file
libcommon_files = \
./ceph_ver.c \
+ arch/probe.cc \
+ arch/intel.c \
auth/AuthAuthorizeHandler.cc \
auth/AuthClientHandler.cc \
auth/AuthSessionHandler.cc \
@@ -1533,8 +1550,6 @@ libcommon_files = \
common/Timer.cc \
common/Finisher.cc \
common/environment.cc\
- common/sctp_crc32.c\
- common/crc32c-intel.c\
common/assert.cc \
common/run_cmd.cc \
common/WorkQueue.cc \
@@ -1609,6 +1624,16 @@ else
libcommon_files += perfglue/disabled_stubs.cc
endif
+# global
+libglobal_la_SOURCES = \
+ global/global_context.cc \
+ global/global_init.cc \
+ global/pidfile.cc \
+ global/signal_handler.cc
+libglobal_la_CFLAGS= ${CRYPTO_CFLAGS} ${AM_CFLAGS}
+libglobal_la_CXXFLAGS= ${AM_CXXFLAGS}
+libglobal_la_LIBADD= libcommon.la
+noinst_LTLIBRARIES += libglobal.la
libmon_a_SOURCES = \
@@ -1744,6 +1769,8 @@ python_PYTHON = pybind/rados.py \
# that autotools doesn't magically identify.
noinst_HEADERS = \
rados_sync.h \
+ arch/probe.h \
+ arch/intel.h \
auth/cephx/CephxAuthorizeHandler.h\
auth/cephx/CephxKeyServer.h\
auth/cephx/CephxProtocol.h\
@@ -1881,10 +1908,13 @@ noinst_HEADERS = \
common/ceph_crypto.h\
common/ceph_crypto_cms.h\
common/ceph_json.h\
+ common/crc32c_intel_baseline.h\
+ common/crc32c_intel_fast.h\
common/lru_map.h\
common/utf8.h\
common/mime.h\
common/pick_address.h\
+ common/sctp_crc32.h\
common/secret.h\
common/strtol.h\
common/static_assert.h\
diff --git a/src/arch/intel.c b/src/arch/intel.c
new file mode 100644
index 00000000000..0513da53c23
--- /dev/null
+++ b/src/arch/intel.c
@@ -0,0 +1,46 @@
+#include "arch/probe.h"
+
+/* flags we export */
+int ceph_arch_intel_sse42 = 0;
+
+
+/* this probably isn't specific enough for x86_64? fix me someday */
+#ifdef __LP64__
+
+/* intel cpu? */
+static void do_cpuid(unsigned int *eax, unsigned int *ebx, unsigned int *ecx,
+ unsigned int *edx)
+{
+ int id = *eax;
+
+ asm("movl %4, %%eax;"
+ "cpuid;"
+ "movl %%eax, %0;"
+ "movl %%ebx, %1;"
+ "movl %%ecx, %2;"
+ "movl %%edx, %3;"
+ : "=r" (*eax), "=r" (*ebx), "=r" (*ecx), "=r" (*edx)
+ : "r" (id)
+ : "eax", "ebx", "ecx", "edx");
+}
+
+int ceph_arch_intel_probe(void)
+{
+ /* i know how to check this on x86_64... */
+ unsigned int eax = 1, ebx, ecx, edx;
+ do_cpuid(&eax, &ebx, &ecx, &edx);
+ if ((ecx & (1 << 20)) != 0) {
+ ceph_arch_intel_sse42 = 1;
+ }
+ return 0;
+}
+
+#else // __LP64__
+
+int ceph_arch_intel_probe(void)
+{
+ /* no features */
+ return 0;
+}
+
+#endif // __LP64__
diff --git a/src/arch/intel.h b/src/arch/intel.h
new file mode 100644
index 00000000000..aefb64eaa7f
--- /dev/null
+++ b/src/arch/intel.h
@@ -0,0 +1,16 @@
+#ifndef CEPH_ARCH_INTEL_H
+#define CEPH_ARCH_INTEL_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+extern int ceph_arch_intel_sse42; /* true if we have sse 4.2 features */
+
+extern int ceph_arch_intel_probe(void);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/src/arch/probe.cc b/src/arch/probe.cc
new file mode 100644
index 00000000000..9f8bc9d2d0f
--- /dev/null
+++ b/src/arch/probe.cc
@@ -0,0 +1,20 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "arch/probe.h"
+
+#include "arch/intel.h"
+
+int ceph_arch_probe(void)
+{
+ if (ceph_arch_probed)
+ return 1;
+
+ ceph_arch_intel_probe();
+
+ ceph_arch_probed = 1;
+ return 1;
+}
+
+// do this once using the magic of c++.
+int ceph_arch_probed = ceph_arch_probe();
diff --git a/src/arch/probe.h b/src/arch/probe.h
new file mode 100644
index 00000000000..a789c4e864c
--- /dev/null
+++ b/src/arch/probe.h
@@ -0,0 +1,16 @@
+#ifndef CEPH_ARCH_PROBE_H
+#define CEPH_ARCH_PROBE_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+extern int ceph_arch_probed; /* non-zero if we've probed features */
+
+extern int ceph_arch_probe(void);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/src/auth/cephx/CephxKeyServer.cc b/src/auth/cephx/CephxKeyServer.cc
index e0c8174a2a1..e57b5575142 100644
--- a/src/auth/cephx/CephxKeyServer.cc
+++ b/src/auth/cephx/CephxKeyServer.cc
@@ -163,7 +163,7 @@ bool KeyServer::_check_rotating_secrets()
ldout(cct, 10) << __func__ << " added " << added << dendl;
data.rotating_ver++;
//data.next_rotating_time = ceph_clock_now(cct);
- //data.next_rotating_time += MIN(g_conf->auth_mon_ticket_ttl, g_conf->auth_service_ticket_ttl);
+ //data.next_rotating_time += MIN(cct->_conf->auth_mon_ticket_ttl, cct->_conf->auth_service_ticket_ttl);
_dump_rotating_secrets();
return true;
}
@@ -191,7 +191,7 @@ int KeyServer::_rotate_secret(uint32_t service_id)
RotatingSecrets& r = data.rotating_secrets[service_id];
int added = 0;
utime_t now = ceph_clock_now(cct);
- double ttl = service_id == CEPH_ENTITY_TYPE_AUTH ? g_conf->auth_mon_ticket_ttl : g_conf->auth_service_ticket_ttl;
+ double ttl = service_id == CEPH_ENTITY_TYPE_AUTH ? cct->_conf->auth_mon_ticket_ttl : cct->_conf->auth_service_ticket_ttl;
while (r.need_new_secrets(now)) {
ExpiringCryptoKey ek;
@@ -424,7 +424,7 @@ int KeyServer::_build_session_auth_info(uint32_t service_id, CephXServiceTicketI
{
info.service_id = service_id;
info.ticket = auth_ticket_info.ticket;
- info.ticket.init_timestamps(ceph_clock_now(cct), g_conf->auth_service_ticket_ttl);
+ info.ticket.init_timestamps(ceph_clock_now(cct), cct->_conf->auth_service_ticket_ttl);
generate_secret(info.session_key);
diff --git a/src/ceph-disk b/src/ceph-disk
index 0ecfdf02073..3d09bdf7418 100755
--- a/src/ceph-disk
+++ b/src/ceph-disk
@@ -671,6 +671,7 @@ def mount(
subprocess.check_call(
args=[
'mount',
+ '-t', fstype,
'-o', options,
'--',
dev,
@@ -850,9 +851,21 @@ def prepare_journal_dev(
journal,
],
)
+
+ # try to make sure the kernel refreshes the table. note
+ # that if this gets ebusy, we are probably racing with
+ # udev because it already updated it.. ignore failure here.
+ LOG.debug('Calling partprobe on prepared device %s', journal)
+ subprocess.call(
+ args=[
+ 'partprobe',
+ journal,
+ ],
+ )
+
+ # wait for udev event queue to clear
subprocess.call(
args=[
- # wait for udev event queue to clear
'udevadm',
'settle',
],
diff --git a/src/ceph_osd.cc b/src/ceph_osd.cc
index d8590bff817..dc6f435bdcf 100644
--- a/src/ceph_osd.cc
+++ b/src/ceph_osd.cc
@@ -465,6 +465,8 @@ int main(int argc, const char **argv)
register_async_signal_handler_oneshot(SIGINT, handle_osd_signal);
register_async_signal_handler_oneshot(SIGTERM, handle_osd_signal);
+ osd->final_init();
+
if (g_conf->inject_early_sigterm)
kill(getpid(), SIGTERM);
diff --git a/src/client/fuse_ll.cc b/src/client/fuse_ll.cc
index 58093d2de98..0c78557f041 100644
--- a/src/client/fuse_ll.cc
+++ b/src/client/fuse_ll.cc
@@ -553,9 +553,9 @@ static int getgroups_cb(void *handle, uid_t uid, gid_t **sgids)
static void invalidate_cb(void *handle, vinodeno_t vino, int64_t off, int64_t len)
{
+#if FUSE_VERSION >= FUSE_MAKE_VERSION(2, 8)
CephFuse::Handle *cfuse = (CephFuse::Handle *)handle;
fuse_ino_t fino = cfuse->make_fake_ino(vino.ino, vino.snapid);
-#if FUSE_VERSION >= FUSE_MAKE_VERSION(2, 8)
fuse_lowlevel_notify_inval_inode(cfuse->ch, fino, off, len);
#endif
}
diff --git a/src/common/config_opts.h b/src/common/config_opts.h
index b021651bd4d..f526f80c929 100644
--- a/src/common/config_opts.h
+++ b/src/common/config_opts.h
@@ -203,6 +203,7 @@ OPTION(mon_leveldb_max_open_files, OPT_INT, 0) // monitor's leveldb max open fil
OPTION(mon_leveldb_compression, OPT_BOOL, false) // monitor's leveldb uses compression
OPTION(mon_leveldb_paranoid, OPT_BOOL, false) // monitor's leveldb paranoid flag
OPTION(mon_leveldb_log, OPT_STR, "")
+OPTION(mon_leveldb_size_warn, OPT_U64, 40*1024*1024*1024) // issue a warning when the monitor's leveldb goes over 40GB (in bytes)
OPTION(paxos_stash_full_interval, OPT_INT, 25) // how often (in commits) to stash a full copy of the PaxosService state
OPTION(paxos_max_join_drift, OPT_INT, 10) // max paxos iterations before we must first sync the monitor stores
OPTION(paxos_propose_interval, OPT_DOUBLE, 1.0) // gather updates for this long before proposing a map update
@@ -243,6 +244,7 @@ OPTION(client_readahead_max_periods, OPT_LONGLONG, 4) // as multiple of file la
OPTION(client_snapdir, OPT_STR, ".snap")
OPTION(client_mountpoint, OPT_STR, "/")
OPTION(client_notify_timeout, OPT_INT, 10) // in seconds
+OPTION(osd_client_watch_timeout, OPT_INT, 30) // in seconds
OPTION(client_caps_release_delay, OPT_INT, 5) // in seconds
OPTION(client_oc, OPT_BOOL, true)
OPTION(client_oc_size, OPT_INT, 1024*1024* 200) // MB * n
@@ -425,6 +427,11 @@ OPTION(osd_heartbeat_addr, OPT_ADDR, entity_addr_t())
OPTION(osd_heartbeat_interval, OPT_INT, 6) // (seconds) how often we ping peers
OPTION(osd_heartbeat_grace, OPT_INT, 20) // (seconds) how long before we decide a peer has failed
OPTION(osd_heartbeat_min_peers, OPT_INT, 10) // minimum number of peers
+
+// minimum number of peers tha tmust be reachable to mark ourselves
+// back up after being wrongly marked down.
+OPTION(osd_heartbeat_min_healthy_ratio, OPT_FLOAT, .33)
+
OPTION(osd_mon_heartbeat_interval, OPT_INT, 30) // (seconds) how often to ping monitor if no peers
OPTION(osd_mon_report_interval_max, OPT_INT, 120)
OPTION(osd_mon_report_interval_min, OPT_INT, 5) // pg stats, failures, up_thru, boot.
@@ -518,6 +525,7 @@ OPTION(osd_max_attr_size, OPT_U64, 65536)
OPTION(filestore, OPT_BOOL, false)
/// filestore wb throttle limits
+OPTION(filestore_wbthrottle_enable, OPT_BOOL, true)
OPTION(filestore_wbthrottle_btrfs_bytes_start_flusher, OPT_U64, 41943040)
OPTION(filestore_wbthrottle_btrfs_bytes_hard_limit, OPT_U64, 419430400)
OPTION(filestore_wbthrottle_btrfs_ios_start_flusher, OPT_U64, 500)
diff --git a/src/common/crc32c-intel.c b/src/common/crc32c-intel.c
deleted file mode 100644
index 1c4689e3b76..00000000000
--- a/src/common/crc32c-intel.c
+++ /dev/null
@@ -1,113 +0,0 @@
-#include <inttypes.h>
-#include <string.h>
-#include <unistd.h>
-#include <stdlib.h>
-#include <signal.h>
-#include <sys/types.h>
-#include <sys/wait.h>
-
-
-/* this probably isn't specific enough for x86_64? fix me someday */
-#ifdef __LP64__
-
-/*
- * * Based on a posting to lkml by Austin Zhang <austin.zhang@intel.com>
- * * Further based on the fio crc32c-intel.c implementation by Jens Axboe.
- * *
- * * Using hardware provided CRC32 instruction to accelerate the CRC32 disposal.
- * * CRC32C polynomial:0x1EDC6F41(BE)/0x82F63B78(LE)
- * * CRC32 is a new instruction in Intel SSE4.2, the reference can be found at:
- * * http://www.intel.com/products/processor/manuals/
- * * Intel(R) 64 and IA-32 Architectures Software Developer's Manual
- * * Volume 2A: Instruction Set Reference, A-M
- * */
-
-#if BITS_PER_LONG == 64
-#define REX_PRE "0x48, "
-#define SCALE_F 8
-#else
-#define REX_PRE
-#define SCALE_F 4
-#endif
-
-static uint32_t crc32c_intel_le_hw_byte(uint32_t crc, unsigned char const *data,
- unsigned length)
-{
- while (length--) {
- __asm__ __volatile__(
- ".byte 0xf2, 0xf, 0x38, 0xf0, 0xf1"
- :"=S"(crc)
- :"0"(crc), "c"(*data)
- );
- data++;
- }
-
- return crc;
-}
-
-uint32_t ceph_crc32c_le_intel(uint32_t crc, unsigned char const *data, unsigned length)
-{
- unsigned int iquotient = length / SCALE_F;
- unsigned int iremainder = length % SCALE_F;
-#if BITS_PER_LONG == 64
- uint64_t *ptmp = (uint64_t *) data;
-#else
- uint32_t *ptmp = (uint32_t *) data;
-#endif
-
- while (iquotient--) {
- __asm__ __volatile__(
- ".byte 0xf2, " REX_PRE "0xf, 0x38, 0xf1, 0xf1;"
- :"=S"(crc)
- :"0"(crc), "c"(*ptmp)
- );
- ptmp++;
- }
-
- if (iremainder)
- crc = crc32c_intel_le_hw_byte(crc, (unsigned char *)ptmp,
- iremainder);
-
- return crc;
-}
-
-
-static void do_cpuid(unsigned int *eax, unsigned int *ebx, unsigned int *ecx,
- unsigned int *edx)
-{
- int id = *eax;
-
- asm("movl %4, %%eax;"
- "cpuid;"
- "movl %%eax, %0;"
- "movl %%ebx, %1;"
- "movl %%ecx, %2;"
- "movl %%edx, %3;"
- : "=r" (*eax), "=r" (*ebx), "=r" (*ecx), "=r" (*edx)
- : "r" (id)
- : "eax", "ebx", "ecx", "edx");
-}
-
-int ceph_have_crc32c_intel(void)
-{
- /* i know how to check this on x86_64... */
- unsigned int eax = 1, ebx, ecx, edx;
- do_cpuid(&eax, &ebx, &ecx, &edx);
- if ((ecx & (1 << 20)) != 0)
- return 1;
- return 0;
-}
-
-#else /* __LP64__ */
-
-uint32_t ceph_crc32c_le_intel(uint32_t crc, unsigned char const *data, unsigned length)
-{
- return 0; /* this shouldn't get called! */
-}
-
-int ceph_have_crc32c_intel(void)
-{
- return 0; /* clearly not x86_64 */
-}
-
-#endif
diff --git a/src/common/crc32c.cc b/src/common/crc32c.cc
new file mode 100644
index 00000000000..e2e81a42f45
--- /dev/null
+++ b/src/common/crc32c.cc
@@ -0,0 +1,41 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "include/crc32c.h"
+
+#include "arch/probe.h"
+#include "arch/intel.h"
+#include "common/sctp_crc32.h"
+#include "common/crc32c_intel_baseline.h"
+#include "common/crc32c_intel_fast.h"
+
+/*
+ * choose best implementation based on the CPU architecture.
+ */
+ceph_crc32c_func_t ceph_choose_crc32(void)
+{
+ // make sure we've probed cpu features; this might depend on the
+ // link order of this file relative to arch/probe.cc.
+ ceph_arch_probe();
+
+ // if the CPU supports it, *and* the fast version is compiled in,
+ // use that.
+ if (ceph_arch_intel_sse42 && ceph_crc32c_intel_fast_exists()) {
+ return ceph_crc32c_intel_fast;
+ }
+
+ // default
+ return ceph_crc32c_sctp;
+}
+
+/*
+ * static global
+ *
+ * This is a bit of a no-no for shared libraries, but we don't care.
+ * It is effectively constant for the executing process as the value
+ * depends on the CPU architecture.
+ *
+ * We initialize it during program init using the magic of C++.
+ */
+ceph_crc32c_func_t ceph_crc32c_func = ceph_choose_crc32();
+
diff --git a/src/common/crc32c_intel_baseline.c b/src/common/crc32c_intel_baseline.c
new file mode 100644
index 00000000000..cfcfec624ae
--- /dev/null
+++ b/src/common/crc32c_intel_baseline.c
@@ -0,0 +1,126 @@
+/*
+ * Copyright 2012-2013 Intel Corporation All Rights Reserved.
+ * All rights reserved.
+ *
+ * http://opensource.org/licenses/BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * * Neither the name of the Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <inttypes.h>
+#include <stdlib.h>
+
+#define MAX_ITER 8
+
+unsigned long crc32_table_iscsi_base[256] = {
+ 0x00000000, 0xF26B8303, 0xE13B70F7, 0x1350F3F4,
+ 0xC79A971F, 0x35F1141C, 0x26A1E7E8, 0xD4CA64EB,
+ 0x8AD958CF, 0x78B2DBCC, 0x6BE22838, 0x9989AB3B,
+ 0x4D43CFD0, 0xBF284CD3, 0xAC78BF27, 0x5E133C24,
+ 0x105EC76F, 0xE235446C, 0xF165B798, 0x030E349B,
+ 0xD7C45070, 0x25AFD373, 0x36FF2087, 0xC494A384,
+ 0x9A879FA0, 0x68EC1CA3, 0x7BBCEF57, 0x89D76C54,
+ 0x5D1D08BF, 0xAF768BBC, 0xBC267848, 0x4E4DFB4B,
+ 0x20BD8EDE, 0xD2D60DDD, 0xC186FE29, 0x33ED7D2A,
+ 0xE72719C1, 0x154C9AC2, 0x061C6936, 0xF477EA35,
+ 0xAA64D611, 0x580F5512, 0x4B5FA6E6, 0xB93425E5,
+ 0x6DFE410E, 0x9F95C20D, 0x8CC531F9, 0x7EAEB2FA,
+ 0x30E349B1, 0xC288CAB2, 0xD1D83946, 0x23B3BA45,
+ 0xF779DEAE, 0x05125DAD, 0x1642AE59, 0xE4292D5A,
+ 0xBA3A117E, 0x4851927D, 0x5B016189, 0xA96AE28A,
+ 0x7DA08661, 0x8FCB0562, 0x9C9BF696, 0x6EF07595,
+ 0x417B1DBC, 0xB3109EBF, 0xA0406D4B, 0x522BEE48,
+ 0x86E18AA3, 0x748A09A0, 0x67DAFA54, 0x95B17957,
+ 0xCBA24573, 0x39C9C670, 0x2A993584, 0xD8F2B687,
+ 0x0C38D26C, 0xFE53516F, 0xED03A29B, 0x1F682198,
+ 0x5125DAD3, 0xA34E59D0, 0xB01EAA24, 0x42752927,
+ 0x96BF4DCC, 0x64D4CECF, 0x77843D3B, 0x85EFBE38,
+ 0xDBFC821C, 0x2997011F, 0x3AC7F2EB, 0xC8AC71E8,
+ 0x1C661503, 0xEE0D9600, 0xFD5D65F4, 0x0F36E6F7,
+ 0x61C69362, 0x93AD1061, 0x80FDE395, 0x72966096,
+ 0xA65C047D, 0x5437877E, 0x4767748A, 0xB50CF789,
+ 0xEB1FCBAD, 0x197448AE, 0x0A24BB5A, 0xF84F3859,
+ 0x2C855CB2, 0xDEEEDFB1, 0xCDBE2C45, 0x3FD5AF46,
+ 0x7198540D, 0x83F3D70E, 0x90A324FA, 0x62C8A7F9,
+ 0xB602C312, 0x44694011, 0x5739B3E5, 0xA55230E6,
+ 0xFB410CC2, 0x092A8FC1, 0x1A7A7C35, 0xE811FF36,
+ 0x3CDB9BDD, 0xCEB018DE, 0xDDE0EB2A, 0x2F8B6829,
+ 0x82F63B78, 0x709DB87B, 0x63CD4B8F, 0x91A6C88C,
+ 0x456CAC67, 0xB7072F64, 0xA457DC90, 0x563C5F93,
+ 0x082F63B7, 0xFA44E0B4, 0xE9141340, 0x1B7F9043,
+ 0xCFB5F4A8, 0x3DDE77AB, 0x2E8E845F, 0xDCE5075C,
+ 0x92A8FC17, 0x60C37F14, 0x73938CE0, 0x81F80FE3,
+ 0x55326B08, 0xA759E80B, 0xB4091BFF, 0x466298FC,
+ 0x1871A4D8, 0xEA1A27DB, 0xF94AD42F, 0x0B21572C,
+ 0xDFEB33C7, 0x2D80B0C4, 0x3ED04330, 0xCCBBC033,
+ 0xA24BB5A6, 0x502036A5, 0x4370C551, 0xB11B4652,
+ 0x65D122B9, 0x97BAA1BA, 0x84EA524E, 0x7681D14D,
+ 0x2892ED69, 0xDAF96E6A, 0xC9A99D9E, 0x3BC21E9D,
+ 0xEF087A76, 0x1D63F975, 0x0E330A81, 0xFC588982,
+ 0xB21572C9, 0x407EF1CA, 0x532E023E, 0xA145813D,
+ 0x758FE5D6, 0x87E466D5, 0x94B49521, 0x66DF1622,
+ 0x38CC2A06, 0xCAA7A905, 0xD9F75AF1, 0x2B9CD9F2,
+ 0xFF56BD19, 0x0D3D3E1A, 0x1E6DCDEE, 0xEC064EED,
+ 0xC38D26C4, 0x31E6A5C7, 0x22B65633, 0xD0DDD530,
+ 0x0417B1DB, 0xF67C32D8, 0xE52CC12C, 0x1747422F,
+ 0x49547E0B, 0xBB3FFD08, 0xA86F0EFC, 0x5A048DFF,
+ 0x8ECEE914, 0x7CA56A17, 0x6FF599E3, 0x9D9E1AE0,
+ 0xD3D3E1AB, 0x21B862A8, 0x32E8915C, 0xC083125F,
+ 0x144976B4, 0xE622F5B7, 0xF5720643, 0x07198540,
+ 0x590AB964, 0xAB613A67, 0xB831C993, 0x4A5A4A90,
+ 0x9E902E7B, 0x6CFBAD78, 0x7FAB5E8C, 0x8DC0DD8F,
+ 0xE330A81A, 0x115B2B19, 0x020BD8ED, 0xF0605BEE,
+ 0x24AA3F05, 0xD6C1BC06, 0xC5914FF2, 0x37FACCF1,
+ 0x69E9F0D5, 0x9B8273D6, 0x88D28022, 0x7AB90321,
+ 0xAE7367CA, 0x5C18E4C9, 0x4F48173D, 0xBD23943E,
+ 0xF36E6F75, 0x0105EC76, 0x12551F82, 0xE03E9C81,
+ 0x34F4F86A, 0xC69F7B69, 0xD5CF889D, 0x27A40B9E,
+ 0x79B737BA, 0x8BDCB4B9, 0x988C474D, 0x6AE7C44E,
+ 0xBE2DA0A5, 0x4C4623A6, 0x5F16D052, 0xAD7D5351,
+};
+
+
+// iSCSI CRC baseline function
+uint32_t ceph_crc32c_intel_baseline(uint32_t crc_init2, unsigned char const *buffer, unsigned len)
+{
+ unsigned int crc_init = crc_init2;
+ unsigned int crc;
+ unsigned char* p_buf;
+
+ p_buf = (unsigned char*)buffer;
+ unsigned char const * p_end = buffer + len;
+
+ crc = crc_init;
+
+ while(p_buf < (unsigned char *) p_end ){
+ crc = (crc >> 8) ^ crc32_table_iscsi_base[(crc & 0x000000FF) ^ *p_buf++] ;
+ }
+ return crc;
+}
diff --git a/src/common/crc32c_intel_baseline.h b/src/common/crc32c_intel_baseline.h
new file mode 100644
index 00000000000..5b14ddfc07e
--- /dev/null
+++ b/src/common/crc32c_intel_baseline.h
@@ -0,0 +1,14 @@
+#ifndef CEPH_COMMON_CRC32C_INTEL_BASELINE_H
+#define CEPH_COMMON_CRC32C_INTEL_BASELINE_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+extern uint32_t ceph_crc32c_intel_baseline(uint32_t crc, unsigned char const *buffer, unsigned len);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/src/common/crc32c_intel_fast.c b/src/common/crc32c_intel_fast.c
new file mode 100644
index 00000000000..10b3c1c5c27
--- /dev/null
+++ b/src/common/crc32c_intel_fast.c
@@ -0,0 +1,30 @@
+#include <inttypes.h>
+#include "acconfig.h"
+
+extern unsigned int crc32_iscsi_00(unsigned char const *buffer, int len, unsigned int crc);
+
+#ifdef WITH_GOOD_YASM_ELF64
+
+uint32_t ceph_crc32c_intel_fast(uint32_t crc, unsigned char const *buffer, unsigned len)
+{
+ return crc32_iscsi_00(buffer, len, crc);
+}
+
+int ceph_crc32c_intel_fast_exists(void)
+{
+ return 1;
+}
+
+#else
+
+int ceph_crc32c_intel_fast_exists(void)
+{
+ return 0;
+}
+
+uint32_t ceph_crc32c_intel_fast(uint32_t crc, unsigned char const *buffer, unsigned len)
+{
+ return 0;
+}
+
+#endif
diff --git a/src/common/crc32c_intel_fast.h b/src/common/crc32c_intel_fast.h
new file mode 100644
index 00000000000..7a394a0b82c
--- /dev/null
+++ b/src/common/crc32c_intel_fast.h
@@ -0,0 +1,28 @@
+#ifndef CEPH_COMMON_CRC32C_INTEL_FAST_H
+#define CEPH_COMMON_CRC32C_INTEL_FAST_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* is the fast version compiled in */
+extern int ceph_crc32c_intel_fast_exists(void);
+
+#ifdef __LP64__
+
+extern uint32_t ceph_crc32c_intel_fast(uint32_t crc, unsigned char const *buffer, unsigned len);
+
+#else
+
+static inline uint32_t ceph_crc32c_intel_fast(uint32_t crc, unsigned char const *buffer, unsigned len)
+{
+ return 0;
+}
+
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/src/common/crc32c_intel_fast_asm.S b/src/common/crc32c_intel_fast_asm.S
new file mode 100644
index 00000000000..4ca5d65032e
--- /dev/null
+++ b/src/common/crc32c_intel_fast_asm.S
@@ -0,0 +1,664 @@
+;
+; Copyright 2012-2013 Intel Corporation All Rights Reserved.
+; All rights reserved.
+;
+; http://opensource.org/licenses/BSD-3-Clause
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following
+; conditions are met:
+;
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+;
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+;
+; * Neither the name of the Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+; FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+; COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
+; INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+; HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+; STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+; ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+; OF THE POSSIBILITY OF SUCH DAMAGE.
+;
+
+; Function to compute iscsi CRC32 with table-based recombination
+; crc done "by 3" with block sizes 1920, 960, 480, 240
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; crcB3 MACRO to implement crc32 on 3 %%bSize-byte blocks
+%macro crcB3 3
+%define %%bSize %1 ; 1/3 of buffer size
+%define %%td2 %2 ; table offset for crc0 (2/3 of buffer)
+%define %%td1 %3 ; table offset for crc1 (1/3 of buffer)
+
+%IF %%bSize=640
+ sub len, %%bSize*3
+ js %%crcB3_end ;; jump to next level if 3*blockSize > len
+%ELSE
+ cmp len, %%bSize*3
+ jnae %%crcB3_end ;; jump to next level if 3*blockSize > len
+%ENDIF
+ ;;;;;; Calculate CRC of 3 blocks of the buffer ;;;;;;
+%%crcB3_loop:
+ ;; rax = crc0 = initial crc
+ xor rbx, rbx ;; rbx = crc1 = 0;
+ xor r10, r10 ;; r10 = crc2 = 0;
+
+ %assign i 0
+ %rep %%bSize/8 - 1
+ crc32 rax, [bufptmp+i + 0*%%bSize] ;; update crc0
+ crc32 rbx, [bufptmp+i + 1*%%bSize] ;; update crc1
+ crc32 r10, [bufptmp+i + 2*%%bSize] ;; update crc2
+ %assign i (i+8)
+ %endrep
+ crc32 rax, [bufptmp+i + 0*%%bSize] ;; update crc0
+ crc32 rbx, [bufptmp+i + 1*%%bSize] ;; update crc1
+; SKIP ;crc32 r10, [bufptmp+i + 2*%%bSize] ;; update crc2
+
+ ; merge in crc0
+ movzx bufp_dw, al
+ mov r9d, [crc_init + bufp*4 + %%td2]
+ movzx bufp_dw, ah
+ shr eax, 16
+ mov r11d, [crc_init + bufp*4 + %%td2]
+ shl r11, 8
+ xor r9, r11
+
+ movzx bufp_dw, al
+ mov r11d, [crc_init + bufp*4 + %%td2]
+ movzx bufp_dw, ah
+ shl r11, 16
+ xor r9, r11
+ mov r11d, [crc_init + bufp*4 + %%td2]
+ shl r11, 24
+ xor r9, r11
+
+ ; merge in crc1
+
+ movzx bufp_dw, bl
+ mov r11d, [crc_init + bufp*4 + %%td1]
+ movzx bufp_dw, bh
+ shr ebx, 16
+ xor r9, r11
+ mov r11d, [crc_init + bufp*4 + %%td1]
+ shl r11, 8
+ xor r9, r11
+
+ movzx bufp_dw, bl
+ mov r11d, [crc_init + bufp*4 + %%td1]
+ movzx bufp_dw, bh
+ shl r11, 16
+ xor r9, r11
+ mov r11d, [crc_init + bufp*4 + %%td1]
+ shl r11, 24
+ xor r9, r11
+
+ xor r9, [bufptmp+i + 2*%%bSize]
+ crc32 r10, r9
+ mov rax, r10
+
+ add bufptmp, %%bSize*3 ;; move to next block
+ sub len, %%bSize*3
+%IF %%bSize=640
+ jns %%crcB3_loop
+%ENDIF
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%%crcB3_end:
+%IF %%bSize=640
+ add len, %%bSize*3
+%ENDIF
+ je do_return ;; return if remaining data is zero
+%endmacro
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;; ISCSI CRC 32 Implementation with crc32 Instruction
+
+;;; unsigned int crc32_iscsi_00(unsigned char * buffer, int len, unsigned int crc_init);
+;;;
+;;; *buf = rcx
+;;; len = rdx
+;;; crc_init = r8
+;;;
+
+global crc32_iscsi_00:function
+crc32_iscsi_00:
+
+%ifidn __OUTPUT_FORMAT__, elf64
+%define bufp rdi
+%define bufp_dw edi
+%define bufp_w di
+%define bufp_b dil
+%define bufptmp rcx
+%define block_0 rcx
+%define block_1 r8
+%define block_2 r11
+%define len rsi
+%define len_dw esi
+%define len_w si
+%define len_b sil
+%define crc_init rdx
+%define crc_init_dw edx
+%else
+%define bufp rcx
+%define bufp_dw ecx
+%define bufp_w cx
+%define bufp_b cl
+%define bufptmp rdi
+%define block_0 rdi
+%define block_1 rsi
+%define block_2 r11
+%define len rdx
+%define len_dw edx
+%define len_w dx
+%define len_b dl
+%define crc_init r8
+%define crc_init_dw r8d
+%endif
+
+
+ push rdi
+ push rbx
+
+ mov rax, crc_init ;; rax = crc_init;
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; 1) ALIGN: ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ mov bufptmp, bufp ;; rdi = *buf
+ neg bufp
+ and bufp, 7 ;; calculate the unalignment
+ ;; amount of the address
+ je proc_block ;; Skip if aligned
+
+ cmp len, 8
+ jb less_than_8
+
+ ;;;; Calculate CRC of unaligned bytes of the buffer (if any) ;;;;
+ mov rbx, [bufptmp] ;; load a quadword from the buffer
+ add bufptmp, bufp ;; align buffer pointer for
+ ;; quadword processing
+ sub len, bufp ;; update buffer length
+align_loop:
+ crc32 eax, bl ;; compute crc32 of 1-byte
+ shr rbx, 8 ;; get next byte
+ dec bufp
+ jne align_loop
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; 2) BLOCK LEVEL: ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+proc_block:
+ cmp len, 240
+ jb bit8
+
+ lea crc_init, [mul_table_72 wrt rip] ;; load table base address
+
+ crcB3 640, 0x1000, 0x0c00 ; 640*3 = 1920 (Tables 1280, 640)
+ crcB3 320, 0x0c00, 0x0800 ; 320*3 = 960 (Tables 640, 320)
+ crcB3 160, 0x0800, 0x0400 ; 160*3 = 480 (Tables 320, 160)
+ crcB3 80, 0x0400, 0x0000 ; 80*3 = 240 (Tables 160, 80)
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;4) LESS THAN 256-bytes REMAIN AT THIS POINT (8-bits of rdx are full)
+
+bit8:
+ shl len_b, 1 ;; shift-out MSB (bit-7)
+ jnc bit7 ;; jump to bit-6 if bit-7 == 0
+ %assign i 0
+ %rep 16
+ crc32 rax, [bufptmp+i] ;; compute crc32 of 8-byte data
+ %assign i (i+8)
+ %endrep
+ je do_return ;; return if remaining data is zero
+ add bufptmp, 128 ;; buf +=64; (next 64 bytes)
+
+bit7:
+ shl len_b, 1 ;; shift-out MSB (bit-7)
+ jnc bit6 ;; jump to bit-6 if bit-7 == 0
+ %assign i 0
+ %rep 8
+ crc32 rax, [bufptmp+i] ;; compute crc32 of 8-byte data
+ %assign i (i+8)
+ %endrep
+ je do_return ;; return if remaining data is zero
+ add bufptmp, 64 ;; buf +=64; (next 64 bytes)
+bit6:
+ shl len_b, 1 ;; shift-out MSB (bit-6)
+ jnc bit5 ;; jump to bit-5 if bit-6 == 0
+ %assign i 0
+ %rep 4
+ crc32 rax, [bufptmp+i] ;; compute crc32 of 8-byte data
+ %assign i (i+8)
+ %endrep
+ je do_return ;; return if remaining data is zero
+ add bufptmp, 32 ;; buf +=32; (next 32 bytes)
+bit5:
+ shl len_b, 1 ;; shift-out MSB (bit-5)
+ jnc bit4 ;; jump to bit-4 if bit-5 == 0
+ %assign i 0
+ %rep 2
+ crc32 rax, [bufptmp+i] ;; compute crc32 of 8-byte data
+ %assign i (i+8)
+ %endrep
+ je do_return ;; return if remaining data is zero
+ add bufptmp, 16 ;; buf +=16; (next 16 bytes)
+bit4:
+ shl len_b, 1 ;; shift-out MSB (bit-4)
+ jnc bit3 ;; jump to bit-3 if bit-4 == 0
+ crc32 rax, [bufptmp] ;; compute crc32 of 8-byte data
+ je do_return ;; return if remaining data is zero
+ add bufptmp, 8 ;; buf +=8; (next 8 bytes)
+bit3:
+ mov rbx, [bufptmp] ;; load a 8-bytes from the buffer:
+ shl len_b, 1 ;; shift-out MSB (bit-3)
+ jnc bit2 ;; jump to bit-2 if bit-3 == 0
+ crc32 eax, ebx ;; compute crc32 of 4-byte data
+ je do_return ;; return if remaining data is zero
+ shr rbx, 32 ;; get next 3 bytes
+bit2:
+ shl len_b, 1 ;; shift-out MSB (bit-2)
+ jnc bit1 ;; jump to bit-1 if bit-2 == 0
+ crc32 eax, bx ;; compute crc32 of 2-byte data
+ je do_return ;; return if remaining data is zero
+ shr rbx, 16 ;; next byte
+bit1:
+ test len_b,len_b
+ je do_return
+ crc32 eax, bl ;; compute crc32 of 1-byte data
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+do_return:
+
+ pop rbx
+ pop rdi
+ ret
+
+less_than_8:
+ test len,4
+ jz less_than_4
+ crc32 eax, dword[bufptmp]
+ add bufptmp,4
+less_than_4:
+ test len,2
+ jz less_than_2
+ crc32 eax, word[bufptmp]
+ add bufptmp,2
+less_than_2:
+ test len,1
+ jz do_return
+ crc32 rax, byte[bufptmp]
+ pop rbx
+ pop bufptmp
+ ret
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;; global mul_table_72, mul_table_152, mul_table_312, mul_table_632, mul_table_1272
+
+section .data
+align 8
+mul_table_72:
+DD 0x00000000,0x39d3b296,0x73a7652c,0x4a74d7ba
+DD 0xe74eca58,0xde9d78ce,0x94e9af74,0xad3a1de2
+DD 0xcb71e241,0xf2a250d7,0xb8d6876d,0x810535fb
+DD 0x2c3f2819,0x15ec9a8f,0x5f984d35,0x664bffa3
+DD 0x930fb273,0xaadc00e5,0xe0a8d75f,0xd97b65c9
+DD 0x7441782b,0x4d92cabd,0x07e61d07,0x3e35af91
+DD 0x587e5032,0x61ade2a4,0x2bd9351e,0x120a8788
+DD 0xbf309a6a,0x86e328fc,0xcc97ff46,0xf5444dd0
+DD 0x23f31217,0x1a20a081,0x5054773b,0x6987c5ad
+DD 0xc4bdd84f,0xfd6e6ad9,0xb71abd63,0x8ec90ff5
+DD 0xe882f056,0xd15142c0,0x9b25957a,0xa2f627ec
+DD 0x0fcc3a0e,0x361f8898,0x7c6b5f22,0x45b8edb4
+DD 0xb0fca064,0x892f12f2,0xc35bc548,0xfa8877de
+DD 0x57b26a3c,0x6e61d8aa,0x24150f10,0x1dc6bd86
+DD 0x7b8d4225,0x425ef0b3,0x082a2709,0x31f9959f
+DD 0x9cc3887d,0xa5103aeb,0xef64ed51,0xd6b75fc7
+DD 0x47e6242e,0x7e3596b8,0x34414102,0x0d92f394
+DD 0xa0a8ee76,0x997b5ce0,0xd30f8b5a,0xeadc39cc
+DD 0x8c97c66f,0xb54474f9,0xff30a343,0xc6e311d5
+DD 0x6bd90c37,0x520abea1,0x187e691b,0x21addb8d
+DD 0xd4e9965d,0xed3a24cb,0xa74ef371,0x9e9d41e7
+DD 0x33a75c05,0x0a74ee93,0x40003929,0x79d38bbf
+DD 0x1f98741c,0x264bc68a,0x6c3f1130,0x55eca3a6
+DD 0xf8d6be44,0xc1050cd2,0x8b71db68,0xb2a269fe
+DD 0x64153639,0x5dc684af,0x17b25315,0x2e61e183
+DD 0x835bfc61,0xba884ef7,0xf0fc994d,0xc92f2bdb
+DD 0xaf64d478,0x96b766ee,0xdcc3b154,0xe51003c2
+DD 0x482a1e20,0x71f9acb6,0x3b8d7b0c,0x025ec99a
+DD 0xf71a844a,0xcec936dc,0x84bde166,0xbd6e53f0
+DD 0x10544e12,0x2987fc84,0x63f32b3e,0x5a2099a8
+DD 0x3c6b660b,0x05b8d49d,0x4fcc0327,0x761fb1b1
+DD 0xdb25ac53,0xe2f61ec5,0xa882c97f,0x91517be9
+DD 0x8fcc485c,0xb61ffaca,0xfc6b2d70,0xc5b89fe6
+DD 0x68828204,0x51513092,0x1b25e728,0x22f655be
+DD 0x44bdaa1d,0x7d6e188b,0x371acf31,0x0ec97da7
+DD 0xa3f36045,0x9a20d2d3,0xd0540569,0xe987b7ff
+DD 0x1cc3fa2f,0x251048b9,0x6f649f03,0x56b72d95
+DD 0xfb8d3077,0xc25e82e1,0x882a555b,0xb1f9e7cd
+DD 0xd7b2186e,0xee61aaf8,0xa4157d42,0x9dc6cfd4
+DD 0x30fcd236,0x092f60a0,0x435bb71a,0x7a88058c
+DD 0xac3f5a4b,0x95ece8dd,0xdf983f67,0xe64b8df1
+DD 0x4b719013,0x72a22285,0x38d6f53f,0x010547a9
+DD 0x674eb80a,0x5e9d0a9c,0x14e9dd26,0x2d3a6fb0
+DD 0x80007252,0xb9d3c0c4,0xf3a7177e,0xca74a5e8
+DD 0x3f30e838,0x06e35aae,0x4c978d14,0x75443f82
+DD 0xd87e2260,0xe1ad90f6,0xabd9474c,0x920af5da
+DD 0xf4410a79,0xcd92b8ef,0x87e66f55,0xbe35ddc3
+DD 0x130fc021,0x2adc72b7,0x60a8a50d,0x597b179b
+DD 0xc82a6c72,0xf1f9dee4,0xbb8d095e,0x825ebbc8
+DD 0x2f64a62a,0x16b714bc,0x5cc3c306,0x65107190
+DD 0x035b8e33,0x3a883ca5,0x70fceb1f,0x492f5989
+DD 0xe415446b,0xddc6f6fd,0x97b22147,0xae6193d1
+DD 0x5b25de01,0x62f66c97,0x2882bb2d,0x115109bb
+DD 0xbc6b1459,0x85b8a6cf,0xcfcc7175,0xf61fc3e3
+DD 0x90543c40,0xa9878ed6,0xe3f3596c,0xda20ebfa
+DD 0x771af618,0x4ec9448e,0x04bd9334,0x3d6e21a2
+DD 0xebd97e65,0xd20accf3,0x987e1b49,0xa1ada9df
+DD 0x0c97b43d,0x354406ab,0x7f30d111,0x46e36387
+DD 0x20a89c24,0x197b2eb2,0x530ff908,0x6adc4b9e
+DD 0xc7e6567c,0xfe35e4ea,0xb4413350,0x8d9281c6
+DD 0x78d6cc16,0x41057e80,0x0b71a93a,0x32a21bac
+DD 0x9f98064e,0xa64bb4d8,0xec3f6362,0xd5ecd1f4
+DD 0xb3a72e57,0x8a749cc1,0xc0004b7b,0xf9d3f9ed
+DD 0x54e9e40f,0x6d3a5699,0x274e8123,0x1e9d33b5
+
+mul_table_152:
+DD 0x00000000,0x878a92a7,0x0af953bf,0x8d73c118
+DD 0x15f2a77e,0x927835d9,0x1f0bf4c1,0x98816666
+DD 0x2be54efc,0xac6fdc5b,0x211c1d43,0xa6968fe4
+DD 0x3e17e982,0xb99d7b25,0x34eeba3d,0xb364289a
+DD 0x57ca9df8,0xd0400f5f,0x5d33ce47,0xdab95ce0
+DD 0x42383a86,0xc5b2a821,0x48c16939,0xcf4bfb9e
+DD 0x7c2fd304,0xfba541a3,0x76d680bb,0xf15c121c
+DD 0x69dd747a,0xee57e6dd,0x632427c5,0xe4aeb562
+DD 0xaf953bf0,0x281fa957,0xa56c684f,0x22e6fae8
+DD 0xba679c8e,0x3ded0e29,0xb09ecf31,0x37145d96
+DD 0x8470750c,0x03fae7ab,0x8e8926b3,0x0903b414
+DD 0x9182d272,0x160840d5,0x9b7b81cd,0x1cf1136a
+DD 0xf85fa608,0x7fd534af,0xf2a6f5b7,0x752c6710
+DD 0xedad0176,0x6a2793d1,0xe75452c9,0x60dec06e
+DD 0xd3bae8f4,0x54307a53,0xd943bb4b,0x5ec929ec
+DD 0xc6484f8a,0x41c2dd2d,0xccb11c35,0x4b3b8e92
+DD 0x5ac60111,0xdd4c93b6,0x503f52ae,0xd7b5c009
+DD 0x4f34a66f,0xc8be34c8,0x45cdf5d0,0xc2476777
+DD 0x71234fed,0xf6a9dd4a,0x7bda1c52,0xfc508ef5
+DD 0x64d1e893,0xe35b7a34,0x6e28bb2c,0xe9a2298b
+DD 0x0d0c9ce9,0x8a860e4e,0x07f5cf56,0x807f5df1
+DD 0x18fe3b97,0x9f74a930,0x12076828,0x958dfa8f
+DD 0x26e9d215,0xa16340b2,0x2c1081aa,0xab9a130d
+DD 0x331b756b,0xb491e7cc,0x39e226d4,0xbe68b473
+DD 0xf5533ae1,0x72d9a846,0xffaa695e,0x7820fbf9
+DD 0xe0a19d9f,0x672b0f38,0xea58ce20,0x6dd25c87
+DD 0xdeb6741d,0x593ce6ba,0xd44f27a2,0x53c5b505
+DD 0xcb44d363,0x4cce41c4,0xc1bd80dc,0x4637127b
+DD 0xa299a719,0x251335be,0xa860f4a6,0x2fea6601
+DD 0xb76b0067,0x30e192c0,0xbd9253d8,0x3a18c17f
+DD 0x897ce9e5,0x0ef67b42,0x8385ba5a,0x040f28fd
+DD 0x9c8e4e9b,0x1b04dc3c,0x96771d24,0x11fd8f83
+DD 0xb58c0222,0x32069085,0xbf75519d,0x38ffc33a
+DD 0xa07ea55c,0x27f437fb,0xaa87f6e3,0x2d0d6444
+DD 0x9e694cde,0x19e3de79,0x94901f61,0x131a8dc6
+DD 0x8b9beba0,0x0c117907,0x8162b81f,0x06e82ab8
+DD 0xe2469fda,0x65cc0d7d,0xe8bfcc65,0x6f355ec2
+DD 0xf7b438a4,0x703eaa03,0xfd4d6b1b,0x7ac7f9bc
+DD 0xc9a3d126,0x4e294381,0xc35a8299,0x44d0103e
+DD 0xdc517658,0x5bdbe4ff,0xd6a825e7,0x5122b740
+DD 0x1a1939d2,0x9d93ab75,0x10e06a6d,0x976af8ca
+DD 0x0feb9eac,0x88610c0b,0x0512cd13,0x82985fb4
+DD 0x31fc772e,0xb676e589,0x3b052491,0xbc8fb636
+DD 0x240ed050,0xa38442f7,0x2ef783ef,0xa97d1148
+DD 0x4dd3a42a,0xca59368d,0x472af795,0xc0a06532
+DD 0x58210354,0xdfab91f3,0x52d850eb,0xd552c24c
+DD 0x6636ead6,0xe1bc7871,0x6ccfb969,0xeb452bce
+DD 0x73c44da8,0xf44edf0f,0x793d1e17,0xfeb78cb0
+DD 0xef4a0333,0x68c09194,0xe5b3508c,0x6239c22b
+DD 0xfab8a44d,0x7d3236ea,0xf041f7f2,0x77cb6555
+DD 0xc4af4dcf,0x4325df68,0xce561e70,0x49dc8cd7
+DD 0xd15deab1,0x56d77816,0xdba4b90e,0x5c2e2ba9
+DD 0xb8809ecb,0x3f0a0c6c,0xb279cd74,0x35f35fd3
+DD 0xad7239b5,0x2af8ab12,0xa78b6a0a,0x2001f8ad
+DD 0x9365d037,0x14ef4290,0x999c8388,0x1e16112f
+DD 0x86977749,0x011de5ee,0x8c6e24f6,0x0be4b651
+DD 0x40df38c3,0xc755aa64,0x4a266b7c,0xcdacf9db
+DD 0x552d9fbd,0xd2a70d1a,0x5fd4cc02,0xd85e5ea5
+DD 0x6b3a763f,0xecb0e498,0x61c32580,0xe649b727
+DD 0x7ec8d141,0xf94243e6,0x743182fe,0xf3bb1059
+DD 0x1715a53b,0x909f379c,0x1decf684,0x9a666423
+DD 0x02e70245,0x856d90e2,0x081e51fa,0x8f94c35d
+DD 0x3cf0ebc7,0xbb7a7960,0x3609b878,0xb1832adf
+DD 0x29024cb9,0xae88de1e,0x23fb1f06,0xa4718da1
+
+mul_table_312:
+DD 0x00000000,0xbac2fd7b,0x70698c07,0xcaab717c
+DD 0xe0d3180e,0x5a11e575,0x90ba9409,0x2a786972
+DD 0xc44a46ed,0x7e88bb96,0xb423caea,0x0ee13791
+DD 0x24995ee3,0x9e5ba398,0x54f0d2e4,0xee322f9f
+DD 0x8d78fb2b,0x37ba0650,0xfd11772c,0x47d38a57
+DD 0x6dabe325,0xd7691e5e,0x1dc26f22,0xa7009259
+DD 0x4932bdc6,0xf3f040bd,0x395b31c1,0x8399ccba
+DD 0xa9e1a5c8,0x132358b3,0xd98829cf,0x634ad4b4
+DD 0x1f1d80a7,0xa5df7ddc,0x6f740ca0,0xd5b6f1db
+DD 0xffce98a9,0x450c65d2,0x8fa714ae,0x3565e9d5
+DD 0xdb57c64a,0x61953b31,0xab3e4a4d,0x11fcb736
+DD 0x3b84de44,0x8146233f,0x4bed5243,0xf12faf38
+DD 0x92657b8c,0x28a786f7,0xe20cf78b,0x58ce0af0
+DD 0x72b66382,0xc8749ef9,0x02dfef85,0xb81d12fe
+DD 0x562f3d61,0xecedc01a,0x2646b166,0x9c844c1d
+DD 0xb6fc256f,0x0c3ed814,0xc695a968,0x7c575413
+DD 0x3e3b014e,0x84f9fc35,0x4e528d49,0xf4907032
+DD 0xdee81940,0x642ae43b,0xae819547,0x1443683c
+DD 0xfa7147a3,0x40b3bad8,0x8a18cba4,0x30da36df
+DD 0x1aa25fad,0xa060a2d6,0x6acbd3aa,0xd0092ed1
+DD 0xb343fa65,0x0981071e,0xc32a7662,0x79e88b19
+DD 0x5390e26b,0xe9521f10,0x23f96e6c,0x993b9317
+DD 0x7709bc88,0xcdcb41f3,0x0760308f,0xbda2cdf4
+DD 0x97daa486,0x2d1859fd,0xe7b32881,0x5d71d5fa
+DD 0x212681e9,0x9be47c92,0x514f0dee,0xeb8df095
+DD 0xc1f599e7,0x7b37649c,0xb19c15e0,0x0b5ee89b
+DD 0xe56cc704,0x5fae3a7f,0x95054b03,0x2fc7b678
+DD 0x05bfdf0a,0xbf7d2271,0x75d6530d,0xcf14ae76
+DD 0xac5e7ac2,0x169c87b9,0xdc37f6c5,0x66f50bbe
+DD 0x4c8d62cc,0xf64f9fb7,0x3ce4eecb,0x862613b0
+DD 0x68143c2f,0xd2d6c154,0x187db028,0xa2bf4d53
+DD 0x88c72421,0x3205d95a,0xf8aea826,0x426c555d
+DD 0x7c76029c,0xc6b4ffe7,0x0c1f8e9b,0xb6dd73e0
+DD 0x9ca51a92,0x2667e7e9,0xeccc9695,0x560e6bee
+DD 0xb83c4471,0x02feb90a,0xc855c876,0x7297350d
+DD 0x58ef5c7f,0xe22da104,0x2886d078,0x92442d03
+DD 0xf10ef9b7,0x4bcc04cc,0x816775b0,0x3ba588cb
+DD 0x11dde1b9,0xab1f1cc2,0x61b46dbe,0xdb7690c5
+DD 0x3544bf5a,0x8f864221,0x452d335d,0xffefce26
+DD 0xd597a754,0x6f555a2f,0xa5fe2b53,0x1f3cd628
+DD 0x636b823b,0xd9a97f40,0x13020e3c,0xa9c0f347
+DD 0x83b89a35,0x397a674e,0xf3d11632,0x4913eb49
+DD 0xa721c4d6,0x1de339ad,0xd74848d1,0x6d8ab5aa
+DD 0x47f2dcd8,0xfd3021a3,0x379b50df,0x8d59ada4
+DD 0xee137910,0x54d1846b,0x9e7af517,0x24b8086c
+DD 0x0ec0611e,0xb4029c65,0x7ea9ed19,0xc46b1062
+DD 0x2a593ffd,0x909bc286,0x5a30b3fa,0xe0f24e81
+DD 0xca8a27f3,0x7048da88,0xbae3abf4,0x0021568f
+DD 0x424d03d2,0xf88ffea9,0x32248fd5,0x88e672ae
+DD 0xa29e1bdc,0x185ce6a7,0xd2f797db,0x68356aa0
+DD 0x8607453f,0x3cc5b844,0xf66ec938,0x4cac3443
+DD 0x66d45d31,0xdc16a04a,0x16bdd136,0xac7f2c4d
+DD 0xcf35f8f9,0x75f70582,0xbf5c74fe,0x059e8985
+DD 0x2fe6e0f7,0x95241d8c,0x5f8f6cf0,0xe54d918b
+DD 0x0b7fbe14,0xb1bd436f,0x7b163213,0xc1d4cf68
+DD 0xebaca61a,0x516e5b61,0x9bc52a1d,0x2107d766
+DD 0x5d508375,0xe7927e0e,0x2d390f72,0x97fbf209
+DD 0xbd839b7b,0x07416600,0xcdea177c,0x7728ea07
+DD 0x991ac598,0x23d838e3,0xe973499f,0x53b1b4e4
+DD 0x79c9dd96,0xc30b20ed,0x09a05191,0xb362acea
+DD 0xd028785e,0x6aea8525,0xa041f459,0x1a830922
+DD 0x30fb6050,0x8a399d2b,0x4092ec57,0xfa50112c
+DD 0x14623eb3,0xaea0c3c8,0x640bb2b4,0xdec94fcf
+DD 0xf4b126bd,0x4e73dbc6,0x84d8aaba,0x3e1a57c1
+
+mul_table_632:
+DD 0x00000000,0x6b749fb2,0xd6e93f64,0xbd9da0d6
+DD 0xa83e0839,0xc34a978b,0x7ed7375d,0x15a3a8ef
+DD 0x55906683,0x3ee4f931,0x837959e7,0xe80dc655
+DD 0xfdae6eba,0x96daf108,0x2b4751de,0x4033ce6c
+DD 0xab20cd06,0xc05452b4,0x7dc9f262,0x16bd6dd0
+DD 0x031ec53f,0x686a5a8d,0xd5f7fa5b,0xbe8365e9
+DD 0xfeb0ab85,0x95c43437,0x285994e1,0x432d0b53
+DD 0x568ea3bc,0x3dfa3c0e,0x80679cd8,0xeb13036a
+DD 0x53adecfd,0x38d9734f,0x8544d399,0xee304c2b
+DD 0xfb93e4c4,0x90e77b76,0x2d7adba0,0x460e4412
+DD 0x063d8a7e,0x6d4915cc,0xd0d4b51a,0xbba02aa8
+DD 0xae038247,0xc5771df5,0x78eabd23,0x139e2291
+DD 0xf88d21fb,0x93f9be49,0x2e641e9f,0x4510812d
+DD 0x50b329c2,0x3bc7b670,0x865a16a6,0xed2e8914
+DD 0xad1d4778,0xc669d8ca,0x7bf4781c,0x1080e7ae
+DD 0x05234f41,0x6e57d0f3,0xd3ca7025,0xb8beef97
+DD 0xa75bd9fa,0xcc2f4648,0x71b2e69e,0x1ac6792c
+DD 0x0f65d1c3,0x64114e71,0xd98ceea7,0xb2f87115
+DD 0xf2cbbf79,0x99bf20cb,0x2422801d,0x4f561faf
+DD 0x5af5b740,0x318128f2,0x8c1c8824,0xe7681796
+DD 0x0c7b14fc,0x670f8b4e,0xda922b98,0xb1e6b42a
+DD 0xa4451cc5,0xcf318377,0x72ac23a1,0x19d8bc13
+DD 0x59eb727f,0x329fedcd,0x8f024d1b,0xe476d2a9
+DD 0xf1d57a46,0x9aa1e5f4,0x273c4522,0x4c48da90
+DD 0xf4f63507,0x9f82aab5,0x221f0a63,0x496b95d1
+DD 0x5cc83d3e,0x37bca28c,0x8a21025a,0xe1559de8
+DD 0xa1665384,0xca12cc36,0x778f6ce0,0x1cfbf352
+DD 0x09585bbd,0x622cc40f,0xdfb164d9,0xb4c5fb6b
+DD 0x5fd6f801,0x34a267b3,0x893fc765,0xe24b58d7
+DD 0xf7e8f038,0x9c9c6f8a,0x2101cf5c,0x4a7550ee
+DD 0x0a469e82,0x61320130,0xdcafa1e6,0xb7db3e54
+DD 0xa27896bb,0xc90c0909,0x7491a9df,0x1fe5366d
+DD 0x4b5bc505,0x202f5ab7,0x9db2fa61,0xf6c665d3
+DD 0xe365cd3c,0x8811528e,0x358cf258,0x5ef86dea
+DD 0x1ecba386,0x75bf3c34,0xc8229ce2,0xa3560350
+DD 0xb6f5abbf,0xdd81340d,0x601c94db,0x0b680b69
+DD 0xe07b0803,0x8b0f97b1,0x36923767,0x5de6a8d5
+DD 0x4845003a,0x23319f88,0x9eac3f5e,0xf5d8a0ec
+DD 0xb5eb6e80,0xde9ff132,0x630251e4,0x0876ce56
+DD 0x1dd566b9,0x76a1f90b,0xcb3c59dd,0xa048c66f
+DD 0x18f629f8,0x7382b64a,0xce1f169c,0xa56b892e
+DD 0xb0c821c1,0xdbbcbe73,0x66211ea5,0x0d558117
+DD 0x4d664f7b,0x2612d0c9,0x9b8f701f,0xf0fbefad
+DD 0xe5584742,0x8e2cd8f0,0x33b17826,0x58c5e794
+DD 0xb3d6e4fe,0xd8a27b4c,0x653fdb9a,0x0e4b4428
+DD 0x1be8ecc7,0x709c7375,0xcd01d3a3,0xa6754c11
+DD 0xe646827d,0x8d321dcf,0x30afbd19,0x5bdb22ab
+DD 0x4e788a44,0x250c15f6,0x9891b520,0xf3e52a92
+DD 0xec001cff,0x8774834d,0x3ae9239b,0x519dbc29
+DD 0x443e14c6,0x2f4a8b74,0x92d72ba2,0xf9a3b410
+DD 0xb9907a7c,0xd2e4e5ce,0x6f794518,0x040ddaaa
+DD 0x11ae7245,0x7adaedf7,0xc7474d21,0xac33d293
+DD 0x4720d1f9,0x2c544e4b,0x91c9ee9d,0xfabd712f
+DD 0xef1ed9c0,0x846a4672,0x39f7e6a4,0x52837916
+DD 0x12b0b77a,0x79c428c8,0xc459881e,0xaf2d17ac
+DD 0xba8ebf43,0xd1fa20f1,0x6c678027,0x07131f95
+DD 0xbfadf002,0xd4d96fb0,0x6944cf66,0x023050d4
+DD 0x1793f83b,0x7ce76789,0xc17ac75f,0xaa0e58ed
+DD 0xea3d9681,0x81490933,0x3cd4a9e5,0x57a03657
+DD 0x42039eb8,0x2977010a,0x94eaa1dc,0xff9e3e6e
+DD 0x148d3d04,0x7ff9a2b6,0xc2640260,0xa9109dd2
+DD 0xbcb3353d,0xd7c7aa8f,0x6a5a0a59,0x012e95eb
+DD 0x411d5b87,0x2a69c435,0x97f464e3,0xfc80fb51
+DD 0xe92353be,0x8257cc0c,0x3fca6cda,0x54bef368
+
+mul_table_1272:
+DD 0x00000000,0xdd66cbbb,0xbf21e187,0x62472a3c
+DD 0x7bafb5ff,0xa6c97e44,0xc48e5478,0x19e89fc3
+DD 0xf75f6bfe,0x2a39a045,0x487e8a79,0x951841c2
+DD 0x8cf0de01,0x519615ba,0x33d13f86,0xeeb7f43d
+DD 0xeb52a10d,0x36346ab6,0x5473408a,0x89158b31
+DD 0x90fd14f2,0x4d9bdf49,0x2fdcf575,0xf2ba3ece
+DD 0x1c0dcaf3,0xc16b0148,0xa32c2b74,0x7e4ae0cf
+DD 0x67a27f0c,0xbac4b4b7,0xd8839e8b,0x05e55530
+DD 0xd34934eb,0x0e2fff50,0x6c68d56c,0xb10e1ed7
+DD 0xa8e68114,0x75804aaf,0x17c76093,0xcaa1ab28
+DD 0x24165f15,0xf97094ae,0x9b37be92,0x46517529
+DD 0x5fb9eaea,0x82df2151,0xe0980b6d,0x3dfec0d6
+DD 0x381b95e6,0xe57d5e5d,0x873a7461,0x5a5cbfda
+DD 0x43b42019,0x9ed2eba2,0xfc95c19e,0x21f30a25
+DD 0xcf44fe18,0x122235a3,0x70651f9f,0xad03d424
+DD 0xb4eb4be7,0x698d805c,0x0bcaaa60,0xd6ac61db
+DD 0xa37e1f27,0x7e18d49c,0x1c5ffea0,0xc139351b
+DD 0xd8d1aad8,0x05b76163,0x67f04b5f,0xba9680e4
+DD 0x542174d9,0x8947bf62,0xeb00955e,0x36665ee5
+DD 0x2f8ec126,0xf2e80a9d,0x90af20a1,0x4dc9eb1a
+DD 0x482cbe2a,0x954a7591,0xf70d5fad,0x2a6b9416
+DD 0x33830bd5,0xeee5c06e,0x8ca2ea52,0x51c421e9
+DD 0xbf73d5d4,0x62151e6f,0x00523453,0xdd34ffe8
+DD 0xc4dc602b,0x19baab90,0x7bfd81ac,0xa69b4a17
+DD 0x70372bcc,0xad51e077,0xcf16ca4b,0x127001f0
+DD 0x0b989e33,0xd6fe5588,0xb4b97fb4,0x69dfb40f
+DD 0x87684032,0x5a0e8b89,0x3849a1b5,0xe52f6a0e
+DD 0xfcc7f5cd,0x21a13e76,0x43e6144a,0x9e80dff1
+DD 0x9b658ac1,0x4603417a,0x24446b46,0xf922a0fd
+DD 0xe0ca3f3e,0x3dacf485,0x5febdeb9,0x828d1502
+DD 0x6c3ae13f,0xb15c2a84,0xd31b00b8,0x0e7dcb03
+DD 0x179554c0,0xcaf39f7b,0xa8b4b547,0x75d27efc
+DD 0x431048bf,0x9e768304,0xfc31a938,0x21576283
+DD 0x38bffd40,0xe5d936fb,0x879e1cc7,0x5af8d77c
+DD 0xb44f2341,0x6929e8fa,0x0b6ec2c6,0xd608097d
+DD 0xcfe096be,0x12865d05,0x70c17739,0xada7bc82
+DD 0xa842e9b2,0x75242209,0x17630835,0xca05c38e
+DD 0xd3ed5c4d,0x0e8b97f6,0x6cccbdca,0xb1aa7671
+DD 0x5f1d824c,0x827b49f7,0xe03c63cb,0x3d5aa870
+DD 0x24b237b3,0xf9d4fc08,0x9b93d634,0x46f51d8f
+DD 0x90597c54,0x4d3fb7ef,0x2f789dd3,0xf21e5668
+DD 0xebf6c9ab,0x36900210,0x54d7282c,0x89b1e397
+DD 0x670617aa,0xba60dc11,0xd827f62d,0x05413d96
+DD 0x1ca9a255,0xc1cf69ee,0xa38843d2,0x7eee8869
+DD 0x7b0bdd59,0xa66d16e2,0xc42a3cde,0x194cf765
+DD 0x00a468a6,0xddc2a31d,0xbf858921,0x62e3429a
+DD 0x8c54b6a7,0x51327d1c,0x33755720,0xee139c9b
+DD 0xf7fb0358,0x2a9dc8e3,0x48dae2df,0x95bc2964
+DD 0xe06e5798,0x3d089c23,0x5f4fb61f,0x82297da4
+DD 0x9bc1e267,0x46a729dc,0x24e003e0,0xf986c85b
+DD 0x17313c66,0xca57f7dd,0xa810dde1,0x7576165a
+DD 0x6c9e8999,0xb1f84222,0xd3bf681e,0x0ed9a3a5
+DD 0x0b3cf695,0xd65a3d2e,0xb41d1712,0x697bdca9
+DD 0x7093436a,0xadf588d1,0xcfb2a2ed,0x12d46956
+DD 0xfc639d6b,0x210556d0,0x43427cec,0x9e24b757
+DD 0x87cc2894,0x5aaae32f,0x38edc913,0xe58b02a8
+DD 0x33276373,0xee41a8c8,0x8c0682f4,0x5160494f
+DD 0x4888d68c,0x95ee1d37,0xf7a9370b,0x2acffcb0
+DD 0xc478088d,0x191ec336,0x7b59e90a,0xa63f22b1
+DD 0xbfd7bd72,0x62b176c9,0x00f65cf5,0xdd90974e
+DD 0xd875c27e,0x051309c5,0x675423f9,0xba32e842
+DD 0xa3da7781,0x7ebcbc3a,0x1cfb9606,0xc19d5dbd
+DD 0x2f2aa980,0xf24c623b,0x900b4807,0x4d6d83bc
+DD 0x54851c7f,0x89e3d7c4,0xeba4fdf8,0x36c23643
+
+%macro slversion 4
+global %1_slver_%2%3%4
+global %1_slver
+%1_slver:
+%1_slver_%2%3%4:
+ dw 0x%4
+ db 0x%3, 0x%2
+%endmacro
+;;; func core, ver, snum
+slversion crc32_iscsi_00, 00, 02, 0014
diff --git a/src/common/sctp_crc32.c b/src/common/sctp_crc32.c
index b11bb48dd87..7e2678a2b7c 100644
--- a/src/common/sctp_crc32.c
+++ b/src/common/sctp_crc32.c
@@ -728,7 +728,7 @@ sctp_csum_finalize(uint32_t crc32c)
}
#endif
-uint32_t ceph_crc32c_le_generic(uint32_t crc, unsigned char const *data, unsigned length)
+uint32_t ceph_crc32c_sctp(uint32_t crc, unsigned char const *data, unsigned length)
{
return update_crc32(crc, data, length);
}
diff --git a/src/common/sctp_crc32.h b/src/common/sctp_crc32.h
new file mode 100644
index 00000000000..92d20bcb7cc
--- /dev/null
+++ b/src/common/sctp_crc32.h
@@ -0,0 +1,14 @@
+#ifndef CEPH_COMMON_SCTP_CRC32_H
+#define CEPH_COMMON_SCTP_CRC32_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+extern uint32_t ceph_crc32c_sctp(uint32_t crc, unsigned char const *data, unsigned length);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/src/common/sharedptr_registry.hpp b/src/common/sharedptr_registry.hpp
index a62aa0d9ce3..90043001ee7 100644
--- a/src/common/sharedptr_registry.hpp
+++ b/src/common/sharedptr_registry.hpp
@@ -58,6 +58,31 @@ public:
lock("SharedPtrRegistry::lock")
{}
+ bool empty() {
+ Mutex::Locker l(lock);
+ return contents.empty();
+ }
+
+ bool get_next(const K &key, pair<K, VPtr> *next) {
+ pair<K, VPtr> r;
+ {
+ Mutex::Locker l(lock);
+ VPtr next_val;
+ typename map<K, WeakVPtr>::iterator i = contents.upper_bound(key);
+ while (i != contents.end() &&
+ !(next_val = i->second.lock()))
+ ++i;
+ if (i == contents.end())
+ return false;
+ if (next)
+ r = make_pair(i->first, next_val);
+ }
+ if (next)
+ *next = r;
+ return true;
+ }
+
+
bool get_next(const K &key, pair<K, V> *next) {
VPtr next_val;
Mutex::Locker l(lock);
diff --git a/src/gtest/.gitignore b/src/gtest/.gitignore
index 5dc4299f8fe..251055769fc 100644
--- a/src/gtest/.gitignore
+++ b/src/gtest/.gitignore
@@ -2,4 +2,5 @@ fused-src
/scripts/gtest-config
/build-aux/config.h.in
/build-aux/config.h
+/build-aux/test-driver
/lib/
diff --git a/src/include/buffer.h b/src/include/buffer.h
index 8c4dfb56e17..8e637d658c5 100644
--- a/src/include/buffer.h
+++ b/src/include/buffer.h
@@ -425,7 +425,7 @@ public:
it != _buffers.end();
++it)
if (it->length())
- crc = ceph_crc32c_le(crc, (unsigned char*)it->c_str(), it->length());
+ crc = ceph_crc32c(crc, (unsigned char*)it->c_str(), it->length());
return crc;
}
diff --git a/src/include/crc32c.h b/src/include/crc32c.h
index 3fd209efb02..d5f7388be56 100644
--- a/src/include/crc32c.h
+++ b/src/include/crc32c.h
@@ -1,25 +1,25 @@
#ifndef CEPH_CRC32C_H
#define CEPH_CRC32C_H
-#ifdef __cplusplus
-extern "C" {
-#endif
-
+#include "include/inttypes.h"
#include <string.h>
-extern int ceph_have_crc32c_intel(void);
-extern uint32_t ceph_crc32c_le_generic(uint32_t crc, unsigned char const *data, unsigned length);
-extern uint32_t ceph_crc32c_le_intel(uint32_t crc, unsigned char const *data, unsigned length);
+typedef uint32_t (*ceph_crc32c_func_t)(uint32_t crc, unsigned char const *data, unsigned length);
-static inline uint32_t ceph_crc32c_le(uint32_t crc, unsigned char const *data, unsigned length) {
- if (ceph_have_crc32c_intel()) //__builtin_cpu_supports("sse4.2"))
- return ceph_crc32c_le_intel(crc, data, length);
- else
- return ceph_crc32c_le_generic(crc, data, length);
-}
+/*
+ * this is a static global with the chosen crc32c implementation for
+ * the given architecture.
+ */
+extern ceph_crc32c_func_t ceph_crc32c_func;
-#ifdef __cplusplus
+extern ceph_crc32c_func_t ceph_choose_crc32(void);
+
+/*
+ * common entry point; use this!
+ */
+static inline uint32_t ceph_crc32c(uint32_t crc, unsigned char const *data, unsigned length)
+{
+ return ceph_crc32c_func(crc, data, length);
}
-#endif
#endif
diff --git a/src/init-ceph.in b/src/init-ceph.in
index 7d003e6370c..3a404a46c6f 100644
--- a/src/init-ceph.in
+++ b/src/init-ceph.in
@@ -34,6 +34,10 @@ usage_exit() {
exit
}
+# behave if we are not completely installed (e.g., Debian "removed,
+# config remains" state)
+test -f $LIBDIR/ceph_common.sh || exit 0
+
. $LIBDIR/ceph_common.sh
EXIT_STATUS=0
diff --git a/src/init-rbdmap b/src/init-rbdmap
index 54554ae40d7..e04424fcd78 100755
--- a/src/init-rbdmap
+++ b/src/init-rbdmap
@@ -68,7 +68,7 @@ do_unmap() {
umount $MNT
done
# Unmap all rbd device
- if [ -b /dev/rbd[0-9]* ]; then
+ if ls /dev/rbd[0-9]* >/dev/null 2>&1; then
for DEV in /dev/rbd[0-9]*; do
log_progress_msg $DEV
rbd unmap $DEV
diff --git a/src/json_spirit/json_spirit_writer_template.h b/src/json_spirit/json_spirit_writer_template.h
index 61a0e18c2b4..c66037e1f9f 100644
--- a/src/json_spirit/json_spirit_writer_template.h
+++ b/src/json_spirit/json_spirit_writer_template.h
@@ -34,8 +34,6 @@ namespace json_spirit
template< class String_type >
String_type non_printable_to_string( unsigned int c )
{
- typedef typename String_type::value_type Char_type;
-
String_type result( 6, '\\' );
result[1] = 'u';
diff --git a/src/librados-config.cc b/src/librados-config.cc
index a0a064fd1d7..ffe758129b8 100644
--- a/src/librados-config.cc
+++ b/src/librados-config.cc
@@ -42,7 +42,8 @@ int main(int argc, const char **argv)
bool opt_version = false;
bool opt_vernum = false;
- global_init(NULL, args, CEPH_ENTITY_TYPE_CLIENT, CODE_ENVIRONMENT_UTILITY, 0);
+ global_init(NULL, args, CEPH_ENTITY_TYPE_CLIENT, CODE_ENVIRONMENT_UTILITY,
+ CINIT_FLAG_NO_DEFAULT_CONFIG_FILE);
common_init_finish(g_ceph_context);
for (std::vector<const char*>::iterator i = args.begin();
diff --git a/src/librados/AioCompletionImpl.h b/src/librados/AioCompletionImpl.h
index 4243bcd2298..63a56db8aa8 100644
--- a/src/librados/AioCompletionImpl.h
+++ b/src/librados/AioCompletionImpl.h
@@ -35,7 +35,7 @@ struct librados::AioCompletionImpl {
version_t objver;
rados_callback_t callback_complete, callback_safe;
- void *callback_arg;
+ void *callback_complete_arg, *callback_safe_arg;
// for read
bool is_read;
@@ -50,21 +50,24 @@ struct librados::AioCompletionImpl {
AioCompletionImpl() : lock("AioCompletionImpl lock", false, false),
ref(1), rval(0), released(false), ack(false), safe(false),
objver(0),
- callback_complete(0), callback_safe(0), callback_arg(0),
+ callback_complete(0),
+ callback_safe(0),
+ callback_complete_arg(0),
+ callback_safe_arg(0),
is_read(false), pbl(0), buf(0), maxlen(0),
io(NULL), aio_write_seq(0), aio_write_list_item(this) { }
int set_complete_callback(void *cb_arg, rados_callback_t cb) {
lock.Lock();
callback_complete = cb;
- callback_arg = cb_arg;
+ callback_complete_arg = cb_arg;
lock.Unlock();
return 0;
}
int set_safe_callback(void *cb_arg, rados_callback_t cb) {
lock.Lock();
callback_safe = cb;
- callback_arg = cb_arg;
+ callback_safe_arg = cb_arg;
lock.Unlock();
return 0;
}
@@ -172,7 +175,7 @@ struct C_AioComplete : public Context {
void finish(int r) {
rados_callback_t cb = c->callback_complete;
- void *cb_arg = c->callback_arg;
+ void *cb_arg = c->callback_complete_arg;
cb(c, cb_arg);
c->lock.Lock();
@@ -191,7 +194,7 @@ struct C_AioSafe : public Context {
void finish(int r) {
rados_callback_t cb = c->callback_safe;
- void *cb_arg = c->callback_arg;
+ void *cb_arg = c->callback_safe_arg;
cb(c, cb_arg);
c->lock.Lock();
@@ -223,13 +226,14 @@ struct C_AioCompleteAndSafe : public Context {
c->safe = true;
c->lock.Unlock();
rados_callback_t cb_complete = c->callback_complete;
- void *cb_arg = c->callback_arg;
+ void *cb_complete_arg = c->callback_complete_arg;
if (cb_complete)
- cb_complete(c, cb_arg);
+ cb_complete(c, cb_complete_arg);
rados_callback_t cb_safe = c->callback_safe;
+ void *cb_safe_arg = c->callback_safe_arg;
if (cb_safe)
- cb_safe(c, cb_arg);
+ cb_safe(c, cb_safe_arg);
c->lock.Lock();
c->callback_complete = NULL;
diff --git a/src/librbd/internal.cc b/src/librbd/internal.cc
index 9c9ae16dfa4..abc6ff92a28 100644
--- a/src/librbd/internal.cc
+++ b/src/librbd/internal.cc
@@ -2845,7 +2845,7 @@ reprotect_and_return_err:
{
CephContext *cct = ictx->cct;
ldout(cct, 20) << "aio_write " << ictx << " off = " << off << " len = "
- << len << " buf = " << &buf << dendl;
+ << len << " buf = " << (void*)buf << dendl;
if (!len)
return 0;
diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc
index 898dcd39f48..86b380f2827 100644
--- a/src/mds/MDCache.cc
+++ b/src/mds/MDCache.cc
@@ -7935,7 +7935,7 @@ void MDCache::_open_ino_backtrace_fetched(inodeno_t ino, bufferlist& bl, int err
inode_backtrace_t backtrace;
if (err == 0) {
::decode(backtrace, bl);
- if (backtrace.pool != info.pool) {
+ if (backtrace.pool != info.pool && backtrace.pool != -1) {
dout(10) << " old object in pool " << info.pool
<< ", retrying pool " << backtrace.pool << dendl;
info.pool = backtrace.pool;
diff --git a/src/mds/flock.cc b/src/mds/flock.cc
index e83c5ee23a0..5e329afafb7 100644
--- a/src/mds/flock.cc
+++ b/src/mds/flock.cc
@@ -75,12 +75,14 @@ bool ceph_lock_state_t::add_lock(ceph_filelock& new_lock,
} else {
//yay, we can insert a shared lock
dout(15) << "inserting shared lock" << dendl;
+ remove_waiting(new_lock);
adjust_locks(self_overlapping_locks, new_lock, neighbor_locks);
held_locks.insert(pair<uint64_t, ceph_filelock>(new_lock.start, new_lock));
ret = true;
}
}
} else { //no overlapping locks except our own
+ remove_waiting(new_lock);
adjust_locks(self_overlapping_locks, new_lock, neighbor_locks);
dout(15) << "no conflicts, inserting " << new_lock << dendl;
held_locks.insert(pair<uint64_t, ceph_filelock>
@@ -89,7 +91,6 @@ bool ceph_lock_state_t::add_lock(ceph_filelock& new_lock,
}
if (ret) {
++client_held_lock_counts[(client_t)new_lock.client];
- remove_waiting(new_lock);
}
else if (wait_on_fail && !replay)
++client_waiting_lock_counts[(client_t)new_lock.client];
@@ -306,7 +307,7 @@ void ceph_lock_state_t::adjust_locks(list<multimap<uint64_t, ceph_filelock>::ite
old_lock = &(*iter)->second;
old_lock_client = old_lock->client;
dout(15) << "lock to coalesce: " << *old_lock << dendl;
- /* because if it's a neibhoring lock there can't be any self-overlapping
+ /* because if it's a neighboring lock there can't be any self-overlapping
locks that covered it */
if (old_lock->type == new_lock.type) { //merge them
if (0 == new_lock.length) {
diff --git a/src/mon/DataHealthService.cc b/src/mon/DataHealthService.cc
index 6e8aa313a36..5fc745ce11d 100644
--- a/src/mon/DataHealthService.cc
+++ b/src/mon/DataHealthService.cc
@@ -81,6 +81,18 @@ health_status_t DataHealthService::get_health(
health_detail = "low disk space!";
}
+ if (stats.store_stats.bytes_total >= g_conf->mon_leveldb_size_warn) {
+ if (health_status > HEALTH_WARN)
+ health_status = HEALTH_WARN;
+ if (!health_detail.empty())
+ health_detail.append("; ");
+ stringstream ss;
+ ss << "store is getting too big! "
+ << prettybyte_t(stats.store_stats.bytes_total)
+ << " >= " << prettybyte_t(g_conf->mon_leveldb_size_warn);
+ health_detail.append(ss.str());
+ }
+
if (overall_status > health_status)
overall_status = health_status;
@@ -95,18 +107,15 @@ health_status_t DataHealthService::get_health(
if (f) {
f->open_object_section("mon");
f->dump_string("name", mon_name.c_str());
- f->dump_int("kb_total", stats.kb_total);
- f->dump_int("kb_used", stats.kb_used);
- f->dump_int("kb_avail", stats.kb_avail);
- f->dump_int("avail_percent", stats.latest_avail_percent);
- f->dump_stream("last_updated") << stats.last_update;
+ // leave this unenclosed by an object section to avoid breaking backward-compatibility
+ stats.dump(f);
f->dump_stream("health") << health_status;
if (health_status != HEALTH_OK)
- f->dump_string("health_detail", health_detail);
+ f->dump_string("health_detail", health_detail);
f->close_section();
}
}
-
+
if (f) {
f->close_section(); // mons
f->close_section(); // data_health
@@ -115,6 +124,22 @@ health_status_t DataHealthService::get_health(
return overall_status;
}
+int DataHealthService::update_store_stats(DataStats &ours)
+{
+ map<string,uint64_t> extra;
+ uint64_t store_size = mon->store->get_estimated_size(extra);
+ assert(store_size > 0);
+
+ ours.store_stats.bytes_total = store_size;
+ ours.store_stats.bytes_sst = extra["sst"];
+ ours.store_stats.bytes_log = extra["log"];
+ ours.store_stats.bytes_misc = extra["misc"];
+ ours.last_update = ceph_clock_now(g_ceph_context);
+
+ return 0;
+}
+
+
int DataHealthService::update_stats()
{
struct statfs stbuf;
@@ -135,7 +160,8 @@ int DataHealthService::update_stats()
<< " total " << ours.kb_total << " used " << ours.kb_used << " avail " << ours.kb_avail
<< dendl;
ours.last_update = ceph_clock_now(g_ceph_context);
- return 0;
+
+ return update_store_stats(ours);
}
void DataHealthService::share_stats()
diff --git a/src/mon/DataHealthService.h b/src/mon/DataHealthService.h
index 337e7a450f7..750c58e5f80 100644
--- a/src/mon/DataHealthService.h
+++ b/src/mon/DataHealthService.h
@@ -34,6 +34,7 @@ class DataHealthService :
int last_warned_percent;
void handle_tell(MMonHealth *m);
+ int update_store_stats(DataStats &ours);
int update_stats();
void share_stats();
diff --git a/src/mon/MonCap.cc b/src/mon/MonCap.cc
index d8bccce9bc2..644d614bdf9 100644
--- a/src/mon/MonCap.cc
+++ b/src/mon/MonCap.cc
@@ -346,7 +346,7 @@ struct MonCapParser : qi::grammar<Iterator, MonCap()>
quoted_string %=
lexeme['"' >> +(char_ - '"') >> '"'] |
lexeme['\'' >> +(char_ - '\'') >> '\''];
- unquoted_word %= +char_("a-zA-Z0-9_-");
+ unquoted_word %= +char_("a-zA-Z0-9_.-");
str %= quoted_string | unquoted_word;
spaces = +lit(' ');
diff --git a/src/mon/MonCommands.h b/src/mon/MonCommands.h
index 8d85e03ed99..ec1ee71c9e1 100644
--- a/src/mon/MonCommands.h
+++ b/src/mon/MonCommands.h
@@ -111,7 +111,7 @@ COMMAND("pg getmap", "get binary pg map to -o/stdout", "pg", "r", "cli,rest")
COMMAND("pg send_pg_creates", "trigger pg creates to be issued",\
"pg", "rw", "cli,rest")
COMMAND("pg dump " \
- "name=dumpcontents,type=CephChoices,strings=all|summary|sum|pools|osds|pgs|pgs_brief,n=N,req=false", \
+ "name=dumpcontents,type=CephChoices,strings=all|summary|sum|delta|pools|osds|pgs|pgs_brief,n=N,req=false", \
"show human-readable versions of pg map", "pg", "r", "cli,rest")
COMMAND("pg dump_json " \
"name=dumpcontents,type=CephChoices,strings=all|summary|sum|pools|osds|pgs,n=N,req=false", \
diff --git a/src/mon/Monitor.cc b/src/mon/Monitor.cc
index 61e2a2aa57a..45ca02027fc 100644
--- a/src/mon/Monitor.cc
+++ b/src/mon/Monitor.cc
@@ -1878,7 +1878,6 @@ bool Monitor::_allowed_command(MonSession *s, string &module, string &prefix,
MonCommand *this_cmd = NULL;
for (MonCommand *cp = mon_commands;
cp < &mon_commands[ARRAY_SIZE(mon_commands)]; cp++) {
- dout(0) << __func__ << " CAPSBAR >> matching against " << cp->cmdstring << dendl;
if (cp->cmdstring.find(prefix) != string::npos) {
this_cmd = cp;
break;
diff --git a/src/mon/MonitorDBStore.h b/src/mon/MonitorDBStore.h
index 276620f7516..85f6c895145 100644
--- a/src/mon/MonitorDBStore.h
+++ b/src/mon/MonitorDBStore.h
@@ -509,6 +509,10 @@ class MonitorDBStore
db->compact_prefix(prefix);
}
+ uint64_t get_estimated_size(map<string, uint64_t> &extras) {
+ return db->get_estimated_size(extras);
+ }
+
MonitorDBStore(const string& path) :
db(0), do_dump(false), dump_fd(-1) {
string::const_reverse_iterator rit;
@@ -523,8 +527,8 @@ class MonitorDBStore
LevelDBStore *db_ptr = new LevelDBStore(g_ceph_context, full_path);
if (!db_ptr) {
- std::cout << __func__ << " error initializing level db back storage in "
- << full_path << std::endl;
+ derr << __func__ << " error initializing level db back storage in "
+ << full_path << dendl;
assert(0 != "MonitorDBStore: error initializing level db back storage");
}
db.reset(db_ptr);
diff --git a/src/mon/PGMap.cc b/src/mon/PGMap.cc
index 40d84e6a5a5..e9a35c6b8ab 100644
--- a/src/mon/PGMap.cc
+++ b/src/mon/PGMap.cc
@@ -513,13 +513,18 @@ void PGMap::dump_basic(Formatter *f) const
pg_sum.dump(f);
f->close_section();
- f->open_object_section("pg_stats_delta");
- pg_sum_delta.dump(f);
- f->close_section();
-
f->open_object_section("osd_stats_sum");
osd_sum.dump(f);
f->close_section();
+
+ dump_delta(f);
+}
+
+void PGMap::dump_delta(Formatter *f) const
+{
+ f->open_object_section("pg_stats_delta");
+ pg_sum_delta.dump(f);
+ f->close_section();
}
void PGMap::dump_pg_stats(Formatter *f, bool brief) const
@@ -849,9 +854,9 @@ void PGMap::print_summary(Formatter *f, ostream *out) const
f->dump_unsigned("version", version);
f->dump_unsigned("num_pgs", pg_stat.size());
f->dump_unsigned("data_bytes", pg_sum.stats.sum.num_bytes);
- f->dump_unsigned("bytes_used", osd_sum.kb_used * 4096ull);
- f->dump_unsigned("bytes_avail", osd_sum.kb_avail * 4096ull);
- f->dump_unsigned("bytes_total", osd_sum.kb * 4096ull);
+ f->dump_unsigned("bytes_used", osd_sum.kb_used * 1024ull);
+ f->dump_unsigned("bytes_avail", osd_sum.kb_avail * 1024ull);
+ f->dump_unsigned("bytes_total", osd_sum.kb * 1024ull);
} else {
*out << " pgmap v" << version << ": "
<< pg_stat.size() << " pgs, " << pg_pool_sum.size() << " pools, "
diff --git a/src/mon/PGMap.h b/src/mon/PGMap.h
index 00aa01ed07b..84d89f87517 100644
--- a/src/mon/PGMap.h
+++ b/src/mon/PGMap.h
@@ -158,6 +158,7 @@ public:
void dump_pg_stats(Formatter *f, bool brief) const;
void dump_pool_stats(Formatter *f) const;
void dump_osd_stats(Formatter *f) const;
+ void dump_delta(Formatter *f) const;
void dump_pg_stats_plain(ostream& ss,
const hash_map<pg_t, pg_stat_t>& pg_stats) const;
diff --git a/src/mon/PGMonitor.cc b/src/mon/PGMonitor.cc
index bb5f447a4e3..2a677be61d9 100644
--- a/src/mon/PGMonitor.cc
+++ b/src/mon/PGMonitor.cc
@@ -1406,6 +1406,11 @@ bool PGMonitor::preprocess_command(MMonCommand *m)
if (what.count("pgs_brief")) {
pg_map.dump_pg_stats(f.get(), true);
}
+ if (what.count("delta")) {
+ f->open_object_section("delta");
+ pg_map.dump_delta(f.get());
+ f->close_section();
+ }
}
f->flush(ds);
} else {
@@ -1423,7 +1428,6 @@ bool PGMonitor::preprocess_command(MMonCommand *m)
cmd_getval(g_ceph_context, cmdmap, "threshold", threshold,
int64_t(g_conf->mon_pg_stuck_threshold));
- boost::scoped_ptr<Formatter> f(new_formatter("json"));
r = dump_stuck_pg_stats(ds, f.get(), (int)threshold, stuckop_vec);
ss << "ok";
r = 0;
diff --git a/src/mon/Paxos.cc b/src/mon/Paxos.cc
index 445413da13b..495268ff9ee 100644
--- a/src/mon/Paxos.cc
+++ b/src/mon/Paxos.cc
@@ -282,10 +282,11 @@ void Paxos::share_state(MMonPaxos *m, version_t peer_first_committed,
* be. All all this is done tightly wrapped in a transaction to ensure we
* enjoy the atomicity guarantees given by our awesome k/v store.
*/
-void Paxos::store_state(MMonPaxos *m)
+bool Paxos::store_state(MMonPaxos *m)
{
MonitorDBStore::Transaction t;
map<version_t,bufferlist>::iterator start = m->values.begin();
+ bool changed = false;
// build map of values to store
// we want to write the range [last_committed, m->last_committed] only.
@@ -327,6 +328,15 @@ void Paxos::store_state(MMonPaxos *m)
// apply.
decode_append_transaction(t, it->second);
}
+
+ // discard obsolete uncommitted value?
+ if (uncommitted_v && uncommitted_v <= last_committed) {
+ dout(10) << " forgetting obsolete uncommitted value " << uncommitted_v
+ << " pn " << uncommitted_pn << dendl;
+ uncommitted_v = 0;
+ uncommitted_pn = 0;
+ uncommitted_value.clear();
+ }
}
if (!t.empty()) {
dout(30) << __func__ << " transaction dump:\n";
@@ -341,9 +351,12 @@ void Paxos::store_state(MMonPaxos *m)
first_committed = get_store()->get(get_name(), "first_committed");
_sanity_check_store();
+ changed = true;
}
remove_legacy_versions();
+
+ return changed;
}
void Paxos::remove_legacy_versions()
@@ -371,6 +384,8 @@ void Paxos::_sanity_check_store()
// leader
void Paxos::handle_last(MMonPaxos *last)
{
+ bool need_refresh = false;
+
dout(10) << "handle_last " << *last << dendl;
if (!mon->is_leader()) {
@@ -397,7 +412,7 @@ void Paxos::handle_last(MMonPaxos *last)
assert(g_conf->paxos_kill_at != 1);
// store any committed values if any are specified in the message
- store_state(last);
+ need_refresh = store_state(last);
assert(g_conf->paxos_kill_at != 2);
@@ -419,7 +434,7 @@ void Paxos::handle_last(MMonPaxos *last)
// did this person send back an accepted but uncommitted value?
if (last->uncommitted_pn) {
- if (last->uncommitted_pn > uncommitted_pn &&
+ if (last->uncommitted_pn >= uncommitted_pn &&
last->last_committed >= last_committed &&
last->last_committed + 1 >= uncommitted_v) {
uncommitted_v = last->last_committed+1;
@@ -473,6 +488,7 @@ void Paxos::handle_last(MMonPaxos *last)
dout(10) << "that's everyone. active!" << dendl;
extend_lease();
+ need_refresh = false;
if (do_refresh()) {
finish_round();
@@ -487,6 +503,9 @@ void Paxos::handle_last(MMonPaxos *last)
dout(10) << "old pn, ignoring" << dendl;
}
+ if (need_refresh)
+ (void)do_refresh();
+
last->put();
}
@@ -793,17 +812,11 @@ void Paxos::handle_commit(MMonPaxos *commit)
store_state(commit);
- commit->put();
-
- bool need_bootstrap = false;
- mon->refresh_from_paxos(&need_bootstrap);
- if (need_bootstrap) {
- dout(10) << " doing requested bootstrap" << dendl;
- mon->bootstrap();
- return;
+ if (do_refresh()) {
+ finish_contexts(g_ceph_context, waiting_for_commit);
}
- finish_contexts(g_ceph_context, waiting_for_commit);
+ commit->put();
}
void Paxos::extend_lease()
diff --git a/src/mon/Paxos.h b/src/mon/Paxos.h
index 69419e64ab9..1bd3a3c1f51 100644
--- a/src/mon/Paxos.h
+++ b/src/mon/Paxos.h
@@ -1095,8 +1095,9 @@ public:
* enjoy the atomicity guarantees given by our awesome k/v store.
*
* @param m A message
+ * @returns true if we stored something new; false otherwise
*/
- void store_state(MMonPaxos *m);
+ bool store_state(MMonPaxos *m);
void _sanity_check_store();
/**
diff --git a/src/mon/mon_types.h b/src/mon/mon_types.h
index 0eae3b172bf..0ae1aaf8d5e 100644
--- a/src/mon/mon_types.h
+++ b/src/mon/mon_types.h
@@ -40,6 +40,52 @@ inline const char *get_paxos_name(int p) {
#define CEPH_MON_ONDISK_MAGIC "ceph mon volume v012"
+/**
+ * leveldb store stats
+ *
+ * If we ever decide to support multiple backends for the monitor store,
+ * we should then create an abstract class 'MonitorStoreStats' of sorts
+ * and inherit it on LevelDBStoreStats. I'm sure you'll figure something
+ * out.
+ */
+struct LevelDBStoreStats {
+ uint64_t bytes_total;
+ uint64_t bytes_sst;
+ uint64_t bytes_log;
+ uint64_t bytes_misc;
+ utime_t last_update;
+
+ void dump(Formatter *f) const {
+ assert(f != NULL);
+ f->dump_int("bytes_total", bytes_total);
+ f->dump_int("bytes_sst", bytes_sst);
+ f->dump_int("bytes_log", bytes_log);
+ f->dump_int("bytes_misc", bytes_misc);
+ f->dump_stream("last_updated") << last_update;
+ }
+
+ void encode(bufferlist &bl) const {
+ ENCODE_START(1, 1, bl);
+ ::encode(bytes_total, bl);
+ ::encode(bytes_sst, bl);
+ ::encode(bytes_log, bl);
+ ::encode(bytes_misc, bl);
+ ::encode(last_update, bl);
+ ENCODE_FINISH(bl);
+ }
+
+ void decode(bufferlist::iterator &p) {
+ DECODE_START(1, p);
+ ::decode(bytes_total, p);
+ ::decode(bytes_sst, p);
+ ::decode(bytes_log, p);
+ ::decode(bytes_misc, p);
+ ::decode(last_update, p);
+ DECODE_FINISH(p);
+ }
+};
+WRITE_CLASS_ENCODER(LevelDBStoreStats);
+
// data stats
struct DataStats {
@@ -50,13 +96,29 @@ struct DataStats {
int latest_avail_percent;
utime_t last_update;
+ LevelDBStoreStats store_stats;
+
+ void dump(Formatter *f) const {
+ assert(f != NULL);
+ f->dump_int("kb_total", kb_total);
+ f->dump_int("kb_used", kb_used);
+ f->dump_int("kb_avail", kb_avail);
+ f->dump_int("avail_percent", latest_avail_percent);
+ f->dump_stream("last_updated") << last_update;
+
+ f->open_object_section("store_stats");
+ store_stats.dump(f);
+ f->close_section();
+ }
+
void encode(bufferlist &bl) const {
- ENCODE_START(1, 1, bl);
+ ENCODE_START(2, 1, bl);
::encode(kb_total, bl);
::encode(kb_used, bl);
::encode(kb_avail, bl);
::encode(latest_avail_percent, bl);
::encode(last_update, bl);
+ ::encode(store_stats, bl);
ENCODE_FINISH(bl);
}
void decode(bufferlist::iterator &p) {
@@ -66,10 +128,12 @@ struct DataStats {
::decode(kb_avail, p);
::decode(latest_avail_percent, p);
::decode(last_update, p);
+ if (struct_v > 1)
+ ::decode(store_stats, p);
+
DECODE_FINISH(p);
}
};
-
WRITE_CLASS_ENCODER(DataStats);
struct ScrubResult {
diff --git a/src/msg/Message.h b/src/msg/Message.h
index 3ed8ee667d2..f345e7adaab 100644
--- a/src/msg/Message.h
+++ b/src/msg/Message.h
@@ -449,8 +449,8 @@ public:
const utime_t& get_recv_complete_stamp() const { return recv_complete_stamp; }
void calc_header_crc() {
- header.crc = ceph_crc32c_le(0, (unsigned char*)&header,
- sizeof(header) - sizeof(header.crc));
+ header.crc = ceph_crc32c(0, (unsigned char*)&header,
+ sizeof(header) - sizeof(header.crc));
}
void calc_front_crc() {
footer.front_crc = payload.crc32c(0);
diff --git a/src/msg/Pipe.cc b/src/msg/Pipe.cc
index 6f271c812f3..50656fee53b 100644
--- a/src/msg/Pipe.cc
+++ b/src/msg/Pipe.cc
@@ -1684,7 +1684,7 @@ int Pipe::read_message(Message **pm)
if (connection_state->has_feature(CEPH_FEATURE_NOSRCADDR)) {
if (tcp_read((char*)&header, sizeof(header)) < 0)
return -1;
- header_crc = ceph_crc32c_le(0, (unsigned char *)&header, sizeof(header) - sizeof(header.crc));
+ header_crc = ceph_crc32c(0, (unsigned char *)&header, sizeof(header) - sizeof(header.crc));
} else {
ceph_msg_header_old oldheader;
if (tcp_read((char*)&oldheader, sizeof(oldheader)) < 0)
@@ -1694,7 +1694,7 @@ int Pipe::read_message(Message **pm)
header.src = oldheader.src.name;
header.reserved = oldheader.reserved;
header.crc = oldheader.crc;
- header_crc = ceph_crc32c_le(0, (unsigned char *)&oldheader, sizeof(oldheader) - sizeof(oldheader.crc));
+ header_crc = ceph_crc32c(0, (unsigned char *)&oldheader, sizeof(oldheader) - sizeof(oldheader.crc));
}
ldout(msgr->cct,20) << "reader got envelope type=" << header.type
@@ -2028,8 +2028,8 @@ int Pipe::write_message(ceph_msg_header& header, ceph_msg_footer& footer, buffer
oldheader.src.addr = connection_state->get_peer_addr();
oldheader.orig_src = oldheader.src;
oldheader.reserved = header.reserved;
- oldheader.crc = ceph_crc32c_le(0, (unsigned char*)&oldheader,
- sizeof(oldheader) - sizeof(oldheader.crc));
+ oldheader.crc = ceph_crc32c(0, (unsigned char*)&oldheader,
+ sizeof(oldheader) - sizeof(oldheader.crc));
msgvec[msg.msg_iovlen].iov_base = (char*)&oldheader;
msgvec[msg.msg_iovlen].iov_len = sizeof(oldheader);
msglen += sizeof(oldheader);
diff --git a/src/os/BtrfsFileStoreBackend.cc b/src/os/BtrfsFileStoreBackend.cc
index 580a4b5bebf..ac7d1014ac7 100644
--- a/src/os/BtrfsFileStoreBackend.cc
+++ b/src/os/BtrfsFileStoreBackend.cc
@@ -51,7 +51,8 @@
#define ALIGN_UP(x, by) (ALIGNED((x), (by)) ? (x) : (ALIGN_DOWN((x), (by)) + (by)))
BtrfsFileStoreBackend::BtrfsFileStoreBackend(FileStore *fs):
- GenericFileStoreBackend(fs), has_clone_range(false), has_snap_create(false),
+ GenericFileStoreBackend(fs), has_clone_range(false),
+ has_snap_create(false), has_snap_destroy(false),
has_snap_create_v2(false), has_wait_sync(false), stable_commits(false),
m_filestore_btrfs_clone_range(g_conf->filestore_btrfs_clone_range),
m_filestore_btrfs_snap (g_conf->filestore_btrfs_snap) { }
@@ -298,8 +299,10 @@ int BtrfsFileStoreBackend::create_current()
int BtrfsFileStoreBackend::list_checkpoints(list<string>& ls)
{
+ int ret, err = 0;
+
struct stat basest;
- int ret = ::fstat(get_basedir_fd(), &basest);
+ ret = ::fstat(get_basedir_fd(), &basest);
if (ret < 0) {
ret = -errno;
dout(0) << "list_checkpoints: cannot fstat basedir " << cpp_strerror(ret) << dendl;
@@ -317,8 +320,9 @@ int BtrfsFileStoreBackend::list_checkpoints(list<string>& ls)
list<string> snaps;
char path[PATH_MAX];
- struct dirent buf, *de;
- while (::readdir_r(dir, &buf, &de) == 0) {
+ char buf[offsetof(struct dirent, d_name) + PATH_MAX + 1];
+ struct dirent *de;
+ while (::readdir_r(dir, (struct dirent *)&buf, &de) == 0) {
if (!de)
break;
@@ -327,10 +331,10 @@ int BtrfsFileStoreBackend::list_checkpoints(list<string>& ls)
struct stat st;
ret = ::stat(path, &st);
if (ret < 0) {
- ret = -errno;
+ err = -errno;
dout(0) << "list_checkpoints: stat '" << path << "' failed: "
- << cpp_strerror(ret) << dendl;
- return ret;
+ << cpp_strerror(err) << dendl;
+ break;
}
if (!S_ISDIR(st.st_mode))
@@ -339,10 +343,10 @@ int BtrfsFileStoreBackend::list_checkpoints(list<string>& ls)
struct statfs fs;
ret = ::statfs(path, &fs);
if (ret < 0) {
- ret = -errno;
+ err = -errno;
dout(0) << "list_checkpoints: statfs '" << path << "' failed: "
- << cpp_strerror(ret) << dendl;
- return ret;
+ << cpp_strerror(err) << dendl;
+ break;
}
if (fs.f_type == BTRFS_SUPER_MAGIC && basest.st_dev != st.st_dev)
@@ -352,9 +356,13 @@ int BtrfsFileStoreBackend::list_checkpoints(list<string>& ls)
if (::closedir(dir) < 0) {
ret = -errno;
dout(0) << "list_checkpoints: closedir failed: " << cpp_strerror(ret) << dendl;
- return ret;
+ if (!err)
+ err = ret;
}
+ if (err)
+ return err;
+
ls.swap(snaps);
return 0;
}
@@ -367,7 +375,7 @@ int BtrfsFileStoreBackend::create_checkpoint(const string& name, uint64_t *trans
memset(&async_args, 0, sizeof(async_args));
async_args.fd = get_current_fd();
async_args.flags = BTRFS_SUBVOL_CREATE_ASYNC;
- strcpy(async_args.name, name.c_str());
+ strncpy(async_args.name, name.c_str(), sizeof(async_args.name));
int r = ::ioctl(get_basedir_fd(), BTRFS_IOC_SNAP_CREATE_V2, &async_args);
if (r < 0) {
@@ -455,7 +463,7 @@ int BtrfsFileStoreBackend::destroy_checkpoint(const string& name)
btrfs_ioctl_vol_args vol_args;
memset(&vol_args, 0, sizeof(vol_args));
vol_args.fd = 0;
- strcpy(vol_args.name, name.c_str());
+ strncpy(vol_args.name, name.c_str(), sizeof(vol_args.name));
int ret = ::ioctl(get_basedir_fd(), BTRFS_IOC_SNAP_DESTROY, &vol_args);
if (ret) {
diff --git a/src/os/FileStore.cc b/src/os/FileStore.cc
index 80561056daa..d4d540df876 100644
--- a/src/os/FileStore.cc
+++ b/src/os/FileStore.cc
@@ -2600,7 +2600,8 @@ int FileStore::_write(coll_t cid, const hobject_t& oid,
r = bl.length();
// flush?
- if (!replaying)
+ if (!replaying &&
+ g_conf->filestore_wbthrottle_enable)
wbthrottle.queue_wb(fd, oid, offset, len, replica);
lfn_close(fd);
@@ -3787,8 +3788,9 @@ int FileStore::list_collections(vector<coll_t>& ls)
return r;
}
- struct dirent sde, *de;
- while ((r = ::readdir_r(dir, &sde, &de)) == 0) {
+ char buf[offsetof(struct dirent, d_name) + PATH_MAX + 1];
+ struct dirent *de;
+ while ((r = ::readdir_r(dir, (struct dirent *)&buf, &de)) == 0) {
if (!de)
break;
if (de->d_type == DT_UNKNOWN) {
diff --git a/src/os/FlatIndex.cc b/src/os/FlatIndex.cc
index f4a5ce3ab7d..db46750e411 100644
--- a/src/os/FlatIndex.cc
+++ b/src/os/FlatIndex.cc
@@ -387,7 +387,8 @@ int FlatIndex::collection_list_partial(const hobject_t &start,
}
int FlatIndex::collection_list(vector<hobject_t> *ls) {
- char dir_name[PATH_MAX], buf[PATH_MAX], new_name[PATH_MAX];
+ char buf[offsetof(struct dirent, d_name) + PATH_MAX + 1];
+ char dir_name[PATH_MAX], new_name[PATH_MAX];
strncpy(dir_name, base_path.c_str(), sizeof(dir_name));
dir_name[sizeof(dir_name)-1]='\0';
@@ -399,7 +400,7 @@ int FlatIndex::collection_list(vector<hobject_t> *ls) {
vector< pair<ino_t,hobject_t> > inolist;
struct dirent *de;
- while (::readdir_r(dir, (struct dirent*)buf, &de) == 0) {
+ while (::readdir_r(dir, (struct dirent *)buf, &de) == 0) {
if (!de)
break;
// parse
diff --git a/src/os/KeyValueDB.h b/src/os/KeyValueDB.h
index f62bca996a5..e98463aa763 100644
--- a/src/os/KeyValueDB.h
+++ b/src/os/KeyValueDB.h
@@ -165,6 +165,8 @@ public:
);
}
+ virtual uint64_t get_estimated_size(map<string,uint64_t> &extra) = 0;
+
virtual ~KeyValueDB() {}
protected:
diff --git a/src/os/LFNIndex.cc b/src/os/LFNIndex.cc
index 09d0f02267f..029e8ad8197 100644
--- a/src/os/LFNIndex.cc
+++ b/src/os/LFNIndex.cc
@@ -378,7 +378,7 @@ int LFNIndex::list_objects(const vector<string> &to_list, int max_objs,
long *handle, map<string, hobject_t> *out) {
string to_list_path = get_full_path_subdir(to_list);
DIR *dir = ::opendir(to_list_path.c_str());
- char buf[PATH_MAX];
+ char buf[offsetof(struct dirent, d_name) + PATH_MAX + 1];
int r;
if (!dir) {
return -errno;
@@ -438,7 +438,7 @@ int LFNIndex::list_subdirs(const vector<string> &to_list,
set<string> *out) {
string to_list_path = get_full_path_subdir(to_list);
DIR *dir = ::opendir(to_list_path.c_str());
- char buf[PATH_MAX];
+ char buf[offsetof(struct dirent, d_name) + PATH_MAX + 1];
if (!dir)
return -errno;
diff --git a/src/os/LevelDBStore.h b/src/os/LevelDBStore.h
index f3809cf3496..356ee59aa27 100644
--- a/src/os/LevelDBStore.h
+++ b/src/os/LevelDBStore.h
@@ -20,6 +20,12 @@
#include "leveldb/filter_policy.h"
#endif
+#include <errno.h>
+#include "common/errno.h"
+#include "common/dout.h"
+#include "include/assert.h"
+#include "common/Formatter.h"
+
#include "common/ceph_context.h"
class PerfCounters;
@@ -300,6 +306,64 @@ public:
return limit;
}
+ virtual uint64_t get_estimated_size(map<string,uint64_t> &extra) {
+ DIR *store_dir = opendir(path.c_str());
+ if (!store_dir) {
+ lderr(cct) << __func__ << " something happened opening the store: "
+ << cpp_strerror(errno) << dendl;
+ return 0;
+ }
+
+ uint64_t total_size = 0;
+ uint64_t sst_size = 0;
+ uint64_t log_size = 0;
+ uint64_t misc_size = 0;
+
+ struct dirent *entry = NULL;
+ while ((entry = readdir(store_dir)) != NULL) {
+ string n(entry->d_name);
+
+ if (n == "." || n == "..")
+ continue;
+
+ string fpath = path + '/' + n;
+ struct stat s;
+ int err = stat(fpath.c_str(), &s);
+ if (err < 0) {
+ lderr(cct) << __func__ << " error obtaining stats for " << fpath
+ << ": " << cpp_strerror(errno) << dendl;
+ goto err;
+ }
+
+ size_t pos = n.find_last_of('.');
+ if (pos == string::npos) {
+ misc_size += s.st_size;
+ continue;
+ }
+
+ string ext = n.substr(pos+1);
+ if (ext == "sst") {
+ sst_size += s.st_size;
+ } else if (ext == "log") {
+ log_size += s.st_size;
+ } else {
+ misc_size += s.st_size;
+ }
+ }
+
+ total_size = sst_size + log_size + misc_size;
+
+ extra["sst"] = sst_size;
+ extra["log"] = log_size;
+ extra["misc"] = misc_size;
+ extra["total"] = total_size;
+
+err:
+ closedir(store_dir);
+ return total_size;
+ }
+
+
protected:
WholeSpaceIterator _get_iterator() {
return std::tr1::shared_ptr<KeyValueDB::WholeSpaceIteratorImpl>(
diff --git a/src/os/WBThrottle.cc b/src/os/WBThrottle.cc
index 23e24765cc2..8479b3c878d 100644
--- a/src/os/WBThrottle.cc
+++ b/src/os/WBThrottle.cc
@@ -145,7 +145,7 @@ void *WBThrottle::entry()
while (get_next_should_flush(&wb)) {
clearing = wb.get<0>();
lock.Unlock();
- ::fsync(**wb.get<1>());
+ ::fdatasync(**wb.get<1>());
if (wb.get<2>().nocache)
posix_fadvise(**wb.get<1>(), 0, 0, POSIX_FADV_DONTNEED);
lock.Lock();
diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc
index 52c35bd247c..66022a3898a 100644
--- a/src/osd/OSD.cc
+++ b/src/osd/OSD.cc
@@ -1231,6 +1231,44 @@ int OSD::init()
// tick
tick_timer.add_event_after(g_conf->osd_heartbeat_interval, new C_Tick(this));
+ service.init();
+ service.publish_map(osdmap);
+ service.publish_superblock(superblock);
+
+ osd_lock.Unlock();
+
+ r = monc->authenticate();
+ if (r < 0) {
+ monc->shutdown();
+ store->umount();
+ osd_lock.Lock(); // locker is going to unlock this on function exit
+ if (is_stopping())
+ return 0;
+ return r;
+ }
+
+ while (monc->wait_auth_rotating(30.0) < 0) {
+ derr << "unable to obtain rotating service keys; retrying" << dendl;
+ }
+
+ osd_lock.Lock();
+ if (is_stopping())
+ return 0;
+
+ dout(10) << "ensuring pgs have consumed prior maps" << dendl;
+ consume_map();
+ peering_wq.drain();
+
+ dout(10) << "done with init, starting boot process" << dendl;
+ state = STATE_BOOTING;
+ start_boot();
+
+ return 0;
+}
+
+void OSD::final_init()
+{
+ int r;
AdminSocket *admin_socket = cct->get_admin_socket();
asok_hook = new OSDSocketHook(this);
r = admin_socket->register_command("dump_ops_in_flight",
@@ -1323,40 +1361,6 @@ int OSD::init()
test_ops_hook,
"inject metadata error");
assert(r == 0);
-
- service.init();
- service.publish_map(osdmap);
- service.publish_superblock(superblock);
-
- osd_lock.Unlock();
-
- r = monc->authenticate();
- if (r < 0) {
- monc->shutdown();
- store->umount();
- osd_lock.Lock(); // locker is going to unlock this on function exit
- if (is_stopping())
- return 0;
- return r;
- }
-
- while (monc->wait_auth_rotating(30.0) < 0) {
- derr << "unable to obtain rotating service keys; retrying" << dendl;
- }
-
- osd_lock.Lock();
- if (is_stopping())
- return 0;
-
- dout(10) << "ensuring pgs have consumed prior maps" << dendl;
- consume_map();
- peering_wq.drain();
-
- dout(10) << "done with init, starting boot process" << dendl;
- state = STATE_BOOTING;
- start_boot();
-
- return 0;
}
void OSD::create_logger()
@@ -1522,7 +1526,6 @@ int OSD::shutdown()
dout(20) << " kicking pg " << p->first << dendl;
p->second->lock();
p->second->on_shutdown();
- p->second->kick();
p->second->unlock();
p->second->osr->flush();
}
@@ -3513,7 +3516,7 @@ bool OSD::_is_healthy()
++up;
++num;
}
- if (up < num / 3) {
+ if ((float)up < (float)num * g_conf->osd_heartbeat_min_healthy_ratio) {
dout(1) << "is_healthy false -- only " << up << "/" << num << " up peers (less than 1/3)" << dendl;
return false;
}
diff --git a/src/osd/OSD.h b/src/osd/OSD.h
index d259ceae545..e23c19b8f93 100644
--- a/src/osd/OSD.h
+++ b/src/osd/OSD.h
@@ -1719,6 +1719,7 @@ public:
// startup/shutdown
int pre_init();
int init();
+ void final_init();
void suicide(int exitcode);
int shutdown();
diff --git a/src/osd/OSDCap.cc b/src/osd/OSDCap.cc
index ee77f0ea43d..e315835f4ba 100644
--- a/src/osd/OSDCap.cc
+++ b/src/osd/OSDCap.cc
@@ -178,7 +178,7 @@ struct OSDCapParser : qi::grammar<Iterator, OSDCap()>
equoted_string %=
lexeme['"' >> *(char_ - '"') >> '"'] |
lexeme['\'' >> *(char_ - '\'') >> '\''];
- unquoted_word %= +char_("a-zA-Z0-9_-");
+ unquoted_word %= +char_("a-zA-Z0-9_.-");
str %= quoted_string | unquoted_word;
estr %= equoted_string | unquoted_word;
diff --git a/src/osd/PG.cc b/src/osd/PG.cc
index d8ada534c30..ef64fe37919 100644
--- a/src/osd/PG.cc
+++ b/src/osd/PG.cc
@@ -4530,9 +4530,6 @@ void PG::start_peering_interval(const OSDMapRef lastmap,
{
const OSDMapRef osdmap = get_osdmap();
- // -- there was a change! --
- kick();
-
set_last_peering_reset();
vector<int> oldacting, oldup;
diff --git a/src/osd/PG.h b/src/osd/PG.h
index 14ac7c9fac5..720ce67bca3 100644
--- a/src/osd/PG.h
+++ b/src/osd/PG.h
@@ -232,7 +232,6 @@ protected:
* put_unlock() when done with the current pointer (_most common_).
*/
Mutex _lock;
- Cond _cond;
atomic_t ref;
#ifdef PG_DEBUG_REFS
@@ -261,14 +260,6 @@ public:
bool is_locked() const {
return _lock.is_locked();
}
- void wait() {
- assert(_lock.is_locked());
- _cond.Wait(_lock);
- }
- void kick() {
- assert(_lock.is_locked());
- _cond.Signal();
- }
#ifdef PG_DEBUG_REFS
uint64_t get_with_id();
diff --git a/src/osd/ReplicatedPG.cc b/src/osd/ReplicatedPG.cc
index cfa1dce1942..a04ab485e7e 100644
--- a/src/osd/ReplicatedPG.cc
+++ b/src/osd/ReplicatedPG.cc
@@ -4,6 +4,9 @@
* Ceph - scalable distributed file system
*
* Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ * Copyright (C) 2013 Cloudwatt <libre.licensing@cloudwatt.com>
+ *
+ * Author: Loic Dachary <loic@dachary.org>
*
* This is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
@@ -329,7 +332,7 @@ int ReplicatedPG::do_command(cmdmap_t cmdmap, ostream& ss,
if (!all_unfound_are_queried_or_lost(get_osdmap())) {
ss << "pg has " << unfound
- << " objects but we haven't probed all sources, not marking lost";
+ << " unfound objects but we haven't probed all sources, not marking lost";
return -EINVAL;
}
@@ -613,7 +616,9 @@ void ReplicatedPG::calc_trim_to()
ReplicatedPG::ReplicatedPG(OSDService *o, OSDMapRef curmap,
const PGPool &_pool, pg_t p, const hobject_t& oid,
const hobject_t& ioid) :
- PG(o, curmap, _pool, p, oid, ioid), temp_created(false),
+ PG(o, curmap, _pool, p, oid, ioid),
+ snapset_contexts_lock("ReplicatedPG::snapset_contexts"),
+ temp_created(false),
temp_coll(coll_t::make_temp_coll(p)), snap_trimmer_machine(this)
{
snap_trimmer_machine.initiate();
@@ -693,7 +698,7 @@ void ReplicatedPG::do_op(OpRequestRef op)
return;
}
- ObjectContext *obc;
+ ObjectContextRef obc;
bool can_create = op->may_write();
snapid_t snapid;
int r = find_object_context(
@@ -747,7 +752,6 @@ void ReplicatedPG::do_op(OpRequestRef op)
if (!op->may_write() && !obc->obs.exists) {
osd->reply_op_error(op, -ENOENT);
- put_object_context(obc);
return;
}
@@ -756,7 +760,6 @@ void ReplicatedPG::do_op(OpRequestRef op)
dout(10) << "do_op writes for " << obc->obs.oi.soid << " blocked by "
<< obc->blocked_by->obs.oi.soid << dendl;
wait_for_degraded_object(obc->blocked_by->obs.oi.soid, op);
- put_object_context(obc);
return;
}
@@ -773,7 +776,7 @@ void ReplicatedPG::do_op(OpRequestRef op)
}
// src_oids
- map<hobject_t,ObjectContext*> src_obc;
+ map<hobject_t,ObjectContextRef> src_obc;
for (vector<OSDOp>::iterator p = m->ops.begin(); p != m->ops.end(); ++p) {
OSDOp& osd_op = *p;
@@ -781,7 +784,6 @@ void ReplicatedPG::do_op(OpRequestRef op)
if (osd_op.op.op == CEPH_OSD_OP_LIST_SNAPS &&
m->get_snapid() != CEPH_SNAPDIR) {
dout(10) << "LIST_SNAPS with incorrect context" << dendl;
- put_object_context(obc);
osd->reply_op_error(op, -EINVAL);
return;
}
@@ -794,7 +796,7 @@ void ReplicatedPG::do_op(OpRequestRef op)
hobject_t src_oid(osd_op.soid, src_oloc.key, m->get_pg().ps(),
info.pgid.pool(), m->get_object_locator().nspace);
if (!src_obc.count(src_oid)) {
- ObjectContext *sobc;
+ ObjectContextRef sobc;
snapid_t ssnapid;
int r = find_object_context(src_oid, &sobc, false, &ssnapid);
@@ -816,10 +818,8 @@ void ReplicatedPG::do_op(OpRequestRef op)
(before_backfill && sobc->obs.oi.soid > backfill_target_info->last_backfill)) {
wait_for_degraded_object(sobc->obs.oi.soid, op);
dout(10) << " writes for " << obc->obs.oi.soid << " now blocked by "
- << sobc->obs.oi.soid << dendl;
- obc->get();
+ << sobc->obs.oi.soid << dendl;
obc->blocked_by = sobc;
- sobc->get();
sobc->blocking.insert(obc);
} else {
dout(10) << " src_oid " << src_oid << " obc " << src_obc << dendl;
@@ -836,8 +836,7 @@ void ReplicatedPG::do_op(OpRequestRef op)
dout(10) << "no src oid specified for multi op " << osd_op << dendl;
osd->reply_op_error(op, -EINVAL);
}
- put_object_contexts(src_obc);
- put_object_context(obc);
+ src_obc.clear();
return;
}
@@ -852,7 +851,7 @@ void ReplicatedPG::do_op(OpRequestRef op)
hobject_t clone_oid = obc->obs.oi.soid;
clone_oid.snap = *p;
if (!src_obc.count(clone_oid)) {
- ObjectContext *sobc;
+ ObjectContextRef sobc;
snapid_t ssnapid;
int r = find_object_context(clone_oid, &sobc, false, &ssnapid);
@@ -868,8 +867,7 @@ void ReplicatedPG::do_op(OpRequestRef op)
src_obc[clone_oid] = sobc;
continue;
}
- put_object_contexts(src_obc);
- put_object_context(obc);
+ src_obc.clear();
return;
} else {
continue;
@@ -902,8 +900,7 @@ void ReplicatedPG::do_op(OpRequestRef op)
<< " < snapset seq " << obc->ssc->snapset.seq
<< " on " << soid << dendl;
delete ctx;
- put_object_context(obc);
- put_object_contexts(src_obc);
+ src_obc.clear();
osd->reply_op_error(op, -EOLDSNAPC);
return;
}
@@ -913,8 +910,7 @@ void ReplicatedPG::do_op(OpRequestRef op)
const eversion_t& oldv = entry->version;
dout(3) << "do_op dup " << ctx->reqid << " was " << oldv << dendl;
delete ctx;
- put_object_context(obc);
- put_object_contexts(src_obc);
+ src_obc.clear();
if (already_complete(oldv)) {
osd->reply_op_error(op, 0, oldv, entry->user_version);
} else {
@@ -973,7 +969,7 @@ void ReplicatedPG::do_op(OpRequestRef op)
dout(10) << " taking ondisk_read_lock" << dendl;
obc->ondisk_read_lock();
}
- for (map<hobject_t,ObjectContext*>::iterator p = src_obc.begin(); p != src_obc.end(); ++p) {
+ for (map<hobject_t,ObjectContextRef>::iterator p = src_obc.begin(); p != src_obc.end(); ++p) {
dout(10) << " taking ondisk_read_lock for src " << p->first << dendl;
p->second->ondisk_read_lock();
}
@@ -984,7 +980,7 @@ void ReplicatedPG::do_op(OpRequestRef op)
dout(10) << " dropping ondisk_read_lock" << dendl;
obc->ondisk_read_unlock();
}
- for (map<hobject_t,ObjectContext*>::iterator p = src_obc.begin(); p != src_obc.end(); ++p) {
+ for (map<hobject_t,ObjectContextRef>::iterator p = src_obc.begin(); p != src_obc.end(); ++p) {
dout(10) << " dropping ondisk_read_lock for src " << p->first << dendl;
p->second->ondisk_read_unlock();
}
@@ -992,8 +988,7 @@ void ReplicatedPG::do_op(OpRequestRef op)
if (result == -EAGAIN) {
// clean up after the ctx
delete ctx;
- put_object_context(obc);
- put_object_contexts(src_obc);
+ src_obc.clear();
return;
}
@@ -1001,8 +996,7 @@ void ReplicatedPG::do_op(OpRequestRef op)
if (ctx->delta_stats.num_bytes > 0 &&
pool.info.get_flags() & pg_pool_t::FLAG_FULL) {
delete ctx;
- put_object_context(obc);
- put_object_contexts(src_obc);
+ src_obc.clear();
osd->reply_op_error(op, -ENOSPC);
return;
}
@@ -1048,8 +1042,7 @@ void ReplicatedPG::do_op(OpRequestRef op)
reply->add_flags(CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK);
osd->send_message_osd_client(reply, m->get_connection());
delete ctx;
- put_object_context(obc);
- put_object_contexts(src_obc);
+ src_obc.clear();
return;
}
@@ -1060,9 +1053,6 @@ void ReplicatedPG::do_op(OpRequestRef op)
append_log(ctx->log, pg_trim_to, ctx->local_t);
- // continuing on to write path, make sure object context is registered
- assert(obc->registered);
-
// verify that we are doing this in order?
if (g_conf->osd_debug_op_order && m->get_source().is_client()) {
map<client_t,tid_t>& cm = debug_op_order[obc->obs.oi.soid];
@@ -1470,14 +1460,13 @@ ReplicatedPG::RepGather *ReplicatedPG::trim_object(const hobject_t &coid)
{
// load clone info
bufferlist bl;
- ObjectContext *obc = 0;
+ ObjectContextRef obc;
int r = find_object_context(coid, &obc, false, NULL);
if (r == -ENOENT || coid.snap != obc->obs.oi.soid.snap) {
derr << __func__ << "could not find coid " << coid << dendl;
assert(0);
}
assert(r == 0);
- assert(obc->registered);
object_info_t &coi = obc->obs.oi;
set<snapid_t> old_snaps(coi.snaps.begin(), coi.snaps.end());
@@ -1621,7 +1610,6 @@ ReplicatedPG::RepGather *ReplicatedPG::trim_object(const hobject_t &coid)
snapset.head_exists ? CEPH_NOSNAP:CEPH_SNAPDIR, coid.hash,
info.pgid.pool(), coid.get_namespace());
ctx->snapset_obc = get_object_context(snapoid, false);
- assert(ctx->snapset_obc->registered);
if (snapset.clones.empty() && !snapset.head_exists) {
dout(10) << coid << " removing " << snapoid << dendl;
@@ -2074,7 +2062,7 @@ int ReplicatedPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
ctx->user_modify = true;
}
- ObjectContext *src_obc = 0;
+ ObjectContextRef src_obc;
if (ceph_osd_op_type_multi(op.op)) {
MOSDOp *m = static_cast<MOSDOp *>(ctx->op->request);
object_locator_t src_oloc;
@@ -2492,7 +2480,7 @@ int ReplicatedPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
hobject_t clone_oid = soid;
clone_oid.snap = *clone_iter;
- ObjectContext *clone_obc = ctx->src_obc[clone_oid];
+ ObjectContextRef clone_obc = ctx->src_obc[clone_oid];
assert(clone_obc);
for (vector<snapid_t>::reverse_iterator p = clone_obc->obs.oi.snaps.rbegin();
p != clone_obc->obs.oi.snaps.rend();
@@ -2614,6 +2602,10 @@ int ReplicatedPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
case CEPH_OSD_OP_WRITE:
++ctx->num_write;
{ // write
+ if (op.extent.length != osd_op.indata.length()) {
+ result = -EINVAL;
+ break;
+ }
__u32 seq = oi.truncate_seq;
if (seq && (seq > op.extent.truncate_seq) &&
(op.extent.offset + op.extent.length > oi.size)) {
@@ -2645,9 +2637,7 @@ int ReplicatedPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
result = check_offset_and_length(op.extent.offset, op.extent.length);
if (result < 0)
break;
- bufferlist nbl;
- bp.copy(op.extent.length, nbl);
- t.write(coll, soid, op.extent.offset, op.extent.length, nbl);
+ t.write(coll, soid, op.extent.offset, op.extent.length, osd_op.indata);
write_update_size_and_usage(ctx->delta_stats, oi, ssc->snapset, ctx->modified_ranges,
op.extent.offset, op.extent.length, true);
if (!obs.exists) {
@@ -2660,18 +2650,20 @@ int ReplicatedPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
case CEPH_OSD_OP_WRITEFULL:
++ctx->num_write;
{ // write full object
+ if (op.extent.length != osd_op.indata.length()) {
+ result = -EINVAL;
+ break;
+ }
result = check_offset_and_length(op.extent.offset, op.extent.length);
if (result < 0)
break;
- bufferlist nbl;
- bp.copy(op.extent.length, nbl);
if (obs.exists) {
t.truncate(coll, soid, 0);
} else {
ctx->delta_stats.num_objects++;
obs.exists = true;
}
- t.write(coll, soid, op.extent.offset, op.extent.length, nbl);
+ t.write(coll, soid, op.extent.offset, op.extent.length, osd_op.indata);
interval_set<uint64_t> ch;
if (oi.size > 0)
ch.insert(0, oi.size);
@@ -2833,16 +2825,15 @@ int ReplicatedPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
uint64_t cookie = op.watch.cookie;
bool do_watch = op.watch.flag & 1;
entity_name_t entity = ctx->reqid.name;
- ObjectContext *obc = ctx->obc;
+ ObjectContextRef obc = ctx->obc;
- dout(10) << "watch: ctx->obc=" << (void *)obc << " cookie=" << cookie
+ dout(10) << "watch: ctx->obc=" << (void *)obc.get() << " cookie=" << cookie
<< " oi.version=" << oi.version.version << " ctx->at_version=" << ctx->at_version << dendl;
dout(10) << "watch: oi.user_version=" << oi.user_version<< dendl;
dout(10) << "watch: peer_addr="
<< ctx->op->request->get_connection()->get_peer_addr() << dendl;
- // FIXME: where does the timeout come from?
- watch_info_t w(cookie, 30,
+ watch_info_t w(cookie, g_conf->osd_client_watch_timeout,
ctx->op->request->get_connection()->get_peer_addr());
if (do_watch) {
if (oi.watchers.count(make_pair(cookie, entity))) {
@@ -2853,7 +2844,6 @@ int ReplicatedPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
t.nop(); // make sure update the object_info on disk!
}
ctx->watch_connects.push_back(w);
- assert(obc->registered);
} else {
map<pair<uint64_t, entity_name_t>, watch_info_t>::iterator oi_iter =
oi.watchers.find(make_pair(cookie, entity));
@@ -3436,7 +3426,7 @@ int ReplicatedPG::_rollback_to(OpContext *ctx, ceph_osd_op& op)
dout(10) << "_rollback_to " << soid << " snapid " << snapid << dendl;
- ObjectContext *rollback_to;
+ ObjectContextRef rollback_to;
int ret = find_object_context(
hobject_t(soid.oid, soid.get_key(), snapid, soid.hash, info.pgid.pool(), soid.get_namespace()),
&rollback_to, false, &cloneid);
@@ -3519,7 +3509,6 @@ int ReplicatedPG::_rollback_to(OpContext *ctx, ceph_osd_op& op)
obs.oi.size = rollback_to->obs.oi.size;
snapset.head_exists = true;
}
- put_object_context(rollback_to);
}
return ret;
}
@@ -3575,9 +3564,10 @@ void ReplicatedPG::make_writeable(OpContext *ctx)
object_info_t static_snap_oi(coid);
object_info_t *snap_oi;
if (is_primary()) {
- ctx->clone_obc = new ObjectContext(static_snap_oi, true, NULL);
- ctx->clone_obc->get();
- register_object_context(ctx->clone_obc);
+ ctx->clone_obc = object_contexts.lookup_or_create(static_snap_oi.soid);
+ ctx->clone_obc->destructor_callback = new C_PG_ObjectContext(this, ctx->clone_obc.get());
+ ctx->clone_obc->obs.oi = static_snap_oi;
+ ctx->clone_obc->obs.exists = true;
snap_oi = &ctx->clone_obc->obs.oi;
} else {
snap_oi = &static_snap_oi;
@@ -3827,7 +3817,6 @@ int ReplicatedPG::prepare_transaction(OpContext *ctx)
ctx->at_version.version++;
ctx->snapset_obc->obs.exists = false;
- assert(ctx->snapset_obc->registered);
}
}
} else if (ctx->new_snapset.clones.size()) {
@@ -3844,7 +3833,6 @@ int ReplicatedPG::prepare_transaction(OpContext *ctx)
ctx->snapset_obc->obs.oi.version = ctx->at_version;
ctx->snapset_obc->obs.oi.last_reqid = ctx->reqid;
ctx->snapset_obc->obs.oi.mtime = ctx->mtime;
- assert(ctx->snapset_obc->registered);
bufferlist bv(sizeof(ctx->new_obs.oi));
::encode(ctx->snapset_obc->obs.oi, bv);
@@ -3991,17 +3979,14 @@ void ReplicatedPG::op_applied(RepGather *repop)
int whoami = osd->get_nodeid();
if (repop->ctx->clone_obc) {
- put_object_context(repop->ctx->clone_obc);
- repop->ctx->clone_obc = 0;
+ repop->ctx->clone_obc = ObjectContextRef();
}
if (repop->ctx->snapset_obc) {
- put_object_context(repop->ctx->snapset_obc);
- repop->ctx->snapset_obc = 0;
+ repop->ctx->snapset_obc = ObjectContextRef();
}
- put_object_context(repop->obc);
- put_object_contexts(repop->src_obc);
- repop->obc = 0;
+ repop->src_obc.clear();
+ repop->obc = ObjectContextRef();
if (!repop->aborted) {
assert(repop->waitfor_ack.count(whoami) ||
@@ -4285,7 +4270,7 @@ void ReplicatedPG::issue_repop(RepGather *repop, utime_t now,
}
}
-ReplicatedPG::RepGather *ReplicatedPG::new_repop(OpContext *ctx, ObjectContext *obc,
+ReplicatedPG::RepGather *ReplicatedPG::new_repop(OpContext *ctx, ObjectContextRef obc,
tid_t rep_tid)
{
if (ctx->op)
@@ -4370,16 +4355,14 @@ void ReplicatedPG::repop_ack(RepGather *repop, int result, int ack_type,
void ReplicatedPG::get_watchers(list<obj_watch_item_t> &pg_watchers)
{
- for (map<hobject_t, ObjectContext*>::iterator i = object_contexts.begin();
- i != object_contexts.end();
- ++i) {
- i->second->get();
- get_obc_watchers(i->second, pg_watchers);
- put_object_context(i->second);
+ pair<hobject_t, ObjectContextRef> i;
+ while (object_contexts.get_next(i.first, &i)) {
+ ObjectContextRef obc(i.second);
+ get_obc_watchers(obc, pg_watchers);
}
}
-void ReplicatedPG::get_obc_watchers(ObjectContext *obc, list<obj_watch_item_t> &pg_watchers)
+void ReplicatedPG::get_obc_watchers(ObjectContextRef obc, list<obj_watch_item_t> &pg_watchers)
{
for (map<pair<uint64_t, entity_name_t>, WatchRef>::iterator j =
obc->watchers.begin();
@@ -4403,16 +4386,12 @@ void ReplicatedPG::get_obc_watchers(ObjectContext *obc, list<obj_watch_item_t> &
void ReplicatedPG::check_blacklisted_watchers()
{
dout(20) << "ReplicatedPG::check_blacklisted_watchers for pg " << get_pgid() << dendl;
- for (map<hobject_t, ObjectContext*>::iterator i = object_contexts.begin();
- i != object_contexts.end();
- ++i) {
- i->second->get();
- check_blacklisted_obc_watchers(i->second);
- put_object_context(i->second);
- }
+ pair<hobject_t, ObjectContextRef> i;
+ while (object_contexts.get_next(i.first, &i))
+ check_blacklisted_obc_watchers(i.second);
}
-void ReplicatedPG::check_blacklisted_obc_watchers(ObjectContext *obc)
+void ReplicatedPG::check_blacklisted_obc_watchers(ObjectContextRef obc)
{
dout(20) << "ReplicatedPG::check_blacklisted_obc_watchers for obc " << obc->obs.oi.soid << dendl;
for (map<pair<uint64_t, entity_name_t>, WatchRef>::iterator k =
@@ -4432,7 +4411,7 @@ void ReplicatedPG::check_blacklisted_obc_watchers(ObjectContext *obc)
}
}
-void ReplicatedPG::populate_obc_watchers(ObjectContext *obc)
+void ReplicatedPG::populate_obc_watchers(ObjectContextRef obc)
{
assert(is_active());
assert(!is_missing_object(obc->obs.oi.soid) ||
@@ -4468,7 +4447,7 @@ void ReplicatedPG::populate_obc_watchers(ObjectContext *obc)
void ReplicatedPG::handle_watch_timeout(WatchRef watch)
{
- ObjectContext *obc = watch->get_obc(); // handle_watch_timeout owns this ref
+ ObjectContextRef obc = watch->get_obc(); // handle_watch_timeout owns this ref
dout(10) << "handle_watch_timeout obc " << obc << dendl;
if (is_degraded_object(obc->obs.oi.soid)) {
@@ -4478,7 +4457,6 @@ void ReplicatedPG::handle_watch_timeout(WatchRef watch)
dout(10) << "handle_watch_timeout waiting for degraded on obj "
<< obc->obs.oi.soid
<< dendl;
- put_object_context(obc); // callback got its own ref
return;
}
@@ -4489,7 +4467,6 @@ void ReplicatedPG::handle_watch_timeout(WatchRef watch)
scrubber.add_callback(
watch->get_delayed_cb() // This callback!
);
- put_object_context(obc);
return;
}
@@ -4538,41 +4515,35 @@ void ReplicatedPG::handle_watch_timeout(WatchRef watch)
eval_repop(repop);
}
-ObjectContext *ReplicatedPG::_lookup_object_context(const hobject_t& oid)
-{
- map<hobject_t, ObjectContext*>::iterator p = object_contexts.find(oid);
- if (p != object_contexts.end())
- return p->second;
- return NULL;
-}
-
-ObjectContext *ReplicatedPG::create_object_context(const object_info_t& oi,
- SnapSetContext *ssc)
+ObjectContextRef ReplicatedPG::create_object_context(const object_info_t& oi,
+ SnapSetContext *ssc)
{
- ObjectContext *obc = new ObjectContext(oi, false, ssc);
- dout(10) << "create_object_context " << obc << " " << oi.soid << " " << obc->ref << dendl;
- register_object_context(obc);
+ ObjectContextRef obc(object_contexts.lookup_or_create(oi.soid));
+ assert(obc->destructor_callback == NULL);
+ obc->destructor_callback = new C_PG_ObjectContext(this, obc.get());
+ obc->obs.oi = oi;
+ obc->obs.exists = false;
+ obc->ssc = ssc;
+ if (ssc)
+ register_snapset_context(ssc);
+ dout(10) << "create_object_context " << (void*)obc.get() << " " << oi.soid << " " << dendl;
populate_obc_watchers(obc);
- obc->ref++;
return obc;
}
-ObjectContext *ReplicatedPG::get_object_context(const hobject_t& soid,
- bool can_create)
+ObjectContextRef ReplicatedPG::get_object_context(const hobject_t& soid,
+ bool can_create)
{
- map<hobject_t, ObjectContext*>::iterator p = object_contexts.find(soid);
- ObjectContext *obc;
- if (p != object_contexts.end()) {
- obc = p->second;
- dout(10) << "get_object_context " << obc << " " << soid << " " << obc->ref
- << " -> " << (obc->ref+1) << dendl;
+ ObjectContextRef obc = object_contexts.lookup(soid);
+ if (obc) {
+ dout(10) << "get_object_context " << obc << " " << soid << dendl;
} else {
// check disk
bufferlist bv;
int r = osd->store->getattr(coll, soid, OI_ATTR, bv);
if (r < 0) {
if (!can_create)
- return NULL; // -ENOENT!
+ return ObjectContextRef(); // -ENOENT!
// new object.
object_info_t oi(soid);
@@ -4584,49 +4555,41 @@ ObjectContext *ReplicatedPG::get_object_context(const hobject_t& soid,
assert(oi.soid.pool == (int64_t)info.pgid.pool());
- SnapSetContext *ssc = NULL;
- if (can_create)
- ssc = get_snapset_context(soid.oid, soid.get_key(), soid.hash, true, soid.get_namespace());
- obc = new ObjectContext(oi, true, ssc);
+ obc = object_contexts.lookup_or_create(oi.soid);
+ obc->destructor_callback = new C_PG_ObjectContext(this, obc.get());
+ obc->obs.oi = oi;
obc->obs.exists = true;
- register_object_context(obc);
-
- if (can_create && !obc->ssc)
+ if (can_create) {
obc->ssc = get_snapset_context(soid.oid, soid.get_key(), soid.hash, true, soid.get_namespace());
+ register_snapset_context(obc->ssc);
+ }
populate_obc_watchers(obc);
dout(10) << "get_object_context " << obc << " " << soid << " 0 -> 1 read " << obc->obs.oi << dendl;
}
- obc->ref++;
return obc;
}
void ReplicatedPG::context_registry_on_change()
{
- list<ObjectContext *> contexts;
- for (map<hobject_t, ObjectContext*>::iterator i = object_contexts.begin();
- i != object_contexts.end();
- ++i) {
- i->second->get();
- contexts.push_back(i->second);
- for (map<pair<uint64_t, entity_name_t>, WatchRef>::iterator j =
- i->second->watchers.begin();
- j != i->second->watchers.end();
- i->second->watchers.erase(j++)) {
- j->second->discard();
+ pair<hobject_t, ObjectContextRef> i;
+ while (object_contexts.get_next(i.first, &i)) {
+ ObjectContextRef obc(i.second);
+ if (obc) {
+ for (map<pair<uint64_t, entity_name_t>, WatchRef>::iterator j =
+ obc->watchers.begin();
+ j != obc->watchers.end();
+ obc->watchers.erase(j++)) {
+ j->second->discard();
+ }
}
}
- for (list<ObjectContext *>::iterator i = contexts.begin();
- i != contexts.end();
- contexts.erase(i++)) {
- put_object_context(*i);
- }
}
int ReplicatedPG::find_object_context(const hobject_t& oid,
- ObjectContext **pobc,
+ ObjectContextRef *pobc,
bool can_create,
snapid_t *psnapid)
{
@@ -4638,11 +4601,10 @@ int ReplicatedPG::find_object_context(const hobject_t& oid,
// want the snapdir?
if (oid.snap == CEPH_SNAPDIR) {
// return head or snapdir, whichever exists.
- ObjectContext *obc = get_object_context(head, can_create);
+ ObjectContextRef obc = get_object_context(head, can_create);
if (obc && !obc->obs.exists) {
// ignore it if the obc exists but the object doesn't
- put_object_context(obc);
- obc = NULL;
+ obc = ObjectContextRef();
}
if (!obc) {
obc = get_object_context(snapdir, can_create);
@@ -4660,7 +4622,7 @@ int ReplicatedPG::find_object_context(const hobject_t& oid,
// want the head?
if (oid.snap == CEPH_NOSNAP) {
- ObjectContext *obc = get_object_context(head, can_create);
+ ObjectContextRef obc = get_object_context(head, can_create);
if (!obc)
return -ENOENT;
dout(10) << "find_object_context " << oid << " @" << oid.snap << dendl;
@@ -4683,7 +4645,7 @@ int ReplicatedPG::find_object_context(const hobject_t& oid,
// head?
if (oid.snap > ssc->snapset.seq) {
if (ssc->snapset.head_exists) {
- ObjectContext *obc = get_object_context(head, false);
+ ObjectContextRef obc = get_object_context(head, false);
dout(10) << "find_object_context " << head
<< " want " << oid.snap << " > snapset seq " << ssc->snapset.seq
<< " -- HIT " << obc->obs
@@ -4728,7 +4690,7 @@ int ReplicatedPG::find_object_context(const hobject_t& oid,
return -EAGAIN;
}
- ObjectContext *obc = get_object_context(soid, false);
+ ObjectContextRef obc = get_object_context(soid, false);
assert(obc);
// clone
@@ -4743,41 +4705,20 @@ int ReplicatedPG::find_object_context(const hobject_t& oid,
} else {
dout(20) << "find_object_context " << soid << " [" << first << "," << last
<< "] does not contain " << oid.snap << " -- DNE" << dendl;
- put_object_context(obc);
return -ENOENT;
}
}
-void ReplicatedPG::put_object_context(ObjectContext *obc)
+void ReplicatedPG::object_context_destructor_callback(ObjectContext *obc)
{
- dout(10) << "put_object_context " << obc << " " << obc->obs.oi.soid << " "
- << obc->ref << " -> " << (obc->ref-1) << dendl;
-
- --obc->ref;
- if (obc->ref == 0) {
- if (obc->ssc)
- put_snapset_context(obc->ssc);
-
- if (obc->registered)
- object_contexts.erase(obc->obs.oi.soid);
- delete obc;
-
- if (object_contexts.empty())
- kick();
- }
-}
+ dout(10) << "object_context_destructor_callback " << obc << " "
+ << obc->obs.oi.soid << dendl;
-void ReplicatedPG::put_object_contexts(map<hobject_t,ObjectContext*>& obcv)
-{
- if (obcv.empty())
- return;
- dout(10) << "put_object_contexts " << obcv << dendl;
- for (map<hobject_t,ObjectContext*>::iterator p = obcv.begin(); p != obcv.end(); ++p)
- put_object_context(p->second);
- obcv.clear();
+ if (obc->ssc)
+ put_snapset_context(obc->ssc);
}
-void ReplicatedPG::add_object_context_to_pg_stat(ObjectContext *obc, pg_stat_t *pgstat)
+void ReplicatedPG::add_object_context_to_pg_stat(ObjectContextRef obc, pg_stat_t *pgstat)
{
object_info_t& oi = obc->obs.oi;
@@ -4819,9 +4760,10 @@ void ReplicatedPG::add_object_context_to_pg_stat(ObjectContext *obc, pg_stat_t *
SnapSetContext *ReplicatedPG::create_snapset_context(const object_t& oid)
{
+ Mutex::Locker l(snapset_contexts_lock);
SnapSetContext *ssc = new SnapSetContext(oid);
dout(10) << "create_snapset_context " << ssc << " " << ssc->oid << dendl;
- register_snapset_context(ssc);
+ _register_snapset_context(ssc);
ssc->ref++;
return ssc;
}
@@ -4832,6 +4774,7 @@ SnapSetContext *ReplicatedPG::get_snapset_context(const object_t& oid,
bool can_create,
const string& nspace)
{
+ Mutex::Locker l(snapset_contexts_lock);
SnapSetContext *ssc;
map<object_t, SnapSetContext*>::iterator p = snapset_contexts.find(oid);
if (p != snapset_contexts.end()) {
@@ -4850,7 +4793,7 @@ SnapSetContext *ReplicatedPG::get_snapset_context(const object_t& oid,
return NULL;
}
ssc = new SnapSetContext(oid);
- register_snapset_context(ssc);
+ _register_snapset_context(ssc);
if (r >= 0) {
bufferlist::iterator bvp = bv.begin();
ssc->snapset.decode(bvp);
@@ -4865,9 +4808,9 @@ SnapSetContext *ReplicatedPG::get_snapset_context(const object_t& oid,
void ReplicatedPG::put_snapset_context(SnapSetContext *ssc)
{
+ Mutex::Locker l(snapset_contexts_lock);
dout(10) << "put_snapset_context " << ssc->oid << " "
<< ssc->ref << " -> " << (ssc->ref-1) << dendl;
-
--ssc->ref;
if (ssc->ref == 0) {
if (ssc->registered)
@@ -5090,7 +5033,7 @@ void ReplicatedPG::sub_op_modify_reply(OpRequestRef op)
// ===========================================================
-void ReplicatedPG::calc_head_subsets(ObjectContext *obc, SnapSet& snapset, const hobject_t& head,
+void ReplicatedPG::calc_head_subsets(ObjectContextRef obc, SnapSet& snapset, const hobject_t& head,
pg_missing_t& missing,
const hobject_t &last_backfill,
interval_set<uint64_t>& data_subset,
@@ -5375,7 +5318,7 @@ void ReplicatedPG::send_remove_op(const hobject_t& oid, eversion_t v, int peer)
* clones/heads and dup data ranges where possible.
*/
void ReplicatedPG::prep_push_to_replica(
- ObjectContext *obc, const hobject_t& soid, int peer,
+ ObjectContextRef obc, const hobject_t& soid, int peer,
int prio,
PushOp *pop)
{
@@ -5429,7 +5372,7 @@ void ReplicatedPG::prep_push_to_replica(
}
void ReplicatedPG::prep_push(int prio,
- ObjectContext *obc,
+ ObjectContextRef obc,
const hobject_t& soid, int peer,
PushOp *pop)
{
@@ -5445,7 +5388,7 @@ void ReplicatedPG::prep_push(int prio,
void ReplicatedPG::prep_push(
int prio,
- ObjectContext *obc,
+ ObjectContextRef obc,
const hobject_t& soid, int peer,
eversion_t version,
interval_set<uint64_t> &data_subset,
@@ -5736,7 +5679,7 @@ bool ReplicatedPG::handle_pull_response(
hoid.get_namespace());
assert(ssc);
}
- ObjectContext *obc = create_object_context(pi.recovery_info.oi, ssc);
+ ObjectContextRef obc = create_object_context(pi.recovery_info.oi, ssc);
obc->obs.exists = true;
obc->ondisk_write_lock();
@@ -6134,18 +6077,14 @@ bool ReplicatedPG::handle_push_reply(int peer, PushReplyOp &op, PushOp *reply)
void ReplicatedPG::finish_degraded_object(const hobject_t& oid)
{
dout(10) << "finish_degraded_object " << oid << dendl;
- map<hobject_t, ObjectContext *>::iterator i = object_contexts.find(oid);
- if (i != object_contexts.end()) {
- i->second->get();
- for (set<ObjectContext*>::iterator j = i->second->blocking.begin();
- j != i->second->blocking.end();
- i->second->blocking.erase(j++)) {
+ ObjectContextRef obc(object_contexts.lookup(oid));
+ if (obc) {
+ for (set<ObjectContextRef>::iterator j = obc->blocking.begin();
+ j != obc->blocking.end();
+ obc->blocking.erase(j++)) {
dout(10) << " no longer blocking writes for " << (*j)->obs.oi.soid << dendl;
- (*j)->blocked_by = NULL;
- put_object_context(*j);
- put_object_context(i->second);
+ (*j)->blocked_by = ObjectContextRef();
}
- put_object_context(i->second);
}
if (callbacks_for_degraded_object.count(oid)) {
list<Context*> contexts;
@@ -6250,11 +6189,10 @@ void ReplicatedPG::_committed_pushed_object(
unlock();
}
-void ReplicatedPG::_applied_recovered_object(ObjectContext *obc)
+void ReplicatedPG::_applied_recovered_object(ObjectContextRef obc)
{
lock();
dout(10) << "_applied_recovered_object " << *obc << dendl;
- put_object_context(obc);
assert(active_pushes >= 1);
--active_pushes;
@@ -6460,7 +6398,7 @@ eversion_t ReplicatedPG::pick_newest_available(const hobject_t& oid)
/* Mark an object as lost
*/
-ObjectContext *ReplicatedPG::mark_object_lost(ObjectStore::Transaction *t,
+ObjectContextRef ReplicatedPG::mark_object_lost(ObjectStore::Transaction *t,
const hobject_t &oid, eversion_t version,
utime_t mtime, int what)
{
@@ -6477,7 +6415,7 @@ ObjectContext *ReplicatedPG::mark_object_lost(ObjectStore::Transaction *t,
pg_log_entry_t e(what, oid, info.last_update, version, info.last_user_version, osd_reqid_t(), mtime);
pg_log.add(e);
- ObjectContext *obc = get_object_context(oid, true);
+ ObjectContextRef obc = get_object_context(oid, true);
obc->ondisk_write_lock();
@@ -6494,7 +6432,7 @@ ObjectContext *ReplicatedPG::mark_object_lost(ObjectStore::Transaction *t,
struct C_PG_MarkUnfoundLost : public Context {
ReplicatedPGRef pg;
- list<ObjectContext*> obcs;
+ list<ObjectContextRef> obcs;
C_PG_MarkUnfoundLost(ReplicatedPG *p) : pg(p) {}
void finish(int r) {
pg->_finish_mark_all_unfound_lost(obcs);
@@ -6527,7 +6465,7 @@ void ReplicatedPG::mark_all_unfound_lost(int what)
continue;
}
- ObjectContext *obc = NULL;
+ ObjectContextRef obc;
eversion_t prev;
switch (what) {
@@ -6586,13 +6524,16 @@ void ReplicatedPG::mark_all_unfound_lost(int what)
pg_log.get_log().print(*_dout);
*_dout << dendl;
+ info.stats.stats_invalid = true;
+
if (missing.num_missing() == 0) {
// advance last_complete since nothing else is missing!
info.last_complete = info.last_update;
- dirty_info = true;
- write_if_dirty(*t);
}
+ dirty_info = true;
+ write_if_dirty(*t);
+
osd->store->queue_transaction(osr.get(), t, c, NULL, new C_OSD_OndiskWriteUnlockList(&c->obcs));
// Send out the PG log to all replicas
@@ -6603,7 +6544,7 @@ void ReplicatedPG::mark_all_unfound_lost(int what)
osd->queue_for_recovery(this);
}
-void ReplicatedPG::_finish_mark_all_unfound_lost(list<ObjectContext*>& obcs)
+void ReplicatedPG::_finish_mark_all_unfound_lost(list<ObjectContextRef>& obcs)
{
lock();
dout(10) << "_finish_mark_all_unfound_lost " << dendl;
@@ -6612,11 +6553,7 @@ void ReplicatedPG::_finish_mark_all_unfound_lost(list<ObjectContext*>& obcs)
requeue_ops(waiting_for_all_missing);
waiting_for_all_missing.clear();
- while (!obcs.empty()) {
- ObjectContext *obc = obcs.front();
- put_object_context(obc);
- obcs.pop_front();
- }
+ obcs.clear();
unlock();
}
@@ -7077,7 +7014,7 @@ int ReplicatedPG::recover_primary(int max, ThreadPool::TPHandle &handle)
case pg_log_entry_t::LOST_REVERT:
{
if (item.have == latest->reverting_to) {
- ObjectContext *obc = get_object_context(soid, true);
+ ObjectContextRef obc = get_object_context(soid, true);
if (obc->obs.oi.version == latest->version) {
// I'm already reverting
@@ -7177,7 +7114,7 @@ int ReplicatedPG::prep_object_replica_pushes(
dout(10) << __func__ << ": on " << soid << dendl;
// NOTE: we know we will get a valid oloc off of disk here.
- ObjectContext *obc = get_object_context(soid, false);
+ ObjectContextRef obc = get_object_context(soid, false);
if (!obc) {
pg_log.missing_add(soid, v, eversion_t());
bool uhoh = true;
@@ -7221,7 +7158,6 @@ int ReplicatedPG::prep_object_replica_pushes(
dout(10) << " ondisk_read_unlock on " << soid << dendl;
obc->ondisk_read_unlock();
- put_object_context(obc);
return 1;
}
@@ -7424,11 +7360,10 @@ int ReplicatedPG::recover_backfill(
for (set<hobject_t>::iterator i = add_to_stat.begin();
i != add_to_stat.end();
++i) {
- ObjectContext *obc = get_object_context(*i, false);
+ ObjectContextRef obc = get_object_context(*i, false);
pg_stat_t stat;
add_object_context_to_pg_stat(obc, &stat);
pending_backfill_updates[*i] = stat;
- put_object_context(obc);
}
for (map<hobject_t, eversion_t>::iterator i = to_remove.begin();
i != to_remove.end();
@@ -7499,13 +7434,12 @@ void ReplicatedPG::prep_backfill_object_push(
if (!pushing.count(oid))
start_recovery_op(oid);
- ObjectContext *obc = get_object_context(oid, false);
+ ObjectContextRef obc = get_object_context(oid, false);
obc->ondisk_read_lock();
(*pushes)[peer].push_back(PushOp());
prep_push_to_replica(obc, oid, peer, g_conf->osd_recovery_op_priority,
&((*pushes)[peer].back()));
obc->ondisk_read_unlock();
- put_object_context(obc);
}
void ReplicatedPG::scan_range(
@@ -7527,9 +7461,9 @@ void ReplicatedPG::scan_range(
for (vector<hobject_t>::iterator p = ls.begin(); p != ls.end(); ++p) {
handle.reset_tp_timeout();
- ObjectContext *obc = NULL;
+ ObjectContextRef obc;
if (is_primary())
- obc = _lookup_object_context(*p);
+ obc = object_contexts.lookup(*p);
if (obc) {
bi->objects[*p] = obc->obs.oi.version;
dout(20) << " " << *p << " " << obc->obs.oi.version << dendl;
diff --git a/src/osd/ReplicatedPG.h b/src/osd/ReplicatedPG.h
index 36296c96ce1..bce141834ca 100644
--- a/src/osd/ReplicatedPG.h
+++ b/src/osd/ReplicatedPG.h
@@ -3,6 +3,9 @@
* Ceph - scalable distributed file system
*
* Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ * Copyright (C) 2013 Cloudwatt <libre.licensing@cloudwatt.com>
+ *
+ * Author: Loic Dachary <loic@dachary.org>
*
* This is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
@@ -27,6 +30,9 @@
#include "messages/MOSDOp.h"
#include "messages/MOSDOpReply.h"
#include "messages/MOSDSubOp.h"
+
+#include "common/sharedptr_registry.hpp"
+
class MOSDSubOpReply;
class ReplicatedPG;
@@ -124,10 +130,10 @@ public:
vector<pg_log_entry_t> log;
interval_set<uint64_t> modified_ranges;
- ObjectContext *obc; // For ref counting purposes
- map<hobject_t,ObjectContext*> src_obc;
- ObjectContext *clone_obc; // if we created a clone
- ObjectContext *snapset_obc; // if we created/deleted a snapdir
+ ObjectContextRef obc;
+ map<hobject_t,ObjectContextRef> src_obc;
+ ObjectContextRef clone_obc; // if we created a clone
+ ObjectContextRef snapset_obc; // if we created/deleted a snapdir
int data_off; // FIXME: we may want to kill this msgr hint off at some point!
@@ -150,7 +156,7 @@ public:
modify(false), user_modify(false),
bytes_written(0), bytes_read(0), user_at_version(0),
current_osd_subop_num(0),
- obc(0), clone_obc(0), snapset_obc(0), data_off(0), reply(NULL), pg(_pg),
+ data_off(0), reply(NULL), pg(_pg),
num_read(0),
num_write(0) {
if (_ssc) {
@@ -176,8 +182,8 @@ public:
eversion_t v;
OpContext *ctx;
- ObjectContext *obc;
- map<hobject_t,ObjectContext*> src_obc;
+ ObjectContextRef obc;
+ map<hobject_t,ObjectContextRef> src_obc;
tid_t rep_tid;
@@ -197,7 +203,7 @@ public:
list<ObjectStore::Transaction*> tls;
bool queue_snap_trimmer;
- RepGather(OpContext *c, ObjectContext *pi, tid_t rt,
+ RepGather(OpContext *c, ObjectContextRef pi, tid_t rt,
eversion_t lc) :
queue_item(this),
nref(1),
@@ -240,7 +246,7 @@ protected:
void eval_repop(RepGather*);
void issue_repop(RepGather *repop, utime_t now,
eversion_t old_last_update, bool old_exists, uint64_t old_size, eversion_t old_version);
- RepGather *new_repop(OpContext *ctx, ObjectContext *obc, tid_t rep_tid);
+ RepGather *new_repop(OpContext *ctx, ObjectContextRef obc, tid_t rep_tid);
void remove_repop(RepGather *repop);
void repop_ack(RepGather *repop,
int result, int ack_type,
@@ -276,50 +282,42 @@ protected:
friend struct C_OnPushCommit;
// projected object info
- map<hobject_t, ObjectContext*> object_contexts;
+ SharedPtrRegistry<hobject_t, ObjectContext> object_contexts;
map<object_t, SnapSetContext*> snapset_contexts;
+ Mutex snapset_contexts_lock;
// debug order that client ops are applied
map<hobject_t, map<client_t, tid_t> > debug_op_order;
- void populate_obc_watchers(ObjectContext *obc);
- void check_blacklisted_obc_watchers(ObjectContext *);
+ void populate_obc_watchers(ObjectContextRef obc);
+ void check_blacklisted_obc_watchers(ObjectContextRef obc);
void check_blacklisted_watchers();
void get_watchers(list<obj_watch_item_t> &pg_watchers);
- void get_obc_watchers(ObjectContext *obc, list<obj_watch_item_t> &pg_watchers);
+ void get_obc_watchers(ObjectContextRef obc, list<obj_watch_item_t> &pg_watchers);
public:
void handle_watch_timeout(WatchRef watch);
protected:
- ObjectContext *lookup_object_context(const hobject_t& soid) {
- if (object_contexts.count(soid)) {
- ObjectContext *obc = object_contexts[soid];
- obc->ref++;
- return obc;
- }
- return NULL;
- }
- ObjectContext *_lookup_object_context(const hobject_t& oid);
- ObjectContext *create_object_context(const object_info_t& oi, SnapSetContext *ssc);
- ObjectContext *get_object_context(const hobject_t& soid, bool can_create);
- void register_object_context(ObjectContext *obc) {
- if (!obc->registered) {
- assert(object_contexts.count(obc->obs.oi.soid) == 0);
- obc->registered = true;
- object_contexts[obc->obs.oi.soid] = obc;
- }
- if (obc->ssc)
- register_snapset_context(obc->ssc);
- }
+ ObjectContextRef create_object_context(const object_info_t& oi, SnapSetContext *ssc);
+ ObjectContextRef get_object_context(const hobject_t& soid, bool can_create);
void context_registry_on_change();
- void put_object_context(ObjectContext *obc);
- void put_object_contexts(map<hobject_t,ObjectContext*>& obcv);
+ void object_context_destructor_callback(ObjectContext *obc);
+ struct C_PG_ObjectContext : public Context {
+ ReplicatedPGRef pg;
+ ObjectContext *obc;
+ C_PG_ObjectContext(ReplicatedPG *p, ObjectContext *o) :
+ pg(p), obc(o) {}
+ void finish(int r) {
+ pg->object_context_destructor_callback(obc);
+ }
+ };
+
int find_object_context(const hobject_t& oid,
- ObjectContext **pobc,
+ ObjectContextRef *pobc,
bool can_create, snapid_t *psnapid=NULL);
- void add_object_context_to_pg_stat(ObjectContext *obc, pg_stat_t *stat);
+ void add_object_context_to_pg_stat(ObjectContextRef obc, pg_stat_t *stat);
void get_src_oloc(const object_t& oid, const object_locator_t& oloc, object_locator_t& src_oloc);
@@ -327,6 +325,11 @@ protected:
SnapSetContext *get_snapset_context(const object_t& oid, const string &key,
ps_t seed, bool can_create, const string &nspace);
void register_snapset_context(SnapSetContext *ssc) {
+ Mutex::Locker l(snapset_contexts_lock);
+ _register_snapset_context(ssc);
+ }
+ void _register_snapset_context(SnapSetContext *ssc) {
+ assert(snapset_contexts_lock.is_locked());
if (!ssc->registered) {
assert(snapset_contexts.count(ssc->oid) == 0);
ssc->registered = true;
@@ -525,7 +528,7 @@ protected:
int prep_object_replica_pushes(const hobject_t& soid, eversion_t v,
int priority,
map<int, vector<PushOp> > *pushes);
- void calc_head_subsets(ObjectContext *obc, SnapSet& snapset, const hobject_t& head,
+ void calc_head_subsets(ObjectContextRef obc, SnapSet& snapset, const hobject_t& head,
pg_missing_t& missing,
const hobject_t &last_backfill,
interval_set<uint64_t>& data_subset,
@@ -535,17 +538,17 @@ protected:
interval_set<uint64_t>& data_subset,
map<hobject_t, interval_set<uint64_t> >& clone_subsets);
void prep_push_to_replica(
- ObjectContext *obc,
+ ObjectContextRef obc,
const hobject_t& oid,
int dest,
int priority,
PushOp *push_op);
void prep_push(int priority,
- ObjectContext *obc,
+ ObjectContextRef obc,
const hobject_t& oid, int dest,
PushOp *op);
void prep_push(int priority,
- ObjectContext *obc,
+ ObjectContextRef obc,
const hobject_t& soid, int peer,
eversion_t version,
interval_set<uint64_t> &data_subset,
@@ -648,8 +651,8 @@ protected:
}
};
struct C_OSD_OndiskWriteUnlock : public Context {
- ObjectContext *obc, *obc2;
- C_OSD_OndiskWriteUnlock(ObjectContext *o, ObjectContext *o2=0) : obc(o), obc2(o2) {}
+ ObjectContextRef obc, obc2;
+ C_OSD_OndiskWriteUnlock(ObjectContextRef o, ObjectContextRef o2 = ObjectContextRef()) : obc(o), obc2(o2) {}
void finish(int r) {
obc->ondisk_write_unlock();
if (obc2)
@@ -657,17 +660,17 @@ protected:
}
};
struct C_OSD_OndiskWriteUnlockList : public Context {
- list<ObjectContext*> *pls;
- C_OSD_OndiskWriteUnlockList(list<ObjectContext*> *l) : pls(l) {}
+ list<ObjectContextRef> *pls;
+ C_OSD_OndiskWriteUnlockList(list<ObjectContextRef> *l) : pls(l) {}
void finish(int r) {
- for (list<ObjectContext*>::iterator p = pls->begin(); p != pls->end(); ++p)
+ for (list<ObjectContextRef>::iterator p = pls->begin(); p != pls->end(); ++p)
(*p)->ondisk_write_unlock();
}
};
struct C_OSD_AppliedRecoveredObject : public Context {
ReplicatedPGRef pg;
- ObjectContext *obc;
- C_OSD_AppliedRecoveredObject(ReplicatedPG *p, ObjectContext *o) :
+ ObjectContextRef obc;
+ C_OSD_AppliedRecoveredObject(ReplicatedPG *p, ObjectContextRef o) :
pg(p), obc(o) {}
void finish(int r) {
pg->_applied_recovered_object(obc);
@@ -730,7 +733,7 @@ protected:
void sub_op_modify_commit(RepModify *rm);
void sub_op_modify_reply(OpRequestRef op);
- void _applied_recovered_object(ObjectContext *obc);
+ void _applied_recovered_object(ObjectContextRef obc);
void _applied_recovered_object_replica();
void _committed_pushed_object(epoch_t epoch, eversion_t lc);
void recover_got(hobject_t oid, eversion_t v);
@@ -879,10 +882,10 @@ public:
void mark_all_unfound_lost(int what);
eversion_t pick_newest_available(const hobject_t& oid);
- ObjectContext *mark_object_lost(ObjectStore::Transaction *t,
+ ObjectContextRef mark_object_lost(ObjectStore::Transaction *t,
const hobject_t& oid, eversion_t version,
utime_t mtime, int what);
- void _finish_mark_all_unfound_lost(list<ObjectContext*>& obcs);
+ void _finish_mark_all_unfound_lost(list<ObjectContextRef>& obcs);
void on_role_change();
void on_change(ObjectStore::Transaction *t);
diff --git a/src/osd/Watch.cc b/src/osd/Watch.cc
index 8a084ca9aa1..ffa3adced24 100644
--- a/src/osd/Watch.cc
+++ b/src/osd/Watch.cc
@@ -250,15 +250,14 @@ public:
string Watch::gen_dbg_prefix() {
stringstream ss;
ss << pg->gen_prefix() << " -- Watch("
- << make_pair(cookie, entity)
- << ", obc->ref=" << (obc ? obc->ref : -1) << ") ";
+ << make_pair(cookie, entity) << ") ";
return ss.str();
}
Watch::Watch(
ReplicatedPG *pg,
OSDService *osd,
- ObjectContext *obc,
+ ObjectContextRef obc,
uint32_t timeout,
uint64_t cookie,
entity_name_t entity,
@@ -272,7 +271,6 @@ Watch::Watch(
addr(addr),
entity(entity),
discarded(false) {
- obc->get();
dout(10) << "Watch()" << dendl;
}
@@ -292,13 +290,6 @@ Context *Watch::get_delayed_cb()
return cb;
}
-ObjectContext *Watch::get_obc()
-{
- assert(obc);
- obc->get();
- return obc;
-}
-
void Watch::register_cb()
{
Mutex::Locker l(osd->watch_lock);
@@ -370,8 +361,7 @@ void Watch::discard_state()
sessionref->put();
conn = ConnectionRef();
}
- pg->put_object_context(obc);
- obc = NULL;
+ obc = ObjectContextRef();
}
bool Watch::is_discarded()
@@ -428,7 +418,7 @@ void Watch::notify_ack(uint64_t notify_id)
WatchRef Watch::makeWatchRef(
ReplicatedPG *pg, OSDService *osd,
- ObjectContext *obc, uint32_t timeout, uint64_t cookie, entity_name_t entity, entity_addr_t addr)
+ ObjectContextRef obc, uint32_t timeout, uint64_t cookie, entity_name_t entity, entity_addr_t addr)
{
WatchRef ret(new Watch(pg, osd, obc, timeout, cookie, entity, addr));
ret->set_self(ret);
diff --git a/src/osd/Watch.h b/src/osd/Watch.h
index 1c9fa28cb65..ecb61ad8b72 100644
--- a/src/osd/Watch.h
+++ b/src/osd/Watch.h
@@ -151,7 +151,7 @@ class Watch {
OSDService *osd;
boost::intrusive_ptr<ReplicatedPG> pg;
- ObjectContext *obc;
+ std::tr1::shared_ptr<ObjectContext> obc;
std::map<uint64_t, NotifyRef> in_progress_notifies;
@@ -165,7 +165,7 @@ class Watch {
Watch(
ReplicatedPG *pg, OSDService *osd,
- ObjectContext *obc, uint32_t timeout,
+ std::tr1::shared_ptr<ObjectContext> obc, uint32_t timeout,
uint64_t cookie, entity_name_t entity,
entity_addr_t addr);
@@ -187,7 +187,7 @@ public:
string gen_dbg_prefix();
static WatchRef makeWatchRef(
ReplicatedPG *pg, OSDService *osd,
- ObjectContext *obc, uint32_t timeout, uint64_t cookie, entity_name_t entity, entity_addr_t addr);
+ std::tr1::shared_ptr<ObjectContext> obc, uint32_t timeout, uint64_t cookie, entity_name_t entity, entity_addr_t addr);
void set_self(WatchRef _self) {
self = _self;
}
@@ -195,8 +195,8 @@ public:
/// Does not grant a ref count!
boost::intrusive_ptr<ReplicatedPG> get_pg() { return pg; }
- /// Grants a ref count!
- ObjectContext *get_obc();
+ std::tr1::shared_ptr<ObjectContext> get_obc() { return obc; }
+
uint64_t get_cookie() const { return cookie; }
entity_name_t get_entity() const { return entity; }
entity_addr_t get_peer_addr() const { return addr; }
diff --git a/src/osd/osd_types.h b/src/osd/osd_types.h
index fc976ceaf1a..9b2beb7e8a5 100644
--- a/src/osd/osd_types.h
+++ b/src/osd/osd_types.h
@@ -2008,6 +2008,8 @@ struct ObjectState {
object_info_t oi;
bool exists;
+ ObjectState() : exists(false) {}
+
ObjectState(const object_info_t &oi_, bool exists_)
: oi(oi_), exists(exists_) {}
};
@@ -2029,13 +2031,18 @@ struct SnapSetContext {
* etc., because we don't send writes down to disk until after
* replicas ack.
*/
+
+struct ObjectContext;
+
+typedef std::tr1::shared_ptr<ObjectContext> ObjectContextRef;
+
struct ObjectContext {
- int ref;
- bool registered;
ObjectState obs;
SnapSetContext *ssc; // may be null
+ Context *destructor_callback;
+
private:
Mutex lock;
public:
@@ -2043,20 +2050,22 @@ public:
int unstable_writes, readers, writers_waiting, readers_waiting;
// set if writes for this object are blocked on another objects recovery
- ObjectContext *blocked_by; // object blocking our writes
- set<ObjectContext*> blocking; // objects whose writes we block
+ ObjectContextRef blocked_by; // object blocking our writes
+ set<ObjectContextRef> blocking; // objects whose writes we block
// any entity in obs.oi.watchers MUST be in either watchers or unconnected_watchers.
map<pair<uint64_t, entity_name_t>, WatchRef> watchers;
- ObjectContext(const object_info_t &oi_, bool exists_, SnapSetContext *ssc_)
- : ref(0), registered(false), obs(oi_, exists_), ssc(ssc_),
+ ObjectContext()
+ : ssc(NULL),
+ destructor_callback(0),
lock("ReplicatedPG::ObjectContext::lock"),
- unstable_writes(0), readers(0), writers_waiting(0), readers_waiting(0),
- blocked_by(0) {}
-
- void get() { ++ref; }
+ unstable_writes(0), readers(0), writers_waiting(0), readers_waiting(0) {}
+ ~ObjectContext() {
+ if (destructor_callback)
+ destructor_callback->complete(0);
+ }
// do simple synchronous mutual exclusion, for now. now waitqueues or anything fancy.
void ondisk_write_lock() {
lock.Lock();
diff --git a/src/osdc/ObjectCacher.cc b/src/osdc/ObjectCacher.cc
index 51fad699555..01eeccc03be 100644
--- a/src/osdc/ObjectCacher.cc
+++ b/src/osdc/ObjectCacher.cc
@@ -30,6 +30,7 @@ ObjectCacher::BufferHead *ObjectCacher::Object::split(BufferHead *left, loff_t o
// split off right
ObjectCacher::BufferHead *right = new BufferHead(this);
right->last_write_tid = left->last_write_tid;
+ right->last_read_tid = left->last_read_tid;
right->set_state(left->get_state());
right->snapc = left->snapc;
@@ -113,6 +114,10 @@ void ObjectCacher::Object::try_merge_bh(BufferHead *bh)
assert(oc->lock.is_locked());
ldout(oc->cct, 10) << "try_merge_bh " << *bh << dendl;
+ // do not merge rx buffers; last_read_tid may not match
+ if (bh->is_rx())
+ return;
+
// to the left?
map<loff_t,BufferHead*>::iterator p = data.find(bh->start());
assert(p->second == bh);
@@ -500,6 +505,7 @@ ObjectCacher::ObjectCacher(CephContext *cct_, string name, WritebackHandler& wb,
max_size(max_bytes), max_objects(max_objects),
block_writes_upfront(block_writes_upfront),
flush_set_callback(flush_callback), flush_set_callback_arg(flush_callback_arg),
+ last_read_tid(0),
flusher_stop(false), flusher_thread(this), finisher(cct),
stat_clean(0), stat_zero(0), stat_dirty(0), stat_rx(0), stat_tx(0), stat_missing(0),
stat_error(0), stat_dirty_waiting(0), reads_outstanding(0)
@@ -603,25 +609,29 @@ void ObjectCacher::bh_read(BufferHead *bh)
<< reads_outstanding << dendl;
mark_rx(bh);
+ bh->last_read_tid = ++last_read_tid;
// finisher
- C_ReadFinish *onfinish = new C_ReadFinish(this, bh->ob,
+ C_ReadFinish *onfinish = new C_ReadFinish(this, bh->ob, bh->last_read_tid,
bh->start(), bh->length());
// go
writeback_handler.read(bh->ob->get_oid(), bh->ob->get_oloc(),
bh->start(), bh->length(), bh->ob->get_snap(),
&onfinish->bl, bh->ob->truncate_size, bh->ob->truncate_seq,
onfinish);
+
++reads_outstanding;
}
-void ObjectCacher::bh_read_finish(int64_t poolid, sobject_t oid, loff_t start,
- uint64_t length, bufferlist &bl, int r,
+void ObjectCacher::bh_read_finish(int64_t poolid, sobject_t oid, tid_t tid,
+ loff_t start, uint64_t length,
+ bufferlist &bl, int r,
bool trust_enoent)
{
assert(lock.is_locked());
ldout(cct, 7) << "bh_read_finish "
<< oid
+ << " tid " << tid
<< " " << start << "~" << length
<< " (bl is " << bl.length() << ")"
<< " returned " << r
@@ -711,7 +721,7 @@ void ObjectCacher::bh_read_finish(int64_t poolid, sobject_t oid, loff_t start,
BufferHead *bh = p->second;
ldout(cct, 20) << "checking bh " << *bh << dendl;
-
+
// finishers?
for (map<loff_t, list<Context*> >::iterator it = bh->waitfor_read.begin();
it != bh->waitfor_read.end();
@@ -720,9 +730,9 @@ void ObjectCacher::bh_read_finish(int64_t poolid, sobject_t oid, loff_t start,
bh->waitfor_read.clear();
if (bh->start() > opos) {
- ldout(cct, 1) << "weirdness: gap when applying read results, "
- << opos << "~" << bh->start() - opos
- << dendl;
+ ldout(cct, 1) << "bh_read_finish skipping gap "
+ << opos << "~" << bh->start() - opos
+ << dendl;
opos = bh->start();
continue;
}
@@ -733,6 +743,13 @@ void ObjectCacher::bh_read_finish(int64_t poolid, sobject_t oid, loff_t start,
continue;
}
+ if (bh->last_read_tid != tid) {
+ ldout(cct, 10) << "bh_read_finish bh->last_read_tid " << bh->last_read_tid
+ << " != tid " << tid << ", skipping" << dendl;
+ opos = bh->end();
+ continue;
+ }
+
assert(opos >= bh->start());
assert(bh->start() == opos); // we don't merge rx bh's... yet!
assert(bh->length() <= start+(loff_t)length-opos);
diff --git a/src/osdc/ObjectCacher.h b/src/osdc/ObjectCacher.h
index 7d5ce6fad04..a62a41fd8ce 100644
--- a/src/osdc/ObjectCacher.h
+++ b/src/osdc/ObjectCacher.h
@@ -104,6 +104,7 @@ class ObjectCacher {
Object *ob;
bufferlist bl;
tid_t last_write_tid; // version of bh (if non-zero)
+ tid_t last_read_tid; // tid of last read op (if any)
utime_t last_write;
SnapContext snapc;
int error; // holds return value for failed reads
@@ -116,6 +117,7 @@ class ObjectCacher {
ref(0),
ob(o),
last_write_tid(0),
+ last_read_tid(0),
error(0) {
ex.start = ex.length = 0;
}
@@ -339,6 +341,8 @@ class ObjectCacher {
vector<hash_map<sobject_t, Object*> > objects; // indexed by pool_id
+ tid_t last_read_tid;
+
set<BufferHead*> dirty_bh;
LRU bh_lru_dirty, bh_lru_rest;
LRU ob_lru;
@@ -455,8 +459,9 @@ class ObjectCacher {
bool external_call);
public:
- void bh_read_finish(int64_t poolid, sobject_t oid, loff_t offset,
- uint64_t length, bufferlist &bl, int r,
+ void bh_read_finish(int64_t poolid, sobject_t oid, tid_t tid,
+ loff_t offset, uint64_t length,
+ bufferlist &bl, int r,
bool trust_enoent);
void bh_write_commit(int64_t poolid, sobject_t oid, loff_t offset,
uint64_t length, tid_t t, int r);
@@ -469,17 +474,20 @@ class ObjectCacher {
uint64_t length;
xlist<C_ReadFinish*>::item set_item;
bool trust_enoent;
+ tid_t tid;
public:
bufferlist bl;
- C_ReadFinish(ObjectCacher *c, Object *ob, loff_t s, uint64_t l) :
+ C_ReadFinish(ObjectCacher *c, Object *ob, tid_t t, loff_t s, uint64_t l) :
oc(c), poolid(ob->oloc.pool), oid(ob->get_soid()), start(s), length(l),
- set_item(this), trust_enoent(true) {
+ set_item(this), trust_enoent(true),
+ tid(t) {
ob->reads.push_back(&set_item);
}
void finish(int r) {
- oc->bh_read_finish(poolid, oid, start, length, bl, r, trust_enoent);
+ oc->bh_read_finish(poolid, oid, tid, start, length, bl, r, trust_enoent);
+
// object destructor clears the list
if (set_item.is_on_list())
set_item.remove_myself();
diff --git a/src/osdc/Objecter.cc b/src/osdc/Objecter.cc
index 8782fd9e8a5..9fb0bfa446d 100644
--- a/src/osdc/Objecter.cc
+++ b/src/osdc/Objecter.cc
@@ -625,10 +625,10 @@ void Objecter::handle_osd_map(MOSDMap *m)
// was/is paused?
if (was_pauserd || was_pausewr || pauserd || pausewr)
maybe_request_map();
-
+
// unpause requests?
if ((was_pauserd && !pauserd) ||
- (was_pausewr && !pausewr))
+ (was_pausewr && !pausewr)) {
for (map<tid_t,Op*>::iterator p = ops.begin();
p != ops.end();
++p) {
@@ -638,6 +638,16 @@ void Objecter::handle_osd_map(MOSDMap *m)
!((op->flags & CEPH_OSD_FLAG_WRITE) && pausewr)) // not still paused as a write
need_resend[op->tid] = op;
}
+ for (map<tid_t, LingerOp*>::iterator lp = linger_ops.begin();
+ lp != linger_ops.end();
+ ++lp) {
+ LingerOp *op = lp->second;
+ if (!op->registered &&
+ !pauserd && // not still paused as a read
+ !((op->flags & CEPH_OSD_FLAG_WRITE) && pausewr)) // not still paused as a write
+ need_resend_linger.push_back(op);
+ }
+ }
// resend requests
for (map<tid_t, Op*>::iterator p = need_resend.begin(); p != need_resend.end(); ++p) {
@@ -2232,8 +2242,7 @@ void Objecter::dump_linger_ops(Formatter *fmt) const
fmt->dump_stream("object_id") << op->oid;
fmt->dump_stream("object_locator") << op->oloc;
fmt->dump_stream("snapid") << op->snap;
- fmt->dump_stream("registering") << op->snap;
- fmt->dump_stream("registered") << op->snap;
+ fmt->dump_stream("registered") << op->registered;
fmt->close_section(); // linger_op object
}
fmt->close_section(); // linger_ops array
diff --git a/src/pybind/ceph_rest_api.py b/src/pybind/ceph_rest_api.py
index 421cc59edcc..c53c3d77737 100755
--- a/src/pybind/ceph_rest_api.py
+++ b/src/pybind/ceph_rest_api.py
@@ -5,6 +5,7 @@ import errno
import json
import logging
import logging.handlers
+import os
import rados
import textwrap
import xml.etree.ElementTree
@@ -26,6 +27,7 @@ DEFAULT_ID = 'restapi'
DEFAULT_BASEURL = '/api/v0.1'
DEFAULT_LOG_LEVEL = 'warning'
+DEFAULT_LOGDIR = '/var/log/ceph'
# default client name will be 'client.<DEFAULT_ID>'
# 'app' must be global for decorators, etc.
@@ -117,7 +119,18 @@ def api_setup(app, conf, cluster, clientname, clientid, args):
loglevel = app.ceph_cluster.conf_get('restapi_log_level') \
or DEFAULT_LOG_LEVEL
+ # ceph has a default log file for daemons only; clients (like this)
+ # default to "". Override that for this particular client.
logfile = app.ceph_cluster.conf_get('log_file')
+ if not logfile:
+ logfile = os.path.join(
+ DEFAULT_LOGDIR,
+ '{cluster}-{clientname}.{pid}.log'.format(
+ cluster=cluster,
+ clientname=clientname,
+ pid=os.getpid()
+ )
+ )
app.logger.addHandler(logging.handlers.WatchedFileHandler(logfile))
app.logger.setLevel(LOGLEVELS[loglevel.lower()])
for h in app.logger.handlers:
diff --git a/src/rgw/rgw_admin.cc b/src/rgw/rgw_admin.cc
index 2b8a716115b..644a1760aaf 100644
--- a/src/rgw/rgw_admin.cc
+++ b/src/rgw/rgw_admin.cc
@@ -114,6 +114,7 @@ void _usage()
cerr << " --access=<access> Set access permissions for sub-user, should be one\n";
cerr << " of read, write, readwrite, full\n";
cerr << " --display-name=<name>\n";
+ cerr << " --system set the system flag on the user\n";
cerr << " --bucket=<bucket>\n";
cerr << " --pool=<pool>\n";
cerr << " --object=<object>\n";
@@ -853,6 +854,9 @@ int main(int argc, char **argv)
cerr << "ERROR: invalid replica log type" << std::endl;
return EINVAL;
}
+ } else if (strncmp(*i, "-", 1) == 0) {
+ cerr << "ERROR: invalid flag " << *i << std::endl;
+ return EINVAL;
} else {
++i;
}
diff --git a/src/rgw/rgw_bucket.cc b/src/rgw/rgw_bucket.cc
index 1e523f332cf..5356417f09a 100644
--- a/src/rgw/rgw_bucket.cc
+++ b/src/rgw/rgw_bucket.cc
@@ -1451,7 +1451,12 @@ public:
if (ret < 0)
return ret;
- ret = rgw_unlink_bucket(store, be.owner, entry);
+ /*
+ * We're unlinking the bucket but we don't want to update the entrypoint here — we're removing
+ * it immediately and don't want to invalidate our cached objv_version or the bucket obj removal
+ * will incorrectly fail.
+ */
+ ret = rgw_unlink_bucket(store, be.owner, entry, false);
if (ret < 0) {
lderr(store->ctx()) << "could not unlink bucket=" << entry << " owner=" << be.owner << dendl;
}
diff --git a/src/rgw/rgw_cache.h b/src/rgw/rgw_cache.h
index b6c4e15eede..601fcdfc963 100644
--- a/src/rgw/rgw_cache.h
+++ b/src/rgw/rgw_cache.h
@@ -177,14 +177,13 @@ class RGWCache : public T
if (ret < 0)
return ret;
- ret = T::init_watch();
- return ret;
+ return 0;
}
- void finalize() {
- T::finalize_watch();
- T::finalize();
+ bool need_watch_notify() {
+ return true;
}
+
int distribute_cache(const string& normal_name, rgw_obj& obj, ObjectCacheInfo& obj_info, int op);
int watch_cb(int opcode, uint64_t ver, bufferlist& bl);
public:
diff --git a/src/rgw/rgw_rados.cc b/src/rgw/rgw_rados.cc
index 222b79a7d2e..03cc1ebfdb3 100644
--- a/src/rgw/rgw_rados.cc
+++ b/src/rgw/rgw_rados.cc
@@ -817,6 +817,9 @@ void RGWRadosCtx::set_prefetch_data(rgw_obj& obj) {
void RGWRados::finalize()
{
+ if (need_watch_notify()) {
+ finalize_watch();
+ }
delete meta_mgr;
delete data_log;
if (use_gc_thread) {
@@ -872,6 +875,14 @@ int RGWRados::init_complete()
{
int ret;
+ if (need_watch_notify()) {
+ ret = init_watch();
+ if (ret < 0) {
+ lderr(cct) << "ERROR: failed to initialize watch" << dendl;
+ return ret;
+ }
+ }
+
ret = region.init(cct, this);
if (ret < 0)
return ret;
diff --git a/src/rgw/rgw_rados.h b/src/rgw/rgw_rados.h
index d01f76ec224..e6ab244afa9 100644
--- a/src/rgw/rgw_rados.h
+++ b/src/rgw/rgw_rados.h
@@ -1254,6 +1254,7 @@ public:
virtual int update_containers_stats(map<string, RGWBucketEnt>& m);
virtual int append_async(rgw_obj& obj, size_t size, bufferlist& bl);
+ virtual bool need_watch_notify() { return false; }
virtual int init_watch();
virtual void finalize_watch();
virtual int distribute(const string& key, bufferlist& bl);
diff --git a/src/test/ObjectMap/KeyValueDBMemory.h b/src/test/ObjectMap/KeyValueDBMemory.h
index 93d0809d491..5cffce3ef04 100644
--- a/src/test/ObjectMap/KeyValueDBMemory.h
+++ b/src/test/ObjectMap/KeyValueDBMemory.h
@@ -126,6 +126,24 @@ public:
return static_cast<TransactionImpl_*>(trans.get())->complete();
}
+ uint64_t get_estimated_size(map<string,uint64_t> &extras) {
+ uint64_t total_size = 0;
+
+ for (map<pair<string,string>,bufferlist>::iterator p = db.begin();
+ p != db.end(); ++p) {
+ string prefix = p->first.first;
+ bufferlist &bl = p->second;
+
+ uint64_t sz = bl.length();
+ total_size += sz;
+ if (extras.count(prefix) == 0)
+ extras[prefix] = 0;
+ extras[prefix] += sz;
+ }
+
+ return total_size;
+ }
+
private:
bool exists_prefix(const string &prefix) {
std::map<std::pair<string,string>,bufferlist>::iterator it;
diff --git a/src/test/ObjectMap/test_store_tool/test_store_tool.cc b/src/test/ObjectMap/test_store_tool/test_store_tool.cc
index ace91220df6..f81598ccfb8 100644
--- a/src/test/ObjectMap/test_store_tool/test_store_tool.cc
+++ b/src/test/ObjectMap/test_store_tool/test_store_tool.cc
@@ -90,6 +90,17 @@ class StoreTool
exists = false;
return bufferlist();
}
+
+ uint64_t get_size() {
+ map<string,uint64_t> extras;
+ uint64_t s = db->get_estimated_size(extras);
+ for (map<string,uint64_t>::iterator p = extras.begin();
+ p != extras.end(); ++p) {
+ std::cout << p->first << " - " << p->second << std::endl;
+ }
+ std::cout << "total: " << s << std::endl;
+ return s;
+ }
};
void usage(const char *pname)
@@ -101,6 +112,7 @@ void usage(const char *pname)
<< " exists <prefix> [key]\n"
<< " get <prefix> <key>\n"
<< " verify <store path>\n"
+ << " get-size\n"
<< std::endl;
}
@@ -173,6 +185,8 @@ int main(int argc, const char *argv[])
} else if (cmd == "verify") {
assert(0);
+ } else if (cmd == "get-size") {
+ std::cout << "estimated store size: " << st.get_size() << std::endl;
} else {
std::cerr << "Unrecognized command: " << cmd << std::endl;
return 1;
diff --git a/src/test/cli/radosgw-admin/help.t b/src/test/cli/radosgw-admin/help.t
index 21f51e68c35..90f6beca133 100644
--- a/src/test/cli/radosgw-admin/help.t
+++ b/src/test/cli/radosgw-admin/help.t
@@ -75,6 +75,7 @@
--access=<access> Set access permissions for sub-user, should be one
of read, write, readwrite, full
--display-name=<name>
+ --system set the system flag on the user
--bucket=<bucket>
--pool=<pool>
--object=<object>
diff --git a/src/test/common/test_sharedptr_registry.cc b/src/test/common/test_sharedptr_registry.cc
index aec2107c9e5..b1713a9bd9f 100644
--- a/src/test/common/test_sharedptr_registry.cc
+++ b/src/test/common/test_sharedptr_registry.cc
@@ -137,8 +137,8 @@ TEST_F(SharedPtrRegistry_all, wait_lookup_or_create) {
EXPECT_TRUE(registry.lookup_or_create(key + 12345));
registry.remove(key);
ASSERT_TRUE(wait_for(registry, 0));
- EXPECT_TRUE(t.ptr);
t.join();
+ EXPECT_TRUE(t.ptr);
}
{
unsigned int key = 2;
@@ -163,9 +163,9 @@ TEST_F(SharedPtrRegistry_all, wait_lookup_or_create) {
}
registry.remove(key);
ASSERT_TRUE(wait_for(registry, 0));
+ t.join();
EXPECT_TRUE(t.ptr);
EXPECT_EQ(value, *t.ptr);
- t.join();
}
}
@@ -200,8 +200,8 @@ TEST_F(SharedPtrRegistry_all, wait_lookup) {
EXPECT_FALSE(registry.lookup(key + 12345));
registry.remove(key);
ASSERT_TRUE(wait_for(registry, 0));
- EXPECT_FALSE(t.ptr);
t.join();
+ EXPECT_FALSE(t.ptr);
}
TEST_F(SharedPtrRegistry_all, get_next) {
@@ -238,6 +238,24 @@ TEST_F(SharedPtrRegistry_all, get_next) {
EXPECT_FALSE(registry.get_next(i.first, &i));
}
+ {
+ //
+ // http://tracker.ceph.com/issues/6117
+ // reproduce the issue.
+ //
+ SharedPtrRegistryTest registry;
+ const unsigned int key1 = 111;
+ shared_ptr<int> *ptr1 = new shared_ptr<int>(registry.lookup_or_create(key1));
+ const unsigned int key2 = 222;
+ shared_ptr<int> ptr2 = registry.lookup_or_create(key2);
+
+ pair<unsigned int, shared_ptr<int> > i;
+ EXPECT_TRUE(registry.get_next(i.first, &i));
+ EXPECT_EQ(key1, i.first);
+ delete ptr1;
+ EXPECT_TRUE(registry.get_next(i.first, &i));
+ EXPECT_EQ(key2, i.first);
+ }
}
class SharedPtrRegistry_destructor : public ::testing::Test {
diff --git a/src/test/mon/moncap.cc b/src/test/mon/moncap.cc
index 19f82f55ecf..238442b90d8 100644
--- a/src/test/mon/moncap.cc
+++ b/src/test/mon/moncap.cc
@@ -51,6 +51,7 @@ const char *parse_good[] = {
"allow service foo-foo r, allow service bar r",
"allow service \" foo \" w, allow service bar r",
"allow command abc with arg=foo arg2=bar, allow service foo r",
+ "allow command abc.def with arg=foo arg2=bar, allow service foo r",
"allow command \"foo bar\" with arg=\"baz\"",
"allow command \"foo bar\" with arg=\"baz.xx\"",
0
diff --git a/src/test/osd/osdcap.cc b/src/test/osd/osdcap.cc
index 5f7c607deec..8fc3ddd812a 100644
--- a/src/test/osd/osdcap.cc
+++ b/src/test/osd/osdcap.cc
@@ -41,6 +41,7 @@ const char *parse_good[] = {
"allow rwx pool foo; allow r pool bar",
"allow auid 123 rwx",
"allow pool foo rwx, allow pool bar r",
+ "allow pool foo.froo.foo rwx, allow pool bar r",
"allow pool foo rwx ; allow pool bar r",
"allow pool foo rwx ;allow pool bar r",
"allow pool foo rwx; allow pool bar r",
diff --git a/src/test/pybind/test_rados.py b/src/test/pybind/test_rados.py
index a8df299c879..9be4c1eb815 100644
--- a/src/test/pybind/test_rados.py
+++ b/src/test/pybind/test_rados.py
@@ -31,12 +31,6 @@ def test_rados_init():
with Rados(conffile='', name='client.admin'):
pass
-def test_rados_parse_conf():
- with Rados(conffile='', rados_id='admin') as rados:
- rados.parse_env()
- rados.parse_env('FOO_DOES_NOT_EXIST_BLAHBLAH')
- pass
-
def test_ioctx_context_manager():
with Rados(conffile='', rados_id='admin') as conn:
with conn.open_ioctx('data') as ioctx:
@@ -46,6 +40,8 @@ class TestRados(object):
def setUp(self):
self.rados = Rados(conffile='')
+ self.rados.conf_parse_env('FOO_DOES_NOT_EXIST_BLAHBLAH')
+ self.rados.conf_parse_env()
self.rados.connect()
def tearDown(self):
diff --git a/src/test/test_osd_types.cc b/src/test/test_osd_types.cc
index 2a402aa4bf6..34674358285 100644
--- a/src/test/test_osd_types.cc
+++ b/src/test/test_osd_types.cc
@@ -1009,8 +1009,7 @@ protected:
TEST_F(ObjectContextTest, read_write_lock)
{
{
- object_info_t oi;
- ObjectContext obc(oi, false, NULL);
+ ObjectContext obc;
//
// write_lock
@@ -1045,8 +1044,7 @@ TEST_F(ObjectContextTest, read_write_lock)
useconds_t delay = 0;
{
- object_info_t oi;
- ObjectContext obc(oi, false, NULL);
+ ObjectContext obc;
//
// write_lock
@@ -1103,8 +1101,7 @@ TEST_F(ObjectContextTest, read_write_lock)
}
{
- object_info_t oi;
- ObjectContext obc(oi, false, NULL);
+ ObjectContext obc;
//
// read_lock
diff --git a/src/tools/ceph-monstore-tool.cc b/src/tools/ceph-monstore-tool.cc
index 4ab8fa86465..8f294c4a4e3 100644
--- a/src/tools/ceph-monstore-tool.cc
+++ b/src/tools/ceph-monstore-tool.cc
@@ -179,7 +179,7 @@ int main(int argc, char **argv) {
int fd;
if (vm.count("out")) {
- if ((fd = open(out_path.c_str(), O_WRONLY|O_CREAT|O_TRUNC, 0666)) == -1) {
+ if ((fd = open(out_path.c_str(), O_WRONLY|O_CREAT|O_TRUNC, 0666)) < 0) {
int _err = errno;
if (_err != EISDIR) {
std::cerr << "Couldn't open " << out_path << ": " << cpp_strerror(_err) << std::endl;
@@ -214,6 +214,7 @@ int main(int argc, char **argv) {
} else if (cmd == "compact") {
st.compact();
} else if (cmd == "getmonmap") {
+ assert(fd >= 0);
if (!store_path.size()) {
std::cerr << "need mon store path" << std::endl;
std::cerr << desc << std::endl;
diff --git a/src/vstart.sh b/src/vstart.sh
index 7ce4628d775..1a6f4f957b9 100755
--- a/src/vstart.sh
+++ b/src/vstart.sh
@@ -17,6 +17,11 @@ set -e
[ -z "$CEPH_NUM_MDS" ] && CEPH_NUM_MDS=3
[ -z "$CEPH_NUM_RGW" ] && CEPH_NUM_RGW=1
+[ -z "$CEPH_DIR" ] && CEPH_DIR="$PWD/"
+[ -z "$CEPH_DEV_DIR" ] && CEPH_DEV_DIR="$CEPH_DIR/dev"
+[ -z "$CEPH_OUT_DIR" ] && CEPH_OUT_DIR="$CEPH_DIR/out"
+[ -z "$CEPH_RGW_PORT" ] && CEPH_RGW_PORT=8000
+
extra_conf=""
new=0
standby=0
@@ -34,9 +39,9 @@ cephx=1 #turn cephx on by default
MON_ADDR=""
-conf="ceph.conf"
+conf="$CEPH_DIR/ceph.conf"
-keyring_fn="$PWD/keyring"
+keyring_fn="$CEPH_DIR/keyring"
osdmap_fn="/tmp/ceph_osdmap.$$"
monmap_fn="/tmp/ceph_monmap.$$"
@@ -223,7 +228,7 @@ fi
# sudo if btrfs
-test -d dev/osd0/. && test -e dev/sudo && SUDO="sudo"
+test -d $CEPH_DEV_DIR/osd0/. && test -e $CEPH_DEV_DIR/sudo && SUDO="sudo"
if [ "$start_all" -eq 1 ]; then
$SUDO $CEPH_BIN/init-ceph stop
@@ -275,11 +280,11 @@ do
done
DAEMONOPTS="
- log file = out/\$name.log
- admin socket = out/\$name.asok
+ log file = $CEPH_OUT_DIR/\$name.log
+ admin socket = $CEPH_OUT_DIR/\$name.asok
chdir = \"\"
- pid file = out/\$name.pid
- heartbeat file = out/\$name.heartbeat
+ pid file = $CEPH_OUT_DIR/\$name.pid
+ heartbeat file = $CEPH_OUT_DIR/\$name.heartbeat
"
@@ -294,7 +299,7 @@ if [ "$start_mon" -eq 1 ]; then
osd pgp bits = 5 ; (invalid, but ceph should cope!)
osd crush chooseleaf type = 0
osd pool default min size = 1
- run dir = out
+ run dir = $CEPH_OUT_DIR
EOF
if [ "$cephx" -eq 1 ] ; then
cat <<EOF >> $conf
@@ -311,7 +316,7 @@ fi
[client]
keyring = $keyring_fn
- log file = out/\$name.\$pid.log
+ log file = $CEPH_OUT_DIR/\$name.\$pid.log
[mds]
$DAEMONOPTS
@@ -319,12 +324,12 @@ $CMDSDEBUG
mds debug frag = true
mds debug auth pins = true
mds debug subtrees = true
- mds data = dev/mds.\$id
+ mds data = $CEPH_DEV_DIR/mds.\$id
$extra_conf
[osd]
$DAEMONOPTS
- osd data = dev/osd\$id
- osd journal = dev/osd\$id.journal
+ osd data = $CEPH_DEV_DIR/osd\$id
+ osd journal = $CEPH_DEV_DIR/osd\$id.journal
osd journal size = 100
osd class tmp = out
osd class dir = .libs
@@ -336,7 +341,7 @@ $extra_conf
$DAEMONOPTS
$CMONDEBUG
$extra_conf
- mon cluster log file = out/cluster.mon.\$id.log
+ mon cluster log file = $CEPH_OUT_DIR/cluster.mon.\$id.log
[global]
$extra_conf
EOF
@@ -368,7 +373,7 @@ EOF
cat <<EOF >> $conf
[mon.$f]
host = $HOSTNAME
- mon data = dev/mon.$f
+ mon data = $CEPH_DEV_DIR/mon.$f
mon addr = $IP:$(($CEPH_PORT+$count))
EOF
fi
@@ -380,10 +385,10 @@ EOF
for f in $MONS
do
- cmd="rm -rf dev/mon.$f"
+ cmd="rm -rf $CEPH_DEV_DIR/mon.$f"
echo $cmd
$cmd
- cmd="mkdir dev/mon.$f"
+ cmd="mkdir $CEPH_DEV_DIR/mon.$f"
echo $cmd
$cmd
cmd="$CEPH_BIN/ceph-mon --mkfs -c $conf -i $f --monmap=$monmap_fn"
@@ -414,9 +419,9 @@ if [ "$start_osd" -eq 1 ]; then
[osd.$osd]
host = $HOSTNAME
EOF
- rm -rf dev/osd$osd || true
- for f in dev/osd$osd/* ; do btrfs sub delete $f || true ; done || true
- mkdir -p dev/osd$osd
+ rm -rf $CEPH_DEV_DIR/osd$osd || true
+ for f in $CEPH_DEV_DIR/osd$osd/* ; do btrfs sub delete $f || true ; done || true
+ mkdir -p $CEPH_DEV_DIR/osd$osd
fi
uuid=`uuidgen`
@@ -425,7 +430,7 @@ EOF
$SUDO $CEPH_ADM osd crush add osd.$osd 1.0 host=localhost rack=localrack root=default
$SUDO $CEPH_BIN/ceph-osd -i $osd $ARGS --mkfs --mkkey --osd-uuid $uuid
- key_fn=dev/osd$osd/keyring
+ key_fn=$CEPH_DEV_DIR/osd$osd/keyring
echo adding osd$osd key to auth repository
$SUDO $CEPH_ADM -i $key_fn auth add osd.$osd osd "allow *" mon "allow profile osd"
fi
@@ -448,15 +453,15 @@ if [ "$start_mds" -eq 1 ]; then
for name in a b c d e f g h i j k l m n o p
do
if [ "$new" -eq 1 ]; then
- mkdir -p dev/mds.$name
- key_fn=dev/mds.$name/keyring
+ mkdir -p $CEPH_DEV_DIR/mds.$name
+ key_fn=$CEPH_DEV_DIR/mds.$name/keyring
if [ $overwrite_conf -eq 1 ]; then
cat <<EOF >> $conf
[mds.$name]
host = $HOSTNAME
EOF
if [ "$standby" -eq 1 ]; then
- mkdir -p dev/mds.${name}s
+ mkdir -p $CEPH_DEV_DIR/mds.${name}s
cat <<EOF >> $conf
mds standby for rank = $mds
[mds.${name}s]
@@ -469,8 +474,8 @@ EOF
$SUDO $CEPH_ADM -i $key_fn auth add mds.$name mon 'allow profile mds' osd 'allow *' mds 'allow'
if [ "$standby" -eq 1 ]; then
$SUDO $CEPH_BIN/ceph-authtool --create-keyring --gen-key --name=mds.${name}s \
- dev/mds.${name}s/keyring
- $SUDO $CEPH_ADM -i dev/mds.${name}s/keyring auth add mds.${name}s \
+ $CEPH_DEV_DIR/mds.${name}s/keyring
+ $SUDO $CEPH_ADM -i $CEPH_DEV_DIR/mds.${name}s/keyring auth add mds.${name}s \
mon 'allow *' osd 'allow *' mds 'allow'
fi
fi
@@ -497,7 +502,7 @@ fi
if [ "$start_rgw" -eq 1 ]; then
for rgw in `seq 0 $((CEPH_NUM_RGW-1))`
do
- rgwport=$(( 8000 + $rgw ))
+ rgwport=$(( $CEPH_RGW_PORT + $rgw ))
if [ "$new" -eq 1 ]; then
if [ $overwrite_conf -eq 1 ]; then
dnsname=`hostname -f`
@@ -505,13 +510,13 @@ if [ "$start_rgw" -eq 1 ]; then
[client.radosgw.rgw$rgw]
host = $HOSTNAME
$DAEMONOPTS
- keyring = out/keyring.client.radosgw.rgw$rgw
- rgw socket path = out/sock.client.radosgw.rgw$rgw
+ keyring = $CEPH_OUT_DIR/keyring.client.radosgw.rgw$rgw
+ rgw socket path = $CEPH_OUT_DIR/sock.client.radosgw.rgw$rgw
rgw dns name = $dnsname
EOF
- mkdir -p out/htdocs
- mkdir -p out/fastcgi_sock
- cat <<EOF > out/apache.conf
+ mkdir -p $CEPH_OUT_DIR/htdocs
+ mkdir -p $CEPH_OUT_DIR/fastcgi_sock
+ cat <<EOF > $CEPH_OUT_DIR/apache.conf
LoadModule env_module /usr/lib/apache2/modules/mod_env.so
LoadModule rewrite_module /usr/lib/apache2/modules/mod_rewrite.so
LoadModule fastcgi_module /usr/lib/apache2/modules/mod_fastcgi.so
@@ -519,14 +524,14 @@ LoadModule fastcgi_module /usr/lib/apache2/modules/mod_fastcgi.so
Listen $rgwport
ServerName rgwtest.example.com
-ServerRoot $PWD/out
-ErrorLog $PWD/out/apache.error.log
+ServerRoot $CEPH_OUT_DIR
+ErrorLog $CEPH_OUT_DIR/apache.error.log
LogFormat "%h l %u %t \"%r\" %>s %b \"{Referer}i\" \"%{User-agent}i\"" combined
-CustomLog $PWD/out/apache.access.log combined
-PidFile $PWD/out/apache.pid
-DocumentRoot $PWD/out/htdocs
-FastCgiIPCDir $PWD/out/fastcgi_sock
-FastCgiExternalServer $PWD/out/htdocs/rgw.fcgi -socket $PWD/out/sock.client.radosgw.rgw$rgw
+CustomLog $CEPH_OUT_DIR/apache.access.log combined
+PidFile $CEPH_OUT_DIR/apache.pid
+DocumentRoot $CEPH_OUT_DIR/htdocs
+FastCgiIPCDir $CEPH_OUT_DIR/fastcgi_sock
+FastCgiExternalServer $CEPH_OUT_DIR/htdocs/rgw.fcgi -socket $CEPH_OUT_DIR/sock.client.radosgw.rgw$rgw
RewriteEngine On
RewriteRule ^/([a-zA-Z0-9-_.]*)([/]?.*) /rgw.fcgi?page=$1&params=$2&%{QUERY_STRING} [E=HTTP_AUTHORIZATION:%{HTTP:Authorization},L]
@@ -537,7 +542,7 @@ SetEnv RGW_LOG_LEVEL 20
SetEnv RGW_PRINT_CONTINUE yes
SetEnv RGW_SHOULD_LOG yes
-<Directory $PWD/out/htdocs>
+<Directory $CEPH_OUT_DIR/htdocs>
Options +ExecCGI
AllowOverride All
SetHandler fastcgi-script
@@ -546,7 +551,7 @@ SetEnv RGW_SHOULD_LOG yes
AllowEncodedSlashes On
ServerSignature Off
EOF
- $SUDO $CEPH_ADM auth get-or-create client.radosgw.rgw$rgw osd 'allow rwx' mon 'allow r' -o out/keyring.client.radosgw.rgw$rgw
+ $SUDO $CEPH_ADM auth get-or-create client.radosgw.rgw$rgw osd 'allow rwx' mon 'allow r' -o $CEPH_OUT_DIR/keyring.client.radosgw.rgw$rgw
#akey=`echo $$ | md5sum | cut -c 1-20`
#skey=`dd if=/dev/urandom of=/tmp/random.$$ bs=1 count=40 2>/dev/null ; base64 < /tmp/random.$$ ; rm /tmp/random.$$`
@@ -554,12 +559,12 @@ EOF
skey='h7GhxuBLTrlhVUyxSPUKUV8r/2EI4ngqJxD7iBdBYLhwluN30JaT3Q=='
echo access key $akey
echo secret key $skey
- $CEPH_BIN/radosgw-admin user create --uid tester --access-key $akey --secret $skey --display-name 'M. Tester' --email tester@ceph.com
+ $CEPH_BIN/radosgw-admin user create --uid tester --access-key $akey --secret $skey --display-name 'M. Tester' --email tester@ceph.com -c $conf
fi
fi
echo start rgw$rgw on http://localhost:$rgwport
run 'rgw' $SUDO $CEPH_BIN/radosgw -n client.radosgw.rgw$rgw $ARGS
- run 'apache2' $SUDO apache2 -f $PWD/out/apache.conf
+ run 'apache2' $SUDO apache2 -f $CEPH_OUT_DIR/apache.conf
done
fi
diff --git a/src/yasm-wrapper b/src/yasm-wrapper
new file mode 100755
index 00000000000..57d95def46e
--- /dev/null
+++ b/src/yasm-wrapper
@@ -0,0 +1,38 @@
+#!/bin/sh -e
+
+# libtool and yasm do not get along.
+# filter out any crap that libtool feeds us that yasm does not understand.
+new=""
+touch=""
+while [ -n "$*" ]; do
+ case "$1" in
+ -f )
+ shift
+ new="-f $1"
+ shift
+ ;;
+ -g* | -f* | -W* | -MD | -MP | -fPIC | -c | -D* | --param* | -O* | -I* | -m* | -pipe )
+ shift
+ ;;
+ -MT )
+ shift
+ shift
+ ;;
+ -MF )
+ shift
+ touch="$1"
+ shift
+ ;;
+ * )
+ new="$new $1"
+ shift
+ ;;
+ esac
+done
+
+echo $0: yasm $new
+yasm $new
+
+[ -n "$touch" ] && touch $touch
+
+true \ No newline at end of file