diff options
author | Sage Weil <sage@inktank.com> | 2012-11-25 09:18:44 -0800 |
---|---|---|
committer | Sage Weil <sage@inktank.com> | 2012-11-25 09:18:44 -0800 |
commit | 39d2d08a233c81dc6e376f7016ff92f1fb33fa70 (patch) | |
tree | f87edb3f97cb154ed22b256f0f90a2d0fd774584 | |
parent | 2b002a8de3d64a5c03a4710bb44923afd2feed36 (diff) | |
parent | 7602a055764aff1c50b1e2641a3e703845cbe471 (diff) | |
download | ceph-39d2d08a233c81dc6e376f7016ff92f1fb33fa70.tar.gz |
Merge remote-tracking branch 'gh/next'
46 files changed, 1096 insertions, 246 deletions
diff --git a/doc/man/8/mkcephfs.rst b/doc/man/8/mkcephfs.rst index 5d0c9952356..05dee95e746 100644 --- a/doc/man/8/mkcephfs.rst +++ b/doc/man/8/mkcephfs.rst @@ -7,7 +7,7 @@ Synopsis ======== -| **mkcephfs** -c *ceph.conf* [ --mkbtrfs ] [ -a, --all-hosts [ -k +| **mkcephfs** -c *ceph.conf* [ --mkfs ] [ -a, --all-hosts [ -k */path/to/admin.keyring* ] ] @@ -70,20 +70,15 @@ Options default is ``/etc/ceph/keyring`` (or whatever is specified in the config file). -.. option:: --mkbtrfs +.. option:: --mkfs - Create and mount the any btrfs file systems specified in the - ceph.conf for OSD data storage using mkfs.btrfs. The "btrfs devs" - and (if it differs from "osd data") "btrfs path" options must be - defined. + Create and mount the any file systems specified in the + ceph.conf for OSD data storage using mkfs. The "devs" and (if it + differs from "osd data") "fs path" options must be defined. **NOTE** Btrfs is still considered experimental. This option - can ease some configuration pain, but is the use of btrfs is not - required when ``osd data`` directories are mounted manually by the - adminstrator. - - **NOTE** This option is deprecated and will be removed in a future - release. + can ease some configuration pain, but is not required when + ``osd data`` directories are mounted manually by the adminstrator. .. option:: --no-copy-conf diff --git a/man/mkcephfs.8 b/man/mkcephfs.8 index f0ab80d2ccc..26276b76030 100644 --- a/man/mkcephfs.8 +++ b/man/mkcephfs.8 @@ -32,7 +32,7 @@ level margin: \\n[rst2man-indent\\n[rst2man-indent-level]] . .SH SYNOPSIS .nf -\fBmkcephfs\fP \-c \fIceph.conf\fP [ \-\-mkbtrfs ] [ \-a, \-\-all\-hosts [ \-k +\fBmkcephfs\fP \-c \fIceph.conf\fP [ \-\-mkfs ] [ \-a, \-\-all\-hosts [ \-k \fI/path/to/admin.keyring\fP ] ] .fi .sp @@ -111,19 +111,16 @@ config file). .UNINDENT .INDENT 0.0 .TP -.B \-\-mkbtrfs -Create and mount the any btrfs file systems specified in the -ceph.conf for OSD data storage using mkfs.btrfs. The "btrfs devs" -and (if it differs from "osd data") "btrfs path" options must be +.B \-\-mkfs +Create and mount any file systems specified in the +ceph.conf for OSD data storage using mkfs.*. The "devs" +and (if it differs from "osd data") "fs path" options must be defined. .sp \fBNOTE\fP Btrfs is still considered experimental. This option -can ease some configuration pain, but is the use of btrfs is not +can ease some configuration pain, but the use of this option is not required when \fBosd data\fP directories are mounted manually by the adminstrator. -.sp -\fBNOTE\fP This option is deprecated and will be removed in a future -release. .UNINDENT .INDENT 0.0 .TP diff --git a/qa/run_xfstests.sh b/qa/run_xfstests.sh index 958691cee09..1eba38a248d 100644 --- a/qa/run_xfstests.sh +++ b/qa/run_xfstests.sh @@ -49,7 +49,7 @@ XFS_MKFS_OPTIONS="-l su=32k" # until we can work through getting them all passing reliably. TESTS="1-9 11-15 17 19-21 26-29 31-34 41 46-48 50-54 56 61 63-67 69-70 74-76" TESTS="${TESTS} 78 79 84-89 91-92 100 103 105 108 110 116-121 124 126" -TESTS="${TESTS} 129-135 137-141 164-167 174 179 181-184 186-190 192 194" +TESTS="${TESTS} 129-135 137-141 164-167 179 182-184 186-190 192 194" TESTS="${TESTS} 196 199 201 203 214-216 220-227 234 236-238 241 243-249" TESTS="${TESTS} 253 257-259 261 262 269 273 275 277 278 280 285 286" # 275 was the highest available test as of 4/10/12. diff --git a/src/Makefile.am b/src/Makefile.am index 3a4437c0e49..eebb6c7cc5c 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -1197,6 +1197,7 @@ libcommon_files = \ common/BackTrace.cc \ common/perf_counters.cc \ common/Mutex.cc \ + common/OutputDataSocket.cc \ common/admin_socket.cc \ common/admin_socket_client.cc \ common/escape.c \ @@ -1475,6 +1476,7 @@ noinst_HEADERS = \ common/Finisher.h\ common/Formatter.h\ common/perf_counters.h\ + common/OutputDataSocket.h \ common/admin_socket.h \ common/admin_socket_client.h \ common/shared_cache.hpp \ diff --git a/src/auth/Crypto.cc b/src/auth/Crypto.cc index 77c959fba92..1bfc2814f4f 100644 --- a/src/auth/Crypto.cc +++ b/src/auth/Crypto.cc @@ -191,9 +191,13 @@ static void nss_aes_operation(CK_ATTRIBUTE_TYPE op, const bufferptr& secret, goto err_op; } - PK11_DestroyContext(ctx, PR_TRUE); out_tmp.set_length(written + written2); out.append(out_tmp); + + PK11_DestroyContext(ctx, PR_TRUE); + SECITEM_FreeItem(param, PR_TRUE); + PK11_FreeSymKey(key); + PK11_FreeSlot(slot); return; err_op: diff --git a/src/auth/KeyRing.cc b/src/auth/KeyRing.cc index ad23922d625..96dba74a043 100644 --- a/src/auth/KeyRing.cc +++ b/src/auth/KeyRing.cc @@ -236,7 +236,8 @@ void KeyRing::import(CephContext *cct, KeyRing& other) for (map<EntityName, EntityAuth>::iterator p = other.keys.begin(); p != other.keys.end(); ++p) { - ldout(cct, 10) << " importing " << p->first << " " << p->second << dendl; + ldout(cct, 10) << " importing " << p->first << dendl; + ldout(cct, 30) << " " << p->second << dendl; keys[p->first] = p->second; } } diff --git a/src/auth/cephx/CephxKeyServer.cc b/src/auth/cephx/CephxKeyServer.cc index 1440b2c2b9f..c3e4f9cfdc1 100644 --- a/src/auth/cephx/CephxKeyServer.cc +++ b/src/auth/cephx/CephxKeyServer.cc @@ -46,7 +46,7 @@ bool KeyServerData::get_service_secret(CephContext *cct, uint32_t service_id, secret_id = riter->first; secret = riter->second; - ldout(cct, 10) << "get_service_secret service " << ceph_entity_type_name(service_id) + ldout(cct, 30) << "get_service_secret service " << ceph_entity_type_name(service_id) << " id " << secret_id << " " << secret << dendl; return true; } @@ -77,12 +77,13 @@ bool KeyServerData::get_service_secret(CephContext *cct, uint32_t service_id, if (riter == secrets.secrets.end()) { ldout(cct, 10) << "get_service_secret service " << ceph_entity_type_name(service_id) - << " secret " << secret_id << " not found; i have:" << dendl; + << " secret " << secret_id << " not found" << dendl; + ldout(cct, 30) << " I have:" << dendl; for (map<uint64_t, ExpiringCryptoKey>::const_iterator iter = secrets.secrets.begin(); iter != secrets.secrets.end(); ++iter) - ldout(cct, 10) << " id " << iter->first << " " << iter->second << dendl; + ldout(cct, 30) << " id " << iter->first << " " << iter->second << dendl; return false; } @@ -170,7 +171,7 @@ bool KeyServer::_check_rotating_secrets() void KeyServer::_dump_rotating_secrets() { - ldout(cct, 10) << "_dump_rotating_secrets" << dendl; + ldout(cct, 30) << "_dump_rotating_secrets" << dendl; for (map<uint32_t, RotatingSecrets>::iterator iter = data.rotating_secrets.begin(); iter != data.rotating_secrets.end(); ++iter) { @@ -178,7 +179,7 @@ void KeyServer::_dump_rotating_secrets() for (map<uint64_t, ExpiringCryptoKey>::iterator mapiter = key.secrets.begin(); mapiter != key.secrets.end(); ++mapiter) - ldout(cct, 10) << "service " << ceph_entity_type_name(iter->first) + ldout(cct, 30) << "service " << ceph_entity_type_name(iter->first) << " id " << mapiter->first << " key " << mapiter->second << dendl; } @@ -203,7 +204,8 @@ int KeyServer::_rotate_secret(uint32_t service_id) } ek.expiration += ttl; uint64_t secret_id = r.add(ek); - ldout(cct, 10) << "_rotate_secret adding " << ceph_entity_type_name(service_id) + ldout(cct, 10) << "_rotate_secret adding " << ceph_entity_type_name(service_id) << dendl; + ldout(cct, 30) << "_rotate_secret adding " << ceph_entity_type_name(service_id) << " id " << secret_id << " " << ek << dendl; added++; diff --git a/src/auth/cephx/CephxProtocol.cc b/src/auth/cephx/CephxProtocol.cc index 9c262634e7b..34f31f70c72 100644 --- a/src/auth/cephx/CephxProtocol.cc +++ b/src/auth/cephx/CephxProtocol.cc @@ -118,7 +118,7 @@ bool cephx_build_service_ticket_reply(CephContext *cct, } ::encode(blob, service_ticket_bl); - ldout(cct, 20) << "service_ticket_blob is "; + ldout(cct, 30) << "service_ticket_blob is "; service_ticket_bl.hexdump(*_dout); *_dout << dendl; diff --git a/src/ceph.conf.twoosds b/src/ceph.conf.twoosds index c0cfc68f1a0..3417cf68fce 100644 --- a/src/ceph.conf.twoosds +++ b/src/ceph.conf.twoosds @@ -67,16 +67,17 @@ debug journal = 20 log dir = /data/cosd$id osd data = /mnt/osd$id - btrfs options = "flushoncommit,usertrans" ; user = root ; osd journal = /mnt/osd$id/journal ; osd journal size = 1000 osd journal = "/dev/disk/by-path/pci-0000:05:02.0-scsi-6:0:0:0" + osd mkfs type = btrfs + osd mount options btrfs = "flushoncommit,usertrans" ; filestore max sync interval = 1 - btrfs devs = "/dev/disk/by-path/pci-0000:05:01.0-scsi-2:0:0:0" -; btrfs devs = "/dev/disk/by-path/pci-0000:05:01.0-scsi-2:0:0:0 \ + devs = "/dev/disk/by-path/pci-0000:05:01.0-scsi-2:0:0:0" +; devs = "/dev/disk/by-path/pci-0000:05:01.0-scsi-2:0:0:0 \ ; /dev/disk/by-path/pci-0000:05:01.0-scsi-3:0:0:0 \ ; /dev/disk/by-path/pci-0000:05:01.0-scsi-4:0:0:0 \ ; /dev/disk/by-path/pci-0000:05:01.0-scsi-5:0:0:0 \ diff --git a/src/common/OutputDataSocket.cc b/src/common/OutputDataSocket.cc new file mode 100644 index 00000000000..54f6ab4a2a4 --- /dev/null +++ b/src/common/OutputDataSocket.cc @@ -0,0 +1,418 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2011 New Dream Network + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "common/Thread.h" +#include "common/OutputDataSocket.h" +#include "common/config.h" +#include "common/dout.h" +#include "common/errno.h" +#include "common/perf_counters.h" +#include "common/pipe.h" +#include "common/safe_io.h" +#include "common/version.h" +#include "common/Formatter.h" + +#include <errno.h> +#include <fcntl.h> +#include <inttypes.h> +#include <map> +#include <poll.h> +#include <set> +#include <sstream> +#include <stdint.h> +#include <string.h> +#include <string> +#include <sys/socket.h> +#include <sys/types.h> +#include <sys/un.h> +#include <unistd.h> + +#include "include/compat.h" + +#define dout_subsys ceph_subsys_asok +#undef dout_prefix +#define dout_prefix *_dout << "asok(" << (void*)m_cct << ") " + +using std::ostringstream; + +/* + * UNIX domain sockets created by an application persist even after that + * application closes, unless they're explicitly unlinked. This is because the + * directory containing the socket keeps a reference to the socket. + * + * This code makes things a little nicer by unlinking those dead sockets when + * the application exits normally. + */ +static pthread_mutex_t cleanup_lock = PTHREAD_MUTEX_INITIALIZER; +static std::vector <const char*> cleanup_files; +static bool cleanup_atexit = false; + +static void remove_cleanup_file(const char *file) +{ + pthread_mutex_lock(&cleanup_lock); + TEMP_FAILURE_RETRY(unlink(file)); + for (std::vector <const char*>::iterator i = cleanup_files.begin(); + i != cleanup_files.end(); ++i) { + if (strcmp(file, *i) == 0) { + free((void*)*i); + cleanup_files.erase(i); + break; + } + } + pthread_mutex_unlock(&cleanup_lock); +} + +static void remove_all_cleanup_files() +{ + pthread_mutex_lock(&cleanup_lock); + for (std::vector <const char*>::iterator i = cleanup_files.begin(); + i != cleanup_files.end(); ++i) { + TEMP_FAILURE_RETRY(unlink(*i)); + free((void*)*i); + } + cleanup_files.clear(); + pthread_mutex_unlock(&cleanup_lock); +} + +static void add_cleanup_file(const char *file) +{ + char *fname = strdup(file); + if (!fname) + return; + pthread_mutex_lock(&cleanup_lock); + cleanup_files.push_back(fname); + if (!cleanup_atexit) { + atexit(remove_all_cleanup_files); + cleanup_atexit = true; + } + pthread_mutex_unlock(&cleanup_lock); +} + + +OutputDataSocket::OutputDataSocket(CephContext *cct, uint64_t _backlog) + : m_cct(cct), + data_max_backlog(_backlog), + m_sock_fd(-1), + m_shutdown_rd_fd(-1), + m_shutdown_wr_fd(-1), + going_down(false), + m_lock("OutputDataSocket::m_lock") +{ +} + +OutputDataSocket::~OutputDataSocket() +{ + shutdown(); +} + +/* + * This thread listens on the UNIX domain socket for incoming connections. + * It only handles one connection at a time at the moment. All I/O is nonblocking, + * so that we can implement sensible timeouts. [TODO: make all I/O nonblocking] + * + * This thread also listens to m_shutdown_rd_fd. If there is any data sent to this + * pipe, the thread terminates itself gracefully, allowing the + * OutputDataSocketConfigObs class to join() it. + */ + +#define PFL_SUCCESS ((void*)(intptr_t)0) +#define PFL_FAIL ((void*)(intptr_t)1) + +std::string OutputDataSocket::create_shutdown_pipe(int *pipe_rd, int *pipe_wr) +{ + int pipefd[2]; + int ret = pipe_cloexec(pipefd); + if (ret < 0) { + ostringstream oss; + oss << "OutputDataSocket::create_shutdown_pipe error: " << cpp_strerror(ret); + return oss.str(); + } + + *pipe_rd = pipefd[0]; + *pipe_wr = pipefd[1]; + return ""; +} + +std::string OutputDataSocket::bind_and_listen(const std::string &sock_path, int *fd) +{ + ldout(m_cct, 5) << "bind_and_listen " << sock_path << dendl; + + struct sockaddr_un address; + if (sock_path.size() > sizeof(address.sun_path) - 1) { + ostringstream oss; + oss << "OutputDataSocket::bind_and_listen: " + << "The UNIX domain socket path " << sock_path << " is too long! The " + << "maximum length on this system is " + << (sizeof(address.sun_path) - 1); + return oss.str(); + } + int sock_fd = socket(PF_UNIX, SOCK_STREAM, 0); + if (sock_fd < 0) { + int err = errno; + ostringstream oss; + oss << "OutputDataSocket::bind_and_listen: " + << "failed to create socket: " << cpp_strerror(err); + return oss.str(); + } + int r = fcntl(sock_fd, F_SETFD, FD_CLOEXEC); + if (r < 0) { + r = errno; + TEMP_FAILURE_RETRY(::close(sock_fd)); + ostringstream oss; + oss << "OutputDataSocket::bind_and_listen: failed to fcntl on socket: " << cpp_strerror(r); + return oss.str(); + } + memset(&address, 0, sizeof(struct sockaddr_un)); + address.sun_family = AF_UNIX; + snprintf(address.sun_path, sizeof(address.sun_path), + "%s", sock_path.c_str()); + if (bind(sock_fd, (struct sockaddr*)&address, + sizeof(struct sockaddr_un)) != 0) { + int err = errno; + if (err == EADDRINUSE) { + // The old UNIX domain socket must still be there. + // Let's unlink it and try again. + TEMP_FAILURE_RETRY(unlink(sock_path.c_str())); + if (bind(sock_fd, (struct sockaddr*)&address, + sizeof(struct sockaddr_un)) == 0) { + err = 0; + } + else { + err = errno; + } + } + if (err != 0) { + ostringstream oss; + oss << "OutputDataSocket::bind_and_listen: " + << "failed to bind the UNIX domain socket to '" << sock_path + << "': " << cpp_strerror(err); + close(sock_fd); + return oss.str(); + } + } + if (listen(sock_fd, 5) != 0) { + int err = errno; + ostringstream oss; + oss << "OutputDataSocket::bind_and_listen: " + << "failed to listen to socket: " << cpp_strerror(err); + close(sock_fd); + TEMP_FAILURE_RETRY(unlink(sock_path.c_str())); + return oss.str(); + } + *fd = sock_fd; + return ""; +} + +void* OutputDataSocket::entry() +{ + ldout(m_cct, 5) << "entry start" << dendl; + while (true) { + struct pollfd fds[2]; + memset(fds, 0, sizeof(fds)); + fds[0].fd = m_sock_fd; + fds[0].events = POLLIN | POLLRDBAND; + fds[1].fd = m_shutdown_rd_fd; + fds[1].events = POLLIN | POLLRDBAND; + + int ret = poll(fds, 2, -1); + if (ret < 0) { + int err = errno; + if (err == EINTR) { + continue; + } + lderr(m_cct) << "OutputDataSocket: poll(2) error: '" + << cpp_strerror(err) << dendl; + return PFL_FAIL; + } + + if (fds[0].revents & POLLIN) { + // Send out some data + do_accept(); + } + if (fds[1].revents & POLLIN) { + // Parent wants us to shut down + return PFL_SUCCESS; + } + } + ldout(m_cct, 5) << "entry exit" << dendl; + + return PFL_SUCCESS; // unreachable +} + + +bool OutputDataSocket::do_accept() +{ + struct sockaddr_un address; + socklen_t address_length = sizeof(address); + ldout(m_cct, 30) << "OutputDataSocket: calling accept" << dendl; + int connection_fd = accept(m_sock_fd, (struct sockaddr*) &address, + &address_length); + ldout(m_cct, 30) << "OutputDataSocket: finished accept" << dendl; + if (connection_fd < 0) { + int err = errno; + lderr(m_cct) << "OutputDataSocket: do_accept error: '" + << cpp_strerror(err) << dendl; + return false; + } + + handle_connection(connection_fd); + close_connection(connection_fd); + + return 0; +} + +void OutputDataSocket::handle_connection(int fd) +{ + bufferlist bl; + + m_lock.Lock(); + init_connection(bl); + m_lock.Unlock(); + + if (bl.length()) { + /* need to special case the connection init buffer output, as it needs + * to be dumped before any data, including older data that was sent + * before the connection was established, or before we identified + * older connection was broken + */ + int ret = safe_write(fd, bl.c_str(), bl.length()); + if (ret < 0) { + return; + } + } + + int ret = dump_data(fd); + if (ret < 0) + return; + + do { + m_lock.Lock(); + cond.Wait(m_lock); + + if (going_down) { + m_lock.Unlock(); + break; + } + m_lock.Unlock(); + + ret = dump_data(fd); + } while (ret >= 0); +} + +int OutputDataSocket::dump_data(int fd) +{ + m_lock.Lock(); + list<bufferlist> l; + l = data; + data.clear(); + data_size = 0; + m_lock.Unlock(); + + for (list<bufferlist>::iterator iter = l.begin(); iter != l.end(); ++iter) { + bufferlist& bl = *iter; + int ret = safe_write(fd, bl.c_str(), bl.length()); + if (ret >= 0) { + ret = safe_write(fd, delim.c_str(), delim.length()); + } + if (ret < 0) { + for (; iter != l.end(); ++iter) { + bufferlist& bl = *iter; + data.push_back(bl); + data_size += bl.length(); + } + return ret; + } + } + + return 0; +} + +void OutputDataSocket::close_connection(int fd) +{ + TEMP_FAILURE_RETRY(close(fd)); +} + +bool OutputDataSocket::init(const std::string &path) +{ + ldout(m_cct, 5) << "init " << path << dendl; + + /* Set up things for the new thread */ + std::string err; + int pipe_rd = -1, pipe_wr = -1; + err = create_shutdown_pipe(&pipe_rd, &pipe_wr); + if (!err.empty()) { + lderr(m_cct) << "OutputDataSocketConfigObs::init: error: " << err << dendl; + return false; + } + int sock_fd; + err = bind_and_listen(path, &sock_fd); + if (!err.empty()) { + lderr(m_cct) << "OutputDataSocketConfigObs::init: failed: " << err << dendl; + close(pipe_rd); + close(pipe_wr); + return false; + } + + /* Create new thread */ + m_sock_fd = sock_fd; + m_shutdown_rd_fd = pipe_rd; + m_shutdown_wr_fd = pipe_wr; + m_path = path; + create(); + add_cleanup_file(m_path.c_str()); + return true; +} + +void OutputDataSocket::shutdown() +{ + m_lock.Lock(); + going_down = true; + cond.Signal(); + m_lock.Unlock(); + + if (m_shutdown_wr_fd < 0) + return; + + ldout(m_cct, 5) << "shutdown" << dendl; + + // Send a byte to the shutdown pipe that the thread is listening to + char buf[1] = { 0x0 }; + int ret = safe_write(m_shutdown_wr_fd, buf, sizeof(buf)); + TEMP_FAILURE_RETRY(close(m_shutdown_wr_fd)); + m_shutdown_wr_fd = -1; + + if (ret == 0) { + join(); + } else { + lderr(m_cct) << "OutputDataSocket::shutdown: failed to write " + "to thread shutdown pipe: error " << ret << dendl; + } + + remove_cleanup_file(m_path.c_str()); + m_path.clear(); +} + +void OutputDataSocket::append_output(bufferlist& bl) +{ + Mutex::Locker l(m_lock); + + if (data_size + bl.length() > data_max_backlog) { + ldout(m_cct, 20) << "dropping data output, max backlog reached" << dendl; + } + data.push_back(bl); + + data_size += bl.length(); + + cond.Signal(); +} diff --git a/src/common/OutputDataSocket.h b/src/common/OutputDataSocket.h new file mode 100644 index 00000000000..f581a56bf03 --- /dev/null +++ b/src/common/OutputDataSocket.h @@ -0,0 +1,72 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2011 New Dream Network + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_COMMON_OUTPUTDATASOCKET_H +#define CEPH_COMMON_OUTPUTDATASOCKET_H + +#include "common/Thread.h" +#include "common/Mutex.h" +#include "common/Cond.h" + +#include <string> +#include <map> +#include <list> +#include "include/buffer.h" + +class CephContext; + +class OutputDataSocket : public Thread +{ +public: + OutputDataSocket(CephContext *cct, uint64_t _backlog); + virtual ~OutputDataSocket(); + + bool init(const std::string &path); + + void append_output(bufferlist& bl); + +protected: + virtual void init_connection(bufferlist& bl) {} + void shutdown(); + + std::string create_shutdown_pipe(int *pipe_rd, int *pipe_wr); + std::string bind_and_listen(const std::string &sock_path, int *fd); + + void *entry(); + bool do_accept(); + + void handle_connection(int fd); + void close_connection(int fd); + + int dump_data(int fd); + + CephContext *m_cct; + uint64_t data_max_backlog; + std::string m_path; + int m_sock_fd; + int m_shutdown_rd_fd; + int m_shutdown_wr_fd; + bool going_down; + + uint64_t data_size; + + std::list<bufferlist> data; + + Mutex m_lock; + Cond cond; + + bufferlist delim; +}; + +#endif diff --git a/src/common/config_opts.h b/src/common/config_opts.h index cc05095bb3c..51f569c7e1f 100644 --- a/src/common/config_opts.h +++ b/src/common/config_opts.h @@ -470,6 +470,9 @@ OPTION(rgw_usage_max_shards, OPT_INT, 32) OPTION(rgw_usage_max_user_shards, OPT_INT, 1) OPTION(rgw_enable_ops_log, OPT_BOOL, true) // enable logging every rgw operation OPTION(rgw_enable_usage_log, OPT_BOOL, true) // enable logging bandwidth usage +OPTION(rgw_ops_log_rados, OPT_BOOL, true) // whether ops log should go to rados +OPTION(rgw_ops_log_socket_path, OPT_STR, "") // path to unix domain socket where ops log can go +OPTION(rgw_ops_log_data_backlog, OPT_INT, 5 << 20) // max data backlog for ops log OPTION(rgw_usage_log_flush_threshold, OPT_INT, 1024) // threshold to flush pending log data OPTION(rgw_usage_log_tick_interval, OPT_INT, 30) // flush pending log data every X seconds OPTION(rgw_intent_log_object_name, OPT_STR, "%Y-%m-%d-%i-%n") // man date to see codes (a subset are supported) diff --git a/src/common/sync_filesystem.h b/src/common/sync_filesystem.h index 3ad8c9e928d..dc90b890c93 100644 --- a/src/common/sync_filesystem.h +++ b/src/common/sync_filesystem.h @@ -16,6 +16,7 @@ #define CEPH_SYNC_FILESYSTEM_H #include <unistd.h> +#include <syscall.h> #ifndef __CYGWIN__ # ifndef DARWIN @@ -35,6 +36,11 @@ inline int sync_filesystem(int fd) return 0; #endif +#ifdef SYS_syncfs + if (syscall(SYS_syncfs, fd) == 0) + return 0; +#endif + #ifdef BTRFS_IOC_SYNC if (::ioctl(fd, BTRFS_IOC_SYNC) == 0) return 0; diff --git a/src/crush/CrushWrapper.cc b/src/crush/CrushWrapper.cc index 8d33839bbb7..f38a7698376 100644 --- a/src/crush/CrushWrapper.cc +++ b/src/crush/CrushWrapper.cc @@ -20,6 +20,25 @@ void CrushWrapper::find_roots(set<int>& roots) const } } +bool CrushWrapper::subtree_contains(int root, int item) const +{ + if (root == item) + return true; + + if (root >= 0) + return false; // root is a leaf + + const crush_bucket *b = get_bucket(root); + if (!b) + return false; + + for (unsigned j=0; j<b->size; j++) { + if (subtree_contains(b->items[j], item)) + return true; + } + return false; +} + int CrushWrapper::remove_item(CephContext *cct, int item) { @@ -230,9 +249,23 @@ int CrushWrapper::insert_item(CephContext *cct, int item, float weight, string n return -EINVAL; } + // check that we aren't creating a cycle. + if (subtree_contains(id, cur)) { + ldout(cct, 1) << "insert_item item " << cur << " already exists beneath " << id << dendl; + return -EINVAL; + } + crush_bucket *b = get_bucket(id); assert(b); + if (p->first != b->type) { + ldout(cct, 1) << "insert_item existing bucket has type " + << "'" << type_map[b->type] << "' != " + << "'" << type_map[p->first] << "'" << dendl; + return -EINVAL; + } + + // make sure the item doesn't already exist in this bucket for (unsigned j=0; j<b->size; j++) if (b->items[j] == cur) { diff --git a/src/crush/CrushWrapper.h b/src/crush/CrushWrapper.h index e8120931103..5ea68b9a83d 100644 --- a/src/crush/CrushWrapper.h +++ b/src/crush/CrushWrapper.h @@ -212,6 +212,15 @@ public: void find_roots(set<int>& roots) const; /** + * see if an item is contained within a subtree + * + * @param root haystack + * @param item needle + * @return true if the item is located beneath the given node + */ + bool subtree_contains(int root, int item) const; + + /** * see if item is located where we think it is * * This verifies that the given item is located at a particular diff --git a/src/init-ceph.in b/src/init-ceph.in index 31aeb287223..2bafa995f75 100644 --- a/src/init-ceph.in +++ b/src/init-ceph.in @@ -100,8 +100,8 @@ docrun= allhosts=0 debug=0 monaddr= -dobtrfs=1 -dobtrfsumount=0 +dofsmount=1 +dofsumount=0 verbose=0 while echo $1 | grep -q '^-'; do # FIXME: why not '^-'? @@ -130,14 +130,14 @@ case $1 in shift MON_ADDR=$1 ;; - --btrfs) - dobtrfs=1 + --btrfs | --fsmount) + dofsmount=1 ;; - --nobtrfs) - dobtrfs=0 + --nobtrfs | --nofsmount) + dofsmount=0 ;; - --btrfsumount) - dobtrfsumount=1 + --btrfsumount | --fsumount) + dofsumount=1 ;; --conf | -c) [ -z "$2" ] && usage_exit @@ -222,9 +222,18 @@ for name in $what; do if echo $name | grep -q ^osd; then get_conf osd_data "" "osd data" - get_conf btrfs_path "$osd_data" "btrfs path" # mount point defaults so osd data - get_conf btrfs_devs "" "btrfs devs" - first_dev=`echo $btrfs_devs | cut '-d ' -f 1` + get_conf fs_path "$osd_data" "fs path" # mount point defaults so osd data + get_conf fs_devs "" "devs" + if [ -z "$fs_devs" ]; then + # try to fallback to old keys + get_conf tmp_btrfs_devs "" "btrfs devs" + if [ -n "$tmp_btrfs_devs" ]; then + fs_devs="$tmp_btrfs_devs" + else + echo No osd devs defined! + fi + fi + first_dev=`echo $fs_devs | cut '-d ' -f 1` fi # do lockfile, if RH @@ -262,13 +271,44 @@ for name in $what; do cmd="$wrap $cmd $runmode" - if [ $dobtrfs -eq 1 ] && [ -n "$btrfs_devs" ]; then + if [ $dofsmount -eq 1 ] && [ -n "$fs_devs" ]; then get_conf pre_mount "true" "pre mount command" - get_conf btrfs_opt "noatime" "btrfs options" - [ -n "$btrfs_opt" ] && btrfs_opt="-o $btrfs_opt" + get_conf fs_type "" "osd mkfs type" + + if [ -z "$fs_type" ]; then + # try to fallback to to old keys + get_conf tmp_devs "" "btrfs devs" + if [ -n "$tmp_devs" ]; then + fs_type = "btrfs" + else + echo No filesystem type defined! + exit 0 + fi + fi + + get_conf fs_opt "" "osd mount options $fs_type" + if [ -z "$fs_opt" ]; then + if [ "$fs_type" = "btrfs" ]; then + #try to fallback to old keys + get_conf fs_opt "" "btrfs options" + fi + + if [ -z "$fs_opt" ]; then + #fallback to use at least noatime + fs_opt="rw,noatime" + fi + fi + + [ -n "$fs_opt" ] && fs_opt="-o $fs_opt" [ -n "$pre_mount" ] && do_cmd "$pre_mount" - echo Mounting Btrfs on $host:$btrfs_path - do_root_cmd "modprobe btrfs ; btrfs device scan || btrfsctl -a ; egrep -q '^[^ ]+ $btrfs_path' /proc/mounts || mount -t btrfs $btrfs_opt $first_dev $btrfs_path" + + if [ "$fs_type" == "btrfs" ]; then + echo Mounting Btrfs on $host:$fs_path + do_root_cmd "modprobe btrfs ; btrfs device scan || btrfsctl -a ; egrep -q '^[^ ]+ $fs_path' /proc/mounts || mount -t btrfs $fs_opt $first_dev $fs_path" + else + echo Mounting $fs_type on $host:$fs_path + do_root_cmd "modprobe $fs_type ; egrep -q '^[^ ]+ $fs_path' /proc/mounts || mount -t $fs_type $fs_opt $first_dev $fs_path" + fi fi echo Starting Ceph $name on $host... mkdir -p $RUN_DIR @@ -289,9 +329,9 @@ for name in $what; do stop_daemon $name ceph-$type $pid_file [ -n "$post_stop" ] && do_cmd "$post_stop" [ -n "$lockfile" ] && [ "$?" -eq 0 ] && rm -f $lockfile - if [ $dobtrfsumount -eq 1 ] && [ -n "$btrfs_devs" ]; then - echo Unmounting Btrfs on $host:$btrfs_path - do_root_cmd "umount $btrfs_path || true" + if [ $dofsumount -eq 1 ] && [ -n "$fs_devs" ]; then + echo Unmounting OSD volume on $host:$fs_path + do_root_cmd "umount $fs_path || true" fi ;; diff --git a/src/mds/Server.cc b/src/mds/Server.cc index 4642a1346ff..b0fd9ec0c15 100644 --- a/src/mds/Server.cc +++ b/src/mds/Server.cc @@ -1754,7 +1754,7 @@ CInode* Server::prepare_new_inode(MDRequest *mdr, CDir *dir, inodeno_t useino, u in->inode.gid = diri->inode.gid; if (S_ISDIR(mode)) { dout(10) << " new dir also sticky" << dendl; - mode |= S_ISGID; + in->inode.mode |= S_ISGID; } } else in->inode.gid = mdr->client_request->get_caller_gid(); diff --git a/src/mkcephfs.in b/src/mkcephfs.in index aae616c68a0..1cb135e7929 100644 --- a/src/mkcephfs.in +++ b/src/mkcephfs.in @@ -60,7 +60,7 @@ else fi usage_exit() { - echo "usage: $0 -a -c ceph.conf [-k adminkeyring] [--mkbtrfs]" + echo "usage: $0 -a -c ceph.conf [-k adminkeyring] [--mkfs]" echo " to generate a new ceph cluster on all nodes; for advanced usage see man page" echo " ** be careful, this WILL clobber old data; check your ceph.conf carefully **" exit @@ -70,7 +70,7 @@ usage_exit() { allhosts=0 -mkbtrfs=0 +mkfs=0 preparemonmap=0 prepareosdfs="" initdaemon="" @@ -130,8 +130,8 @@ case $1 in preparemon=1 manual_action=1 ;; - --mkbtrfs) - mkbtrfs=1 + --mkbtrfs | --mkfs) + mkfs=1 ;; --no-copy-conf) nocopyconf=1 @@ -307,21 +307,48 @@ if [ -n "$prepareosdfs" ]; then get_conf osd_data "/var/lib/ceph/osd/ceph-$id" "osd data" get_conf osd_journal "$osd_data/journal" "osd journal" - get_conf btrfs_path "$osd_data" "btrfs path" # mount point defaults so osd data - get_conf btrfs_devs "" "btrfs devs" - - if [ -z "$btrfs_devs" ]; then - echo "no btrfs devs defined for $name" - exit 0 + get_conf fs_path "$osd_data" "fs path" # mount point defaults so osd data + get_conf fs_devs "" "devs" + get_conf fs_type "" "osd mkfs type" + + if [ -z "$fs_devs" ]; then + # try to fallback to old keys + get_conf tmp_btrfs_devs "" "btrfs devs" + if [ -n "$tmp_btrfs_devs" ]; then + fs_devs="$tmp_btrfs_devs" + else + echo "no devs defined for $name" + exit 0 + fi + fi + if [ -z "$fs_type" ]; then + # try to fallback to to old keys + get_conf tmp_devs "" "btrfs devs" + if [ -n "$tmp_devs" ]; then + fs_type = "btrfs" + else + echo No filesystem type defined! + exit 0 + fi fi - first_dev=`echo $btrfs_devs | cut '-d ' -f 1` - get_conf btrfs_opt "noatime" "btrfs options" - [ -n "$btrfs_opt" ] && btrfs_opt="-o $btrfs_opt" + first_dev=`echo $fs_devs | cut '-d ' -f 1` + get_conf fs_opt "" "osd mount options $fs_type" + if [ -z "$fs_opt" ]; then + if [ "$fs_type" = "btrfs" ]; then + #try to fallback to old keys + get_conf fs_opt "" "btrfs options" + fi + if [ -z "$fs_opt" ]; then + #fallback to use at least rw,noatime + fs_opt="rw,noatime" + fi + fi + [ -n "$fs_opt" ] && fs_opt="-o $fs_opt" get_conf osd_user "root" "user" - if [ -n "$osd_journal" ] && echo "$btrfs_devs" | grep -q -w "$osd_journal" ; then - echo "ERROR: osd journal device ($osd_journal) also used by btrfs devs ($btrfs_devs)" + if [ -n "$osd_journal" ] && echo "fs_devs" | grep -q -w "$osd_journal" ; then + echo "ERROR: osd journal device ($osd_journal) also used by devs ($fs_devs)" exit 1 fi @@ -331,19 +358,22 @@ if [ -n "$prepareosdfs" ]; then test -d $osd_journal || mkdir -p `dirname $osd_journal` fi - umount $btrfs_path || true - for f in $btrfs_devs ; do + umount $fs_path || true + for f in $fs_devs ; do umount $f || true done - modprobe btrfs || true - mkfs.btrfs $btrfs_devs - btrfs device scan || btrfsctl -a - sync # seems to fix problems for some people... - mount -t btrfs $btrfs_opt $first_dev $btrfs_path - chown $osd_user $btrfs_path - chmod +w $btrfs_path - + get_conf mkfs_opt "" "osd mkfs options $fs_type" + if [ "$fs_type" == "xfs" ] && [ -z "$mkfs_opt" ]; then + echo Xfs filesystem found add missing -f mkfs option! + mkfs_opt="-f" + fi + modprobe $fs_type || true + mkfs.$fs_type $mkfs_opt $fs_devs + mount -t $fs_type $fs_opt $first_dev $fs_path + chown $osd_user $fs_path + chmod +w $fs_path + exit 0 fi @@ -460,7 +490,7 @@ if [ $allhosts -eq 1 ]; then fi fi - if [ $mkbtrfs -eq 1 ] && [ "$type" = "osd" ]; then + if [ $mkfs -eq 1 ] && [ "$type" = "osd" ]; then do_root_cmd "$0 -d $rdir --prepare-osdfs $name" fi diff --git a/src/mon/AuthMonitor.cc b/src/mon/AuthMonitor.cc index 030153e17df..c43d7e988db 100644 --- a/src/mon/AuthMonitor.cc +++ b/src/mon/AuthMonitor.cc @@ -549,7 +549,8 @@ void AuthMonitor::import_keyring(KeyRing& keyring) auth_inc.name = p->first; auth_inc.auth = p->second; auth_inc.op = KeyServerData::AUTH_INC_ADD; - dout(10) << " importing " << auth_inc.name << " " << auth_inc.auth << dendl; + dout(10) << " importing " << auth_inc.name << dendl; + dout(30) << " " << auth_inc.auth << dendl; push_cephx_inc(auth_inc); } } @@ -627,7 +628,8 @@ bool AuthMonitor::prepare_command(MMonCommand *m) for (unsigned i=3; i+1<m->cmd.size(); i += 2) ::encode(m->cmd[i+1], auth_inc.auth.caps[m->cmd[i]]); - dout(10) << " importing " << auth_inc.name << " " << auth_inc.auth << dendl; + dout(10) << " importing " << auth_inc.name << dendl; + dout(30) << " " << auth_inc.auth << dendl; push_cephx_inc(auth_inc); ss << "added key for " << auth_inc.name; diff --git a/src/mon/Monitor.cc b/src/mon/Monitor.cc index 469cbd03aa5..4a64e48de8d 100644 --- a/src/mon/Monitor.cc +++ b/src/mon/Monitor.cc @@ -207,14 +207,26 @@ void Monitor::recovered_leader(int id) require_gv_onwire(); } - for (vector<Paxos*>::iterator p = paxos.begin(); p != paxos.end(); p++) + for (vector<Paxos*>::iterator p = paxos.begin(); p != paxos.end(); p++) { + if (!(*p)->is_active()) + continue; finish_contexts(g_ceph_context, (*p)->waiting_for_active); - for (vector<Paxos*>::iterator p = paxos.begin(); p != paxos.end(); p++) + } + for (vector<Paxos*>::iterator p = paxos.begin(); p != paxos.end(); p++) { + if (!(*p)->is_active()) + continue; finish_contexts(g_ceph_context, (*p)->waiting_for_commit); - for (vector<Paxos*>::iterator p = paxos.begin(); p != paxos.end(); p++) + } + for (vector<Paxos*>::iterator p = paxos.begin(); p != paxos.end(); p++) { + if (!(*p)->is_readable()) + continue; finish_contexts(g_ceph_context, (*p)->waiting_for_readable); - for (vector<Paxos*>::iterator p = paxos.begin(); p != paxos.end(); p++) + } + for (vector<Paxos*>::iterator p = paxos.begin(); p != paxos.end(); p++) { + if (!(*p)->is_writeable()) + continue; finish_contexts(g_ceph_context, (*p)->waiting_for_writeable); + } } } diff --git a/src/mon/PGMonitor.cc b/src/mon/PGMonitor.cc index 2bdfbc4b5eb..dcf9380613b 100644 --- a/src/mon/PGMonitor.cc +++ b/src/mon/PGMonitor.cc @@ -991,12 +991,17 @@ bool PGMonitor::preprocess_command(MMonCommand *m) r = -EINVAL; if (pgid.parse(m->cmd[2].c_str())) { vector<int> up, acting; - pg_t mpgid = mon->osdmon()->osdmap.raw_pg_to_pg(pgid); - mon->osdmon()->osdmap.pg_to_up_acting_osds(pgid, up, acting); - ss << "osdmap e" << mon->osdmon()->osdmap.get_epoch() - << " pg " << pgid << " (" << mpgid << ")" - << " -> up " << up << " acting " << acting; - r = 0; + if (mon->osdmon()->osdmap.have_pg_pool(pgid.pool())) { + pg_t mpgid = mon->osdmon()->osdmap.raw_pg_to_pg(pgid); + mon->osdmon()->osdmap.pg_to_up_acting_osds(pgid, up, acting); + ss << "osdmap e" << mon->osdmon()->osdmap.get_epoch() + << " pg " << pgid << " (" << mpgid << ")" + << " -> up " << up << " acting " << acting; + r = 0; + } else { + r = -ENOENT; + ss << "pg '" << m->cmd[2] << "' does not exist"; + } } else ss << "invalid pgid '" << m->cmd[2] << "'"; } diff --git a/src/msg/Accepter.cc b/src/msg/Accepter.cc index b95d26afb08..ac180e21441 100644 --- a/src/msg/Accepter.cc +++ b/src/msg/Accepter.cc @@ -138,9 +138,11 @@ int Accepter::bind(const entity_addr_t &bind_addr, int avoid_port1, int avoid_po assert(msgr->get_need_addr()); // should still be true. if (msgr->get_myaddr().get_port() == 0) { - listen_addr.nonce = nonce; msgr->set_myaddr(listen_addr); } + entity_addr_t addr = msgr->get_myaddr(); + addr.nonce = nonce; + msgr->set_myaddr(addr); msgr->init_local_connection(); diff --git a/src/os/FileStore.cc b/src/os/FileStore.cc index b8f01c2e1ac..1d6797c2b32 100644 --- a/src/os/FileStore.cc +++ b/src/os/FileStore.cc @@ -25,6 +25,7 @@ #if defined(__linux__) #include <linux/fs.h> +#include <syscall.h> #endif #include <iostream> @@ -1227,8 +1228,17 @@ int FileStore::_detect_fs() dout(0) << "mount syncfs(2) syscall supported by glibc BUT NOT the kernel" << dendl; } #else +#ifdef SYS_syncfs + if (syscall(SYS_syncfs, fd) == 0) { + dout(0) << "mount syscall(SYS_syncfs, fd) fully supported" << dendl; + have_syncfs = true; + } else { + dout(0) << "mount syscall(SYS_syncfs, fd) supported by libc BUT NOT the kernel" << dendl; + } +#else dout(0) << "mount syncfs(2) syscall not support by glibc" << dendl; #endif +#endif if (!have_syncfs) { if (btrfs) { dout(0) << "mount no syncfs(2), but the btrfs SYNC ioctl will suffice" << dendl; diff --git a/src/osdc/ObjectCacher.cc b/src/osdc/ObjectCacher.cc index 8edf3a55c4b..c933bad164a 100644 --- a/src/osdc/ObjectCacher.cc +++ b/src/osdc/ObjectCacher.cc @@ -703,6 +703,7 @@ void ObjectCacher::bh_read_finish(int64_t poolid, sobject_t oid, loff_t start, if (r == -ENOENT) { ldout(cct, 10) << "bh_read_finish removing " << *bh << dendl; bh_remove(ob, bh); + delete bh; continue; } diff --git a/src/osdc/Striper.cc b/src/osdc/Striper.cc index cdbfe61674d..26e2b917bb7 100644 --- a/src/osdc/Striper.cc +++ b/src/osdc/Striper.cc @@ -231,12 +231,15 @@ void Striper::StripedReadResult::add_partial_sparse_result(CephContext *cct, if (s->first > bl_off) { // gap in sparse read result pair<bufferlist, uint64_t>& r = partial[tofs]; - size_t gap = s->first - bl_off; + size_t gap = MIN(s->first - bl_off, tlen); ldout(cct, 20) << " s gap " << gap << ", skipping" << dendl; r.second = gap; bl_off += gap; tofs += gap; tlen -= gap; + if (tlen == 0) { + continue; + } } assert(s->first <= bl_off); diff --git a/src/rbd.cc b/src/rbd.cc index fa7648a2148..09d506f1ef7 100644 --- a/src/rbd.cc +++ b/src/rbd.cc @@ -472,7 +472,11 @@ static int do_purge_snaps(librbd::Image& image) } for (size_t i = 0; i < snaps.size(); ++i) { - image.snap_remove(snaps[i].name.c_str()); + r = image.snap_remove(snaps[i].name.c_str()); + if (r < 0) { + pc.fail(); + return r; + } pc.update_progress(i + 1, snaps.size()); } diff --git a/src/rgw/rgw_admin.cc b/src/rgw/rgw_admin.cc index 5a5b7406aaf..665ac5516e8 100644 --- a/src/rgw/rgw_admin.cc +++ b/src/rgw/rgw_admin.cc @@ -1361,25 +1361,8 @@ int main(int argc, char **argv) goto next; if (show_log_entries) { - formatter->open_object_section("log_entry"); - formatter->dump_string("bucket", entry.bucket); - entry.time.gmtime(formatter->dump_stream("time")); // UTC - entry.time.localtime(formatter->dump_stream("time_local")); - formatter->dump_string("remote_addr", entry.remote_addr); - if (entry.object_owner.length()) - formatter->dump_string("object_owner", entry.object_owner); - formatter->dump_string("user", entry.user); - formatter->dump_string("operation", entry.op); - formatter->dump_string("uri", entry.uri); - formatter->dump_string("http_status", entry.http_status); - formatter->dump_string("error_code", entry.error_code); - formatter->dump_int("bytes_sent", entry.bytes_sent); - formatter->dump_int("bytes_received", entry.bytes_received); - formatter->dump_int("object_size", entry.obj_size); - formatter->dump_int("total_time", total_time); - formatter->dump_string("user_agent", entry.user_agent); - formatter->dump_string("referrer", entry.referrer); - formatter->close_section(); + + rgw_format_ops_log_entry(entry, formatter); formatter->flush(cout); } next: diff --git a/src/rgw/rgw_client_io.cc b/src/rgw/rgw_client_io.cc index 740418c9ab1..46385f41f33 100644 --- a/src/rgw/rgw_client_io.cc +++ b/src/rgw/rgw_client_io.cc @@ -53,8 +53,7 @@ int RGWClientIO::read(char *buf, int max, int *actual) *actual = ret; - if (account) - bytes_received += *actual; + bytes_received += *actual; return 0; } diff --git a/src/rgw/rgw_client_io.h b/src/rgw/rgw_client_io.h index d4842e97e6e..546b16d51f7 100644 --- a/src/rgw/rgw_client_io.h +++ b/src/rgw/rgw_client_io.h @@ -3,6 +3,8 @@ #include <stdlib.h> +#include "include/types.h" + class RGWClientIO { bool account; @@ -27,6 +29,9 @@ public: void set_account(bool _account) { account = _account; } + + uint64_t get_bytes_sent() { return bytes_sent; } + uint64_t get_bytes_received() { return bytes_received; } }; #endif diff --git a/src/rgw/rgw_common.cc b/src/rgw/rgw_common.cc index db8013e6120..35b31be60c0 100644 --- a/src/rgw/rgw_common.cc +++ b/src/rgw/rgw_common.cc @@ -106,8 +106,6 @@ req_state::req_state(CephContext *_cct, struct RGWEnv *e) : cct(_cct), cio(NULL) object = NULL; header_ended = false; - bytes_sent = 0; - bytes_received = 0; obj_size = 0; prot_flags = 0; diff --git a/src/rgw/rgw_common.h b/src/rgw/rgw_common.h index c55907a38d3..f43a2c78a1f 100644 --- a/src/rgw/rgw_common.h +++ b/src/rgw/rgw_common.h @@ -582,8 +582,6 @@ struct req_state { struct rgw_err err; bool expect_cont; bool header_ended; - uint64_t bytes_sent; // bytes sent as a response, excluding header - uint64_t bytes_received; // data received uint64_t obj_size; bool enable_ops_log; bool enable_usage_log; diff --git a/src/rgw/rgw_gc.cc b/src/rgw/rgw_gc.cc index 361e1b04ae0..d7861e61250 100644 --- a/src/rgw/rgw_gc.cc +++ b/src/rgw/rgw_gc.cc @@ -34,9 +34,7 @@ void RGWGC::initialize(CephContext *_cct, RGWRados *_store) { void RGWGC::finalize() { - for (int i = 0; i < max_objs; i++) { - delete[] obj_names; - } + delete[] obj_names; } int RGWGC::tag_index(const string& tag) diff --git a/src/rgw/rgw_gc.h b/src/rgw/rgw_gc.h index db1753a100f..b19afc55dd5 100644 --- a/src/rgw/rgw_gc.h +++ b/src/rgw/rgw_gc.h @@ -36,6 +36,10 @@ class RGWGC { GCWorker *worker; public: RGWGC() : cct(NULL), store(NULL), max_objs(0), obj_names(NULL), worker(NULL) {} + ~RGWGC() { + stop_processor(); + finalize(); + } void add_chain(librados::ObjectWriteOperation& op, cls_rgw_obj_chain& chain, const string& tag); int send_chain(cls_rgw_obj_chain& chain, const string& tag, bool sync); diff --git a/src/rgw/rgw_log.cc b/src/rgw/rgw_log.cc index 5406a2c924b..e999f623a01 100644 --- a/src/rgw/rgw_log.cc +++ b/src/rgw/rgw_log.cc @@ -1,10 +1,13 @@ #include "common/Clock.h" #include "common/Timer.h" #include "common/utf8.h" +#include "common/OutputDataSocket.h" +#include "common/Formatter.h" #include "rgw_log.h" #include "rgw_acl.h" #include "rgw_rados.h" +#include "rgw_client_io.h" #define dout_subsys ceph_subsys_rgw @@ -175,7 +178,10 @@ static void log_usage(struct req_state *s, const string& op_name) rgw_usage_log_entry entry(user, s->bucket.name); - rgw_usage_data data(s->bytes_sent, s->bytes_received); + uint64_t bytes_sent = s->cio->get_bytes_sent(); + uint64_t bytes_received = s->cio->get_bytes_received(); + + rgw_usage_data data(bytes_sent, bytes_received); data.ops = 1; if (!s->err.is_err()) @@ -188,7 +194,67 @@ static void log_usage(struct req_state *s, const string& op_name) usage_logger->insert(ts, entry); } -int rgw_log_op(RGWRados *store, struct req_state *s, const string& op_name) +void rgw_format_ops_log_entry(struct rgw_log_entry& entry, Formatter *formatter) +{ + formatter->open_object_section("log_entry"); + formatter->dump_string("bucket", entry.bucket); + entry.time.gmtime(formatter->dump_stream("time")); // UTC + entry.time.localtime(formatter->dump_stream("time_local")); + formatter->dump_string("remote_addr", entry.remote_addr); + if (entry.object_owner.length()) + formatter->dump_string("object_owner", entry.object_owner); + formatter->dump_string("user", entry.user); + formatter->dump_string("operation", entry.op); + formatter->dump_string("uri", entry.uri); + formatter->dump_string("http_status", entry.http_status); + formatter->dump_string("error_code", entry.error_code); + formatter->dump_int("bytes_sent", entry.bytes_sent); + formatter->dump_int("bytes_received", entry.bytes_received); + formatter->dump_int("object_size", entry.obj_size); + uint64_t total_time = entry.total_time.sec() * 1000000LL * entry.total_time.usec(); + + formatter->dump_int("total_time", total_time); + formatter->dump_string("user_agent", entry.user_agent); + formatter->dump_string("referrer", entry.referrer); + formatter->close_section(); +} + +void OpsLogSocket::formatter_to_bl(bufferlist& bl) +{ + stringstream ss; + formatter->flush(ss); + const string& s = ss.str(); + + bl.append(s); +} + +void OpsLogSocket::init_connection(bufferlist& bl) +{ + bl.append("["); +} + +OpsLogSocket::OpsLogSocket(CephContext *cct, uint64_t _backlog) : OutputDataSocket(cct, _backlog) +{ + formatter = new JSONFormatter; + delim.append(",\n"); +} + +OpsLogSocket::~OpsLogSocket() +{ + delete formatter; +} + +void OpsLogSocket::log(struct rgw_log_entry& entry) +{ + bufferlist bl; + + rgw_format_ops_log_entry(entry, formatter); + formatter_to_bl(bl); + + append_output(bl); +} + +int rgw_log_op(RGWRados *store, struct req_state *s, const string& op_name, OpsLogSocket *olog) { struct rgw_log_entry entry; string bucket_id; @@ -240,10 +306,13 @@ int rgw_log_op(RGWRados *store, struct req_state *s, const string& op_name) entry.object_owner = s->object_acl->get_owner().get_id(); entry.bucket_owner = s->bucket_owner; + uint64_t bytes_sent = s->cio->get_bytes_sent(); + uint64_t bytes_received = s->cio->get_bytes_received(); + entry.time = s->time; entry.total_time = ceph_clock_now(s->cct) - s->time; - entry.bytes_sent = s->bytes_sent; - entry.bytes_received = s->bytes_received; + entry.bytes_sent = bytes_sent; + entry.bytes_received = bytes_received; if (s->err.http_ret) { char buf[16]; snprintf(buf, sizeof(buf), "%d", s->err.http_ret); @@ -263,19 +332,27 @@ int rgw_log_op(RGWRados *store, struct req_state *s, const string& op_name) gmtime_r(&t, &bdt); else localtime_r(&t, &bdt); - - string oid = render_log_object_name(s->cct->_conf->rgw_log_object_name, &bdt, - s->bucket.bucket_id, entry.bucket.c_str()); - rgw_obj obj(store->params.log_pool, oid); + int ret = 0; + + if (s->cct->_conf->rgw_ops_log_rados) { + string oid = render_log_object_name(s->cct->_conf->rgw_log_object_name, &bdt, + s->bucket.bucket_id, entry.bucket.c_str()); + + rgw_obj obj(store->params.log_pool, oid); - int ret = store->append_async(obj, bl.length(), bl); - if (ret == -ENOENT) { - ret = store->create_pool(store->params.log_pool); - if (ret < 0) - goto done; - // retry ret = store->append_async(obj, bl.length(), bl); + if (ret == -ENOENT) { + ret = store->create_pool(store->params.log_pool); + if (ret < 0) + goto done; + // retry + ret = store->append_async(obj, bl.length(), bl); + } + } + + if (olog) { + olog->log(entry); } done: if (ret < 0) diff --git a/src/rgw/rgw_log.h b/src/rgw/rgw_log.h index f832c1cadeb..823f0b1767f 100644 --- a/src/rgw/rgw_log.h +++ b/src/rgw/rgw_log.h @@ -3,6 +3,8 @@ #include "rgw_common.h" #include "include/utime.h" +#include "common/Formatter.h" +#include "common/OutputDataSocket.h" class RGWRados; @@ -115,11 +117,27 @@ struct rgw_intent_log_entry { }; WRITE_CLASS_ENCODER(rgw_intent_log_entry) -int rgw_log_op(RGWRados *store, struct req_state *s, const string& op_name); +class OpsLogSocket : public OutputDataSocket { + Formatter *formatter; + + void formatter_to_bl(bufferlist& bl); + +protected: + void init_connection(bufferlist& bl); + +public: + OpsLogSocket(CephContext *cct, uint64_t _backlog); + ~OpsLogSocket(); + + void log(struct rgw_log_entry& entry); +}; + +int rgw_log_op(RGWRados *store, struct req_state *s, const string& op_name, OpsLogSocket *olog); int rgw_log_intent(RGWRados *store, rgw_obj& obj, RGWIntentEvent intent, const utime_t& timestamp, bool utc); int rgw_log_intent(RGWRados *store, struct req_state *s, rgw_obj& obj, RGWIntentEvent intent); void rgw_log_usage_init(CephContext *cct, RGWRados *store); void rgw_log_usage_finalize(); +void rgw_format_ops_log_entry(struct rgw_log_entry& entry, Formatter *formatter); #endif diff --git a/src/rgw/rgw_main.cc b/src/rgw/rgw_main.cc index 944b59a5c8d..5f52dde228d 100644 --- a/src/rgw/rgw_main.cc +++ b/src/rgw/rgw_main.cc @@ -61,20 +61,12 @@ static sighandler_t sighandler_usr1; static sighandler_t sighandler_alrm; static sighandler_t sighandler_term; +class RGWProcess; -#define SOCKET_BACKLOG 1024 +static RGWProcess *pprocess = NULL; -static void godown_handler(int signum) -{ - FCGX_ShutdownPending(); - signal(signum, sighandler_usr1); - alarm(5); -} -static void godown_alarm(int signum) -{ - _exit(0); -} +#define SOCKET_BACKLOG 1024 struct RGWRequest { @@ -127,10 +119,12 @@ struct RGWRequest class RGWProcess { RGWRados *store; + OpsLogSocket *olog; deque<RGWRequest *> m_req_queue; ThreadPool m_tp; Throttle req_throttle; RGWREST *rest; + int sock_fd; struct RGWWQ : public ThreadPool::WorkQueue<RGWRequest> { RGWProcess *process; @@ -185,20 +179,25 @@ class RGWProcess { uint64_t max_req_id; public: - RGWProcess(CephContext *cct, RGWRados *rgwstore, int num_threads, RGWREST *_rest) - : store(rgwstore), m_tp(cct, "RGWProcess::m_tp", num_threads), + RGWProcess(CephContext *cct, RGWRados *rgwstore, OpsLogSocket *_olog, int num_threads, RGWREST *_rest) + : store(rgwstore), olog(_olog), m_tp(cct, "RGWProcess::m_tp", num_threads), req_throttle(cct, "rgw_ops", num_threads * 2), - rest(_rest), + rest(_rest), sock_fd(-1), req_wq(this, g_conf->rgw_op_thread_timeout, g_conf->rgw_op_thread_suicide_timeout, &m_tp), max_req_id(0) {} void run(); void handle_request(RGWRequest *req); + + void close_fd() { + if (sock_fd >= 0) + close(sock_fd); + } }; void RGWProcess::run() { - int s = 0; + sock_fd = 0; if (!g_conf->rgw_socket_path.empty()) { string path_str = g_conf->rgw_socket_path; @@ -216,9 +215,9 @@ void RGWProcess::run() } const char *path = path_str.c_str(); - s = FCGX_OpenSocket(path, SOCKET_BACKLOG); - if (s < 0) { - dout(0) << "ERROR: FCGX_OpenSocket (" << path << ") returned " << s << dendl; + sock_fd = FCGX_OpenSocket(path, SOCKET_BACKLOG); + if (sock_fd < 0) { + dout(0) << "ERROR: FCGX_OpenSocket (" << path << ") returned " << sock_fd << dendl; return; } if (chmod(path, 0777) < 0) { @@ -232,7 +231,7 @@ void RGWProcess::run() RGWRequest *req = new RGWRequest; req->id = ++max_req_id; dout(10) << "allocated request req=" << hex << req << dec << dendl; - FCGX_InitRequest(&req->fcgx, s, 0); + FCGX_InitRequest(&req->fcgx, sock_fd, 0); req_throttle.get(1); int ret = FCGX_Accept_r(&req->fcgx); if (ret < 0) { @@ -248,6 +247,19 @@ void RGWProcess::run() m_tp.stop(); } +static void godown_handler(int signum) +{ + FCGX_ShutdownPending(); + pprocess->close_fd(); + signal(signum, sighandler_usr1); + alarm(5); +} + +static void godown_alarm(int signum) +{ + _exit(0); +} + static int call_log_intent(RGWRados *store, void *ctx, rgw_obj& obj, RGWIntentEvent intent) { struct req_state *s = (struct req_state *)ctx; @@ -331,7 +343,7 @@ void RGWProcess::handle_request(RGWRequest *req) op->execute(); op->complete(); done: - rgw_log_op(store, s, (op ? op->name() : "unknown")); + rgw_log_op(store, s, (op ? op->name() : "unknown"), olog); int http_ret = s->err.http_ret; @@ -485,8 +497,18 @@ int main(int argc, const char **argv) rest.register_resource(g_conf->rgw_admin_entry, admin_resource); } - RGWProcess process(g_ceph_context, store, g_conf->rgw_thread_pool_size, &rest); - process.run(); + OpsLogSocket *olog = NULL; + + if (!g_conf->rgw_ops_log_socket_path.empty()) { + olog = new OpsLogSocket(g_ceph_context, g_conf->rgw_ops_log_data_backlog); + olog->init(g_conf->rgw_ops_log_socket_path); + } + + pprocess = new RGWProcess(g_ceph_context, store, olog, g_conf->rgw_thread_pool_size, &rest); + + pprocess->run(); + + delete pprocess; if (do_swift) { swift_finalize(); @@ -494,12 +516,22 @@ int main(int argc, const char **argv) rgw_log_usage_finalize(); + delete olog; + rgw_perf_stop(g_ceph_context); unregister_async_signal_handler(SIGHUP, sighup_handler); RGWStoreManager::close_storage(store); + rgw_tools_cleanup(); + curl_global_cleanup(); + g_ceph_context->put(); + + shutdown_async_signal_handler(); + + ceph::crypto::shutdown(); + return 0; } diff --git a/src/rgw/rgw_op.cc b/src/rgw/rgw_op.cc index 5da22d2ad7f..49863c3046d 100644 --- a/src/rgw/rgw_op.cc +++ b/src/rgw/rgw_op.cc @@ -1747,7 +1747,6 @@ void RGWPutACLs::execute() RGWACLXMLParser_S3 parser(s->cct); RGWAccessControlPolicy_S3 new_policy(s->cct); stringstream ss; - char *orig_data = data; char *new_data = NULL; ACLOwner owner; rgw_obj obj; @@ -1756,27 +1755,29 @@ void RGWPutACLs::execute() if (!parser.init()) { ret = -EINVAL; - goto done; + return; } owner.set_id(s->user.user_id); owner.set_name(s->user.display_name); - if (get_params() < 0) - goto done; + ret = get_params(); + if (ret < 0) + return; ldout(s->cct, 15) << "read len=" << len << " data=" << (data ? data : "") << dendl; if (!s->canned_acl.empty() && len) { ret = -EINVAL; - goto done; + return; } if (!s->canned_acl.empty()) { ret = get_canned_policy(owner, ss); if (ret < 0) - goto done; + return; new_data = strdup(ss.str().c_str()); + free(data); data = new_data; len = ss.str().size(); } @@ -1784,12 +1785,12 @@ void RGWPutACLs::execute() if (!parser.parse(data, len, 1)) { ret = -EACCES; - goto done; + return; } policy = (RGWAccessControlPolicy_S3 *)parser.find_first("AccessControlPolicy"); if (!policy) { ret = -EINVAL; - goto done; + return; } if (s->cct->_conf->subsys.should_gather(ceph_subsys_rgw, 15)) { @@ -1800,7 +1801,7 @@ void RGWPutACLs::execute() ret = policy->rebuild(store, &owner, new_policy); if (ret < 0) - goto done; + return; if (s->cct->_conf->subsys.should_gather(ceph_subsys_rgw, 15)) { ldout(s->cct, 15) << "New AccessControlPolicy:"; @@ -1812,10 +1813,6 @@ void RGWPutACLs::execute() obj.init(s->bucket, s->object_str); store->set_atomic(s->obj_ctx, obj); ret = store->set_attr(s->obj_ctx, obj, RGW_ATTR_ACL, bl); - -done: - free(orig_data); - free(new_data); } int RGWInitMultipart::verify_permission() diff --git a/src/rgw/rgw_op.h b/src/rgw/rgw_op.h index 52676b92d71..043bad03f3f 100644 --- a/src/rgw/rgw_op.h +++ b/src/rgw/rgw_op.h @@ -493,6 +493,9 @@ public: len = 0; data = NULL; } + virtual ~RGWPutACLs() { + free(data); + } int verify_permission(); void execute(); @@ -540,6 +543,9 @@ public: data = NULL; len = 0; } + virtual ~RGWCompleteMultipart() { + free(data); + } int verify_permission(); void execute(); diff --git a/src/rgw/rgw_rados.cc b/src/rgw/rgw_rados.cc index 6255010785a..aa25d507dfb 100644 --- a/src/rgw/rgw_rados.cc +++ b/src/rgw/rgw_rados.cc @@ -166,8 +166,11 @@ void RGWRadosCtx::set_prefetch_data(rgw_obj& obj) { void RGWRados::finalize() { - if (use_gc_thread) + if (use_gc_thread) { gc->stop_processor(); + delete gc; + gc = NULL; + } } /** diff --git a/src/rgw/rgw_rados.h b/src/rgw/rgw_rados.h index 4e0b52d97b7..d8963e2a91e 100644 --- a/src/rgw/rgw_rados.h +++ b/src/rgw/rgw_rados.h @@ -382,7 +382,12 @@ public: RGWRadosParams params; - virtual ~RGWRados() {} + virtual ~RGWRados() { + if (rados) { + rados->shutdown(); + delete rados; + } + } void tick(); diff --git a/src/rgw/rgw_tools.cc b/src/rgw/rgw_tools.cc index 4201bd42d06..b6d9f284771 100644 --- a/src/rgw/rgw_tools.cc +++ b/src/rgw/rgw_tools.cc @@ -161,3 +161,8 @@ int rgw_tools_init(CephContext *cct) return 0; } + +void rgw_tools_cleanup() +{ + ext_mime_map.clear(); +} diff --git a/src/rgw/rgw_tools.h b/src/rgw/rgw_tools.h index 7860ea3819e..6553d9d1465 100644 --- a/src/rgw/rgw_tools.h +++ b/src/rgw/rgw_tools.h @@ -12,6 +12,7 @@ int rgw_put_system_obj(RGWRados *rgwstore, rgw_bucket& bucket, string& oid, cons int rgw_get_obj(RGWRados *rgwstore, void *ctx, rgw_bucket& bucket, string& key, bufferlist& bl, map<string, bufferlist> *pattrs = NULL); int rgw_tools_init(CephContext *cct); +void rgw_tools_cleanup(); const char *rgw_find_mime_by_ext(string& ext); #endif diff --git a/src/rgw/rgw_xml.cc b/src/rgw/rgw_xml.cc index c94b82e30fa..4347b06115c 100644 --- a/src/rgw/rgw_xml.cc +++ b/src/rgw/rgw_xml.cc @@ -131,13 +131,16 @@ static void xml_start(void *data, const char *el, const char **attr) { } RGWXMLParser:: -RGWXMLParser() : p(NULL), buf(NULL), buf_len(0), cur_obj(NULL), success(true) +RGWXMLParser() : buf(NULL), buf_len(0), cur_obj(NULL), success(true) { + p = XML_ParserCreate(NULL); } RGWXMLParser:: ~RGWXMLParser() { + XML_ParserFree(p); + free(buf); vector<XMLObj *>::iterator iter; for (iter = objs.begin(); iter != objs.end(); ++iter) { @@ -194,7 +197,6 @@ void RGWXMLParser::handle_data(const char *s, int len) bool RGWXMLParser::init() { - p = XML_ParserCreate(NULL); if (!p) { return false; } @@ -221,8 +223,5 @@ bool RGWXMLParser::parse(const char *_buf, int len, int done) success = false; } - if (done || !success) - XML_ParserFree(p); - return success; } diff --git a/src/sample.ceph.conf b/src/sample.ceph.conf index 88f7f02d992..dd121f2ee5c 100644 --- a/src/sample.ceph.conf +++ b/src/sample.ceph.conf @@ -131,27 +131,33 @@ ;debug filestore = 20 ;debug journal = 20 + ; The filesystem used on the volumes + osd mkfs type = btrfs + ; If you want to specify some other mount options, you can do so. + ; for other filesystems use 'osd mount options $fstype' + osd mount options btrfs = rw,noatime + ; The options used to format the filesystem via mkfs.$fstype + ; for other filesystems use 'osd mkfs options $fstype' + ; osd mkfs options btrfs = + + [osd.0] host = delta - ; if 'btrfs devs' is not specified, you're responsible for + ; if 'devs' is not specified, you're responsible for ; setting up the 'osd data' dir. if it is not btrfs, things ; will behave up until you try to recover from a crash (which ; usually fine for basic testing). - btrfs devs = /dev/sdx - - ; If you want to specify some other mount options, you can do so. - ; The default values are rw,noatime - ;btrfs options = rw,noatime + devs = /dev/sdx [osd.1] host = epsilon - btrfs devs = /dev/sdy + devs = /dev/sdy [osd.2] host = zeta - btrfs devs = /dev/sdx + devs = /dev/sdx [osd.3] host = eta - btrfs devs = /dev/sdy + devs = /dev/sdy diff --git a/src/test/cli/osdmaptool/ceph.conf.withracks b/src/test/cli/osdmaptool/ceph.conf.withracks index 1e14411c9da..09399e9554a 100644 --- a/src/test/cli/osdmaptool/ceph.conf.withracks +++ b/src/test/cli/osdmaptool/ceph.conf.withracks @@ -42,7 +42,9 @@ keyring = /mnt/osd.$id/keyring osd data = /mnt/osd.$id osd journal = /dev/disk/by-label/osd.$id.journal - btrfs devs = /dev/disk/by-label/osd.$id.data + osd mkfs type = btrfs + osd mount options btrfs = rw,noatime + devs = /dev/disk/by-label/osd.$id.data ; temp sage debug osd = 20 debug ms = 1 diff --git a/src/test/librbd/fsx.c b/src/test/librbd/fsx.c index f895b5ffa10..d884173b0cf 100644 --- a/src/test/librbd/fsx.c +++ b/src/test/librbd/fsx.c @@ -93,7 +93,8 @@ int logcount = 0; /* total ops */ #define OP_PUNCH_HOLE 6 /* rbd-specific operations */ #define OP_CLONE 7 -#define OP_MAX_FULL 8 +#define OP_FLATTEN 8 +#define OP_MAX_FULL 9 /* operation modifiers */ #define OP_CLOSEOPEN 100 @@ -334,6 +335,9 @@ logdump(void) case OP_CLONE: prt("CLONE"); break; + case OP_FLATTEN: + prt("FLATTEN"); + break; case OP_SKIPPED: prt("SKIPPED (no operation)"); break; @@ -408,7 +412,7 @@ report_failure(int status) *(((unsigned char *)(cp)) + 1))) void -check_buffers(unsigned offset, unsigned size) +check_buffers(char *good_buf, char *temp_buf, unsigned offset, unsigned size) { unsigned char c, t; unsigned i = 0; @@ -586,7 +590,7 @@ doread(unsigned offset, unsigned size) ret, size); report_failure(141); } - check_buffers(offset, size); + check_buffers(good_buf, temp_buf, offset, size); } @@ -789,6 +793,8 @@ void clone_imagename(char *buf, size_t len, int clones) strncpy(buf, iname, len); } +void check_clone(int clonenum); + void do_clone() { @@ -819,6 +825,10 @@ do_clone() exit(163); } + clone_imagename(imagename, sizeof(imagename), num_clones); + clone_imagename(lastimagename, sizeof(lastimagename), + num_clones - 1); + if ((ret = rbd_snap_create(image, "snap")) < 0) { simple_err("do_clone: rbd create snap", ret); exit(164); @@ -829,10 +839,6 @@ do_clone() exit(164); } - clone_imagename(imagename, sizeof(imagename), num_clones); - clone_imagename(lastimagename, sizeof(lastimagename), - num_clones - 1); - ret = rbd_clone2(ioctx, lastimagename, "snap", ioctx, imagename, RBD_FEATURES_ALL, &order, stripe_unit, stripe_count); if (ret < 0) { @@ -844,58 +850,58 @@ do_clone() simple_err("do_clone: rbd open", ret); exit(166); } + + if (num_clones > 1) + check_clone(num_clones - 2); } void -check_clones() +check_clone(int clonenum) { - char filename[1024]; - char imagename[1024]; + char filename[128]; + char imagename[128]; int ret, fd; rbd_image_t cur_image; struct stat file_info; - while (num_clones > 0) { - prt("checking clone #%d\n", num_clones); - --num_clones; - - clone_imagename(imagename, sizeof(imagename), num_clones); - if ((ret = rbd_open(ioctx, imagename, &cur_image, NULL)) < 0) { - simple_err("check_clones: rbd open", ret); - exit(167); - } + char *good_buf, *temp_buf; - clone_filename(filename, sizeof(filename), num_clones + 1); - if ((fd = open(filename, O_RDONLY)) < 0) { - simple_err("check_clones: open", -errno); - exit(168); - } + clone_imagename(imagename, sizeof(imagename), clonenum); + if ((ret = rbd_open(ioctx, imagename, &cur_image, NULL)) < 0) { + simple_err("check_clone: rbd open", ret); + exit(167); + } - prt("checking image %s against file %s\n", imagename, filename); - if ((ret = fstat(fd, &file_info)) < 0) { - simple_err("check_clones: fstat", -errno); - exit(169); - } + clone_filename(filename, sizeof(filename), clonenum + 1); + if ((fd = open(filename, O_RDONLY)) < 0) { + simple_err("check_clone: open", -errno); + exit(168); + } - if ((ret = pread(fd, good_buf, file_info.st_size, 0)) < 0) { - simple_err("check_clones: pread", -errno); - exit(170); - } + prt("checking clone #%d, image %s against file %s\n", + clonenum, imagename, filename); + if ((ret = fstat(fd, &file_info)) < 0) { + simple_err("check_clone: fstat", -errno); + exit(169); + } - if ((ret = rbd_read(cur_image, 0, file_info.st_size, temp_buf)) < 0) { - simple_err("check_clones: rbd_read", ret); - exit(171); - } - close(fd); - check_buffers(0, file_info.st_size); - - unlink(filename); - /* remove the snapshot if it exists, ignore - the error from the last clone. */ - rbd_snap_unprotect(cur_image, "snap"); - rbd_snap_remove(cur_image, "snap"); - rbd_close(cur_image); - rbd_remove(ioctx, imagename); + good_buf = malloc(file_info.st_size); + temp_buf = malloc(file_info.st_size); + + if ((ret = pread(fd, good_buf, file_info.st_size, 0)) < 0) { + simple_err("check_clone: pread", -errno); + exit(170); + } + if ((ret = rbd_read(cur_image, 0, file_info.st_size, temp_buf)) < 0) { + simple_err("check_clone: rbd_read", ret); + exit(171); } + close(fd); + check_buffers(good_buf, temp_buf, 0, file_info.st_size); + + unlink(filename); + + free(good_buf); + free(temp_buf); } void @@ -919,6 +925,25 @@ writefileimage() } } +void +do_flatten() +{ + int ret; + + if (num_clones == 0 || + (rbd_get_parent_info(image, NULL, 0, NULL, 0, NULL, 0) + == -ENOENT)) { + log4(OP_SKIPPED, OP_FLATTEN, 0, 0); + return; + } + log4(OP_FLATTEN, 0, 0, 0); + prt("%lu flatten\n", testcalls); + + if ((ret = rbd_flatten(image)) < 0) { + simple_err("do_flatten: rbd flatten", ret); + exit(177); + } +} void docloseopen(void) @@ -1048,6 +1073,10 @@ test(void) do_clone(); break; + case OP_FLATTEN: + do_flatten(); + break; + default: prterr("test: unknown operation"); report_failure(42); @@ -1279,7 +1308,6 @@ main(int argc, char **argv) char *endp; char goodfile[1024]; char logfile[1024]; - char finaliname[1024]; goodfile[0] = 0; logfile[0] = 0; @@ -1514,7 +1542,7 @@ main(int argc, char **argv) maxfilelen); exit(98); } - } else + } else check_trunc_hack(); //test_fallocate(); @@ -1527,14 +1555,48 @@ main(int argc, char **argv) report_failure(99); } - clone_imagename(finaliname, sizeof(finaliname), num_clones); - if ((ret = rbd_remove(ioctx, finaliname)) < 0) { - prterrcode("rbd_remove final image", ret); - report_failure(100); - } + if (num_clones > 0) + check_clone(num_clones - 1); + + while (num_clones >= 0) { + static int first = 1; + char clonename[128]; + char errmsg[128]; + clone_imagename(clonename, 128, num_clones); + if ((ret = rbd_open(ioctx, clonename, &image, NULL)) < 0) { + sprintf(errmsg, "rbd_open %s", clonename); + prterrcode(errmsg, ret); + report_failure(101); + } + if (!first) { + if ((ret = rbd_snap_unprotect(image, "snap")) < 0) { + sprintf(errmsg, "rbd_snap_unprotect %s@snap", + clonename); + prterrcode(errmsg, ret); + report_failure(102); + } + if ((ret = rbd_snap_remove(image, "snap")) < 0) { + sprintf(errmsg, "rbd_snap_remove %s@snap", + clonename); + prterrcode(errmsg, ret); + report_failure(103); + } + } + if ((ret = rbd_close(image)) < 0) { + sprintf(errmsg, "rbd_close %s", clonename); + prterrcode(errmsg, ret); + report_failure(104); + } - if (clone_calls) - check_clones(); + if ((ret = rbd_remove(ioctx, clonename)) < 0) { + sprintf(errmsg, "rbd_remove %s", clonename); + prterrcode(errmsg, ret); + report_failure(105); + } + + first = 0; + num_clones--; + } rados_ioctx_destroy(ioctx); rados_shutdown(cluster); |